blob: 3fcfa2533ff992c8d953b8692061f6094b309ccd [file] [log] [blame]
Simon Hosie0462a392014-03-07 19:36:44 -08001/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define END(f) .size f, .-f;
19
20
21.macro vmxx_f32 i, mask, opd, opa, opb
22 .if (\i) & \mask
23 .if (\i) & (\mask - 1)
24 fmla \opd, \opa, \opb
25 .else
26 fmul \opd, \opa, \opb
27 .endif
28 .endif
29.endm
30
31.macro vadd_f32 i, mask, opd, opa, opb, stupidsyntax1, stupidsyntax2
32 .if (\i) & \mask
33 .if (\i) & (\mask - 1)
34 fadd \opd, \opa, \opb
35 .else
36 mov \stupidsyntax1, \stupidsyntax2
37 .endif
38 .endif
39.endm
40
41.macro vmxx_s16 i, mask, opd, opa, opb
42 .if (\i) & \mask
43 .if (\i) & (\mask - 1 + 16)
44 smlal \opd, \opa, \opb
45 .else
46 smull \opd, \opa, \opb
47 .endif
48 .endif
49.endm
50
51.macro vmxx2_s16 i, mask, opd, opa, opb
52 .if (\i) & \mask
53 .if (\i) & (\mask - 1 + 16)
54 smlal2 \opd, \opa, \opb
55 .else
56 smull2 \opd, \opa, \opb
57 .endif
58 .endif
59.endm
60
61/* x0 = dst
62 * x1 = src
63 * x2 = count
64 * x3 = params
65 * x4 = column0_fn
66 * x5 = column1_fn
67 * x6 = column2_fn
68 * x7 = column3_fn
69 * x8 = store_fn
70 * x9 = load_fn
71 */
72.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
73
74.align 6
75colormatrix_int_col0_\i:
76 .if \i & 16
77 dup v6.4s, v4.s[0]
78 dup v7.4s, v4.s[0]
79 .endif
80 vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[0]
81 vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[4]
82 vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[0]
83 vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[4]
84 vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[0]
85 vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[4]
86 vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[0]
87 vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[4]
88 sqshrun v8.4h, v6.4s, #8
89 sqshrun2 v8.8h, v7.4s, #8
90 br x5
91
92colormatrix_int_col0_n\i:
93 .if (\i^31) & 16
94 dup v6.4s, v4.s[0]
95 dup v7.4s, v4.s[0]
96 .endif
97 vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[0]
98 vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[4]
99 vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[0]
100 vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[4]
101 vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[0]
102 vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[4]
103 vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[0]
104 vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[4]
105 sqshrun v8.4h, v6.4s, #8
106 sqshrun2 v8.8h, v7.4s, #8
107 br x5
108
109.align 6
110colormatrix_int_col1_\i:
111 .if \i & 16
112 dup v6.4s, v4.s[1]
113 dup v7.4s, v4.s[1]
114 .endif
115 vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[1]
116 vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[5]
117 vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[1]
118 vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[5]
119 vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[1]
120 vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[5]
121 vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[1]
122 vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[5]
123 sqshrun v9.4h, v6.4s, #8
124 sqshrun2 v9.8h, v7.4s, #8
125 br x6
126
127colormatrix_int_col1_n\i:
128 .if (\i^31) & 16
129 dup v6.4s, v4.s[1]
130 dup v7.4s, v4.s[1]
131 .endif
132 vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[1]
133 vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[5]
134 vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[1]
135 vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[5]
136 vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[1]
137 vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[5]
138 vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[1]
139 vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[5]
140 sqshrun v9.4h, v6.4s, #8
141 sqshrun2 v9.8h, v7.4s, #8
142 br x6
143
144.align 6
145colormatrix_int_col2_\i:
146 .if \i & 16
147 dup v6.4s, v4.s[2]
148 dup v7.4s, v4.s[2]
149 .endif
150 vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[2]
151 vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[6]
152 vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[2]
153 vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[6]
154 vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[2]
155 vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[6]
156 vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[2]
157 vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[6]
158 sqshrun v10.4h, v6.4s, #8
159 sqshrun2 v10.8h, v7.4s, #8
160 br x7
161
162colormatrix_int_col2_n\i:
163 .if (\i^31) & 16
164 dup v6.4s, v4.s[2]
165 dup v7.4s, v4.s[2]
166 .endif
167 vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[2]
168 vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[6]
169 vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[2]
170 vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[6]
171 vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[2]
172 vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[6]
173 vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[2]
174 vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[6]
175 sqshrun v10.4h, v6.4s, #8
176 sqshrun2 v10.8h, v7.4s, #8
177 br x7
178
179.align 6
180colormatrix_int_col3_\i:
181 .if \i & 16
182 dup v6.4s, v4.s[3]
183 dup v7.4s, v4.s[3]
184 .endif
185 vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[3]
186 vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[7]
187 vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[3]
188 vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[7]
189 vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[3]
190 vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[7]
191 vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[3]
192 vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[7]
193 sqshrun v11.4h, v6.4s, #8
194 sqshrun2 v11.8h, v7.4s, #8
195 br x8
196
197colormatrix_int_col3_n\i:
198 .if (\i^31) & 16
199 dup v6.4s, v4.s[3]
200 dup v7.4s, v4.s[3]
201 .endif
202 vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[3]
203 vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[7]
204 vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[3]
205 vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[7]
206 vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[3]
207 vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[7]
208 vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[3]
209 vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[7]
210 sqshrun v11.4h, v6.4s, #8
211 sqshrun2 v11.8h, v7.4s, #8
212 br x8
213
214.align 5
215colormatrix_float_col0_\i:
216 vmxx_f32 \i, 1, v8.4s, v12.4s, v0.s[0]
217 vmxx_f32 \i, 2, v8.4s, v13.4s, v1.s[0]
218 vmxx_f32 \i, 4, v8.4s, v14.4s, v2.s[0]
219 vmxx_f32 \i, 8, v8.4s, v15.4s, v3.s[0]
220 vadd_f32 \i, 16, v8.4s, v8.4s, v4.4s, v8.16b, v4.16b
221 vmxx_f32 \i, 1, v16.4s, v20.4s, v0.s[0]
222 vmxx_f32 \i, 2, v16.4s, v21.4s, v1.s[0]
223 vmxx_f32 \i, 4, v16.4s, v22.4s, v2.s[0]
224 vmxx_f32 \i, 8, v16.4s, v23.4s, v3.s[0]
225 vadd_f32 \i, 16, v16.4s, v16.4s, v4.4s, v16.16b, v4.16b
226 br x5
227
228.align 4
229colormatrix_float_col0_n\i:
230 vmxx_f32 \i^31, 1, v8.4s, v12.4s, v0.s[0]
231 vmxx_f32 \i^31, 2, v8.4s, v13.4s, v1.s[0]
232 vmxx_f32 \i^31, 4, v8.4s, v14.4s, v2.s[0]
233 vmxx_f32 \i^31, 8, v8.4s, v15.4s, v3.s[0]
234 vadd_f32 \i^31, 16, v8.4s, v8.4s, v4.4s, v8.16b, v4.16b
235 vmxx_f32 \i^31, 1, v16.4s, v20.4s, v0.s[0]
236 vmxx_f32 \i^31, 2, v16.4s, v21.4s, v1.s[0]
237 vmxx_f32 \i^31, 4, v16.4s, v22.4s, v2.s[0]
238 vmxx_f32 \i^31, 8, v16.4s, v23.4s, v3.s[0]
239 vadd_f32 \i^31, 16, v16.4s, v16.4s, v4.4s, v16.16b, v4.16b
240 br x5
241
242.align 5
243colormatrix_float_col1_\i:
244 vmxx_f32 \i, 1, v9.4s, v12.4s, v0.s[1]
245 vmxx_f32 \i, 2, v9.4s, v13.4s, v1.s[1]
246 vmxx_f32 \i, 4, v9.4s, v14.4s, v2.s[1]
247 vmxx_f32 \i, 8, v9.4s, v15.4s, v3.s[1]
248 vadd_f32 \i, 16, v9.4s, v9.4s, v5.4s, v9.16b, v5.16b
249 vmxx_f32 \i, 1, v17.4s, v20.4s, v0.s[1]
250 vmxx_f32 \i, 2, v17.4s, v21.4s, v1.s[1]
251 vmxx_f32 \i, 4, v17.4s, v22.4s, v2.s[1]
252 vmxx_f32 \i, 8, v17.4s, v23.4s, v3.s[1]
253 vadd_f32 \i, 16, v17.4s, v17.4s, v5.4s, v17.16b, v5.16b
254 br x6
255
256.align 4
257colormatrix_float_col1_n\i:
258 vmxx_f32 \i^31, 1, v9.4s, v12.4s, v0.s[1]
259 vmxx_f32 \i^31, 2, v9.4s, v13.4s, v1.s[1]
260 vmxx_f32 \i^31, 4, v9.4s, v14.4s, v2.s[1]
261 vmxx_f32 \i^31, 8, v9.4s, v15.4s, v3.s[1]
262 vadd_f32 \i^31, 16, v9.4s, v9.4s, v5.4s, v9.16b, v5.16b
263 vmxx_f32 \i^31, 1, v17.4s, v20.4s, v0.s[1]
264 vmxx_f32 \i^31, 2, v17.4s, v21.4s, v1.s[1]
265 vmxx_f32 \i^31, 4, v17.4s, v22.4s, v2.s[1]
266 vmxx_f32 \i^31, 8, v17.4s, v23.4s, v3.s[1]
267 vadd_f32 \i^31, 16, v17.4s, v17.4s, v5.4s, v17.16b, v5.16b
268 br x6
269
270.align 5
271colormatrix_float_col2_\i:
272 vmxx_f32 \i, 1, v10.4s, v12.4s, v0.s[2]
273 vmxx_f32 \i, 2, v10.4s, v13.4s, v1.s[2]
274 vmxx_f32 \i, 4, v10.4s, v14.4s, v2.s[2]
275 vmxx_f32 \i, 8, v10.4s, v15.4s, v3.s[2]
276 vadd_f32 \i, 16, v10.4s, v10.4s, v6.4s, v10.16b, v6.16b
277 vmxx_f32 \i, 1, v18.4s, v20.4s, v0.s[2]
278 vmxx_f32 \i, 2, v18.4s, v21.4s, v1.s[2]
279 vmxx_f32 \i, 4, v18.4s, v22.4s, v2.s[2]
280 vmxx_f32 \i, 8, v18.4s, v23.4s, v3.s[2]
281 vadd_f32 \i, 16, v18.4s, v18.4s, v6.4s, v18.16b, v6.16b
282 br x7
283
284.align 4
285colormatrix_float_col2_n\i:
286 vmxx_f32 \i^31, 1, v10.4s, v12.4s, v0.s[2]
287 vmxx_f32 \i^31, 2, v10.4s, v13.4s, v1.s[2]
288 vmxx_f32 \i^31, 4, v10.4s, v14.4s, v2.s[2]
289 vmxx_f32 \i^31, 8, v10.4s, v15.4s, v3.s[2]
290 vadd_f32 \i^31, 16, v10.4s, v10.4s, v6.4s, v10.16b, v6.16b
291 vmxx_f32 \i^31, 1, v18.4s, v20.4s, v0.s[2]
292 vmxx_f32 \i^31, 2, v18.4s, v21.4s, v1.s[2]
293 vmxx_f32 \i^31, 4, v18.4s, v22.4s, v2.s[2]
294 vmxx_f32 \i^31, 8, v18.4s, v23.4s, v3.s[2]
295 vadd_f32 \i^31, 16, v18.4s, v18.4s, v6.4s, v18.16b, v6.16b
296 br x7
297
298.align 5
299colormatrix_float_col3_\i:
300 vmxx_f32 \i, 1, v11.4s, v12.4s, v0.s[3]
301 vmxx_f32 \i, 2, v11.4s, v13.4s, v1.s[3]
302 vmxx_f32 \i, 4, v11.4s, v14.4s, v2.s[3]
303 vmxx_f32 \i, 8, v11.4s, v15.4s, v3.s[3]
304 vadd_f32 \i, 16, v11.4s, v11.4s, v7.4s, v11.16b, v7.16b
305 vmxx_f32 \i, 1, v19.4s, v20.4s, v0.s[3]
306 vmxx_f32 \i, 2, v19.4s, v21.4s, v1.s[3]
307 vmxx_f32 \i, 4, v19.4s, v22.4s, v2.s[3]
308 vmxx_f32 \i, 8, v19.4s, v23.4s, v3.s[3]
309 vadd_f32 \i, 16, v19.4s, v19.4s, v7.4s, v19.16b, v7.16b
310 br x8
311
312.align 4
313colormatrix_float_col3_n\i:
314 vmxx_f32 \i^31, 1, v11.4s, v12.4s, v0.s[3]
315 vmxx_f32 \i^31, 2, v11.4s, v13.4s, v1.s[3]
316 vmxx_f32 \i^31, 4, v11.4s, v14.4s, v2.s[3]
317 vmxx_f32 \i^31, 8, v11.4s, v15.4s, v3.s[3]
318 vadd_f32 \i^31, 16, v11.4s, v11.4s, v7.4s, v11.16b, v7.16b
319 vmxx_f32 \i^31, 1, v19.4s, v20.4s, v0.s[3]
320 vmxx_f32 \i^31, 2, v19.4s, v21.4s, v1.s[3]
321 vmxx_f32 \i^31, 4, v19.4s, v22.4s, v2.s[3]
322 vmxx_f32 \i^31, 8, v19.4s, v23.4s, v3.s[3]
323 vadd_f32 \i^31, 16, v19.4s, v19.4s, v7.4s, v19.16b, v7.16b
324 br x8
325
326.endr
327
328.align 6
329colormatrix_float_ldu4:
330 ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
331 uxtl v20.8h, v20.8b
332 uxtl v21.8h, v21.8b
333 uxtl v22.8h, v22.8b
334 uxtl v23.8h, v23.8b
335 uxtl v12.4s, v20.4h
336 uxtl v13.4s, v21.4h
337 uxtl v14.4s, v22.4h
338 uxtl v15.4s, v23.4h
339 uxtl2 v20.4s, v20.8h
340 uxtl2 v21.4s, v21.8h
341 uxtl2 v22.4s, v22.8h
342 uxtl2 v23.4s, v23.8h
343 ucvtf v12.4s, v12.4s
344 ucvtf v13.4s, v13.4s
345 ucvtf v14.4s, v14.4s
346 ucvtf v15.4s, v15.4s
347 ucvtf v20.4s, v20.4s
348 ucvtf v21.4s, v21.4s
349 ucvtf v22.4s, v22.4s
350 ucvtf v23.4s, v23.4s
351 br x4
352
353.align 5
354colormatrix_int_ldu4:
355 ld4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
356 uxtl v12.8h, v12.8b
357 uxtl v13.8h, v13.8b
358 uxtl v14.8h, v14.8b
359 uxtl v15.8h, v15.8b
360 br x4
361
362.align 6
363colormatrix_float_ldu3:
364 ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
365 uxtl v20.8h, v20.8b
366 uxtl v21.8h, v21.8b
367 uxtl v22.8h, v22.8b
368 uxtl v12.4s, v20.4h
369 uxtl v13.4s, v21.4h
370 uxtl v14.4s, v22.4h
371 uxtl2 v20.4s, v20.8h
372 uxtl2 v21.4s, v21.8h
373 uxtl2 v22.4s, v22.8h
374 ucvtf v12.4s, v12.4s
375 ucvtf v13.4s, v13.4s
376 ucvtf v14.4s, v14.4s
377 ucvtf v20.4s, v20.4s
378 ucvtf v21.4s, v21.4s
379 ucvtf v22.4s, v22.4s
380 br x4
381
382colormatrix_int_ldu3:
383 ld4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
384 uxtl v12.8h, v12.8b
385 uxtl v13.8h, v13.8b
386 uxtl v14.8h, v14.8b
387 br x4
388
389.align 5
390colormatrix_float_ldu1:
391 ld1 {v20.8b}, [x1], #8
392 uxtl v20.8h, v20.8b
393 uxtl v12.4s, v20.4h
394 uxtl2 v20.4s, v20.8h
395 ucvtf v12.4s, v12.4s
396 ucvtf v20.4s, v20.4s
397 br x4
398
399.align 6
400colormatrix_float_ldu2:
401 ld2 {v20.8b,v21.8b}, [x1], #16
402 uxtl v20.8h, v20.8b
403 uxtl v21.8h, v21.8b
404 uxtl v12.4s, v20.4h
405 uxtl v13.4s, v21.4h
406 uxtl2 v20.4s, v20.8h
407 uxtl2 v21.4s, v21.8h
408 ucvtf v12.4s, v12.4s
409 ucvtf v13.4s, v13.4s
410 ucvtf v20.4s, v20.4s
411 ucvtf v21.4s, v21.4s
412 br x4
413
414.align 4
415colormatrix_int_ldu2:
416 ld2 {v12.8b,v13.8b}, [x1], #16
417 uxtl v12.8h, v12.8b
418 uxtl v13.8h, v13.8b
419 br x4
420
421.align 6
422colormatrix_float_stu4:
423 fcvtzs v24.4s, v8.4s, #1
424 fcvtzs v25.4s, v9.4s, #1
425 fcvtzs v26.4s, v10.4s, #1
426 fcvtzs v27.4s, v11.4s, #1
427 fcvtzs v28.4s, v16.4s, #1
428 fcvtzs v29.4s, v17.4s, #1
429 fcvtzs v30.4s, v18.4s, #1
430 fcvtzs v31.4s, v19.4s, #1
431 sqrshrun v24.4h, v24.4s, #1
432 sqrshrun v25.4h, v25.4s, #1
433 sqrshrun v26.4h, v26.4s, #1
434 sqrshrun v27.4h, v27.4s, #1
435 sqrshrun2 v24.8h, v28.4s, #1
436 sqrshrun2 v25.8h, v29.4s, #1
437 sqrshrun2 v26.8h, v30.4s, #1
438 sqrshrun2 v27.8h, v31.4s, #1
439 uqxtn v24.8b, v24.8h
440 uqxtn v25.8b, v25.8h
441 uqxtn v26.8b, v26.8h
442 uqxtn v27.8b, v27.8h
443 subs x2, x2, #8
444 st4 {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
445 blo colormatrix_float_end
446 br x9
447
448.align 5
449colormatrix_int_stu4:
450 uqxtn v12.8b, v8.8h
451 uqxtn v13.8b, v9.8h
452 uqxtn v14.8b, v10.8h
453 uqxtn v15.8b, v11.8h
454 subs x2, x2, #8
455 st4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
456 blo colormatrix_int_end
457 br x9
458
459.align 6
460colormatrix_float_stu3:
461 fcvtzs v24.4s, v8.4s, #1
462 fcvtzs v25.4s, v9.4s, #1
463 fcvtzs v26.4s, v10.4s, #1
464 fcvtzs v28.4s, v16.4s, #1
465 fcvtzs v29.4s, v17.4s, #1
466 fcvtzs v30.4s, v18.4s, #1
467 sqrshrun v24.4h, v24.4s, #1
468 sqrshrun v25.4h, v25.4s, #1
469 sqrshrun v26.4h, v26.4s, #1
470 sqrshrun2 v24.8h, v28.4s, #1
471 sqrshrun2 v25.8h, v29.4s, #1
472 sqrshrun2 v26.8h, v30.4s, #1
473 uqxtn v24.8b, v24.8h
474 uqxtn v25.8b, v25.8h
475 uqxtn v26.8b, v26.8h
476 movi v27.8b, #0
477 subs x2, x2, #8
478 st4 {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
479 blo colormatrix_float_end
480 br x9
481
482.align 4
483colormatrix_int_ldu1:
484 ld1 {v12.8b}, [x1], #8
485 uxtl v12.8h, v12.8b
486 br x4
487
488.align 5
489colormatrix_int_stu3:
490 uqxtn v12.8b, v8.8h
491 uqxtn v13.8b, v9.8h
492 uqxtn v14.8b, v10.8h
493 movi v15.8b, #0
494 subs x2, x2, #8
495 st4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
496 blo colormatrix_int_end
497 br x9
498
499.align 6
500colormatrix_float_stu2:
501 fcvtzs v24.4s, v8.4s, #1
502 fcvtzs v25.4s, v9.4s, #1
503 fcvtzs v28.4s, v16.4s, #1
504 fcvtzs v29.4s, v17.4s, #1
505 sqrshrun v24.4h, v24.4s, #1
506 sqrshrun v25.4h, v25.4s, #1
507 sqrshrun2 v24.8h, v28.4s, #1
508 sqrshrun2 v25.8h, v29.4s, #1
509 uqxtn v24.8b, v24.8h
510 uqxtn v25.8b, v25.8h
511 subs x2, x2, #8
512 st2 {v24.8b,v25.8b}, [x0], #16
513 blo colormatrix_float_end
514 br x9
515
516.align 5
517colormatrix_int_stu2:
518 uqxtn v12.8b, v8.8h
519 uqxtn v13.8b, v9.8h
520 subs x2, x2, #8
521 st2 {v12.8b,v13.8b}, [x0], #16
522 blo colormatrix_int_end
523 br x9
524
525.align 5
526colormatrix_int_stu1:
527 uqxtn v12.8b, v8.8h
528 subs x2, x2, #8
529 st1 {v12.8b}, [x0], #8
530 blo colormatrix_int_end
531 br x9
532
533colormatrix_float_ldf3:
534 ld4 {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
535 ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
536 br x4
537
538.align 6
539colormatrix_float_stu1:
540 fcvtzs v24.4s, v8.4s, #1
541 fcvtzs v28.4s, v16.4s, #1
542 sqrshrun v24.4h, v24.4s, #1
543 sqrshrun2 v24.8h, v28.4s, #1
544 uqxtn v24.8b, v24.8h
545 subs x2, x2, #8
546 st1 {v24.8b}, [x0], #8
547 blo colormatrix_float_end
548 br x9
549
550colormatrix_float_stf3:
551 movi v11.16b, #0
552 st4 {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
553 movi v19.16b, #0
554 subs x2, x2, #8
555 st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
556 blo colormatrix_float_end
557 br x9
558
559.align 5
560colormatrix_float_stf4:
561 st4 {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
562 subs x2, x2, #8
563 st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
564 blo colormatrix_float_end
565 br x9
566
567colormatrix_float_ldf4:
568 ld4 {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
569 ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
570 br x4
571
572.align 5
573colormatrix_float_stf2:
574 st2 {v8.4s, v9.4s}, [x0], #32
575 subs x2, x2, #8
576 st2 {v16.4s, v17.4s}, [x0], #32
577 blo colormatrix_float_end
578 br x9
579
580colormatrix_float_ldf2:
581 ld2 {v12.4s,v13.4s}, [x1], #32
582 ld2 {v20.4s,v21.4s}, [x1], #32
583 br x4
584
585.align 5
586colormatrix_float_stf1:
587 st1 {v8.4s}, [x0], #16
588 subs x2, x2, #8
589 st1 {v16.4s}, [x0], #16
590 blo colormatrix_float_end
591 br x9
592
593colormatrix_float_ldf1:
594 ld1 {v12.4s}, [x1], #16
595 ld1 {v20.4s}, [x1], #16
596 br x4
597
Simon Hosie6e7e2582014-05-06 01:07:21 -0700598colormatrix_int_stu1_end:
599 uqxtn v12.8b, v8.8h
600 tbz x2, #2, 1f
601 st1 {v12.s}[1], [x0], #4
6021: tbz x2, #1, 1f
603 st1 {v12.h}[1], [x0], #2
6041: tbz x2, #0, 1f
605 st1 {v12.b}[1], [x0], #1
6061: b colormatrix_int_realend
Simon Hosie0462a392014-03-07 19:36:44 -0800607
Simon Hosie6e7e2582014-05-06 01:07:21 -0700608colormatrix_int_stu2_end:
609 uqxtn v12.8b, v8.8h
610 uqxtn v13.8b, v9.8h
611 zip1 v12.16b, v12.16b, v13.16b
612 tbz x2, #2, 1f
613 st1 {v12.d}[1], [x0], #8
6141: tbz x2, #1, 1f
615 st1 {v12.s}[1], [x0], #4
6161: tbz x2, #0, 1f
617 st1 {v12.h}[1], [x0], #2
6181: b colormatrix_int_realend
619
620colormatrix_int_stu3_end:
621 uqxtn v12.8b, v8.8h
622 uqxtn v13.8b, v9.8h
623 uqxtn v14.8b, v10.8h
624 movi v15.8b, #0
625 tbz x2, #2, 1f
626 st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
627 st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
628 st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
629 st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
6301: tbz x2, #1, 1f
631 st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
632 st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
6331: tbz x2, #0, 1f
634 st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
6351: b colormatrix_int_realend
636
637colormatrix_int_stu4_end:
638 uqxtn v12.8b, v8.8h
639 uqxtn v13.8b, v9.8h
640 uqxtn v14.8b, v10.8h
641 uqxtn v15.8b, v11.8h
642 tbz x2, #2, 1f
643 st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
644 st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
645 st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
646 st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
6471: tbz x2, #1, 1f
648 st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
649 st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
6501: tbz x2, #0, 1f
651 st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
6521: b colormatrix_int_realend
653
654
655colormatrix_int_ldu1_end:
656 tbz x2, #2, 1f
657 ld1 {v15.s}[3], [x1], #4
6581: tbz x2, #1, 1f
659 ld1 {v15.h}[5], [x1], #2
6601: tbz x2, #0, 1f
661 ld1 {v15.b}[9], [x1], #1
6621: uxtl2 v12.8h, v15.16b
663 br x4
664
665colormatrix_int_ldu2_end:
666 tbz x2, #2, 1f
667 ld1 {v15.d}[1], [x1], #8
6681: tbz x2, #1, 1f
669 ld1 {v15.s}[1], [x1], #4
6701: tbz x2, #0, 1f
671 ld1 {v15.h}[1], [x1], #2
6721: uzp1 v14.16b, v15.16b, v15.16b
673 uzp2 v15.16b, v15.16b, v15.16b
674 uxtl v12.8h, v14.8b
675 uxtl v13.8h, v15.8b
676 br x4
677
678colormatrix_int_ldu3_end:
679 tbz x2, #2, 1f
680 ld4 {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
681 ld4 {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
682 ld4 {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
683 ld4 {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
6841: tbz x2, #1, 1f
685 ld4 {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
686 ld4 {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
6871: tbz x2, #0, 1f
688 ld4 {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
6891: uxtl v12.8h, v12.8b
690 uxtl v13.8h, v13.8b
691 uxtl v14.8h, v14.8b
692 br x4
693
694colormatrix_int_ldu4_end:
695 tbz x2, #2, 1f
696 ld4 {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
697 ld4 {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
698 ld4 {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
699 ld4 {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
7001: tbz x2, #1, 1f
701 ld4 {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
702 ld4 {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
7031: tbz x2, #0, 1f
704 ld4 {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
7051: uxtl v12.8h, v12.8b
706 uxtl v13.8h, v13.8b
707 uxtl v14.8h, v14.8b
708 uxtl v15.8h, v15.8b
709 br x4
710
711colormatrix_float_stu1_end:
712 fcvtzs v12.4s, v8.4s, #1
713 fcvtzs v13.4s, v16.4s, #1
714 sqrshrun v12.4h, v12.4s, #1
715 sqrshrun2 v12.8h, v13.4s, #1
716 uqxtn v12.8b, v12.8h
717 tbz x2, #2, 1f
718 st1 {v12.s}[1], [x0], #4
7191: tbz x2, #1, 1f
720 st1 {v12.h}[1], [x0], #2
7211: tbz x2, #0, 1f
722 st1 {v12.b}[1], [x0], #1
7231: b colormatrix_float_realend
724
725colormatrix_float_stu2_end:
726 fcvtzs v12.4s, v8.4s, #1
727 fcvtzs v13.4s, v9.4s, #1
728 fcvtzs v14.4s, v16.4s, #1
729 fcvtzs v15.4s, v17.4s, #1
730 sqrshrun v12.4h, v12.4s, #1
731 sqrshrun v13.4h, v13.4s, #1
732 sqrshrun v14.4h, v14.4s, #1
733 sqrshrun v15.4h, v15.4s, #1
734 zip1 v12.8h, v12.8h, v13.8h
735 zip1 v13.8h, v14.8h, v15.8h
736 uqxtn v12.8b, v12.8h
737 uqxtn2 v12.16b, v13.8h
738 tbz x2, #2, 1f
739 st1 {v12.d}[1], [x0], #8
7401: tbz x2, #1, 1f
741 st1 {v12.s}[1], [x0], #4
7421: tbz x2, #0, 1f
743 st1 {v12.h}[1], [x0], #2
7441: b colormatrix_float_realend
745
746colormatrix_float_stu3_end:
747 fcvtzs v24.4s, v8.4s, #1
748 fcvtzs v25.4s, v9.4s, #1
749 fcvtzs v26.4s, v10.4s, #1
750 fcvtzs v28.4s, v16.4s, #1
751 fcvtzs v29.4s, v17.4s, #1
752 fcvtzs v30.4s, v18.4s, #1
753 sqrshrun v24.4h, v24.4s, #1
754 sqrshrun v25.4h, v25.4s, #1
755 sqrshrun v26.4h, v26.4s, #1
756 sqrshrun2 v24.8h, v28.4s, #1
757 sqrshrun2 v25.8h, v29.4s, #1
758 sqrshrun2 v26.8h, v30.4s, #1
759 uqxtn v12.8b, v24.8h
760 uqxtn v13.8b, v25.8h
761 uqxtn v14.8b, v26.8h
762 movi v15.8b, #0
763 tbz x2, #2, 1f
764 st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
765 st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
766 st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
767 st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
7681: tbz x2, #1, 1f
769 st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
770 st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
7711: tbz x2, #0, 1f
772 st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
7731: b colormatrix_float_realend
774
775colormatrix_float_stu4_end:
776 fcvtzs v24.4s, v8.4s, #1
777 fcvtzs v25.4s, v9.4s, #1
778 fcvtzs v26.4s, v10.4s, #1
779 fcvtzs v27.4s, v11.4s, #1
780 fcvtzs v28.4s, v16.4s, #1
781 fcvtzs v29.4s, v17.4s, #1
782 fcvtzs v30.4s, v18.4s, #1
783 fcvtzs v31.4s, v19.4s, #1
784 sqrshrun v24.4h, v24.4s, #1
785 sqrshrun v25.4h, v25.4s, #1
786 sqrshrun v26.4h, v26.4s, #1
787 sqrshrun v27.4h, v27.4s, #1
788 sqrshrun2 v24.8h, v28.4s, #1
789 sqrshrun2 v25.8h, v29.4s, #1
790 sqrshrun2 v26.8h, v30.4s, #1
791 sqrshrun2 v27.8h, v31.4s, #1
792 uqxtn v12.8b, v24.8h
793 uqxtn v13.8b, v25.8h
794 uqxtn v14.8b, v26.8h
795 uqxtn v15.8b, v27.8h
796 tbz x2, #2, 1f
797 st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
798 st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
799 st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
800 st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
8011: tbz x2, #1, 1f
802 st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
803 st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
8041: tbz x2, #0, 1f
805 st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
8061: b colormatrix_float_realend
807
808colormatrix_float_stf1_end:
809 tbz x2, #2, 1f
810 st1 {v16.4s}, [x0], #16
8111: tbz x2, #1, 1f
812 st1 {v8.d}[1], [x0], #8
8131: tbz x2, #0, 1f
814 st1 {v8.s}[1], [x0], #4
8151: b colormatrix_float_realend
816
817colormatrix_float_stf2_end:
818 tbz x2, #2, 1f
819 st2 {v16.4s, v17.4s}, [x0], #32
8201: tbz x2, #1, 1f
821 st2 {v8.s,v9.s}[2], [x0], #8
822 st2 {v8.s,v9.s}[3], [x0], #8
8231: tbz x2, #0, 1f
824 st2 {v8.s,v9.s}[1], [x0], #8
8251: b colormatrix_float_realend
826
827colormatrix_float_stf3_end:
828 movi v11.16b, #0
829 movi v19.16b, #0
830colormatrix_float_stf4_end:
831 tbz x2, #2, 1f
832 st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
8331: tbz x2, #1, 1f
834 st4 {v8.s,v9.s,v10.s,v11.s}[2], [x0], #16
835 st4 {v8.s,v9.s,v10.s,v11.s}[3], [x0], #16
8361: tbz x2, #0, 1f
837 st4 {v8.s,v9.s,v10.s,v11.s}[1], [x0], #16
8381: b colormatrix_float_realend
839
840colormatrix_float_ldu1_end:
841 tbz x2, #2, 1f
842 ld1 {v15.s}[1], [x1], #4
8431: tbz x2, #1, 1f
844 ld1 {v15.h}[1], [x1], #2
8451: tbz x2, #0, 1f
846 ld1 {v15.b}[1], [x1], #1
8471: uxtl v15.8h, v15.8b
848 uxtl v12.4s, v15.4h
849 uxtl2 v20.4s, v15.8h
850 ucvtf v12.4s, v12.4s
851 ucvtf v20.4s, v20.4s
852 br x4
853
854colormatrix_float_ldu2_end:
855 tbz x2, #2, 1f
856 ld1 {v15.d}[1], [x1], #8
8571: tbz x2, #1, 1f
858 ld1 {v15.s}[1], [x1], #4
8591: tbz x2, #0, 1f
860 ld1 {v15.h}[1], [x1], #2
8611: uxtl v14.8h, v15.8b
862 uxtl2 v15.8h, v15.16b
863 uzp1 v12.8h, v14.8h, v14.8h
864 uzp2 v13.8h, v14.8h, v14.8h
865 uzp1 v20.8h, v15.8h, v15.8h
866 uzp2 v21.8h, v15.8h, v15.8h
867 uxtl v12.4s, v12.4h
868 uxtl v13.4s, v13.4h
869 uxtl v20.4s, v20.4h
870 uxtl v21.4s, v21.4h
871 ucvtf v12.4s, v12.4s
872 ucvtf v13.4s, v13.4s
873 ucvtf v20.4s, v20.4s
874 ucvtf v21.4s, v21.4s
875 br x4
876
877colormatrix_float_ldu3_end:
878 tbz x2, #2, 1f
879 ld4 {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
880 ld4 {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
881 ld4 {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
882 ld4 {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
8831: tbz x2, #1, 1f
884 ld4 {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
885 ld4 {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
8861: tbz x2, #0, 1f
887 ld4 {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
8881: uxtl v20.8h, v20.8b
889 uxtl v21.8h, v21.8b
890 uxtl v22.8h, v22.8b
891 uxtl v12.4s, v20.4h
892 uxtl v13.4s, v21.4h
893 uxtl v14.4s, v22.4h
894 uxtl2 v20.4s, v20.8h
895 uxtl2 v21.4s, v21.8h
896 uxtl2 v22.4s, v22.8h
897 ucvtf v12.4s, v12.4s
898 ucvtf v13.4s, v13.4s
899 ucvtf v14.4s, v14.4s
900 ucvtf v20.4s, v20.4s
901 ucvtf v21.4s, v21.4s
902 ucvtf v22.4s, v22.4s
903 br x4
904
905colormatrix_float_ldu4_end:
906 tbz x2, #2, 1f
907 ld4 {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
908 ld4 {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
909 ld4 {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
910 ld4 {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
9111: tbz x2, #1, 1f
912 ld4 {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
913 ld4 {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
9141: tbz x2, #0, 1f
915 ld4 {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
9161: uxtl v20.8h, v20.8b
917 uxtl v21.8h, v21.8b
918 uxtl v22.8h, v22.8b
919 uxtl v23.8h, v23.8b
920 uxtl v12.4s, v20.4h
921 uxtl v13.4s, v21.4h
922 uxtl v14.4s, v22.4h
923 uxtl v15.4s, v23.4h
924 uxtl2 v20.4s, v20.8h
925 uxtl2 v21.4s, v21.8h
926 uxtl2 v22.4s, v22.8h
927 uxtl2 v23.4s, v23.8h
928 ucvtf v12.4s, v12.4s
929 ucvtf v13.4s, v13.4s
930 ucvtf v14.4s, v14.4s
931 ucvtf v15.4s, v15.4s
932 ucvtf v20.4s, v20.4s
933 ucvtf v21.4s, v21.4s
934 ucvtf v22.4s, v22.4s
935 ucvtf v23.4s, v23.4s
936 br x4
937
938colormatrix_float_ldf1_end:
939 tbz x2, #2, 1f
940 ld1 {v20.4s}, [x1], #16
9411: tbz x2, #1, 1f
942 ld1 {v12.d}[1], [x1], #8
9431: tbz x2, #0, 1f
944 ld1 {v12.s}[1], [x1], #4
9451: br x4
946
947colormatrix_float_ldf2_end:
948 tbz x2, #2, 1f
949 ld2 {v20.4s,v21.4s}, [x1], #32
9501: tbz x2, #1, 1f
951 ld2 {v12.s,v13.s}[2], [x1], #8
952 ld2 {v12.s,v13.s}[3], [x1], #8
9531: tbz x2, #0, 1f
954 ld2 {v12.s,v13.s}[1], [x1], #8
9551: br x4
956
957colormatrix_float_ldf3_end:
958colormatrix_float_ldf4_end:
959 tbz x2, #2, 1f
960 ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
9611: tbz x2, #1, 1f
962 ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x1], #16
963 ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x1], #16
9641: tbz x2, #0, 1f
965 ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x1], #16
9661: br x4
967
968/* void rsdIntrinsicColorMatrix_int_K(
Simon Hosie0462a392014-03-07 19:36:44 -0800969 * void *out, // x0
970 * void const *in, // x1
971 * size_t count, // x2
972 * fntab_t const *fns, // x3
973 * int16_t const *mult, // x4
974 * int32_t const *add); // x5
975 */
976ENTRY(rsdIntrinsicColorMatrix_int_K)
Simon Hosie0462a392014-03-07 19:36:44 -0800977 sub x7, sp, #32
978 sub sp, sp, #64
979 st1 {v8.1d-v11.1d}, [sp]
980 st1 {v12.1d-v15.1d}, [x7]
981
982 ld1 {v0.8h,v1.8h}, [x4], #32
983 ld1 {v4.4s}, [x5], #16
984
985 ldp x4,x5, [x3],#16
986 ldp x6,x7, [x3],#16
987 ldp x8,x9, [x3],#16
988
989 dup v12.4s, v4.s[0]
990 dup v13.4s, v4.s[1]
991 dup v14.4s, v4.s[2]
992 dup v15.4s, v4.s[3]
993 sqshrun v8.4h, v12.4s, #8
994 sqshrun2 v8.8h, v12.4s, #8
995 sqshrun v9.4h, v13.4s, #8
996 sqshrun2 v9.8h, v13.4s, #8
997 sqshrun v10.4h, v14.4s, #8
998 sqshrun2 v10.8h, v14.4s, #8
999 sqshrun v11.4h, v15.4s, #8
1000 sqshrun2 v11.8h, v15.4s, #8
1001
1002 subs x2, x2, #8
1003 blo colormatrix_int_end
1004 br x9
1005
1006colormatrix_int_end:
Simon Hosie6e7e2582014-05-06 01:07:21 -07001007 adds x2, x2, #8
1008 bls colormatrix_int_realend
1009 mov x16, x8
1010 ldp x8, x9, [x3], #16
1011 cmp x4, x16
1012 csel x4, x8, x4, eq
1013 cmp x5, x16
1014 csel x5, x8, x5, eq
1015 cmp x6, x16
1016 csel x6, x8, x6, eq
1017 cmp x7, x16
1018 csel x7, x8, x7, eq
1019 br x9
1020
1021colormatrix_int_realend:
Simon Hosie0462a392014-03-07 19:36:44 -08001022 ld1 {v8.1d-v11.1d}, [sp], #32
1023 ld1 {v12.1d-v15.1d}, [sp], #32
Simon Hosie0462a392014-03-07 19:36:44 -08001024 ret
1025END(rsdIntrinsicColorMatrix_int_K)
1026
1027/* void rsdIntrinsicColorMatrixSetup_int_K(
1028 * fntab_t const *fns, // x0
1029 * uint32_t mask, // x1
1030 * int dt, // x2
1031 * int st); // x3
1032 */
1033ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
Simon Hosie6e7e2582014-05-06 01:07:21 -07001034 adr x7, 2f
1035 add x4, x7, x2, LSL #2
1036 ldrsh x2, [x4], #2
1037 ldrsh x4, [x4]
1038 add x2, x2, x7
1039 add x4, x4, x7
1040 adr x7, 3f
1041 add x5, x7, x3, LSL #2
1042 ldrsh x3, [x5], #2
1043 ldrsh x5, [x5]
1044 add x3, x3, x7
1045 add x5, x5, x7
Simon Hosie0462a392014-03-07 19:36:44 -08001046 stp x2, x3, [x0, #32]
Simon Hosie6e7e2582014-05-06 01:07:21 -07001047 stp x4, x5, [x0, #48]
Simon Hosie0462a392014-03-07 19:36:44 -08001048
1049/* For each column function, if the matrix is all zeroes then write NULL,
1050 * otherwise look up the appropriate function and store that. */
1051
1052 mov x3, #4
Simon Hosie6e7e2582014-05-06 01:07:21 -07001053 adr x7, 4f
Simon Hosie0462a392014-03-07 19:36:44 -080010541: ands x2, x1, #15
1055 beq 9f
1056 and x2, x1, #31
1057 lsl x2, x2, #3
Simon Hosie6e7e2582014-05-06 01:07:21 -07001058 ldrsh x2, [x7, x2]
1059 add x2, x2, x7
Simon Hosie0462a392014-03-07 19:36:44 -080010609: str x2, [x0], #8
1061 lsr x1, x1, #5
Simon Hosie6e7e2582014-05-06 01:07:21 -07001062 add x7, x7, #2
Simon Hosie0462a392014-03-07 19:36:44 -08001063 subs x3, x3, #1
1064 bne 1b
1065
1066/* For every NULL entry, copy the non-NULL entry that follows it, or the store
1067 * function. */
1068
1069 ldr x2, [x0]
1070 mov x3, #4
10711: ldr x1, [x0, #-8]!
1072 cmp x1, #0
1073 csel x2, x1, x2, ne
1074 str x2, [x0]
1075 subs x3, x3, #1
1076 bne 1b
1077 ret
1078
1079 .align 4
10802: .hword colormatrix_int_stu1-2b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001081 .hword colormatrix_int_stu1_end-2b
Simon Hosie0462a392014-03-07 19:36:44 -08001082 .hword colormatrix_int_stu2-2b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001083 .hword colormatrix_int_stu2_end-2b
Simon Hosie0462a392014-03-07 19:36:44 -08001084 .hword colormatrix_int_stu3-2b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001085 .hword colormatrix_int_stu3_end-2b
Simon Hosie0462a392014-03-07 19:36:44 -08001086 .hword colormatrix_int_stu4-2b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001087 .hword colormatrix_int_stu4_end-2b
Simon Hosie0462a392014-03-07 19:36:44 -080010883: .hword colormatrix_int_ldu1-3b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001089 .hword colormatrix_int_ldu1_end-3b
Simon Hosie0462a392014-03-07 19:36:44 -08001090 .hword colormatrix_int_ldu2-3b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001091 .hword colormatrix_int_ldu2_end-3b
Simon Hosie0462a392014-03-07 19:36:44 -08001092 .hword colormatrix_int_ldu3-3b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001093 .hword colormatrix_int_ldu3_end-3b
Simon Hosie0462a392014-03-07 19:36:44 -08001094 .hword colormatrix_int_ldu4-3b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001095 .hword colormatrix_int_ldu4_end-3b
Simon Hosie0462a392014-03-07 19:36:44 -080010964:
1097.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1098 .hword colormatrix_int_col0_\i-4b
1099 .hword colormatrix_int_col1_\i-4b-2
1100 .hword colormatrix_int_col2_\i-4b-4
1101 .hword colormatrix_int_col3_\i-4b-6
1102.endr
1103.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1104 .hword colormatrix_int_col0_n\i-4b
1105 .hword colormatrix_int_col1_n\i-4b-2
1106 .hword colormatrix_int_col2_n\i-4b-4
1107 .hword colormatrix_int_col3_n\i-4b-6
1108.endr
1109END(rsdIntrinsicColorMatrixSetup_int_K)
1110
1111
Simon Hosie6e7e2582014-05-06 01:07:21 -07001112/* void rsdIntrinsicColorMatrix_float_K(
Simon Hosie0462a392014-03-07 19:36:44 -08001113 * void *out, // x0
1114 * void const *in, // x1
1115 * size_t count, // x2
1116 * fntab_t const *fns, // x3
1117 * float const *mult, // x4
1118 * float const *add); // x5
1119 */
1120ENTRY(rsdIntrinsicColorMatrix_float_K)
Simon Hosie0462a392014-03-07 19:36:44 -08001121 sub x7, sp, #32
1122 sub sp, sp, #64
1123 st1 {v8.1d-v11.1d}, [sp]
1124 st1 {v12.1d-v15.1d}, [x7]
1125
1126 ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x4], #64
1127 ld1r {v4.4s}, [x5], #4
1128 ld1r {v5.4s}, [x5], #4
1129 ld1r {v6.4s}, [x5], #4
1130 ld1r {v7.4s}, [x5], #4
1131
1132 ldp x4,x5, [x3], #16
1133 ldp x6,x7, [x3], #16
1134 ldp x8,x9, [x3], #16
1135
1136 mov v8.16b, v4.16b
1137 mov v9.16b, v5.16b
1138 mov v10.16b, v6.16b
1139 mov v11.16b, v7.16b
1140
1141 mov v16.16b, v4.16b
1142 mov v17.16b, v5.16b
1143 mov v18.16b, v6.16b
1144 mov v19.16b, v7.16b
1145
1146 subs x2, x2, #8
1147 blo colormatrix_float_end
1148 br x9
1149
1150colormatrix_float_end:
Simon Hosie6e7e2582014-05-06 01:07:21 -07001151 adds x2, x2, #8
1152 bls colormatrix_int_realend
1153 mov x16, x8
1154 ldp x8,x9, [x3], #16
1155 cmp x4, x16
1156 csel x4, x8, x4, eq
1157 cmp x5, x16
1158 csel x5, x8, x5, eq
1159 cmp x6, x16
1160 csel x6, x8, x6, eq
1161 cmp x7, x16
1162 csel x7, x8, x7, eq
1163 br x9
1164
1165colormatrix_float_realend:
Simon Hosie0462a392014-03-07 19:36:44 -08001166 ld1 {v8.1d-v11.1d}, [sp], #32
1167 ld1 {v12.1d-v15.1d}, [sp], #32
Simon Hosie0462a392014-03-07 19:36:44 -08001168 ret
1169END(rsdIntrinsicColorMatrix_float_K)
1170
1171/* void rsdIntrinsicColorMatrixSetup_float_K(
1172 * fntab_t const *fns, // x0
1173 * uint32_t mask, // x1
1174 * int dt, // x2
1175 * int st); // x3
1176 */
1177ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
Simon Hosie6e7e2582014-05-06 01:07:21 -07001178 adr x7, 2f
1179 add x4, x7, x2, LSL #2
1180 ldrsh x2, [x4], #2
1181 ldrsh x4, [x4]
1182 add x2, x2, x7
1183 add x4, x4, x7
1184 adr x7, 3f
1185 add x5, x7, x3, LSL #2
1186 ldrsh x3, [x5], #2
1187 ldrsh x5, [x5]
1188 add x3, x3, x7
1189 add x5, x5, x7
Simon Hosie0462a392014-03-07 19:36:44 -08001190 stp x2, x3, [x0, #32]
Simon Hosie6e7e2582014-05-06 01:07:21 -07001191 stp x4, x5, [x0, #48]
Simon Hosie0462a392014-03-07 19:36:44 -08001192
1193/* For each column function, if the matrix is all zeroes then write NULL,
1194 * otherwise look up the appropriate function and store that. */
1195
1196 mov x3, #4
Simon Hosie6e7e2582014-05-06 01:07:21 -07001197 adr x7, 4f
Simon Hosie0462a392014-03-07 19:36:44 -080011981: ands x2, x1, #15
1199 beq 9f
1200 and x2, x1, #31
1201 lsl x2, x2, #3
Simon Hosie6e7e2582014-05-06 01:07:21 -07001202 ldrsh x2, [x7, x2]
1203 add x2, x2, x7
Simon Hosie0462a392014-03-07 19:36:44 -080012049: str x2, [x0], #8
1205 lsr x1, x1, #5
Simon Hosie6e7e2582014-05-06 01:07:21 -07001206 add x7, x7, #2
Simon Hosie0462a392014-03-07 19:36:44 -08001207 subs x3, x3, #1
1208 bne 1b
1209
1210/* For every NULL entry, copy the non-NULL entry that follows it, or the store
1211 * function. */
1212
1213 ldr x2, [x0]
1214 mov x3, #4
12151: ldr x1, [x0, #-8]!
1216 cmp x1, #0
1217 csel x2, x1, x2, ne
1218 str x2, [x0]
1219 subs x3, x3, #1
1220 bne 1b
1221 ret
1222
1223 .align 4
12242: .hword colormatrix_float_stu1-2b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001225 .hword colormatrix_float_stu1_end-2b
Simon Hosie0462a392014-03-07 19:36:44 -08001226 .hword colormatrix_float_stu2-2b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001227 .hword colormatrix_float_stu2_end-2b
Simon Hosie0462a392014-03-07 19:36:44 -08001228 .hword colormatrix_float_stu3-2b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001229 .hword colormatrix_float_stu3_end-2b
Simon Hosie0462a392014-03-07 19:36:44 -08001230 .hword colormatrix_float_stu4-2b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001231 .hword colormatrix_float_stu4_end-2b
Simon Hosie0462a392014-03-07 19:36:44 -08001232 .hword colormatrix_float_stf1-2b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001233 .hword colormatrix_float_stf1_end-2b
Simon Hosie0462a392014-03-07 19:36:44 -08001234 .hword colormatrix_float_stf2-2b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001235 .hword colormatrix_float_stf2_end-2b
Simon Hosie0462a392014-03-07 19:36:44 -08001236 .hword colormatrix_float_stf3-2b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001237 .hword colormatrix_float_stf3_end-2b
Simon Hosie0462a392014-03-07 19:36:44 -08001238 .hword colormatrix_float_stf4-2b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001239 .hword colormatrix_float_stf4_end-2b
Simon Hosie0462a392014-03-07 19:36:44 -080012403: .hword colormatrix_float_ldu1-3b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001241 .hword colormatrix_float_ldu1_end-3b
Simon Hosie0462a392014-03-07 19:36:44 -08001242 .hword colormatrix_float_ldu2-3b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001243 .hword colormatrix_float_ldu2_end-3b
Simon Hosie0462a392014-03-07 19:36:44 -08001244 .hword colormatrix_float_ldu3-3b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001245 .hword colormatrix_float_ldu3_end-3b
Simon Hosie0462a392014-03-07 19:36:44 -08001246 .hword colormatrix_float_ldu4-3b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001247 .hword colormatrix_float_ldu4_end-3b
Simon Hosie0462a392014-03-07 19:36:44 -08001248 .hword colormatrix_float_ldf1-3b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001249 .hword colormatrix_float_ldf1_end-3b
Simon Hosie0462a392014-03-07 19:36:44 -08001250 .hword colormatrix_float_ldf2-3b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001251 .hword colormatrix_float_ldf2_end-3b
Simon Hosie0462a392014-03-07 19:36:44 -08001252 .hword colormatrix_float_ldf3-3b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001253 .hword colormatrix_float_ldf3_end-3b
Simon Hosie0462a392014-03-07 19:36:44 -08001254 .hword colormatrix_float_ldf4-3b
Simon Hosie6e7e2582014-05-06 01:07:21 -07001255 .hword colormatrix_float_ldf4_end-3b
Simon Hosie0462a392014-03-07 19:36:44 -080012564:
1257.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1258 .hword colormatrix_float_col0_\i-4b
1259 .hword colormatrix_float_col1_\i-4b-2
1260 .hword colormatrix_float_col2_\i-4b-4
1261 .hword colormatrix_float_col3_\i-4b-6
1262.endr
1263.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1264 .hword colormatrix_float_col0_n\i-4b
1265 .hword colormatrix_float_col1_n\i-4b-2
1266 .hword colormatrix_float_col2_n\i-4b-4
1267 .hword colormatrix_float_col3_n\i-4b-6
1268.endr
1269END(rsdIntrinsicColorMatrixSetup_float_K)