blob: bb4b7ae3ac0548655a197c7b2d825e44ef82d1c8 [file] [log] [blame]
Simon Hosieccd7a462014-02-01 01:35:11 -08001/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define END(f) .size f, .-f;
19
20/* Perform the actual YuvToRGB conversion in a macro, from register to
21 * register. This macro will be called from within several different wrapper
22 * variants for different data layouts. Y data starts with the even and odd
23 * bytes split into the low parts of v8 and v9 respectively. U and V are in
Simon Hosiee8814f72014-06-19 13:18:05 -070024 * v10 and v11. Working constants are pre-loaded into v24-v31, and v3 and v7
25 * are pre-loaded with a constant 0xff alpha channel.
Simon Hosieccd7a462014-02-01 01:35:11 -080026 *
27 * The complicated arithmetic is the result of refactoring the original
28 * equations to avoid 16-bit overflow without losing any precision.
29 */
Simon Hosiee8814f72014-06-19 13:18:05 -070030.macro yuvkern, regu=v10, regv=v11
31 /* v0 out R_lo / even R_lo accumulator
32 * v1 out G_lo / even G_lo accumulator
33 * v2 out B_lo / even B_lo accumulator
34 * v3 out A_lo / const 0xff*ff
35 * v4 out R_hi / even R_hi accumulator
36 * v5 out G_hi / even G_hi accumulator
37 * v6 out B_hi / even B_hi accumulator
38 * v7 out A_hi / const 0xff*ff
39 * v8 even Y / G_lo luma tmp
40 * v9 odd Y / G_lo luma tmp
41 * \regu in U
42 * \regv in V
43 * v12 R_lo luma tmp
44 * v13 B_lo luma tmp
45 * v14 R_hi luma tmp
46 * v15 B_hi luma tmp
47 * v16 odd R_lo accumulator
48 * v17 odd G_lo accumulator
49 * v18 odd B_lo accumulator
50 * v19 multiplier extra bits low
51 * v20 odd R_hi accumulator
52 * v21 odd G_hi accumulator
53 * v22 odd B_hi accumulator
54 * v23 multiplier extra bits high
55 * v24 constant 149
56 * v25 constant 50
57 * v26 constant 104
58 * v27 constant 204
59 * v28 constant 254
60 * v29 constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
61 * v30 constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
62 * v31 constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1)
63 */
Simon Hosieccd7a462014-02-01 01:35:11 -080064
Simon Hosiee8814f72014-06-19 13:18:05 -070065 umull v1.8h, v8.8b, v24.8b // g0 = y0 * 149
66 umull v17.8h, v9.8b, v24.8b // g1 = y1 * 149
67 umull2 v5.8h, v8.16b, v24.16b // g0_hi = y0_hi * 149
68 umull2 v21.8h, v9.16b, v24.16b // g1_hi = y1_hi * 149
Simon Hosieccd7a462014-02-01 01:35:11 -080069
Simon Hosiee8814f72014-06-19 13:18:05 -070070 umull v8.8h, \regu\().8b, v25.8b // g2 = u * 50 + v * 104
71 umlal v8.8h, \regv\().8b, v26.8b
72 umull2 v9.8h, \regu\().16b, v25.16b // g2_hi = u_hi * 50 + v_hi * 104
73 umlal2 v9.8h, \regv\().16b, v26.16b
Simon Hosieccd7a462014-02-01 01:35:11 -080074
Simon Hosiee8814f72014-06-19 13:18:05 -070075 ushr v19.16b, \regv\().16b, #1
76 uaddw v0.8h, v1.8h, v19.8b // r0 = g0 + (v >> 1)
77 uaddw v16.8h, v17.8h, v19.8b // r1 = g1 + (v >> 1)
Simon Hosieccd7a462014-02-01 01:35:11 -080078
Simon Hosiee8814f72014-06-19 13:18:05 -070079 uaddw2 v4.8h, v5.8h, v19.16b // r0_hi = g0_hi + (v_hi >> 1)
80 uaddw2 v20.8h, v21.8h, v19.16b // r1_hi = g1_hi + (v_hi >> 1)
Simon Hosieccd7a462014-02-01 01:35:11 -080081
Simon Hosiee8814f72014-06-19 13:18:05 -070082 ushll v19.8h, \regu\().8b, #2
83 ushll2 v23.8h, \regu\().16b, #2
84 add v2.8h, v1.8h, v19.8h // b0 = g0 + (u << 2)
85 add v18.8h, v17.8h, v19.8h // b1 = g1 + (u << 2)
Simon Hosieccd7a462014-02-01 01:35:11 -080086
Simon Hosiee8814f72014-06-19 13:18:05 -070087 add v6.8h, v5.8h, v23.8h // b0_hi = g0_hi + (u_hi << 2)
88 add v22.8h, v21.8h, v23.8h // b1_hi = g1_hi + (u_hi << 2)
Simon Hosieccd7a462014-02-01 01:35:11 -080089
Simon Hosiee8814f72014-06-19 13:18:05 -070090 umull v12.8h, \regv\().8b, v27.8b // r2 = v * 204
91 umull v13.8h, \regu\().8b, v28.8b // b2 = u * 254
Simon Hosieccd7a462014-02-01 01:35:11 -080092
Simon Hosiee8814f72014-06-19 13:18:05 -070093 umull2 v14.8h, \regv\().16b, v27.16b // r2_hi = v_hi * 204
94 umull2 v15.8h, \regu\().16b, v28.16b // b2_hi = u_hi * 254
Simon Hosieccd7a462014-02-01 01:35:11 -080095
Simon Hosiee8814f72014-06-19 13:18:05 -070096 uhadd v0.8h, v0.8h, v12.8h // r0 = (r0 + r2) >> 1
97 uhadd v16.8h, v16.8h, v12.8h // r1 = (r1 + r2) >> 1
98 uqadd v1.8h, v1.8h, v30.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
99 uqadd v17.8h, v17.8h, v30.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
100 uhadd v2.8h, v2.8h, v13.8h // b0 = (b0 + b2) >> 1
101 uhadd v18.8h, v18.8h, v13.8h // b1 = (b1 + b2) >> 1
102
103 uhadd v4.8h, v4.8h, v14.8h // r0_hi = (r0_hi + r2_hi) >> 1
104 uhadd v20.8h, v20.8h, v14.8h // r1_hi = (r1_hi + r2_hi) >> 1
105 uqadd v5.8h, v5.8h, v30.8h // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
106 uqadd v21.8h, v21.8h, v30.8h // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
107 uhadd v6.8h, v6.8h, v15.8h // b0_hi = (b0_hi + b2_hi) >> 1
108 uhadd v22.8h, v22.8h, v15.8h // b1_hi = (b1_hi + b2_hi) >> 1
109
110 uqsub v0.8h, v0.8h, v29.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
111 uqsub v16.8h, v16.8h, v29.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
112 uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2)
113 uqsub v17.8h, v17.8h, v8.8h // g1 = satu16(g1 - g2)
114 uqsub v2.8h, v2.8h, v31.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
115 uqsub v18.8h, v18.8h, v31.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
116
117 uqsub v4.8h, v4.8h, v29.8h // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
118 uqsub v20.8h, v20.8h, v29.8h // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
119 uqsub v5.8h, v5.8h, v9.8h // g0_hi = satu16(g0_hi - g2_hi)
120 uqsub v21.8h, v21.8h, v9.8h // g1_hi = satu16(g1_hi - g2_hi)
121 uqsub v6.8h, v6.8h, v31.8h // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
122 uqsub v22.8h, v22.8h, v31.8h // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
123
124 uqrshrn v0.8b, v0.8h, #6
125 uqrshrn v16.8b, v16.8h, #6
126 uqrshrn v1.8b, v1.8h, #7
127 uqrshrn v17.8b, v17.8h, #7
128 uqrshrn v2.8b, v2.8h, #6
129 uqrshrn v18.8b, v18.8h, #6
130
131 uqrshrn v4.8b, v4.8h, #6
132 uqrshrn v20.8b, v20.8h, #6
133 uqrshrn v5.8b, v5.8h, #7
134 uqrshrn v21.8b, v21.8h, #7
135 uqrshrn v6.8b, v6.8h, #6
136 uqrshrn v22.8b, v22.8h, #6
137
138 zip1 v0.16b, v0.16b, v16.16b
139 zip1 v1.16b, v1.16b, v17.16b
140 zip1 v2.16b, v2.16b, v18.16b
141
142 zip1 v4.16b, v4.16b, v20.16b
143 zip1 v5.16b, v5.16b, v21.16b
144 zip1 v6.16b, v6.16b, v22.16b
Simon Hosieccd7a462014-02-01 01:35:11 -0800145.endm
146
147/* Define the wrapper code which will load and store the data, iterate the
148 * correct number of times, and safely handle the remainder at the end of the
149 * loop. Some sections of code are switched out depending on the data packing
150 * being handled.
151 */
152.macro wrap_line kernel, interleaved=0, swapuv=0
Simon Hosiee8814f72014-06-19 13:18:05 -0700153 movi v24.16b, #149
154 movi v25.16b, #50
155 movi v26.16b, #104
156 movi v27.16b, #204
157 movi v28.16b, #254
Simon Hosieccd7a462014-02-01 01:35:11 -0800158 mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
Simon Hosiee8814f72014-06-19 13:18:05 -0700159 dup v29.8h, w5
Simon Hosieccd7a462014-02-01 01:35:11 -0800160 mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
Simon Hosiee8814f72014-06-19 13:18:05 -0700161 dup v30.8h, w5
Simon Hosieccd7a462014-02-01 01:35:11 -0800162 mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
Simon Hosiee8814f72014-06-19 13:18:05 -0700163 dup v31.8h, w5
Simon Hosieccd7a462014-02-01 01:35:11 -0800164
165 movi v3.16b, #0xff
Simon Hosiee8814f72014-06-19 13:18:05 -0700166 movi v7.16b, #0xff
Simon Hosieccd7a462014-02-01 01:35:11 -0800167
Simon Hosiee8814f72014-06-19 13:18:05 -0700168 subs x2, x2, #32
Simon Hosieccd7a462014-02-01 01:35:11 -0800169 bhs 1f
170 b 2f
171
172 .align 4
Simon Hosiee8814f72014-06-19 13:18:05 -07001731: ld2 {v8.16b,v9.16b}, [x1], #32
Simon Hosieccd7a462014-02-01 01:35:11 -0800174 .if \interleaved
Simon Hosiee8814f72014-06-19 13:18:05 -0700175 ld2 {v10.16b,v11.16b}, [x3], #32
Simon Hosieccd7a462014-02-01 01:35:11 -0800176 .else
Simon Hosiee8814f72014-06-19 13:18:05 -0700177 ld1 {v10.16b}, [x3], #16
178 ld1 {v11.16b}, [x4], #16
Simon Hosieccd7a462014-02-01 01:35:11 -0800179 .endif
180
Simon Hosiee8814f72014-06-19 13:18:05 -0700181 .if \swapuv
182 \kernel regu=v11, regv=v10
183 .else
Simon Hosieccd7a462014-02-01 01:35:11 -0800184 \kernel
Simon Hosiee8814f72014-06-19 13:18:05 -0700185 .endif
Simon Hosieccd7a462014-02-01 01:35:11 -0800186
Simon Hosiee8814f72014-06-19 13:18:05 -0700187 subs x2, x2, #32
Simon Hosieccd7a462014-02-01 01:35:11 -0800188
Simon Hosiee8814f72014-06-19 13:18:05 -0700189 st4 {v0.16b - v3.16b}, [x0], #64
190 st4 {v4.16b - v7.16b}, [x0], #64
Simon Hosieccd7a462014-02-01 01:35:11 -0800191
192 bhs 1b
193
Simon Hosiee8814f72014-06-19 13:18:05 -07001942: adds x2, x2, #32
Simon Hosieccd7a462014-02-01 01:35:11 -0800195 beq 2f
196
Simon Hosiee8814f72014-06-19 13:18:05 -0700197 /* To handle the tail portion of the data (something less than 32
Simon Hosieccd7a462014-02-01 01:35:11 -0800198 * bytes) load small power-of-two chunks into working registers. It
199 * doesn't matter where they end up in the register; the same process
200 * will store them back out using the same positions and the
201 * interaction between neighbouring pixels is constrained to odd
202 * boundaries where the load operations don't interfere.
203 */
204 movi v8.8b, #0
205 movi v9.8b, #0
Simon Hosiee8814f72014-06-19 13:18:05 -0700206 movi v10.8b, #0
207 movi v11.8b, #0
Simon Hosieccd7a462014-02-01 01:35:11 -0800208
Simon Hosiee8814f72014-06-19 13:18:05 -0700209 tbz x2, #4, 1f
210 ld1 {v9.16b}, [x1], #16
Simon Hosieccd7a462014-02-01 01:35:11 -0800211 .if \interleaved
Simon Hosiee8814f72014-06-19 13:18:05 -0700212 ld1 {v11.16b}, [x3], #16
Simon Hosieccd7a462014-02-01 01:35:11 -0800213 .else
Simon Hosiee8814f72014-06-19 13:18:05 -0700214 ld1 {v10.d}[1], [x3], #8
215 ld1 {v11.d}[1], [x4], #8
216 .endif
2171: tbz x2, #3, 1f
218 ld1 {v8.d}[1], [x1], #8
219 .if \interleaved
220 ld1 {v10.d}[1], [x3], #8
221 .else
222 ld1 {v10.s}[1], [x3], #4
223 ld1 {v11.s}[1], [x4], #4
Simon Hosieccd7a462014-02-01 01:35:11 -0800224 .endif
2251: tbz x2, #2, 1f
226 ld1 {v8.s}[1], [x1], #4
227 .if \interleaved
Simon Hosiee8814f72014-06-19 13:18:05 -0700228 ld1 {v10.s}[1], [x3], #4
Simon Hosieccd7a462014-02-01 01:35:11 -0800229 .else
Simon Hosiee8814f72014-06-19 13:18:05 -0700230 ld1 {v10.h}[1], [x3], #2
231 ld1 {v11.h}[1], [x4], #2
Simon Hosieccd7a462014-02-01 01:35:11 -0800232 .endif
2331: tbz x2, #1, 1f
234 ld1 {v8.h}[1], [x1], #2
235 .if \interleaved
Simon Hosiee8814f72014-06-19 13:18:05 -0700236 ld1 {v10.h}[1], [x3], #2
Simon Hosieccd7a462014-02-01 01:35:11 -0800237 .else
Simon Hosiee8814f72014-06-19 13:18:05 -0700238 ld1 {v10.b}[1], [x3], #1
239 ld1 {v11.b}[1], [x4], #1
Simon Hosieccd7a462014-02-01 01:35:11 -0800240 .endif
2411: tbz x2, #0, 1f
242 ld1 {v8.b}[1], [x1], #1
243 .if \interleaved
Simon Hosiee8814f72014-06-19 13:18:05 -0700244 ld1 {v10.h}[0], [x3], #2
Simon Hosieccd7a462014-02-01 01:35:11 -0800245 .else
Simon Hosiee8814f72014-06-19 13:18:05 -0700246 ld1 {v10.b}[0], [x3], #1
247 ld1 {v11.b}[0], [x4], #1
Simon Hosieccd7a462014-02-01 01:35:11 -0800248 .endif
249
250 /* One small impediment in the process above is that some of the load
251 * operations can't perform byte-wise structure deinterleaving at the
252 * same time as loading only part of a register. So the data is loaded
253 * linearly and unpacked manually at this point if necessary.
254 */
Simon Hosiee8814f72014-06-19 13:18:05 -07002551: mov v12.16b, v8.16b
256 uzp1 v8.16b, v12.16b, v9.16b
257 uzp2 v9.16b, v12.16b, v9.16b
Simon Hosieccd7a462014-02-01 01:35:11 -0800258 .if \interleaved
Simon Hosiee8814f72014-06-19 13:18:05 -0700259 mov v12.16b, v10.16b
260 uzp1 v10.16b, v12.16b, v11.16b
261 uzp2 v11.16b, v12.16b, v11.16b
Simon Hosieccd7a462014-02-01 01:35:11 -0800262 .endif
263
Simon Hosiee8814f72014-06-19 13:18:05 -0700264 .if \swapuv
265 \kernel regu=v11, regv=v10
266 .else
Simon Hosieccd7a462014-02-01 01:35:11 -0800267 \kernel
Simon Hosiee8814f72014-06-19 13:18:05 -0700268 .endif
Simon Hosieccd7a462014-02-01 01:35:11 -0800269
270 /* As above but with the output; structured stores for partial vectors
271 * aren't available, so the data is re-packed first and stored linearly.
272 */
Simon Hosiee8814f72014-06-19 13:18:05 -0700273 zip1 v16.16b, v0.16b, v2.16b
274 zip2 v18.16b, v0.16b, v2.16b
275 zip1 v17.16b, v1.16b, v3.16b
276 zip2 v19.16b, v1.16b, v3.16b
277 zip1 v0.16b, v16.16b, v17.16b
278 zip2 v1.16b, v16.16b, v17.16b
279 zip1 v2.16b, v18.16b, v19.16b
280 zip2 v3.16b, v18.16b, v19.16b
Simon Hosieccd7a462014-02-01 01:35:11 -0800281
Simon Hosiee8814f72014-06-19 13:18:05 -0700282 /* Luckily v4-v7 don't need to be unzipped because the complete set of
283 * four and can be stored using st4. */
284
285 tbz x2, #4, 1f
286 st4 {v4.16b - v7.16b}, [x0], #64
Simon Hosieccd7a462014-02-01 01:35:11 -08002871: tbz x2, #3, 1f
288 st1 {v2.16b,v3.16b}, [x0], #32
2891: tbz x2, #2, 1f
290 st1 {v1.16b}, [x0], #16
2911: tbz x2, #1, 1f
292 st1 {v0.d}[1], [x0], #8
2931: tbz x2, #0, 2f
294 st1 {v0.s}[1], [x0], #4
2952:
296.endm
297
298
299/* void rsdIntrinsicYuv2_K(
300 * void *out, // x0
301 * void const *yin, // x1
302 * void const *uin, // x2
303 * void const *vin, // x3
304 * size_t xstart, // x4
305 * size_t xend); // x5
306 */
307ENTRY(rsdIntrinsicYuv2_K)
308 lsr x6, x4, #1
309 add x0, x0, x4, LSL #2
310 add x1, x1, x4
311 add x4, x3, x6
312 add x3, x2, x6
Simon Hosie9732e852014-09-19 23:08:21 -0700313 sub x2, x5, x6, LSL #1
Simon Hosieccd7a462014-02-01 01:35:11 -0800314
315 sub x6, sp, #32
316 sub sp, sp, #64
317 st1 {v8.1d - v11.1d}, [sp]
318 st1 {v12.1d - v15.1d}, [x6]
319
320 wrap_line yuvkern, 0
321
322 ld1 {v8.1d - v11.1d}, [sp], #32
323 ld1 {v12.1d - v15.1d}, [sp], #32
324 ret
325END(rsdIntrinsicYuv2_K)
326
327/* void rsdIntrinsicYuv_K(
328 * void *out, // x0
329 * void const *yin, // x1
330 * void const *uvin, // x2
331 * size_t xstart, // x3
332 * size_t xend); // x4
333 */
334ENTRY(rsdIntrinsicYuv_K)
Simon Hosie1d9c8872014-05-01 23:28:45 -0700335 bic x5, x3, #1
Simon Hosieccd7a462014-02-01 01:35:11 -0800336 add x0, x0, x5, LSL #2
337 add x1, x1, x5
338 add x3, x2, x5
339 sub x2, x4, x5
340
341 sub x5, sp, #32
342 sub sp, sp, #64
343 st1 {v8.1d - v11.1d}, [sp]
344 st1 {v12.1d - v15.1d}, [x5]
345
346 wrap_line yuvkern, 1, 1
347
348 ld1 {v8.1d - v11.1d}, [sp], #32
349 ld1 {v12.1d - v15.1d}, [sp], #32
350 ret
351END(rsdIntrinsicYuv_K)
352
353/* void rsdIntrinsicYuvR_K(
354 * void *out, // x0
355 * void const *yin, // x1
356 * void const *uvin, // x2
357 * size_t xstart, // x3
358 * size_t xend); // x4
359 */
360ENTRY(rsdIntrinsicYuvR_K)
Simon Hosie1d9c8872014-05-01 23:28:45 -0700361 bic x5, x3, #1
Simon Hosieccd7a462014-02-01 01:35:11 -0800362 add x0, x0, x5, LSL #2
363 add x1, x1, x5
364 add x3, x2, x5
365 sub x2, x4, x5
366
367 sub x5, sp, #32
368 sub sp, sp, #64
369 st1 {v8.1d - v11.1d}, [sp]
370 st1 {v12.1d - v15.1d}, [x5]
371
372 wrap_line yuvkern, 1
373
374 ld1 {v8.1d - v11.1d}, [sp], #32
375 ld1 {v12.1d - v15.1d}, [sp], #32
376 ret
377END(rsdIntrinsicYuvR_K)