blob: 5c3bce41a9245b91ac08f062ab911191d1df692e [file] [log] [blame]
Simon Hosieccd7a462014-02-01 01:35:11 -08001/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
18#define END(f) .fnend; .size f, .-f;
19
20.eabi_attribute 25,1 @Tag_ABI_align8_preserved
21.arm
22
23/* Perform the actual YuvToRGB conversion in a macro, from register to
24 * register. This macro will be called from within several different wrapper
25 * variants for different data layouts. Y data starts in q8, but with the even
26 * and odd bytes split into d16 and d17 respectively. U and V are in d20
27 * and d21. Working constants are pre-loaded into q13-q15, and q3 is
28 * pre-loaded with a constant 0xff alpha channel.
29 *
30 * The complicated arithmetic is the result of refactoring the original
31 * equations to avoid 16-bit overflow without losing any precision.
32 */
33.macro yuvkern
34 vmov.i8 d15, #149
35
36 vmull.u8 q1, d16, d15 // g0 = y0 * 149
37 vmull.u8 q5, d17, d15 // g1 = y1 * 149
38
39 vmov.i8 d14, #50
40 vmov.i8 d15, #104
41 vmull.u8 q8, d20, d14 // g2 = u * 50 + v * 104
42 vmlal.u8 q8, d21, d15
43
44 vshr.u8 d14, d21, #1
45 vaddw.u8 q0, q1, d14 // r0 = y0 * 149 + (v >> 1)
46 vaddw.u8 q4, q5, d14 // r1 = y1 * 149 + (v >> 1)
47
48 vshll.u8 q7, d20, #2
49 vadd.u16 q2, q1, q7 // b0 = y0 * 149 + (u << 2)
50 vadd.u16 q6, q5, q7 // b1 = y1 * 149 + (u << 2)
51
52 vmov.i8 d14, #204
53 vmov.i8 d15, #254
54 vmull.u8 q11, d21, d14 // r2 = v * 204
55 vmull.u8 q12, d20, d15 // b2 = u * 254
56
57 vhadd.u16 q0, q11 // r0 = (r0 + r2) >> 1
58 vhadd.u16 q4, q11 // r1 = (r1 + r2) >> 1
59 vqadd.u16 q1, q14 // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
60 vqadd.u16 q5, q14 // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
61 vhadd.u16 q2, q12 // b0 = (b0 + b2) >> 1
62 vhadd.u16 q6, q12 // b1 = (b1 + b2) >> 1
63
64 vqsub.u16 q0, q13 // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
65 vqsub.u16 q4, q13 // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
66 vqsub.u16 q1, q8 // g0 = satu16(g0 - g2)
67 vqsub.u16 q5, q8 // g1 = satu16(g1 - g2)
68 vqsub.u16 q2, q15 // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
69 vqsub.u16 q6, q15 // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
70
71 vqrshrn.u16 d0, q0, #6
72 vqrshrn.u16 d1, q1, #7
73 vqrshrn.u16 d2, q4, #6
74 vqrshrn.u16 d3, q5, #7
75 vqrshrn.u16 d4, q2, #6
76 vqrshrn.u16 d5, q6, #6
77
78 vzip.u8 q0, q1
79 vzip.u8 d4, d5
80.endm
81
82/* Define the wrapper code which will load and store the data, iterate the
83 * correct number of times, and safely handle the remainder at the end of the
84 * loop. Some sections of code are switched out depending on the data packing
85 * being handled.
86 */
87.macro wrap_line kernel, interleaved=0, swapuv=0
88
89 movw r5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
90 vdup.i16 q13, r5
91 movw r5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
92 vdup.i16 q14, r5
93 movw r5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
94 vdup.i16 q15, r5
95
96 vmov.i8 q3, #0xff
97
98 subs r2, #16
99 bhs 1f
100 b 2f
101
102 .align 4
1031: vld2.u8 {d16,d17}, [r1]!
104 pld [r1, #256]
105 .if \interleaved
106 vld2.u8 {d20,d21}, [r3]!
107 .if \swapuv
108 vswp d20, d21
109 .endif
110 pld [r3, #256]
111 .else
112 vld1.u8 d20, [r3]!
113 vld1.u8 d21, [r4]!
114 pld [r3, #128]
115 pld [r4, #128]
116 .endif
117
118 \kernel
119
Simon Hosie1d9c8872014-05-01 23:28:45 -0700120 subs r2, #16
Simon Hosieccd7a462014-02-01 01:35:11 -0800121
122 vst4.u8 {d0,d2,d4,d6}, [r0]!
123 vst4.u8 {d1,d3,d5,d7}, [r0]!
124
125 bhs 1b
126
1272: adds r2, #16
128 beq 2f
129
130 /* To handle the tail portion of the data (something less than 16
131 * bytes) load small power-of-two chunks into working registers. It
132 * doesn't matter where they end up in the register; the same process
133 * will store them back out using the same positions and the
134 * interaction between neighbouring pixels is constrained to odd
135 * boundaries where the load operations don't interfere.
136 */
137 vmov.i8 q8, #0
138 vmov.i8 q10, #0
139
140 tst r2, #8
141 beq 1f
142 vld1.u8 d17, [r1]!
143 .if \interleaved
144 vld1.u8 d21, [r3]!
145 .else
146 vld1.u32 d20[1], [r3]!
147 vld1.u32 d21[1], [r4]!
148 .endif
149
1501: tst r2, #4
151 beq 1f
152 vld1.u32 d16[1], [r1]!
153 .if \interleaved
154 vld1.u32 d20[1], [r3]!
155 .else
156 vld1.u16 d20[1], [r3]!
157 vld1.u16 d21[1], [r4]!
158 .endif
1591: tst r2, #2
160 beq 1f
161 vld1.u16 d16[1], [r1]!
162 .if \interleaved
163 vld1.u16 d20[1], [r3]!
164 .else
165 vld1.u8 d20[1], [r3]!
166 vld1.u8 d21[1], [r4]!
167 .endif
1681: tst r2, #1
169 beq 1f
170 vld1.u8 d16[1], [r1]!
171 .if \interleaved
Simon Hosie1d9c8872014-05-01 23:28:45 -0700172 vld1.u16 d20[0], [r3]!
Simon Hosieccd7a462014-02-01 01:35:11 -0800173 .else
174 vld1.u8 d20[0], [r3]!
175 vld1.u8 d21[0], [r4]!
176 .endif
177
178 /* One small impediment in the process above is that some of the load
179 * operations can't perform byte-wise structure deinterleaving at the
180 * same time as loading only part of a register. So the data is loaded
181 * linearly and unpacked manually at this point if necessary.
182 */
1831: vuzp.8 d16, d17
184 .if \interleaved
185 vuzp.8 d20, d21
186 .if \swapuv
187 vswp d20, d21
188 .endif
189 .endif
190
191 \kernel
192
193 /* As above but with the output; structured stores for partial vectors
194 * aren't available, so the data is re-packed first and stored linearly.
195 */
196 vzip.8 q0, q2
197 vzip.8 q1, q3
198 vzip.8 q0, q1
199 vzip.8 q2, q3
200
2011: tst r2, #8
202 beq 1f
203 vst1.u8 {d4,d5,d6,d7}, [r0]!
204
2051: tst r2, #4
206 beq 1f
207 vst1.u8 {d2,d3}, [r0]!
2081: tst r2, #2
209 beq 1f
210 vst1.u8 d1, [r0]!
2111: tst r2, #1
212 beq 2f
213 vst1.u32 d0[1], [r0]!
2142:
215.endm
216
217
218/* void rsdIntrinsicYuv2_K(
219 * void *out, // r0
220 * void const *yin, // r1
221 * void const *uin, // r2
222 * void const *vin, // r3
223 * size_t xstart, // [sp]
224 * size_t xend); // [sp+#4]
225 */
226ENTRY(rsdIntrinsicYuv2_K)
227 push {r4,r5}
228 ldr r5, [sp, #8]
229 mov r4, r3
230 mov r3, r2
231 ldr r2, [sp, #12]
232
233 add r0, r5, LSL #2
234 add r1, r5
235 add r3, r5, LSR #1
236 add r4, r5, LSR #1
237 sub r2, r5
238
239 vpush {d8-d15}
240
241 wrap_line yuvkern, 0
242
243 vpop {d8-d15}
244 pop {r4,r5}
245 bx lr
246END(rsdIntrinsicYuv2_K)
247
248/* void rsdIntrinsicYuv_K(
249 * void *out, // r0
250 * void const *yin, // r1
251 * void const *uvin, // r2
252 * size_t xstart, // r3
253 * size_t xend); // [sp]
254 */
255ENTRY(rsdIntrinsicYuv_K)
256 push {r4,r5}
257 bic r4, r3, #1
258 add r3, r2, r4
259 ldr r2, [sp, #8]
260
261 add r0, r4, LSL #2
262 add r1, r4
263 sub r2, r4
264
265 vpush {d8-d15}
266
267 wrap_line yuvkern, 1, 1
268
269 vpop {d8-d15}
270 pop {r4,r5}
271 bx lr
272END(rsdIntrinsicYuv_K)
273
274/* void rsdIntrinsicYuvR_K(
275 * void *out, // r0
276 * void const *yin, // r1
277 * void const *uvin, // r2
278 * size_t xstart, // r3
279 * size_t xend); // [sp]
280 */
281ENTRY(rsdIntrinsicYuvR_K)
282 push {r4,r5}
283 bic r4, r3, #1
284 add r3, r2, r4
285 ldr r2, [sp, #8]
286
287 add r0, r4, LSL #2
288 add r1, r4
289 sub r2, r4
290
291 vpush {d8-d15}
292
293 wrap_line yuvkern, 1
294
295 vpop {d8-d15}
296 pop {r4,r5}
297 bx lr
298END(rsdIntrinsicYuvR_K)