blob: 62ccf97ae21a4220f55a55d391e44fc017079b09 [file] [log] [blame]
Jiho Chang9b81eb72012-03-24 06:01:16 +09001
2 .arch armv7-a
3 .text
4 .global csc_ARGB8888_to_YUV420SP_NEON
5 .type csc_ARGB8888_to_YUV420SP_NEON, %function
6csc_ARGB8888_to_YUV420SP_NEON:
7 .fnstart
8
9 @r0 pDstY
10 @r1 pDstUV
11 @r2 pSrcRGB
12 @r3 nWidth
13 @r4 pDstY2 = pDstY + nWidth
14 @r5 pSrcRGB2 = pSrcRGB + nWidthx2
15 @r6 temp7, nWidth16m
16 @r7 temp6, accumilator
17 @r8 temp5, nWidthTemp
18 @r9 temp4, Raw RGB565
19 @r10 temp3, r,g,b
20 @r11 temp2, immediate operand
21 @r12 temp1, nHeight
22 @r14 temp0, debugging pointer
23
24 .equ CACHE_LINE_SIZE, 32
25 .equ PRE_LOAD_OFFSET, 6
26
27 stmfd sp!, {r4-r12,r14} @ backup registers
28 ldr r12, [sp, #40] @ load nHeight
29 @ldr r14, [sp, #44] @ load pTest
30 add r4, r0, r3 @r4: pDstY2 = pDstY + nWidth
31 add r5, r2, r3, lsl #2 @r5: pSrcRGB2 = tmpSrcRGB + nWidthx4
32 sub r8, r3, #16 @r8: nWidthTmp = nWidth -16
33
34 @q0: temp1, R
35 @q1: temp2, GB
36 @q2: R
37 @q3: G
38 @q4: B
39 @q5: temp3, output
40
41
42 vmov.u16 q6, #66 @coefficient assignment
43 vmov.u16 q7, #129
44 vmov.u16 q8, #25
45 vmov.u16 q9, #0x8080 @ 128<<8 + 128
46
47 vmov.u16 q10, #0x1000 @ 16<<8 + 128
48 vorr.u16 q10, #0x0080
49
50 vmov.u16 q11, #38 @#-38
51 vmov.u16 q12, #74 @#-74
52 vmov.u16 q13, #112
53 vmov.u16 q14, #94 @#-94
54 vmov.u16 q15, #18 @#-18
55
56
57
58
59LOOP_NHEIGHT2:
60 stmfd sp!, {r12} @ backup registers
61
62LOOP_NWIDTH16:
63 pld [r2, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
64 @-------------------------------------------YUV ------------------------------------------
65 vmov.u16 q14, #94 @#94
66 vmov.u16 q15, #18 @#18
67 vld4.8 {d0,d1,d2,d3}, [r2]! @loadRGB interleavely
68 vld4.8 {d4,d5,d6,d7}, [r2]! @loadRGB interleavely
69
70
71 vmov.u16 d8,d2
72 vmov.u16 d9,d6
73 vmov.u16 d10,d1
74 vmov.u16 d11,d5
75 vmov.u16 d12,d0
76 vmov.u16 d13,d4
77
78 vand.u16 q4,#0x00FF @R
79 vand.u16 q5,#0x00FF @G
80 vand.u16 q6,#0x00FF @B
81
82 vmov.u16 q8,q9 @ CalcU()
83 vmla.u16 q8,q6,q13 @112 * B[k]
84 vmls.u16 q8,q4,q11 @q0:U -(38 * R[k]) @128<<6+ 32 + u>>2
85 vmls.u16 q8,q5,q12 @-(74 * G[k])
86 vshr.u16 q8,q8, #8 @(128<<8+ 128 + u)>>8
87
88 vmov.u16 q7,q9 @CalcV()
89 vmla.u16 q7,q4,q13 @112 * R[k]
90 vmls.u16 q7,q5,q14 @q0:U -(94 * G[k]) @128<<6+ 32 + v>>2
91 vmls.u16 q7,q6,q15 @-(18 * B[k])
92 vshr.u16 q7,q7, #8 @(128<<8+ 128 + v)>>8
93
94
95 vtrn.8 q8,q7
96 vst1.8 {q8}, [r1]! @write UV component to yuv420_buffer+linear_ylanesiez
97
98 @-------------------------------------------Y ------------------------------------------
99
100 vmov.u16 q14, #66 @#66
101 vmov.u16 q15, #129 @#129
102 vmov.u16 q8, #25 @#25
103
104 @CalcY_Y()
105
106 vmul.u16 q7,q4,q14 @q0 = 66 *R[k]
107 vmla.u16 q7,q5,q15 @q0 += 129 *G[k]
108 vmla.u16 q7,q6,q8 @q0 += 25 *B[k]
109
110 vadd.u16 q7,q7,q10
111 vshr.u16 q7,q7, #8
112
113 vmov.u16 d8,d2
114 vmov.u16 d9,d6
115 vmov.u16 d10,d1
116 vmov.u16 d11,d5
117 vmov.u16 d12,d0
118 vmov.u16 d13,d4
119
120 vshr.u16 q4,q4,#8 @R
121 vshr.u16 q5,q5,#8 @G
122 vshr.u16 q6,q6,#8 @B
123
124 vmul.u16 q0,q4,q14 @q0 = 66 *R[k]
125 vmla.u16 q0,q5,q15 @q0 += 129 *G[k]
126 vmla.u16 q0,q6,q8 @q0 += 25 *B[k]
127 vadd.u16 q0,q0,q10
128 vshr.u16 q0,q0, #8
129
130 vtrn.8 q7,q0
131 vst1.8 {q7}, [r0]!@write to Y to yuv420_buffer
132
133
134
135 @-------------------------------------------Y ------------------------------------------
136
137 @---------------------------------------------Y1-------------------------------------------
138
139 pld [r5, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
140 vld4.8 {d0,d1,d2,d3}, [r5]! @loadRGB interleavely
141 vld4.8 {d4,d5,d6,d7}, [r5]! @loadRGB interleavely
142
143 vmov.u16 d8,d2
144 vmov.u16 d9,d6
145 vmov.u16 d10,d1
146 vmov.u16 d11,d5
147 vmov.u16 d12,d0
148 vmov.u16 d13,d4
149
150
151 vand.u16 q4,#0x00FF @R
152 vand.u16 q5,#0x00FF @G
153 vand.u16 q6,#0x00FF @B
154
155
156
157 vmul.u16 q7,q4,q14 @q0 = 66 *R[k]
158 vmla.u16 q7,q5,q15 @q0 += 129 *G[k]
159 vmla.u16 q7,q6,q8 @q0 += 25 *B[k]
160 vadd.u16 q7,q7,q10
161 vshr.u16 q7,q7, #8
162
163 vmov.u16 d8,d2
164 vmov.u16 d9,d6
165 vmov.u16 d10,d1
166 vmov.u16 d11,d5
167 vmov.u16 d12,d0
168 vmov.u16 d13,d4
169
170 vshr.u16 q4,q4,#8 @R
171 vshr.u16 q5,q5,#8 @G
172 vshr.u16 q6,q6,#8 @B
173
174 vmul.u16 q0,q4,q14 @q0 = 66 *R[k]
175 vmla.u16 q0,q5,q15 @q0 += 129 *G[k]
176 vmla.u16 q0,q6,q8 @q0 += 25 *B[k]
177 vadd.u16 q0,q0,q10
178 vshr.u16 q0,q0, #8
179
180 vtrn.8 q7,q0
181 vst1.8 {q7}, [r4]!@write to Y to yuv420_buffer
182
183 subs r8,r8,#16 @nWidth16--
184 BPL LOOP_NWIDTH16 @if nWidth16>0
185 @-----------------------------------unaligned ---------------------------------------
186
187 adds r8,r8,#16 @ + 16 - 2
188 BEQ NO_UNALIGNED @in case that nWidht is multiple of 16
189LOOP_NWIDTH2:
190 @----------------------------------pDstRGB1--Y------------------------------------------
191 @stmfd sp!, {r14} @backup r14
192
193
194 ldr r9, [r2], #4 @loadRGB int
195 ldr r12, [r2], #4 @loadRGB int
196
197 mov r10, r9,lsr #16 @copy to r10
198 mov r14, r12 @copy to r10
199
200 ldr r6, =0x000000FF
201 and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
202 ldr r6, =0x00FF0000
203 and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10;
204 add r10,r10,r14
205
206 mov r11, #66 @accumilator += R*66
207 mul r7, r10, r11
208
209 mov r10, r9,lsr #8 @copy to r10
210 mov r14, r12,lsl #8 @copy to r10
211
212 ldr r6, =0x000000FF
213 and r10, r10, r6 @G:
214 ldr r6, =0x00FF0000
215 and r14, r14, r6 @G:
216 add r10,r10,r14
217
218 mov r11, #129 @accumilator += G *129
219 mla r7, r10, r11, r7
220
221 mov r10, r9 @copy to r10
222 mov r14, r12,lsl #16 @copy to r10
223
224 ldr r6, =0x000000FF
225 and r10, r10, r6 @B
226 ldr r6, =0x00FF0000
227 and r14, r14, r6 @B
228 add r10,r10,r14
229
230 mov r11, #25 @accumilator 1 -= B *25
231 mla r7, r10, r11, r7
232
233 ldr r6, =0x10801080
234 add r7, r6
235
236 lsr r7, #8
237 strb r7, [r0],#1
238 lsr r7,#16
239 strb r7, [r0],#1
240 @ldmfd sp!, {r14} @load r14
241
242
243 @----------------------------------pDstRGB2--UV------------------------------------------
244
245 mov r10, r9 @copy to r10
246 ldr r7,=0x00008080
247 mov r12,r7
248
249 ldr r6, =0x000000FF
250 and r10, r10, r6 @B:
251
252 mov r11, #112 @accumilator += B*112
253 mla r7, r10, r11, r7
254
255
256 mov r11, #18 @accumilator -= B*18
257 mul r11, r10, r11
258 sub r12, r12, r11
259
260
261
262
263 mov r10, r9, lsr #16 @copy to r10
264 ldr r6, =0x000000FF
265 and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
266
267 mov r11, #38 @accumilator -= R *38
268 mul r11, r10, r11
269 sub r7, r7, r11
270
271 mov r11, #112 @accumilator = R *112
272 mla r12, r10, r11, r12
273
274 mov r10, r9,lsr #8 @copy to r10
275 ldr r6, =0x000000FF
276 and r10, r10, r6 @G: (rgbIn[k] & 0x07E0) >> 5;
277
278 mov r11, #74 @accumilator -= G*74
279 mul r11, r10, r11
280 sub r7, r7, r11
281
282 mov r11, #94 @accumilator -= G*94
283 mul r11, r10, r11
284 sub r12, r12, r11
285
286 lsr r7, #8 @ >>8
287 strb r7, [r1],#1
288 lsr r12, #8 @ >>8
289 strb r12, [r1],#1
290
291 @----------------------------------pDstRGB2--Y------------------------------------------
292 @stmfd sp!, {r14} @backup r14
293
294
295 ldr r9, [r5], #4 @loadRGB int
296 ldr r12, [r5], #4 @loadRGB int
297
298 mov r10, r9,lsr #16 @copy to r10
299 mov r14, r12 @copy to r10
300
301 ldr r6, =0x000000FF
302 and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
303 ldr r6, =0x00FF0000
304 and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10;
305 add r10,r10,r14
306
307 mov r11, #66 @accumilator += R*66
308 mul r7, r10, r11
309
310 mov r10, r9,lsr #8 @copy to r10
311 mov r14, r12,lsl #8 @copy to r10
312
313 ldr r6, =0x000000FF
314 and r10, r10, r6 @G:
315 ldr r6, =0x00FF0000
316 and r14, r14, r6 @G:
317 add r10,r10,r14
318
319 mov r11, #129 @accumilator += G *129
320 mla r7, r10, r11, r7
321
322 mov r10, r9 @copy to r10
323 mov r14, r12,lsl #16 @copy to r10
324
325 ldr r6, =0x000000FF
326 and r10, r10, r6 @B
327 ldr r6, =0x00FF0000
328 and r14, r14, r6 @B
329 add r10,r10,r14
330
331
332
333
334 mov r11, #25 @accumilator 1 -= B *25
335 mla r7, r10, r11, r7
336
337 ldr r6, =0x10801080
338 add r7, r6
339 lsr r7, #8
340
341 strb r7, [r4],#1
342 lsr r7,#16
343 strb r7, [r4],#1
344 @ldmfd sp!, {r14} @load r14
345
346
347 subs r8,r8,#2 @ nWidth2 -= 2
348 BGT LOOP_NWIDTH2 @ if nWidth2>0
349
350
351NO_UNALIGNED: @in case that nWidht is multiple of 16
352
353 @-----------------------------------------------------------------------------
354 sub r8, r3, #16 @r8: nWidthTmp = nWidth -16
355 add r0, r0, r3 @pDstY + nwidth
356 add r2, r2, r3, lsl #2 @pSrcRGB + nwidthx4
357 add r4, r4, r3 @pDstY2 + nwidth
358 add r5, r5, r3, lsl #2 @pSrcRGB2 + nwidthx4
359
360 ldmfd sp!, {r12}
361 subs r12,r12,#2 @nHeight -=2
362 BGT LOOP_NHEIGHT2 @if nHeight2>0
363
364 ldmfd sp!, {r4-r12,pc} @ backup registers
365 .fnend