blob: b4d5084844e5d5054f7b7545dd1177a59e2683c1 [file] [log] [blame]
Jiho Chang9b81eb72012-03-24 06:01:16 +09001/*
2 *
3 * Copyright 2012 Samsung Electronics S.LSI Co. LTD
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License")
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18/*
19 * @file csc_tiled_to_linear_uv_deinterleave_neon.s
20 * @brief SEC_OMX specific define. It support MFC 6.x tiled.
21 * @author ShinWon Lee (shinwon.lee@samsung.com)
22 * @version 1.0
23 * @history
24 * 2012.02.01 : Create
25 */
26
27/*
28 * Converts and Deinterleave tiled data to linear for mfc 6.x
29 * 1. UV of NV12T to Y of YUV420P
30 *
31 * @param u_dst
32 * U address of YUV420[out]
33 *
34 * @param v_dst
35 * V address of YUV420[out]
36 *
37 * @param uv_src
38 * UV address of NV12T[in]
39 *
40 * @param yuv420_width
41 * real width of YUV420[in]. It should be even.
42 *
43 * @param yuv420_height
44 * real height of YUV420[in] It should be even.
45 */
46
47 .arch armv7-a
48 .text
49 .global csc_tiled_to_linear_uv_deinterleave_neon
50 .type csc_tiled_to_linear_uv_deinterleave_neon, %function
51csc_tiled_to_linear_uv_deinterleave_neon:
52 .fnstart
53
54 .equ CACHE_LINE_SIZE, 64
55 .equ PRE_LOAD_OFFSET, 6
56
57 @r0 u_dst
58 @r1 v_dst
59 @r2 uv_src
60 @r3 width
61 @r4 height
62 @r5 i
63 @r6 j
64 @r7 dst_offset
65 @r8 src_offset
66 @r9 aligned_height
67 @r10 aligned_width
68 @r11 tiled_width
69 @r12 temp1
70 @r14 temp2
71
72 stmfd sp!, {r4-r12,r14} @ backup registers
73 ldr r4, [sp, #40] @ r4 = height
74
75 bic r9, r4, #0x7 @ aligned_height = height & (~0x7)
76 bic r10, r3, #0xF @ aligned_width = width & (~0xF)
77 add r11, r3, #15 @ tiled_width = ((width + 15) >> 4) << 4
78 mov r11, r11, asr #4
79 mov r11, r11, lsl #4
80
81 mov r5, #0
82LOOP_MAIN_ALIGNED_HEIGHT:
83 mul r8, r11, r5 @ src_offset = tiled_width * i
84 mov r6, #0
85 add r8, r2, r8 @ src_offset = uv_src + src_offset
86LOOP_MAIN_ALIGNED_WIDTH:
87 mov r12, r3, asr #1 @ temp1 = (width >> 1) * i + (j >> 1)
88 mul r12, r12, r5
89
90 pld [r8, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
91 vld2.8 {q0, q1}, [r8]!
92 add r12, r12, r6, asr #1
93 vld2.8 {q2, q3}, [r8]!
94 add r7, r0, r12 @ dst_offset = u_dst + temp1
95 pld [r8, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
96 vld2.8 {q4, q5}, [r8]!
97 mov r14, r3, asr #1 @ temp2 = width / 2
98 vld2.8 {q6, q7}, [r8]!
99
100 vst1.8 {d0}, [r7], r14
101 vst1.8 {d1}, [r7], r14
102 vst1.8 {d4}, [r7], r14
103 vst1.8 {d5}, [r7], r14
104 vst1.8 {d8}, [r7], r14
105 vst1.8 {d9}, [r7], r14
106 vst1.8 {d12}, [r7], r14
107 vst1.8 {d13}, [r7], r14
108
109 add r7, r1, r12 @ dst_offset = v_dst + temp1
110
111 vst1.8 {d2}, [r7], r14
112 vst1.8 {d3}, [r7], r14
113 vst1.8 {d6}, [r7], r14
114 vst1.8 {d7}, [r7], r14
115 vst1.8 {d10}, [r7], r14
116 vst1.8 {d11}, [r7], r14
117 add r6, r6, #16
118 vst1.8 {d14}, [r7], r14
119 cmp r6, r10
120 vst1.8 {d15}, [r7], r14
121 blt LOOP_MAIN_ALIGNED_WIDTH
122
123MAIN_REMAIN_WIDTH_START:
124 cmp r10, r3 @ if (aligned_width != width) {
125 beq MAIN_REMAIN_WIDTH_END
126 stmfd sp!, {r0-r2,r4} @ backup registers
127 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 3)
128 add r8, r8, r6, lsl #3
129 add r8, r2, r8 @ r8 = uv_src + src_offset
130 mov r12, r3, asr #1 @ temp1 = (width >> 1) * i + (j >> 1)
131 mul r12, r12, r5
132 add r12, r12, r6, asr #1
133 add r7, r0, r12 @ r7 = u_dst + temp1
134 add r12, r1, r12 @ r12 = v_dst + temp1
135 sub r14, r3, r6 @ r14 = (width - j) / 2
136 mov r14, r14, asr #1
137
138 mov r4, #0
139LOOP_MAIN_REMAIN_HEIGHT:
140 mov r0, #0 @ r0 is index in de-interleave
141LOOP_MAIN_REMAIN_WIDTH:
142 ldrb r1, [r8], #1
143 ldrb r2, [r8], #1
144 strb r1, [r7], #1
145 strb r2, [r12], #1
146 add r0, #1
147 cmp r0, r14
148 blt LOOP_MAIN_REMAIN_WIDTH
149
150 sub r8, r8, r14, lsl #1
151 sub r7, r7, r14
152 sub r12, r12, r14
153 add r8, r8, #16
154 add r7, r7, r3, asr #1
155 add r12, r12, r3, asr #1
156
157 add r4, #1
158 cmp r4, #8
159 blt LOOP_MAIN_REMAIN_HEIGHT
160 ldmfd sp!, {r0-r2,r4} @ restore registers
161MAIN_REMAIN_WIDTH_END:
162
163 add r5, r5, #8
164 cmp r5, r9
165 blt LOOP_MAIN_ALIGNED_HEIGHT
166
167REMAIN_HEIGHT_START:
168 cmp r9, r4 @ if (aligned_height != height) {
169 beq REMAIN_HEIGHT_END
170
171 mov r6, #0
172LOOP_REMAIN_HEIGHT_WIDTH16:
173 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 3)
174 add r8, r8, r6, lsl #3
175 add r8, r2, r8 @ src_offset = uv_src + src_offset
176
177 mov r12, r3, asr #1 @ temp1 = (width >> 1) * i + (j >> 1)
178 mul r12, r12, r5
179 add r12, r12, r6, asr #1
180 add r7, r0, r12 @ r7 = u_dst + temp1
181 add r12, r1, r12 @ r12 = v_dst + temp1
182 mov r14, r3, asr #1 @ temp2 = width / 2
183
184 stmfd sp!, {r0-r1} @ backup registers
185 mov r0, #0
186 sub r1, r4, r9
187LOOP_REMAIN_HEIGHT_WIDTH16_HEIGHT1:
188 vld2.8 {d0, d1}, [r8]!
189 vst1.8 {d0}, [r7], r14
190 vst1.8 {d1}, [r12], r14
191
192 add r0, r0, #1
193 cmp r0, r1
194 blt LOOP_REMAIN_HEIGHT_WIDTH16_HEIGHT1
195 ldmfd sp!, {r0-r1} @ restore registers
196
197 add r6, r6, #16
198 cmp r6, r10
199 blt LOOP_REMAIN_HEIGHT_WIDTH16
200
201REMAIN_HEIGHT_REMAIN_WIDTH_START:
202 cmp r10, r3
203 beq REMAIN_HEIGHT_REMAIN_WIDTH_END
204 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 3)
205 add r8, r8, r6, lsl #3
206 add r8, r2, r8 @ src_offset = uv_src + src_offset
207
208 mov r12, r3, asr #1 @ temp1 = (width >> 1) * i + (j >> 1)
209 mul r12, r12, r5
210 add r12, r12, r6, asr #1
211 add r7, r0, r12 @ r7 = u_dst + temp1
212 add r12, r1, r12 @ r12 = v_dst + temp1
213 sub r14, r3, r6 @ r14 = (width - j) /2
214 mov r14, r14, asr #1
215
216 stmfd sp!, {r0-r2,r4-r5} @ backup registers
217 mov r0, #0
218 sub r1, r4, r9
219LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1:
220
221 mov r4, #0
222LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1_WIDTHx:
223 ldrb r2, [r8], #1
224 ldrb r5, [r8], #1
225 strb r2, [r7], #1
226 strb r5, [r12], #1
227 add r4, #1
228 cmp r4, r14
229 blt LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1_WIDTHx
230
231 sub r8, r8, r14, lsl #1
232 sub r7, r7, r14
233 sub r12, r12, r14
234 add r8, r8, #16
235 add r7, r7, r3, asr #1
236 add r12, r12, r3, asr #1
237
238 add r0, r0, #1
239 cmp r0, r1
240 blt LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1
241 ldmfd sp!, {r0-r2,r4-r5} @ restore registers
242
243REMAIN_HEIGHT_REMAIN_WIDTH_END:
244
245REMAIN_HEIGHT_END:
246
247RESTORE_REG:
248 ldmfd sp!, {r4-r12,r15} @ restore registers
249
250 .fnend