blob: 0daa0c5a9d52afc5bd78f70f86108f143c94e814 [file] [log] [blame]
Simon Hosie4e5c4142014-03-15 21:45:49 -07001/*
2 * Copyright (C) 2012,2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 x0 = dst
19 x1 = y0 base pointer
20 x2 = y1 base pointer
21 x3 = y2 base pointer
22 x4 = coeffs
23 x5 = length / 2
24*/
25
26#define ENTRY(f) .text; .align 2; .globl f; .type f,#function; f:
27#define END(f) .size f, .-f;
28
29ENTRY(rsdIntrinsicConvolve3x3_K)
30 sub x6, sp, #64
31 sub sp, sp, #64
32 st1 {v8.1d-v11.1d}, [x6], #32
33 st1 {v12.1d-v15.1d}, [x6]
34
35 /* Load the coefficients in the v0, v1 registers */
36 ld1 {v0.8h, v1.8h}, [x4]
37
38 /* Load the frequently used immediate in a register */
39 mov x4, #8
40
411:
42 /* Load and post-increase the address by x4=#8 */
43 ld1 {v13.16b}, [x1], x4
44 ld1 {v14.16b}, [x2], x4
45 ld1 {v15.16b}, [x3], x4
46
47 /* Signal memory for data that will be used in the loop after the next */
48// prfm PLDL1KEEP,[x1, x4] // TODO: test this
49// prfm PLDL1KEEP,[x2, x4] // TODO: test this
50// prfm PLDL1KEEP,[x3, x4] // TODO: test this
51
52 uxtl v2.8h, v13.8b
53 uxtl2 v3.8h, v13.16b
54 uxtl v4.8h, v14.8b
55 uxtl2 v5.8h, v14.16b
56 uxtl v6.8h, v15.8b
57 uxtl2 v7.8h, v15.16b
58
59/*
60 The two pixel source array is
61 v2, v2hi, v3lo, v3hi
62 v4, v4hi, v5lo, v5hi
63 v6, v6hi, v7lo, v7hi
64*/
65
66 smull v8.4s, v2.4h, v0.h[0]
67 smull2 v9.4s, v2.8h, v0.h[0]
68 smlal2 v8.4s, v2.8h, v0.h[1]
69 smlal v9.4s, v3.4h, v0.h[1]
70 smlal v8.4s, v3.4h, v0.h[2]
71 smlal2 v9.4s, v3.8h, v0.h[2]
72 smlal v8.4s, v4.4h, v0.h[3]
73 smlal2 v9.4s, v4.8h, v0.h[3]
74 smlal2 v8.4s, v4.8h, v0.h[4]
75 smlal v9.4s, v5.4h, v0.h[4]
76 smlal v8.4s, v5.4h, v0.h[5]
77 smlal2 v9.4s, v5.8h, v0.h[5]
78 smlal v8.4s, v6.4h, v0.h[6]
79 smlal2 v9.4s, v6.8h, v0.h[6]
80 smlal2 v8.4s, v6.8h, v0.h[7]
81 smlal v9.4s, v7.4h, v0.h[7]
82 smlal v8.4s, v7.4h, v1.h[0]
83 smlal2 v9.4s, v7.8h, v1.h[0]
84
85 shrn v8.4h, v8.4s, #8
86 shrn2 v8.8h, v9.4s, #8
87
88 sqxtun v8.8b, v8.8h
89 st1 {v8.8b}, [x0], #8
90
91 /* Are we done yet? */
92 subs x5, x5, #1
93 bne 1b
94
95 /* We're done, bye! */
96 ld1 {v8.1d-v11.1d}, [sp], #32
97 ld1 {v12.1d-v15.1d}, [sp], #32
98 ret
99END(rsdIntrinsicConvolve3x3_K)
100
101
102/* Convolve 5x5 */
103
104/*
105 x0 = dst
106 x1 = y0 base pointer
107 x2 = y1 base pointer
108 x3 = y2 base pointer
109 x4 = y3 base pointer
110 x5 = y4 base pointer
111 x6 = coeffs
112 x7 = length
113*/
114ENTRY(rsdIntrinsicConvolve5x5_K)
115 sub x8, sp, #64
116 sub sp, sp, #64
117 st1 {v8.1d-v11.1d}, [x8], #32
118 st1 {v12.1d-v15.1d}, [x8]
119
120 /* Create the coefficients vector */
121 ld1 {v0.8h-v2.8h}, [x6], #48
122 ld1 {v3.4h}, [x6], #8
123
124 movi v15.4s, #0x7f
125
126 /* Load the frequently used immediate in a register */
127 mov x6, #8
128
1291:
130 /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
131 ld1 {v9.8b-v11.8b}, [x1], x6 // y0 ( y - 2 )
132 ld1 {v12.8b-v14.8b}, [x2], x6 // y0 ( y - 1 )
133
134 /* Signal memory for data that will be used in the loop after the next */
135// prfm PLDL1KEEP,[x1, x6] // TODO: test this
136// prfm PLDL1KEEP,[x2, x6] // TODO: test this
137
138 /* Promoting the 8bit channels to 16bit */
139 uxtl v9.8h, v9.8b
140 uxtl v10.8h, v10.8b
141 uxtl v11.8h, v11.8b
142 uxtl v12.8h, v12.8b
143 uxtl v13.8h, v13.8b
144 uxtl v14.8h, v14.8b
145
146/*
147 v9, v9hi, v10lo, v10hi, v11lo, v11hi,
148 v12, v12hi
149*/
150 smull v4.4s, v9.4h, v0.h[0]
151 smull2 v5.4s, v9.8h, v0.h[0]
152 smlal2 v4.4s, v9.8h, v0.h[1]
153 smlal v5.4s, v10.4h, v0.h[1]
154 smlal v4.4s, v10.4h, v0.h[2]
155 smlal2 v5.4s, v10.8h, v0.h[2]
156 smlal2 v4.4s, v10.8h, v0.h[3]
157 smlal v5.4s, v11.4h, v0.h[3]
158 smlal v4.4s, v11.4h, v0.h[4]
159 smlal2 v5.4s, v11.8h, v0.h[4]
160
161 smlal v4.4s, v12.4h, v0.h[5]
162 smlal2 v5.4s, v12.8h, v0.h[5]
163 smlal2 v4.4s, v12.8h, v0.h[6]
164 smlal v5.4s, v13.4h, v0.h[6]
165 smlal v4.4s, v13.4h, v0.h[7]
166 smlal2 v5.4s, v13.8h, v0.h[7]
167 smlal2 v4.4s, v13.8h, v1.h[0]
168 smlal v5.4s, v14.4h, v1.h[0]
169 smlal v4.4s, v14.4h, v1.h[1]
170 smlal2 v5.4s, v14.8h, v1.h[1]
171
172 /* Next 2 rows */
173 /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
174 ld1 {v9.8b-v11.8b}, [x3], x6 // y0 ( y )
175 ld1 {v12.8b-v14.8b}, [x4], x6 // y0 ( y + 1 )
176
177 /* Signal memory for data that will be used in the loop after the next */
178// prfm PLDL1KEEP,[x3, x6] // TODO: test this
179// prfm PLDL1KEEP,[x4, x6] // TODO: test this
180
181 /* Promoting the 8bit channels to 16bit */
182 uxtl v9.8h, v9.8b
183 uxtl v10.8h, v10.8b
184 uxtl v11.8h, v11.8b
185 uxtl v12.8h, v12.8b
186 uxtl v13.8h, v13.8b
187 uxtl v14.8h, v14.8b
188
189/*
190 v9, v9hi, v10lo, v10hi, v11lo, v11hi,
191 v12, v12hi
192*/
193 smlal v4.4s, v9.4h, v1.h[2]
194 smlal2 v5.4s, v9.8h, v1.h[2]
195 smlal2 v4.4s, v9.8h, v1.h[3]
196 smlal v5.4s, v10.4h, v1.h[3]
197 smlal v4.4s, v10.4h, v1.h[4]
198 smlal2 v5.4s, v10.8h, v1.h[4]
199 smlal2 v4.4s, v10.8h, v1.h[5]
200 smlal v5.4s, v11.4h, v1.h[5]
201 smlal v4.4s, v11.4h, v1.h[6]
202 smlal2 v5.4s, v11.8h, v1.h[6]
203
204 smlal v4.4s, v12.4h, v1.h[7]
205 smlal2 v5.4s, v12.8h, v1.h[7]
206 smlal2 v4.4s, v12.8h, v2.h[0]
207 smlal v5.4s, v13.4h, v2.h[0]
208 smlal v4.4s, v13.4h, v2.h[1]
209 smlal2 v5.4s, v13.8h, v2.h[1]
210 smlal2 v4.4s, v13.8h, v2.h[2]
211 smlal v5.4s, v14.4h, v2.h[2]
212 smlal v4.4s, v14.4h, v2.h[3]
213 smlal2 v5.4s, v14.8h, v2.h[3]
214
215 /* Last row */
216 /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
217 ld1 {v9.8b- v11.8b}, [x5], x6 // y0 ( y + 2 )
218
219 /* Signal memory for data that will be used in the loop after the next */
220// prfm PLDL1KEEP,[x5, x6] // TODO: test this
221
222 /* Promoting the 8bit channels to 16bit */
223 uxtl v9.8h, v9.8b
224 uxtl v10.8h, v10.8b
225 uxtl v11.8h, v11.8b
226
227/*
228 v9, v9hi, v10lo, v10hi, v11lo, v11hi,
229 v12, v12hi
230*/
231
232 smlal v4.4s, v9.4h, v2.h[4]
233 smlal2 v5.4s, v9.8h, v2.h[4]
234 smlal2 v4.4s, v9.8h, v2.h[5]
235 smlal v5.4s, v10.4h, v2.h[5]
236 smlal v4.4s, v10.4h, v2.h[6]
237 smlal2 v5.4s, v10.8h, v2.h[6]
238 smlal2 v4.4s, v10.8h, v2.h[7]
239 smlal v5.4s, v11.4h, v2.h[7]
240 smlal v4.4s, v11.4h, v3.h[0]
241 smlal2 v5.4s, v11.8h, v3.h[0]
242
243 add v4.4s, v4.4s, v15.4s
244 add v5.4s, v5.4s, v15.4s
245
246/* Narrow it to a d-reg 32 -> 16 bit */
247 rshrn v4.4h, v4.4s, #8
248 rshrn2 v4.8h, v5.4s, #8
249
250
251/* Pack 16 -> 8 bit, saturate, put two pixels into D reg */
252 sqxtun v4.8b, v4.8h
253
254 st1 {v4.8b}, [x0], #8 // return the output and increase the address of x0
255
256 /* Are we done? */
257 subs x7, x7, #1
258 bne 1b
259
260 /* Yup, bye */
261 ld1 {v8.1d-v11.1d}, [sp], #32
262 ld1 {v12.1d-v15.1d}, [sp], #32
263 ret
264
265END(rsdIntrinsicConvolve5x5_K)