blob: 8dbf72777bf4f33edd741a0716e163e088ddb4bb [file] [log] [blame]
Christophe Lyon80902f62013-03-29 16:26:42 +01001/*
2
3Copyright (c) 2013 STMicroelectronics
4Written by Christophe Lyon
5
6Permission is hereby granted, free of charge, to any person obtaining a copy
7of this software and associated documentation files (the "Software"), to deal
8in the Software without restriction, including without limitation the rights
9to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10copies of the Software, and to permit persons to whom the Software is
11furnished to do so, subject to the following conditions:
12
13The above copyright notice and this permission notice shall be included in
14all copies or substantial portions of the Software.
15
16THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22THE SOFTWARE.
23
24*/
Christophe Lyon01af0a52013-01-17 17:23:11 +010025
Christophe Lyon1775be02014-07-10 13:46:54 +020026#if defined(__arm__) || defined(__aarch64__)
Christophe Lyon01af0a52013-01-17 17:23:11 +010027#include <arm_neon.h>
28#else
29#include "stm-arm-neon.h"
30#endif
31#include "stm-arm-neon-ref.h"
32
33/* Initialization helpers; 4 slices are needed for vld2, vld3 and
34 vld4. */
35#define MY_INIT_TAB(T,W,N) xNAME(INIT_TAB,N)(T##W##_t)
36#define MY_INIT_TAB2(T,W,N) xNAME(INIT_TAB2,N)(T##W##_t)
37#define MY_INIT_TAB3(T,W,N) xNAME(INIT_TAB3,N)(T##W##_t)
38#define MY_INIT_TAB4(T,W,N) xNAME(INIT_TAB4,N)(T##W##_t)
39
40/* Initialized input buffers. */
41#define VECT_VAR_DECL_INIT(V, T, W, N) \
42 VECT_VAR_DECL(V,T,W,N) [] = { MY_INIT_TAB(T,W,N) };
43
44/* Specialized initializer with 4 entries, as used by vldX_dup and
Christophe Lyonea438942013-04-09 15:43:27 +020045 vdup tests, which iterate 4 times on input buffers. */
Christophe Lyon01af0a52013-01-17 17:23:11 +010046#define VECT_VAR_DECL_INIT4(V, T, W, N) \
47 VECT_VAR_DECL(V,T,W,N) [] = { MY_INIT_TAB(T,W,4) };
48
49/* Initializers for arrays of vectors. */
50#define VECT_ARRAY_INIT2(V, T, W, N) \
51 T##W##_t VECT_ARRAY_VAR(V,T,W,N,2)[] = \
52 { MY_INIT_TAB(T,W,N) \
53 MY_INIT_TAB2(T,W,N) };
54
55#define VECT_ARRAY_INIT3(V, T, W, N) \
56 T##W##_t VECT_ARRAY_VAR(V,T,W,N,3)[] = \
57 { MY_INIT_TAB(T,W,N) \
58 MY_INIT_TAB2(T,W,N) \
59 MY_INIT_TAB3(T,W,N) };
60
61#define VECT_ARRAY_INIT4(V, T, W, N) \
62 T##W##_t VECT_ARRAY_VAR(V,T,W,N,4)[] = \
63 { MY_INIT_TAB(T,W,N) \
64 MY_INIT_TAB2(T,W,N) \
65 MY_INIT_TAB3(T,W,N) \
66 MY_INIT_TAB4(T,W,N) };
67
68/* Sample initialization vectors. */
69#define INIT_TAB_1(T) \
70 (T)-16,
71#define INIT_TAB2_1(T) \
72 (T)-15,
73#define INIT_TAB3_1(T) \
74 (T)-14,
75#define INIT_TAB4_1(T) \
76 (T)-13,
77
78#define INIT_TAB_2(T) \
79 (T)-16, (T)-15,
80#define INIT_TAB2_2(T) \
81 (T)-14, (T)-13,
82#define INIT_TAB3_2(T) \
83 (T)-12, (T)-11,
84#define INIT_TAB4_2(T) \
85 (T)-10, (T)-9,
86
87/* Initializer for vld3_lane tests. */
88#define INIT_TAB_3(T) \
89 (T)-16, (T)-15, (T)-14,
90
91#define INIT_TAB_4(T) \
92 (T)-16, (T)-15, (T)-14, (T)-13,
93#define INIT_TAB2_4(T) \
94 (T)-12, (T)-11, (T)-10, (T)-9,
95#define INIT_TAB3_4(T) \
96 (T)-8, (T)-7, (T)-6, (T)-5,
97#define INIT_TAB4_4(T) \
98 (T)-4, (T)-3, (T)-2, (T)-1,
99
100#define INIT_TAB_8(T) \
101 (T)-16, (T)-15, (T)-14, (T)-13, (T)-12, (T)-11, (T)-10, (T)-9,
102#define INIT_TAB2_8(T) \
103 (T)-8, (T)-7, (T)-6, (T)-5, (T)-4, (T)-3, (T)-2, (T)-1,
104#define INIT_TAB3_8(T) \
105 (T)0, (T)1, (T)2, (T)3, (T)4, (T)5, (T)6, (T)7,
106#define INIT_TAB4_8(T) \
107 (T)8, (T)9, (T)10, (T)11, (T)12, (T)13, (T)14, (T)15,
108
109#define INIT_TAB_16(T) \
110 (T)-16, (T)-15, (T)-14, (T)-13, (T)-12, (T)-11, (T)-10, (T)-9, \
111 (T)-8, (T)-7, (T)-6, (T)-5, (T)-4, (T)-3, (T)-2, (T)-1,
112#define INIT_TAB2_16(T) \
113 (T)0, (T)1, (T)2, (T)3, (T)4, (T)5, (T)6, (T)7, \
114 (T)8, (T)9, (T)10, (T)11, (T)12, (T)13, (T)14, (T)15,
115#define INIT_TAB3_16(T) \
116 (T)16, (T)17, (T)18, (T)19, (T)20, (T)21, (T)22, (T)23, \
117 (T)24, (T)25, (T)26, (T)27, (T)28, (T)29, (T)30, (T)31,
118#define INIT_TAB4_16(T) \
119 (T)32, (T)33, (T)34, (T)35, (T)36, (T)37, (T)38, (T)39, \
120 (T)40, (T)41, (T)42, (T)43, (T)44, (T)45, (T)46, (T)47,
121
122/* Input buffers, one of each size. */
123/* Insert some padding to try to exhibit out of bounds accesses. */
124VECT_VAR_DECL_INIT(buffer, int, 8, 8);
125PAD(buffer_pad, int, 8, 8);
126VECT_VAR_DECL_INIT(buffer, int, 16, 4);
127PAD(buffer_pad, int, 16, 4);
128VECT_VAR_DECL_INIT(buffer, int, 32, 2);
129PAD(buffer_pad, int, 32, 2);
130VECT_VAR_DECL_INIT(buffer, int, 64, 1);
131PAD(buffer_pad, int, 64, 1);
132VECT_VAR_DECL_INIT(buffer, uint, 8, 8);
133PAD(buffer_pad, uint, 8, 8);
Christophe Lyon80902f62013-03-29 16:26:42 +0100134VECT_VAR_DECL_INIT(buffer, poly, 8, 8);
135PAD(buffer_pad, poly, 8, 8);
136VECT_VAR_DECL_INIT(buffer, poly, 16, 4);
137PAD(buffer_pad, poly, 16, 4);
Christophe Lyon01af0a52013-01-17 17:23:11 +0100138VECT_VAR_DECL_INIT(buffer, uint, 16, 4);
139PAD(buffer_pad, uint, 16, 4);
140VECT_VAR_DECL_INIT(buffer, uint, 32, 2);
141PAD(buffer_pad, uint, 32, 2);
142VECT_VAR_DECL_INIT(buffer, uint, 64, 1);
143PAD(buffer_pad, uint, 64, 1);
144VECT_VAR_DECL_INIT(buffer, float, 32, 2);
145PAD(buffer_pad, float, 32, 2);
Christophe Lyond98beba2016-08-24 18:02:41 +0200146#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
Christophe Lyon34adaf62013-04-11 15:05:18 +0200147/* We need a different initialization for ARMCC, because the compiler
148 performs the conversion to half-precision internal
149 representation. */
150#ifdef __ARMCC_VERSION
151__fp16 buffer_float16x4[4] = {-16, -15, -14, -13};
152#else
153VECT_VAR_DECL(buffer, float, 16, 4) [] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
154 0xcb00 /* -14 */, 0xca80 /* -13 */};
155#endif
156PAD(buffer_pad, float, 16, 4);
157#endif
Christophe Lyon01af0a52013-01-17 17:23:11 +0100158VECT_VAR_DECL_INIT(buffer, int, 8, 16);
159PAD(buffer_pad, int, 8, 16);
160VECT_VAR_DECL_INIT(buffer, int, 16, 8);
161PAD(buffer_pad, int, 16, 8);
162VECT_VAR_DECL_INIT(buffer, int, 32, 4);
163PAD(buffer_pad, int, 32, 4);
164VECT_VAR_DECL_INIT(buffer, int, 64, 2);
165PAD(buffer_pad, int, 64, 2);
166VECT_VAR_DECL_INIT(buffer, uint, 8, 16);
167PAD(buffer_pad, uint, 8, 16);
168VECT_VAR_DECL_INIT(buffer, uint, 16, 8);
169PAD(buffer_pad, uint, 16, 8);
170VECT_VAR_DECL_INIT(buffer, uint, 32, 4);
171PAD(buffer_pad, uint, 32, 4);
172VECT_VAR_DECL_INIT(buffer, uint, 64, 2);
173PAD(buffer_pad, uint, 64, 2);
Christophe Lyon80902f62013-03-29 16:26:42 +0100174VECT_VAR_DECL_INIT(buffer, poly, 8, 16);
175PAD(buffer_pad, poly, 8, 16);
176VECT_VAR_DECL_INIT(buffer, poly, 16, 8);
177PAD(buffer_pad, poly, 16, 8);
Christophe Lyon01af0a52013-01-17 17:23:11 +0100178VECT_VAR_DECL_INIT(buffer, float, 32, 4);
179PAD(buffer_pad, float, 32, 4);
Christophe Lyond98beba2016-08-24 18:02:41 +0200180#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
Christophe Lyon34adaf62013-04-11 15:05:18 +0200181#ifdef __ARMCC_VERSION
182__fp16 buffer_float16x8[8] = {-16, -15, -14, -13, -12, -11, -10, -9};
183#else
184VECT_VAR_DECL(buffer, float, 16, 8) [] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
185 0xcb00 /* -14 */, 0xca80 /* -13 */,
186 0xca00 /* -12 */, 0xc980 /* -11 */,
187 0xc900 /* -10 */, 0xc880 /* -9 */};
188#endif
189PAD(buffer_pad, float, 16, 8);
190#endif
Christophe Lyon01af0a52013-01-17 17:23:11 +0100191
192/* The tests for vld1_dup and vdup expect at least 4 entries in the
193 input buffer, so force 1- and 2-elements initializers to have 4
194 entries. */
195VECT_VAR_DECL_INIT(buffer_dup, int, 8, 8);
196VECT_VAR_DECL(buffer_dup_pad, int, 8, 8);
197VECT_VAR_DECL_INIT(buffer_dup, int, 16, 4);
198VECT_VAR_DECL(buffer_dup_pad, int, 16, 4);
199VECT_VAR_DECL_INIT4(buffer_dup, int, 32, 2);
200VECT_VAR_DECL(buffer_dup_pad, int, 32, 2);
201VECT_VAR_DECL_INIT4(buffer_dup, int, 64, 1);
202VECT_VAR_DECL(buffer_dup_pad, int, 64, 1);
203VECT_VAR_DECL_INIT(buffer_dup, uint, 8, 8);
204VECT_VAR_DECL(buffer_dup_pad, uint, 8, 8);
205VECT_VAR_DECL_INIT(buffer_dup, uint, 16, 4);
206VECT_VAR_DECL(buffer_dup_pad, uint, 16, 4);
207VECT_VAR_DECL_INIT4(buffer_dup, uint, 32, 2);
208VECT_VAR_DECL(buffer_dup_pad, uint, 32, 2);
209VECT_VAR_DECL_INIT4(buffer_dup, uint, 64, 1);
210VECT_VAR_DECL(buffer_dup_pad, uint, 64, 1);
Christophe Lyon80902f62013-03-29 16:26:42 +0100211VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 8);
212VECT_VAR_DECL(buffer_dup_pad, poly, 8, 8);
213VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 4);
214VECT_VAR_DECL(buffer_dup_pad, poly, 16, 4);
Christophe Lyon01af0a52013-01-17 17:23:11 +0100215VECT_VAR_DECL_INIT4(buffer_dup, float, 32, 2);
216VECT_VAR_DECL(buffer_dup_pad, float, 32, 2);
Christophe Lyond98beba2016-08-24 18:02:41 +0200217#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
Christophe Lyon34adaf62013-04-11 15:05:18 +0200218#ifdef __ARMCC_VERSION
219__fp16 buffer_dup_float16x4[4] = {-16, -15, -14, -13};
220#else
221VECT_VAR_DECL(buffer_dup, float, 16, 4)[] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
222 0xcb00 /* -14 */, 0xca80 /* -13 */};
223#endif
224PAD(buffer_dup_pad, float, 16, 4);
225#endif
Christophe Lyon01af0a52013-01-17 17:23:11 +0100226VECT_VAR_DECL_INIT(buffer_dup, int, 8, 16);
227VECT_VAR_DECL(buffer_dup_pad, int, 8, 16);
228VECT_VAR_DECL_INIT(buffer_dup, int, 16, 8);
229VECT_VAR_DECL(buffer_dup_pad, int, 16, 8);
230VECT_VAR_DECL_INIT(buffer_dup, int, 32, 4);
231VECT_VAR_DECL(buffer_dup_pad, int, 32, 4);
232VECT_VAR_DECL_INIT4(buffer_dup, int, 64, 2);
233VECT_VAR_DECL(buffer_dup_pad, int, 64, 2);
234VECT_VAR_DECL_INIT(buffer_dup, uint, 8, 16);
235VECT_VAR_DECL(buffer_dup_pad, uint, 8, 16);
236VECT_VAR_DECL_INIT(buffer_dup, uint, 16, 8);
237VECT_VAR_DECL(buffer_dup_pad, uint, 16, 8);
238VECT_VAR_DECL_INIT(buffer_dup, uint, 32, 4);
239VECT_VAR_DECL(buffer_dup_pad, uint, 32, 4);
240VECT_VAR_DECL_INIT4(buffer_dup, uint, 64, 2);
241VECT_VAR_DECL(buffer_dup_pad, uint, 64, 2);
Christophe Lyon80902f62013-03-29 16:26:42 +0100242VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 16);
243VECT_VAR_DECL(buffer_dup_pad, poly, 8, 16);
244VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 8);
245VECT_VAR_DECL(buffer_dup_pad, poly, 16, 8);
Christophe Lyon01af0a52013-01-17 17:23:11 +0100246VECT_VAR_DECL_INIT(buffer_dup, float, 32, 4);
247VECT_VAR_DECL(buffer_dup_pad, float, 32, 4);
Christophe Lyond98beba2016-08-24 18:02:41 +0200248#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
Christophe Lyon34adaf62013-04-11 15:05:18 +0200249#ifdef __ARMCC_VERSION
250__fp16 buffer_dup_float16x8[8] = {-16, -15, -14, -13, -12, -11, -10, -9};
251#else
252VECT_VAR_DECL(buffer_dup, float, 16, 8)[] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
253 0xcb00 /* -14 */, 0xca80 /* -13 */,
254 0xca00 /* -12 */, 0xc980 /* -11 */,
255 0xc900 /* -10 */, 0xc880 /* -9 */};
256#endif
257PAD(buffer_dup_pad, float, 16, 8);
258#endif
Christophe Lyon01af0a52013-01-17 17:23:11 +0100259
260/* Input buffers for vld2, 1 of each size */
261VECT_ARRAY_INIT2(buffer_vld2, int, 8, 8);
262PAD(buffer_vld2_pad, int, 8, 8);
263VECT_ARRAY_INIT2(buffer_vld2, int, 16, 4);
264PAD(buffer_vld2_pad, int, 16, 4);
265VECT_ARRAY_INIT2(buffer_vld2, int, 32, 2);
266PAD(buffer_vld2_pad, int, 32, 2);
267VECT_ARRAY_INIT2(buffer_vld2, int, 64, 1);
268PAD(buffer_vld2_pad, int, 64, 1);
269VECT_ARRAY_INIT2(buffer_vld2, uint, 8, 8);
270PAD(buffer_vld2_pad, uint, 8, 8);
271VECT_ARRAY_INIT2(buffer_vld2, uint, 16, 4);
272PAD(buffer_vld2_pad, uint, 16, 4);
273VECT_ARRAY_INIT2(buffer_vld2, uint, 32, 2);
274PAD(buffer_vld2_pad, uint, 32, 2);
275VECT_ARRAY_INIT2(buffer_vld2, uint, 64, 1);
276PAD(buffer_vld2_pad, uint, 64, 1);
Christophe Lyon80902f62013-03-29 16:26:42 +0100277VECT_ARRAY_INIT2(buffer_vld2, poly, 8, 8);
278PAD(buffer_vld2_pad, poly, 8, 8);
279VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 4);
280PAD(buffer_vld2_pad, poly, 16, 4);
Christophe Lyon01af0a52013-01-17 17:23:11 +0100281VECT_ARRAY_INIT2(buffer_vld2, float, 32, 2);
282PAD(buffer_vld2_pad, float, 32, 2);
Christophe Lyond98beba2016-08-24 18:02:41 +0200283#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
Christophe Lyon34adaf62013-04-11 15:05:18 +0200284#ifdef __ARMCC_VERSION
285__fp16 buffer_vld2_float16x4x2[4*2] = {-16, -15, -14, -13, -12, -11, -10, -9};
286#else
287float16_t buffer_vld2_float16x4x2[4*2] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
288 0xcb00 /* -14 */, 0xca80 /* -13 */,
289 0xca00 /* -12 */, 0xc980 /* -11 */,
290 0xc900 /* -10 */, 0xc880 /* -9 */};
291#endif
292PAD(buffer_vld2_pad, float, 16, 4);
293#endif
Christophe Lyon01af0a52013-01-17 17:23:11 +0100294VECT_ARRAY_INIT2(buffer_vld2, int, 8, 16);
295PAD(buffer_vld2_pad, int, 8, 16);
296VECT_ARRAY_INIT2(buffer_vld2, int, 16, 8);
297PAD(buffer_vld2_pad, int, 16, 8);
298VECT_ARRAY_INIT2(buffer_vld2, int, 32, 4);
299PAD(buffer_vld2_pad, int, 32, 4);
300VECT_ARRAY_INIT2(buffer_vld2, int, 64, 2);
301PAD(buffer_vld2_pad, int, 64, 2);
302VECT_ARRAY_INIT2(buffer_vld2, uint, 8, 16);
303PAD(buffer_vld2_pad, uint, 8, 16);
304VECT_ARRAY_INIT2(buffer_vld2, uint, 16, 8);
305PAD(buffer_vld2_pad, uint, 16, 8);
306VECT_ARRAY_INIT2(buffer_vld2, uint, 32, 4);
307PAD(buffer_vld2_pad, uint, 32, 4);
308VECT_ARRAY_INIT2(buffer_vld2, uint, 64, 2);
309PAD(buffer_vld2_pad, uint, 64, 2);
Christophe Lyon80902f62013-03-29 16:26:42 +0100310VECT_ARRAY_INIT2(buffer_vld2, poly, 8, 16);
311PAD(buffer_vld2_pad, poly, 8, 16);
312VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 8);
313PAD(buffer_vld2_pad, poly, 16, 8);
Christophe Lyon01af0a52013-01-17 17:23:11 +0100314VECT_ARRAY_INIT2(buffer_vld2, float, 32, 4);
315PAD(buffer_vld2_pad, float, 32, 4);
Christophe Lyond98beba2016-08-24 18:02:41 +0200316#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
Christophe Lyon34adaf62013-04-11 15:05:18 +0200317#ifdef __ARMCC_VERSION
318__fp16 buffer_vld2_float16x8x2[8*2] = {-16, -15, -14, -13, -12, -11, -10, -9,
319 -8, -7, -6, -5, -4, -3, -2, -1};
320#else
321float16_t buffer_vld2_float16x8x2[8*2] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
322 0xcb00 /* -14 */, 0xca80 /* -13 */,
323 0xca00 /* -12 */, 0xc980 /* -11 */,
324 0xc900 /* -10 */, 0xc880 /* -9 */,
325 0xc800 /* -8 */, 0xc700 /* -7 */,
326 0xc600 /* -6 */, 0xc500 /* -5 */,
327 0xc400 /* -4 */, 0xc200 /* -3 */,
328 0xc000 /* -2 */, 0xbc00 /* -1 */};
329#endif
330PAD(buffer_vld2_pad, float, 16, 8);
331#endif
Christophe Lyon01af0a52013-01-17 17:23:11 +0100332
333/* Input buffers for vld3, 1 of each size */
334VECT_ARRAY_INIT3(buffer_vld3, int, 8, 8);
335PAD(buffer_vld3_pad, int, 8, 8);
336VECT_ARRAY_INIT3(buffer_vld3, int, 16, 4);
337PAD(buffer_vld3_pad, int, 16, 4);
338VECT_ARRAY_INIT3(buffer_vld3, int, 32, 2);
339PAD(buffer_vld3_pad, int, 32, 2);
340VECT_ARRAY_INIT3(buffer_vld3, int, 64, 1);
341PAD(buffer_vld3_pad, int, 64, 1);
342VECT_ARRAY_INIT3(buffer_vld3, uint, 8, 8);
343PAD(buffer_vld3_pad, uint, 8, 8);
344VECT_ARRAY_INIT3(buffer_vld3, uint, 16, 4);
345PAD(buffer_vld3_pad, uint, 16, 4);
346VECT_ARRAY_INIT3(buffer_vld3, uint, 32, 2);
347PAD(buffer_vld3_pad, uint, 32, 2);
348VECT_ARRAY_INIT3(buffer_vld3, uint, 64, 1);
349PAD(buffer_vld3_pad, uint, 64, 1);
Christophe Lyon80902f62013-03-29 16:26:42 +0100350VECT_ARRAY_INIT3(buffer_vld3, poly, 8, 8);
351PAD(buffer_vld3_pad, poly, 8, 8);
352VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 4);
353PAD(buffer_vld3_pad, poly, 16, 4);
Christophe Lyon01af0a52013-01-17 17:23:11 +0100354VECT_ARRAY_INIT3(buffer_vld3, float, 32, 2);
355PAD(buffer_vld3_pad, float, 32, 2);
Christophe Lyond98beba2016-08-24 18:02:41 +0200356#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
Christophe Lyon34adaf62013-04-11 15:05:18 +0200357#ifdef __ARMCC_VERSION
358__fp16 buffer_vld3_float16x4x3[4*3] = {-16, -15, -14, -13, -12, -11, -10, -9,
359 -8, -7, -6, -5};
360#else
361float16_t buffer_vld3_float16x4x3[4*3] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
362 0xcb00 /* -14 */, 0xca80 /* -13 */,
363 0xca00 /* -12 */, 0xc980 /* -11 */,
364 0xc900 /* -10 */, 0xc880 /* -9 */,
365 0xc800 /* -8 */, 0xc700 /* -7 */,
366 0xc600 /* -6 */, 0xc500 /* -5 */};
367#endif
368PAD(buffer_vld3_pad, float, 16, 4);
369#endif
Christophe Lyon01af0a52013-01-17 17:23:11 +0100370VECT_ARRAY_INIT3(buffer_vld3, int, 8, 16);
371PAD(buffer_vld3_pad, int, 8, 16);
372VECT_ARRAY_INIT3(buffer_vld3, int, 16, 8);
373PAD(buffer_vld3_pad, int, 16, 8);
374VECT_ARRAY_INIT3(buffer_vld3, int, 32, 4);
375PAD(buffer_vld3_pad, int, 32, 4);
376VECT_ARRAY_INIT3(buffer_vld3, int, 64, 2);
377PAD(buffer_vld3_pad, int, 64, 2);
378VECT_ARRAY_INIT3(buffer_vld3, uint, 8, 16);
379PAD(buffer_vld3_pad, uint, 8, 16);
380VECT_ARRAY_INIT3(buffer_vld3, uint, 16, 8);
381PAD(buffer_vld3_pad, uint, 16, 8);
382VECT_ARRAY_INIT3(buffer_vld3, uint, 32, 4);
383PAD(buffer_vld3_pad, uint, 32, 4);
384VECT_ARRAY_INIT3(buffer_vld3, uint, 64, 2);
385PAD(buffer_vld3_pad, uint, 64, 2);
Christophe Lyon80902f62013-03-29 16:26:42 +0100386VECT_ARRAY_INIT3(buffer_vld3, poly, 8, 16);
387PAD(buffer_vld3_pad, poly, 8, 16);
388VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 8);
389PAD(buffer_vld3_pad, poly, 16, 8);
Christophe Lyon01af0a52013-01-17 17:23:11 +0100390VECT_ARRAY_INIT3(buffer_vld3, float, 32, 4);
391PAD(buffer_vld3_pad, float, 32, 4);
Christophe Lyond98beba2016-08-24 18:02:41 +0200392#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
Christophe Lyon34adaf62013-04-11 15:05:18 +0200393#ifdef __ARMCC_VERSION
394__fp16 buffer_vld3_float16x8x3[8*3] = {-16, -15, -14, -13, -12, -11, -10, -9,
395 -8, -7, -6, -5, -4, -3, -2, -1,
396 0, 1, 2, 3, 4, 5, 6, 7};
397#else
398float16_t buffer_vld3_float16x8x3[8*3] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
399 0xcb00 /* -14 */, 0xca80 /* -13 */,
400 0xca00 /* -12 */, 0xc980 /* -11 */,
401 0xc900 /* -10 */, 0xc880 /* -9 */,
402 0xc800 /* -8 */, 0xc700 /* -7 */,
403 0xc600 /* -6 */, 0xc500 /* -6 */,
404 0xc400 /* -4 */, 0xc200 /* -3 */,
405 0xc000 /* -2 */, 0xbc00 /* -1 */,
406 0, 0x3c00 /* 1 */,
407 0x4000 /* 2 */, 0x4200 /* 3 */,
408 0x4400 /* 4 */, 0x4500 /* 5 */,
409 0x4600 /* 6 */, 0x4700 /* 7 */};
410#endif
411PAD(buffer_vld3_pad, float, 16, 8);
412#endif
Christophe Lyon01af0a52013-01-17 17:23:11 +0100413
414/* Input buffers for vld4, 1 of each size */
415VECT_ARRAY_INIT4(buffer_vld4, int, 8, 8);
416PAD(buffer_vld4_pad, int, 8, 8);
417VECT_ARRAY_INIT4(buffer_vld4, int, 16, 4);
418PAD(buffer_vld4_pad, int, 16, 4);
419VECT_ARRAY_INIT4(buffer_vld4, int, 32, 2);
420PAD(buffer_vld4_pad, int, 32, 2);
421VECT_ARRAY_INIT4(buffer_vld4, int, 64, 1);
422PAD(buffer_vld4_pad, int, 64, 1);
423VECT_ARRAY_INIT4(buffer_vld4, uint, 8, 8);
424PAD(buffer_vld4_pad, uint, 8, 8);
425VECT_ARRAY_INIT4(buffer_vld4, uint, 16, 4);
426PAD(buffer_vld4_pad, uint, 16, 4);
427VECT_ARRAY_INIT4(buffer_vld4, uint, 32, 2);
428PAD(buffer_vld4_pad, uint, 32, 2);
429VECT_ARRAY_INIT4(buffer_vld4, uint, 64, 1);
430PAD(buffer_vld4_pad, uint, 64, 1);
Christophe Lyon80902f62013-03-29 16:26:42 +0100431VECT_ARRAY_INIT4(buffer_vld4, poly, 8, 8);
432PAD(buffer_vld4_pad, poly, 8, 8);
433VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 4);
434PAD(buffer_vld4_pad, poly, 16, 4);
Christophe Lyon01af0a52013-01-17 17:23:11 +0100435VECT_ARRAY_INIT4(buffer_vld4, float, 32, 2);
436PAD(buffer_vld4_pad, float, 32, 2);
Christophe Lyond98beba2016-08-24 18:02:41 +0200437#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
Christophe Lyon34adaf62013-04-11 15:05:18 +0200438#ifdef __ARMCC_VERSION
439__fp16 buffer_vld4_float16x4x4[4*4] = {-16, -15, -14, -13, -12, -11, -10, -9,
440 -8, -7, -6, -5, -4, -3, -2, -1};
441#else
442float16_t buffer_vld4_float16x4x4[4*4] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
443 0xcb00 /* -14 */, 0xca80 /* -13 */,
444 0xca00 /* -12 */, 0xc980 /* -11 */,
445 0xc900 /* -10 */, 0xc880 /* -9 */,
446 0xc800 /* -8 */, 0xc700 /* -7 */,
447 0xc600 /* -6 */, 0xc500 /* -5 */,
448 0xc400 /* -4 */, 0xc200 /* -3 */,
449 0xc000 /* -2 */, 0xbc00 /* -1 */};
450#endif
451PAD(buffer_vld4_pad, float, 16, 4);
452#endif
Christophe Lyon01af0a52013-01-17 17:23:11 +0100453VECT_ARRAY_INIT4(buffer_vld4, int, 8, 16);
454PAD(buffer_vld4_pad, int, 8, 16);
455VECT_ARRAY_INIT4(buffer_vld4, int, 16, 8);
456PAD(buffer_vld4_pad, int, 16, 8);
457VECT_ARRAY_INIT4(buffer_vld4, int, 32, 4);
458PAD(buffer_vld4_pad, int, 32, 4);
459VECT_ARRAY_INIT4(buffer_vld4, int, 64, 2);
460PAD(buffer_vld4_pad, int, 64, 2);
461VECT_ARRAY_INIT4(buffer_vld4, uint, 8, 16);
462PAD(buffer_vld4_pad, uint, 8, 16);
463VECT_ARRAY_INIT4(buffer_vld4, uint, 16, 8);
464PAD(buffer_vld4_pad, uint, 16, 8);
465VECT_ARRAY_INIT4(buffer_vld4, uint, 32, 4);
466PAD(buffer_vld4_pad, uint, 32, 4);
467VECT_ARRAY_INIT4(buffer_vld4, uint, 64, 2);
468PAD(buffer_vld4_pad, uint, 64, 2);
Christophe Lyon80902f62013-03-29 16:26:42 +0100469VECT_ARRAY_INIT4(buffer_vld4, poly, 8, 16);
470PAD(buffer_vld4_pad, poly, 8, 16);
471VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 8);
472PAD(buffer_vld4_pad, poly, 16, 8);
Christophe Lyon01af0a52013-01-17 17:23:11 +0100473VECT_ARRAY_INIT4(buffer_vld4, float, 32, 4);
474PAD(buffer_vld4_pad, float, 32, 4);
Christophe Lyond98beba2016-08-24 18:02:41 +0200475#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
Christophe Lyon34adaf62013-04-11 15:05:18 +0200476#ifdef __ARMCC_VERSION
477__fp16 buffer_vld4_float16x8x4[8*4] = {-16, -15, -14, -13, -12, -11, -10, -9,
478 -8, -7, -6, -5, -4, -3, -2, -1,
479 0, 1, 2, 3, 4, 5, 6, 7,
480 8, 9, 10, 11, 12, 13, 14, 15};
481#else
482float16_t buffer_vld4_float16x8x4[8*4] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
483 0xcb00 /* -14 */, 0xca80 /* -13 */,
484 0xca00 /* -12 */, 0xc980 /* -11 */,
485 0xc900 /* -10 */, 0xc880 /* -9 */,
486 0xc800 /* -8 */, 0xc700 /* -7 */,
487 0xc600 /* -6 */, 0xc500 /* -6 */,
488 0xc400 /* -4 */, 0xc200 /* -3 */,
489 0xc000 /* -2 */, 0xbc00 /* -1 */,
490 0, 0x3c00 /* 1 */,
491 0x4000 /* 2 */, 0x4200 /* 3 */,
492 0x4400 /* 4 */, 0x4500 /* 5 */,
493 0x4600 /* 6 */, 0x4700 /* 7 */,
494 0x4800 /* 8 */, 0x4880 /* 9 */,
495 0x4900 /* 10 */, 0x4980 /* 11 */,
496 0x4a00 /* 12 */, 0x4a80 /* 13 */,
497 0x4b00 /* 14 */, 0x04b80 /* 15 */};
498#endif
499PAD(buffer_vld4_pad, float, 16, 8);
500#endif
Christophe Lyon01af0a52013-01-17 17:23:11 +0100501
502/* Input buffers for vld2_lane */
503VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 8, 2);
504VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 16, 2);
505VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 32, 2);
506VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 64, 2);
507VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 8, 2);
508VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 16, 2);
509VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2);
510VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2);
Christophe Lyon80902f62013-03-29 16:26:42 +0100511VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2);
512VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2);
Christophe Lyon01af0a52013-01-17 17:23:11 +0100513VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 32, 2);
Christophe Lyond98beba2016-08-24 18:02:41 +0200514#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
Christophe Lyon34adaf62013-04-11 15:05:18 +0200515#ifdef __ARMCC_VERSION
516__fp16 buffer_vld2_lane_float16x2[2] = {-16, -15};
517#else
518VECT_VAR_DECL(buffer_vld2_lane, float, 16, 2) [] = {0xcc00 /* -16 */,
519 0xcb80 /* -15 */};
520#endif
521#endif
Christophe Lyon01af0a52013-01-17 17:23:11 +0100522
523/* Input buffers for vld3_lane */
524VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 8, 3);
525VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 16, 3);
526VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 32, 3);
527VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 64, 3);
528VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 8, 3);
529VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 16, 3);
530VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3);
531VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3);
Christophe Lyon80902f62013-03-29 16:26:42 +0100532VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3);
533VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3);
Christophe Lyon01af0a52013-01-17 17:23:11 +0100534VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 32, 3);
Christophe Lyond98beba2016-08-24 18:02:41 +0200535#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
Christophe Lyon34adaf62013-04-11 15:05:18 +0200536#ifdef __ARMCC_VERSION
537__fp16 buffer_vld3_lane_float16x3[3] = {-16, -15, -14};
538#else
539VECT_VAR_DECL(buffer_vld3_lane, float, 16, 3) [] = {0xcc00 /* -16 */,
540 0xcb80 /* -15 */,
541 0xcb00 /* -14 */};
542#endif
543#endif
Christophe Lyon01af0a52013-01-17 17:23:11 +0100544
545/* Input buffers for vld4_lane */
546VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 8, 4);
547VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 16, 4);
548VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 32, 4);
549VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 64, 4);
550VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 8, 4);
551VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 16, 4);
552VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4);
553VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4);
Christophe Lyon80902f62013-03-29 16:26:42 +0100554VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4);
555VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4);
Christophe Lyon01af0a52013-01-17 17:23:11 +0100556VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 32, 4);
Christophe Lyond98beba2016-08-24 18:02:41 +0200557#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
Christophe Lyon34adaf62013-04-11 15:05:18 +0200558#ifdef __ARMCC_VERSION
559__fp16 buffer_vld4_lane_float16x4[4] = {-16, -15, -14, -13};
560#else
561VECT_VAR_DECL(buffer_vld4_lane, float, 16, 4) [] = {0xcc00 /* -16 */,
562 0xcb80 /* -15 */,
563 0xcb00 /* -14 */,
564 0xca80 /* -13 */};
565#endif
566#endif