blob: 367491f597219f97db7fbb9fec03bf9b0a2364ec [file] [log] [blame]
sewardj38a3f862005-01-13 15:06:51 +00001
2/*---------------------------------------------------------------*/
sewardj752f9062010-05-03 21:38:49 +00003/*--- begin host_generic_simd64.c ---*/
sewardj38a3f862005-01-13 15:06:51 +00004/*---------------------------------------------------------------*/
5
6/*
sewardj752f9062010-05-03 21:38:49 +00007 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
sewardj38a3f862005-01-13 15:06:51 +00009
sewardj89ae8472013-10-18 14:12:58 +000010 Copyright (C) 2004-2013 OpenWorks LLP
sewardj752f9062010-05-03 21:38:49 +000011 info@open-works.net
sewardj38a3f862005-01-13 15:06:51 +000012
sewardj752f9062010-05-03 21:38:49 +000013 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
sewardj38a3f862005-01-13 15:06:51 +000017
sewardj752f9062010-05-03 21:38:49 +000018 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
sewardj7bd6ffe2005-08-03 16:07:36 +000026 02110-1301, USA.
27
sewardj752f9062010-05-03 21:38:49 +000028 The GNU General Public License is contained in the file COPYING.
sewardj38a3f862005-01-13 15:06:51 +000029
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
sewardj38a3f862005-01-13 15:06:51 +000034*/
35
36/* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37 where the instruction selectors cannot generate code in-line.
38 These are purely back-end entities and cannot be seen/referenced
sewardj8bde7f12013-04-11 13:57:43 +000039 from IR. There are also helpers for 32-bit arithmetic in here. */
sewardj38a3f862005-01-13 15:06:51 +000040
41#include "libvex_basictypes.h"
sewardj8bde7f12013-04-11 13:57:43 +000042#include "main_util.h" // LIKELY, UNLIKELY
sewardjcef7d3e2009-07-02 12:21:59 +000043#include "host_generic_simd64.h"
sewardj38a3f862005-01-13 15:06:51 +000044
45
46
47/* Tuple/select functions for 32x2 vectors. */
48
49static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
50 return (((ULong)w1) << 32) | ((ULong)w0);
51}
52
53static inline UInt sel32x2_1 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000054 return 0xFFFFFFFF & toUInt(w64 >> 32);
sewardj38a3f862005-01-13 15:06:51 +000055}
56static inline UInt sel32x2_0 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000057 return 0xFFFFFFFF & toUInt(w64);
sewardj38a3f862005-01-13 15:06:51 +000058}
59
60
61/* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless
62 with 64-bit shifts so we give it a hand. */
63
64static inline ULong mk16x4 ( UShort w3, UShort w2,
65 UShort w1, UShort w0 ) {
66 UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
67 UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
68 return mk32x2(hi32, lo32);
69}
70
71static inline UShort sel16x4_3 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000072 UInt hi32 = toUInt(w64 >> 32);
73 return toUShort(0xFFFF & (hi32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +000074}
75static inline UShort sel16x4_2 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000076 UInt hi32 = toUInt(w64 >> 32);
77 return toUShort(0xFFFF & hi32);
sewardj38a3f862005-01-13 15:06:51 +000078}
79static inline UShort sel16x4_1 ( ULong w64 ) {
80 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +000081 return toUShort(0xFFFF & (lo32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +000082}
83static inline UShort sel16x4_0 ( ULong w64 ) {
84 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +000085 return toUShort(0xFFFF & lo32);
sewardj38a3f862005-01-13 15:06:51 +000086}
87
88
89/* Tuple/select functions for 8x8 vectors. */
90
91static inline ULong mk8x8 ( UChar w7, UChar w6,
92 UChar w5, UChar w4,
93 UChar w3, UChar w2,
sewardje2ea1762010-09-22 00:56:37 +000094 UChar w1, UChar w0 ) {
sewardj38a3f862005-01-13 15:06:51 +000095 UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16)
96 | (((UInt)w5) << 8) | (((UInt)w4) << 0);
97 UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
98 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
99 return mk32x2(hi32, lo32);
100}
101
102static inline UChar sel8x8_7 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000103 UInt hi32 = toUInt(w64 >> 32);
104 return toUChar(0xFF & (hi32 >> 24));
sewardj38a3f862005-01-13 15:06:51 +0000105}
106static inline UChar sel8x8_6 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000107 UInt hi32 = toUInt(w64 >> 32);
108 return toUChar(0xFF & (hi32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +0000109}
110static inline UChar sel8x8_5 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000111 UInt hi32 = toUInt(w64 >> 32);
112 return toUChar(0xFF & (hi32 >> 8));
sewardj38a3f862005-01-13 15:06:51 +0000113}
114static inline UChar sel8x8_4 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000115 UInt hi32 = toUInt(w64 >> 32);
116 return toUChar(0xFF & (hi32 >> 0));
sewardj38a3f862005-01-13 15:06:51 +0000117}
118static inline UChar sel8x8_3 ( ULong w64 ) {
119 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000120 return toUChar(0xFF & (lo32 >> 24));
sewardj38a3f862005-01-13 15:06:51 +0000121}
122static inline UChar sel8x8_2 ( ULong w64 ) {
123 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000124 return toUChar(0xFF & (lo32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +0000125}
126static inline UChar sel8x8_1 ( ULong w64 ) {
127 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000128 return toUChar(0xFF & (lo32 >> 8));
sewardj38a3f862005-01-13 15:06:51 +0000129}
130static inline UChar sel8x8_0 ( ULong w64 ) {
131 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000132 return toUChar(0xFF & (lo32 >> 0));
sewardj38a3f862005-01-13 15:06:51 +0000133}
134
sewardjd166e282008-02-06 11:42:45 +0000135static inline UChar index8x8 ( ULong w64, UChar ix ) {
136 ix &= 7;
137 return toUChar((w64 >> (8*ix)) & 0xFF);
138}
139
sewardj38a3f862005-01-13 15:06:51 +0000140
141/* Scalar helpers. */
142
sewardj44ce46d2012-07-11 13:19:10 +0000143static inline Int qadd32S ( Int xx, Int yy )
144{
145 Long t = ((Long)xx) + ((Long)yy);
146 const Long loLim = -0x80000000LL;
147 const Long hiLim = 0x7FFFFFFFLL;
148 if (t < loLim) t = loLim;
149 if (t > hiLim) t = hiLim;
150 return (Int)t;
151}
152
sewardj38a3f862005-01-13 15:06:51 +0000153static inline Short qadd16S ( Short xx, Short yy )
154{
155 Int t = ((Int)xx) + ((Int)yy);
156 if (t < -32768) t = -32768;
157 if (t > 32767) t = 32767;
158 return (Short)t;
159}
160
161static inline Char qadd8S ( Char xx, Char yy )
162{
163 Int t = ((Int)xx) + ((Int)yy);
164 if (t < -128) t = -128;
165 if (t > 127) t = 127;
166 return (Char)t;
167}
168
169static inline UShort qadd16U ( UShort xx, UShort yy )
170{
171 UInt t = ((UInt)xx) + ((UInt)yy);
172 if (t > 0xFFFF) t = 0xFFFF;
173 return (UShort)t;
174}
175
176static inline UChar qadd8U ( UChar xx, UChar yy )
177{
178 UInt t = ((UInt)xx) + ((UInt)yy);
179 if (t > 0xFF) t = 0xFF;
180 return (UChar)t;
181}
182
sewardj44ce46d2012-07-11 13:19:10 +0000183static inline Int qsub32S ( Int xx, Int yy )
184{
185 Long t = ((Long)xx) - ((Long)yy);
186 const Long loLim = -0x80000000LL;
187 const Long hiLim = 0x7FFFFFFFLL;
188 if (t < loLim) t = loLim;
189 if (t > hiLim) t = hiLim;
190 return (Int)t;
191}
192
sewardj38a3f862005-01-13 15:06:51 +0000193static inline Short qsub16S ( Short xx, Short yy )
194{
195 Int t = ((Int)xx) - ((Int)yy);
196 if (t < -32768) t = -32768;
197 if (t > 32767) t = 32767;
198 return (Short)t;
199}
200
201static inline Char qsub8S ( Char xx, Char yy )
202{
203 Int t = ((Int)xx) - ((Int)yy);
204 if (t < -128) t = -128;
205 if (t > 127) t = 127;
206 return (Char)t;
207}
208
209static inline UShort qsub16U ( UShort xx, UShort yy )
210{
211 Int t = ((Int)xx) - ((Int)yy);
212 if (t < 0) t = 0;
213 if (t > 0xFFFF) t = 0xFFFF;
214 return (UShort)t;
215}
216
217static inline UChar qsub8U ( UChar xx, UChar yy )
218{
219 Int t = ((Int)xx) - ((Int)yy);
220 if (t < 0) t = 0;
221 if (t > 0xFF) t = 0xFF;
222 return (UChar)t;
223}
224
225static inline Short mul16 ( Short xx, Short yy )
226{
227 Int t = ((Int)xx) * ((Int)yy);
228 return (Short)t;
229}
230
sewardjd166e282008-02-06 11:42:45 +0000231static inline Int mul32 ( Int xx, Int yy )
232{
233 Int t = ((Int)xx) * ((Int)yy);
234 return (Int)t;
235}
236
sewardj38a3f862005-01-13 15:06:51 +0000237static inline Short mulhi16S ( Short xx, Short yy )
238{
239 Int t = ((Int)xx) * ((Int)yy);
240 t >>=/*s*/ 16;
241 return (Short)t;
242}
243
244static inline UShort mulhi16U ( UShort xx, UShort yy )
245{
246 UInt t = ((UInt)xx) * ((UInt)yy);
247 t >>=/*u*/ 16;
248 return (UShort)t;
249}
250
251static inline UInt cmpeq32 ( UInt xx, UInt yy )
252{
253 return xx==yy ? 0xFFFFFFFF : 0;
254}
255
256static inline UShort cmpeq16 ( UShort xx, UShort yy )
257{
sewardjd19fc162005-02-26 02:16:39 +0000258 return toUShort(xx==yy ? 0xFFFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000259}
260
261static inline UChar cmpeq8 ( UChar xx, UChar yy )
262{
sewardjd19fc162005-02-26 02:16:39 +0000263 return toUChar(xx==yy ? 0xFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000264}
265
266static inline UInt cmpgt32S ( Int xx, Int yy )
267{
268 return xx>yy ? 0xFFFFFFFF : 0;
269}
270
271static inline UShort cmpgt16S ( Short xx, Short yy )
272{
sewardjd19fc162005-02-26 02:16:39 +0000273 return toUShort(xx>yy ? 0xFFFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000274}
275
276static inline UChar cmpgt8S ( Char xx, Char yy )
277{
sewardjd19fc162005-02-26 02:16:39 +0000278 return toUChar(xx>yy ? 0xFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000279}
280
sewardj18069182005-01-13 19:16:04 +0000281static inline UInt cmpnez32 ( UInt xx )
282{
283 return xx==0 ? 0 : 0xFFFFFFFF;
284}
285
286static inline UShort cmpnez16 ( UShort xx )
287{
sewardjd19fc162005-02-26 02:16:39 +0000288 return toUShort(xx==0 ? 0 : 0xFFFF);
sewardj18069182005-01-13 19:16:04 +0000289}
290
291static inline UChar cmpnez8 ( UChar xx )
292{
sewardjd19fc162005-02-26 02:16:39 +0000293 return toUChar(xx==0 ? 0 : 0xFF);
sewardj18069182005-01-13 19:16:04 +0000294}
295
sewardjc9bff7d2011-06-15 15:09:37 +0000296static inline Short qnarrow32Sto16S ( UInt xx0 )
sewardj38a3f862005-01-13 15:06:51 +0000297{
298 Int xx = (Int)xx0;
299 if (xx < -32768) xx = -32768;
300 if (xx > 32767) xx = 32767;
301 return (Short)xx;
302}
303
sewardjc9bff7d2011-06-15 15:09:37 +0000304static inline Char qnarrow16Sto8S ( UShort xx0 )
sewardj38a3f862005-01-13 15:06:51 +0000305{
306 Short xx = (Short)xx0;
307 if (xx < -128) xx = -128;
308 if (xx > 127) xx = 127;
309 return (Char)xx;
310}
311
sewardjc9bff7d2011-06-15 15:09:37 +0000312static inline UChar qnarrow16Sto8U ( UShort xx0 )
sewardj38a3f862005-01-13 15:06:51 +0000313{
314 Short xx = (Short)xx0;
315 if (xx < 0) xx = 0;
316 if (xx > 255) xx = 255;
317 return (UChar)xx;
318}
319
sewardjad2c9ea2011-10-22 09:32:16 +0000320static inline UShort narrow32to16 ( UInt xx )
321{
322 return (UShort)xx;
323}
324
325static inline UChar narrow16to8 ( UShort xx )
326{
327 return (UChar)xx;
328}
329
sewardj38a3f862005-01-13 15:06:51 +0000330/* shifts: we don't care about out-of-range ones, since
331 that is dealt with at a higher level. */
332
sewardjd166e282008-02-06 11:42:45 +0000333static inline UChar shl8 ( UChar v, UInt n )
334{
335 return toUChar(v << n);
336}
337
sewardjd71ba832006-12-27 01:15:29 +0000338static inline UChar sar8 ( UChar v, UInt n )
339{
340 return toUChar(((Char)v) >> n);
341}
342
sewardj38a3f862005-01-13 15:06:51 +0000343static inline UShort shl16 ( UShort v, UInt n )
344{
sewardjd19fc162005-02-26 02:16:39 +0000345 return toUShort(v << n);
sewardj38a3f862005-01-13 15:06:51 +0000346}
347
348static inline UShort shr16 ( UShort v, UInt n )
349{
sewardjd19fc162005-02-26 02:16:39 +0000350 return toUShort((((UShort)v) >> n));
sewardj38a3f862005-01-13 15:06:51 +0000351}
352
353static inline UShort sar16 ( UShort v, UInt n )
354{
sewardjd19fc162005-02-26 02:16:39 +0000355 return toUShort(((Short)v) >> n);
sewardj38a3f862005-01-13 15:06:51 +0000356}
357
358static inline UInt shl32 ( UInt v, UInt n )
359{
360 return v << n;
361}
362
363static inline UInt shr32 ( UInt v, UInt n )
364{
365 return (((UInt)v) >> n);
366}
367
368static inline UInt sar32 ( UInt v, UInt n )
369{
370 return ((Int)v) >> n;
371}
372
373static inline UChar avg8U ( UChar xx, UChar yy )
374{
375 UInt xxi = (UInt)xx;
376 UInt yyi = (UInt)yy;
377 UInt r = (xxi + yyi + 1) >> 1;
378 return (UChar)r;
379}
380
381static inline UShort avg16U ( UShort xx, UShort yy )
382{
383 UInt xxi = (UInt)xx;
384 UInt yyi = (UInt)yy;
385 UInt r = (xxi + yyi + 1) >> 1;
386 return (UShort)r;
387}
388
389static inline Short max16S ( Short xx, Short yy )
390{
sewardjd19fc162005-02-26 02:16:39 +0000391 return toUShort((xx > yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000392}
393
394static inline UChar max8U ( UChar xx, UChar yy )
395{
sewardjd19fc162005-02-26 02:16:39 +0000396 return toUChar((xx > yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000397}
398
399static inline Short min16S ( Short xx, Short yy )
400{
sewardjd19fc162005-02-26 02:16:39 +0000401 return toUShort((xx < yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000402}
403
404static inline UChar min8U ( UChar xx, UChar yy )
405{
sewardjd19fc162005-02-26 02:16:39 +0000406 return toUChar((xx < yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000407}
408
sewardje2ea1762010-09-22 00:56:37 +0000409static inline UShort hadd16U ( UShort xx, UShort yy )
410{
411 UInt xxi = (UInt)xx;
412 UInt yyi = (UInt)yy;
413 UInt r = (xxi + yyi) >> 1;
414 return (UShort)r;
415}
416
417static inline Short hadd16S ( Short xx, Short yy )
418{
419 Int xxi = (Int)xx;
420 Int yyi = (Int)yy;
421 Int r = (xxi + yyi) >> 1;
422 return (Short)r;
423}
424
425static inline UShort hsub16U ( UShort xx, UShort yy )
426{
427 UInt xxi = (UInt)xx;
428 UInt yyi = (UInt)yy;
429 UInt r = (xxi - yyi) >> 1;
430 return (UShort)r;
431}
432
433static inline Short hsub16S ( Short xx, Short yy )
434{
435 Int xxi = (Int)xx;
436 Int yyi = (Int)yy;
437 Int r = (xxi - yyi) >> 1;
438 return (Short)r;
439}
440
441static inline UChar hadd8U ( UChar xx, UChar yy )
442{
443 UInt xxi = (UInt)xx;
444 UInt yyi = (UInt)yy;
445 UInt r = (xxi + yyi) >> 1;
446 return (UChar)r;
447}
448
449static inline Char hadd8S ( Char xx, Char yy )
450{
451 Int xxi = (Int)xx;
452 Int yyi = (Int)yy;
453 Int r = (xxi + yyi) >> 1;
454 return (Char)r;
455}
456
457static inline UChar hsub8U ( UChar xx, UChar yy )
458{
459 UInt xxi = (UInt)xx;
460 UInt yyi = (UInt)yy;
461 UInt r = (xxi - yyi) >> 1;
462 return (UChar)r;
463}
464
465static inline Char hsub8S ( Char xx, Char yy )
466{
467 Int xxi = (Int)xx;
468 Int yyi = (Int)yy;
469 Int r = (xxi - yyi) >> 1;
470 return (Char)r;
471}
472
sewardj310d6b22010-10-18 16:29:40 +0000473static inline UInt absdiff8U ( UChar xx, UChar yy )
474{
475 UInt xxu = (UChar)xx;
476 UInt yyu = (UChar)yy;
477 return xxu >= yyu ? xxu - yyu : yyu - xxu;
478}
sewardje2ea1762010-09-22 00:56:37 +0000479
sewardj38a3f862005-01-13 15:06:51 +0000480/* ----------------------------------------------------- */
481/* Start of the externally visible functions. These simply
482 implement the corresponding IR primops. */
483/* ----------------------------------------------------- */
484
485/* ------------ Normal addition ------------ */
486
487ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
488{
489 return mk32x2(
490 sel32x2_1(xx) + sel32x2_1(yy),
491 sel32x2_0(xx) + sel32x2_0(yy)
492 );
493}
494
495ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
496{
497 return mk16x4(
sewardjd19fc162005-02-26 02:16:39 +0000498 toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
499 toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
500 toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
501 toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000502 );
503}
504
505ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
506{
507 return mk8x8(
sewardjd19fc162005-02-26 02:16:39 +0000508 toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
509 toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
510 toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
511 toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
512 toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
513 toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
514 toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
515 toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000516 );
517}
518
519/* ------------ Saturating addition ------------ */
520
521ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
522{
523 return mk16x4(
524 qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
525 qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
526 qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
527 qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
528 );
529}
530
531ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
532{
533 return mk8x8(
534 qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
535 qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
536 qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
537 qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
538 qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
539 qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
540 qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
541 qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
542 );
543}
544
545ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
546{
547 return mk16x4(
548 qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
549 qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
550 qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
551 qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
552 );
553}
554
555ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
556{
557 return mk8x8(
558 qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
559 qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
560 qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
561 qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
562 qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
563 qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
564 qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
565 qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
566 );
567}
568
569/* ------------ Normal subtraction ------------ */
570
571ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
572{
573 return mk32x2(
574 sel32x2_1(xx) - sel32x2_1(yy),
575 sel32x2_0(xx) - sel32x2_0(yy)
576 );
577}
578
579ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
580{
581 return mk16x4(
sewardjd19fc162005-02-26 02:16:39 +0000582 toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
583 toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
584 toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
585 toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000586 );
587}
588
589ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
590{
591 return mk8x8(
sewardjd19fc162005-02-26 02:16:39 +0000592 toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
593 toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
594 toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
595 toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
596 toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
597 toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
598 toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
599 toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000600 );
601}
602
603/* ------------ Saturating subtraction ------------ */
604
605ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
606{
607 return mk16x4(
608 qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
609 qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
610 qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
611 qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
612 );
613}
614
615ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
616{
617 return mk8x8(
618 qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
619 qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
620 qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
621 qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
622 qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
623 qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
624 qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
625 qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
626 );
627}
628
629ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
630{
631 return mk16x4(
632 qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
633 qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
634 qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
635 qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
636 );
637}
638
639ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
640{
641 return mk8x8(
642 qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
643 qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
644 qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
645 qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
646 qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
647 qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
648 qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
649 qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
650 );
651}
652
653/* ------------ Multiplication ------------ */
654
655ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
656{
657 return mk16x4(
658 mul16( sel16x4_3(xx), sel16x4_3(yy) ),
659 mul16( sel16x4_2(xx), sel16x4_2(yy) ),
660 mul16( sel16x4_1(xx), sel16x4_1(yy) ),
661 mul16( sel16x4_0(xx), sel16x4_0(yy) )
662 );
663}
664
sewardjd166e282008-02-06 11:42:45 +0000665ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
666{
667 return mk32x2(
668 mul32( sel32x2_1(xx), sel32x2_1(yy) ),
669 mul32( sel32x2_0(xx), sel32x2_0(yy) )
670 );
671}
672
sewardj38a3f862005-01-13 15:06:51 +0000673ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
674{
675 return mk16x4(
676 mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
677 mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
678 mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
679 mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
680 );
681}
682
683ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
684{
685 return mk16x4(
686 mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
687 mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
688 mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
689 mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
690 );
691}
692
693/* ------------ Comparison ------------ */
694
695ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
696{
697 return mk32x2(
698 cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
699 cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
700 );
701}
702
703ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
704{
705 return mk16x4(
706 cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
707 cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
708 cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
709 cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
710 );
711}
712
713ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
714{
715 return mk8x8(
716 cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
717 cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
718 cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
719 cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
720 cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
721 cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
722 cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
723 cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
724 );
725}
726
727ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
728{
729 return mk32x2(
730 cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
731 cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
732 );
733}
734
735ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
736{
737 return mk16x4(
738 cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
739 cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
740 cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
741 cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
742 );
743}
744
745ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
746{
747 return mk8x8(
748 cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
749 cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
750 cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
751 cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
752 cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
753 cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
754 cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
755 cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
756 );
757}
758
sewardj18069182005-01-13 19:16:04 +0000759ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
760{
761 return mk32x2(
762 cmpnez32( sel32x2_1(xx) ),
763 cmpnez32( sel32x2_0(xx) )
764 );
765}
766
767ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
768{
769 return mk16x4(
770 cmpnez16( sel16x4_3(xx) ),
771 cmpnez16( sel16x4_2(xx) ),
772 cmpnez16( sel16x4_1(xx) ),
773 cmpnez16( sel16x4_0(xx) )
774 );
775}
776
777ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
778{
779 return mk8x8(
780 cmpnez8( sel8x8_7(xx) ),
781 cmpnez8( sel8x8_6(xx) ),
782 cmpnez8( sel8x8_5(xx) ),
783 cmpnez8( sel8x8_4(xx) ),
784 cmpnez8( sel8x8_3(xx) ),
785 cmpnez8( sel8x8_2(xx) ),
786 cmpnez8( sel8x8_1(xx) ),
787 cmpnez8( sel8x8_0(xx) )
788 );
789}
790
sewardj38a3f862005-01-13 15:06:51 +0000791/* ------------ Saturating narrowing ------------ */
792
sewardj5f438dd2011-06-16 11:36:23 +0000793ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
sewardj38a3f862005-01-13 15:06:51 +0000794{
795 UInt d = sel32x2_1(aa);
796 UInt c = sel32x2_0(aa);
797 UInt b = sel32x2_1(bb);
798 UInt a = sel32x2_0(bb);
799 return mk16x4(
sewardjc9bff7d2011-06-15 15:09:37 +0000800 qnarrow32Sto16S(d),
801 qnarrow32Sto16S(c),
802 qnarrow32Sto16S(b),
803 qnarrow32Sto16S(a)
sewardj38a3f862005-01-13 15:06:51 +0000804 );
805}
806
sewardj5f438dd2011-06-16 11:36:23 +0000807ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
sewardj38a3f862005-01-13 15:06:51 +0000808{
809 UShort h = sel16x4_3(aa);
810 UShort g = sel16x4_2(aa);
811 UShort f = sel16x4_1(aa);
812 UShort e = sel16x4_0(aa);
813 UShort d = sel16x4_3(bb);
814 UShort c = sel16x4_2(bb);
815 UShort b = sel16x4_1(bb);
816 UShort a = sel16x4_0(bb);
817 return mk8x8(
sewardjc9bff7d2011-06-15 15:09:37 +0000818 qnarrow16Sto8S(h),
819 qnarrow16Sto8S(g),
820 qnarrow16Sto8S(f),
821 qnarrow16Sto8S(e),
822 qnarrow16Sto8S(d),
823 qnarrow16Sto8S(c),
824 qnarrow16Sto8S(b),
825 qnarrow16Sto8S(a)
sewardj38a3f862005-01-13 15:06:51 +0000826 );
827}
828
sewardj5f438dd2011-06-16 11:36:23 +0000829ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
sewardj38a3f862005-01-13 15:06:51 +0000830{
831 UShort h = sel16x4_3(aa);
832 UShort g = sel16x4_2(aa);
833 UShort f = sel16x4_1(aa);
834 UShort e = sel16x4_0(aa);
835 UShort d = sel16x4_3(bb);
836 UShort c = sel16x4_2(bb);
837 UShort b = sel16x4_1(bb);
838 UShort a = sel16x4_0(bb);
839 return mk8x8(
sewardjc9bff7d2011-06-15 15:09:37 +0000840 qnarrow16Sto8U(h),
841 qnarrow16Sto8U(g),
842 qnarrow16Sto8U(f),
843 qnarrow16Sto8U(e),
844 qnarrow16Sto8U(d),
845 qnarrow16Sto8U(c),
846 qnarrow16Sto8U(b),
847 qnarrow16Sto8U(a)
sewardj38a3f862005-01-13 15:06:51 +0000848 );
849}
850
sewardjad2c9ea2011-10-22 09:32:16 +0000851/* ------------ Truncating narrowing ------------ */
852
853ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
854{
855 UInt d = sel32x2_1(aa);
856 UInt c = sel32x2_0(aa);
857 UInt b = sel32x2_1(bb);
858 UInt a = sel32x2_0(bb);
859 return mk16x4(
860 narrow32to16(d),
861 narrow32to16(c),
862 narrow32to16(b),
863 narrow32to16(a)
864 );
865}
866
867ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
868{
869 UShort h = sel16x4_3(aa);
870 UShort g = sel16x4_2(aa);
871 UShort f = sel16x4_1(aa);
872 UShort e = sel16x4_0(aa);
873 UShort d = sel16x4_3(bb);
874 UShort c = sel16x4_2(bb);
875 UShort b = sel16x4_1(bb);
876 UShort a = sel16x4_0(bb);
877 return mk8x8(
878 narrow16to8(h),
879 narrow16to8(g),
880 narrow16to8(f),
881 narrow16to8(e),
882 narrow16to8(d),
883 narrow16to8(c),
884 narrow16to8(b),
885 narrow16to8(a)
886 );
887}
888
sewardj38a3f862005-01-13 15:06:51 +0000889/* ------------ Interleaving ------------ */
890
891ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
892{
893 return mk8x8(
894 sel8x8_7(aa),
895 sel8x8_7(bb),
896 sel8x8_6(aa),
897 sel8x8_6(bb),
898 sel8x8_5(aa),
899 sel8x8_5(bb),
900 sel8x8_4(aa),
901 sel8x8_4(bb)
902 );
903}
904
905ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
906{
907 return mk8x8(
908 sel8x8_3(aa),
909 sel8x8_3(bb),
910 sel8x8_2(aa),
911 sel8x8_2(bb),
912 sel8x8_1(aa),
913 sel8x8_1(bb),
914 sel8x8_0(aa),
915 sel8x8_0(bb)
916 );
917}
918
919ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
920{
921 return mk16x4(
922 sel16x4_3(aa),
923 sel16x4_3(bb),
924 sel16x4_2(aa),
925 sel16x4_2(bb)
926 );
927}
928
929ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
930{
931 return mk16x4(
932 sel16x4_1(aa),
933 sel16x4_1(bb),
934 sel16x4_0(aa),
935 sel16x4_0(bb)
936 );
937}
938
939ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
940{
941 return mk32x2(
942 sel32x2_1(aa),
943 sel32x2_1(bb)
944 );
945}
946
947ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
948{
949 return mk32x2(
950 sel32x2_0(aa),
951 sel32x2_0(bb)
952 );
953}
954
sewardjd166e282008-02-06 11:42:45 +0000955/* ------------ Concatenation ------------ */
956
957ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
958{
959 return mk16x4(
960 sel16x4_3(aa),
961 sel16x4_1(aa),
962 sel16x4_3(bb),
963 sel16x4_1(bb)
964 );
965}
966
967ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
968{
969 return mk16x4(
970 sel16x4_2(aa),
971 sel16x4_0(aa),
972 sel16x4_2(bb),
973 sel16x4_0(bb)
974 );
975}
976
977/* misc hack looking for a proper home */
978ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
979{
980 return mk8x8(
981 index8x8(aa, sel8x8_7(bb)),
982 index8x8(aa, sel8x8_6(bb)),
983 index8x8(aa, sel8x8_5(bb)),
984 index8x8(aa, sel8x8_4(bb)),
985 index8x8(aa, sel8x8_3(bb)),
986 index8x8(aa, sel8x8_2(bb)),
987 index8x8(aa, sel8x8_1(bb)),
988 index8x8(aa, sel8x8_0(bb))
989 );
990}
sewardj38a3f862005-01-13 15:06:51 +0000991
992/* ------------ Shifting ------------ */
993/* Note that because these primops are undefined if the shift amount
994 equals or exceeds the lane width, the shift amount is masked so
995 that the scalar shifts are always in range. In fact, given the
996 semantics of these primops (ShlN16x4, etc) it is an error if in
997 fact we are ever given an out-of-range shift amount.
998*/
999ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
1000{
1001 /* vassert(nn < 32); */
1002 nn &= 31;
1003 return mk32x2(
1004 shl32( sel32x2_1(xx), nn ),
1005 shl32( sel32x2_0(xx), nn )
1006 );
1007}
1008
1009ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
1010{
1011 /* vassert(nn < 16); */
1012 nn &= 15;
1013 return mk16x4(
1014 shl16( sel16x4_3(xx), nn ),
1015 shl16( sel16x4_2(xx), nn ),
1016 shl16( sel16x4_1(xx), nn ),
1017 shl16( sel16x4_0(xx), nn )
1018 );
1019}
1020
sewardjd166e282008-02-06 11:42:45 +00001021ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn )
1022{
1023 /* vassert(nn < 8); */
1024 nn &= 7;
1025 return mk8x8(
1026 shl8( sel8x8_7(xx), nn ),
1027 shl8( sel8x8_6(xx), nn ),
1028 shl8( sel8x8_5(xx), nn ),
1029 shl8( sel8x8_4(xx), nn ),
1030 shl8( sel8x8_3(xx), nn ),
1031 shl8( sel8x8_2(xx), nn ),
1032 shl8( sel8x8_1(xx), nn ),
1033 shl8( sel8x8_0(xx), nn )
1034 );
1035}
1036
sewardj38a3f862005-01-13 15:06:51 +00001037ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1038{
1039 /* vassert(nn < 32); */
1040 nn &= 31;
1041 return mk32x2(
1042 shr32( sel32x2_1(xx), nn ),
1043 shr32( sel32x2_0(xx), nn )
1044 );
1045}
1046
1047ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1048{
1049 /* vassert(nn < 16); */
1050 nn &= 15;
1051 return mk16x4(
1052 shr16( sel16x4_3(xx), nn ),
1053 shr16( sel16x4_2(xx), nn ),
1054 shr16( sel16x4_1(xx), nn ),
1055 shr16( sel16x4_0(xx), nn )
1056 );
1057}
1058
1059ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1060{
1061 /* vassert(nn < 32); */
1062 nn &= 31;
1063 return mk32x2(
1064 sar32( sel32x2_1(xx), nn ),
1065 sar32( sel32x2_0(xx), nn )
1066 );
1067}
1068
1069ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1070{
1071 /* vassert(nn < 16); */
1072 nn &= 15;
1073 return mk16x4(
1074 sar16( sel16x4_3(xx), nn ),
1075 sar16( sel16x4_2(xx), nn ),
1076 sar16( sel16x4_1(xx), nn ),
1077 sar16( sel16x4_0(xx), nn )
1078 );
1079}
1080
sewardjd71ba832006-12-27 01:15:29 +00001081ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1082{
1083 /* vassert(nn < 8); */
1084 nn &= 7;
1085 return mk8x8(
1086 sar8( sel8x8_7(xx), nn ),
1087 sar8( sel8x8_6(xx), nn ),
1088 sar8( sel8x8_5(xx), nn ),
1089 sar8( sel8x8_4(xx), nn ),
1090 sar8( sel8x8_3(xx), nn ),
1091 sar8( sel8x8_2(xx), nn ),
1092 sar8( sel8x8_1(xx), nn ),
1093 sar8( sel8x8_0(xx), nn )
1094 );
1095}
1096
sewardj38a3f862005-01-13 15:06:51 +00001097/* ------------ Averaging ------------ */
1098
1099ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1100{
1101 return mk8x8(
1102 avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1103 avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1104 avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1105 avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1106 avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1107 avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1108 avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1109 avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1110 );
1111}
1112
1113ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1114{
1115 return mk16x4(
1116 avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1117 avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1118 avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1119 avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1120 );
1121}
1122
1123/* ------------ max/min ------------ */
1124
1125ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1126{
1127 return mk16x4(
1128 max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1129 max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1130 max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1131 max16S( sel16x4_0(xx), sel16x4_0(yy) )
1132 );
1133}
1134
1135ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1136{
1137 return mk8x8(
1138 max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1139 max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1140 max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1141 max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1142 max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1143 max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1144 max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1145 max8U( sel8x8_0(xx), sel8x8_0(yy) )
1146 );
1147}
1148
1149ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1150{
1151 return mk16x4(
1152 min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1153 min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1154 min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1155 min16S( sel16x4_0(xx), sel16x4_0(yy) )
1156 );
1157}
1158
1159ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1160{
1161 return mk8x8(
1162 min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1163 min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1164 min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1165 min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1166 min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1167 min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1168 min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1169 min8U( sel8x8_0(xx), sel8x8_0(yy) )
1170 );
1171}
1172
sewardje13074c2012-11-08 10:57:08 +00001173UInt h_generic_calc_GetMSBs8x8 ( ULong xx )
1174{
1175 UInt r = 0;
1176 if (xx & (1ULL << (64-1))) r |= (1<<7);
1177 if (xx & (1ULL << (56-1))) r |= (1<<6);
1178 if (xx & (1ULL << (48-1))) r |= (1<<5);
1179 if (xx & (1ULL << (40-1))) r |= (1<<4);
1180 if (xx & (1ULL << (32-1))) r |= (1<<3);
1181 if (xx & (1ULL << (24-1))) r |= (1<<2);
1182 if (xx & (1ULL << (16-1))) r |= (1<<1);
1183 if (xx & (1ULL << ( 8-1))) r |= (1<<0);
1184 return r;
1185}
1186
sewardje2ea1762010-09-22 00:56:37 +00001187/* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1188
1189/* Tuple/select functions for 16x2 vectors. */
1190static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1191 return (((UInt)w1) << 16) | ((UInt)w2);
1192}
1193
1194static inline UShort sel16x2_1 ( UInt w32 ) {
1195 return 0xFFFF & (UShort)(w32 >> 16);
1196}
1197static inline UShort sel16x2_0 ( UInt w32 ) {
1198 return 0xFFFF & (UShort)(w32);
1199}
1200
1201static inline UInt mk8x4 ( UChar w3, UChar w2,
1202 UChar w1, UChar w0 ) {
1203 UInt w32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
1204 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
1205 return w32;
1206}
1207
1208static inline UChar sel8x4_3 ( UInt w32 ) {
1209 return toUChar(0xFF & (w32 >> 24));
1210}
1211static inline UChar sel8x4_2 ( UInt w32 ) {
1212 return toUChar(0xFF & (w32 >> 16));
1213}
1214static inline UChar sel8x4_1 ( UInt w32 ) {
1215 return toUChar(0xFF & (w32 >> 8));
1216}
1217static inline UChar sel8x4_0 ( UInt w32 ) {
1218 return toUChar(0xFF & (w32 >> 0));
1219}
1220
1221
1222/* ----------------------------------------------------- */
1223/* More externally visible functions. These simply
1224 implement the corresponding IR primops. */
1225/* ----------------------------------------------------- */
1226
1227/* ------ 16x2 ------ */
1228
1229UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1230{
1231 return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1232 sel16x2_0(xx) + sel16x2_0(yy) );
1233}
1234
1235UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1236{
1237 return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1238 sel16x2_0(xx) - sel16x2_0(yy) );
1239}
1240
1241UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1242{
1243 return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1244 hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1245}
1246
1247UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1248{
1249 return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1250 hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1251}
1252
1253UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1254{
1255 return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1256 hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1257}
1258
1259UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1260{
1261 return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1262 hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1263}
1264
1265UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1266{
1267 return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1268 qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1269}
1270
1271UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1272{
1273 return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1274 qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1275}
1276
1277UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1278{
1279 return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1280 qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1281}
1282
1283UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1284{
1285 return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1286 qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1287}
1288
1289/* ------ 8x4 ------ */
1290
1291UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1292{
1293 return mk8x4(
1294 sel8x4_3(xx) + sel8x4_3(yy),
1295 sel8x4_2(xx) + sel8x4_2(yy),
1296 sel8x4_1(xx) + sel8x4_1(yy),
1297 sel8x4_0(xx) + sel8x4_0(yy)
1298 );
1299}
1300
1301UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1302{
1303 return mk8x4(
1304 sel8x4_3(xx) - sel8x4_3(yy),
1305 sel8x4_2(xx) - sel8x4_2(yy),
1306 sel8x4_1(xx) - sel8x4_1(yy),
1307 sel8x4_0(xx) - sel8x4_0(yy)
1308 );
1309}
1310
1311UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1312{
1313 return mk8x4(
1314 hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1315 hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1316 hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1317 hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1318 );
1319}
1320
1321UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1322{
1323 return mk8x4(
1324 hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1325 hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1326 hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1327 hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1328 );
1329}
1330
1331UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1332{
1333 return mk8x4(
1334 hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1335 hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1336 hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1337 hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1338 );
1339}
1340
1341UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1342{
1343 return mk8x4(
1344 hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1345 hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1346 hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1347 hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1348 );
1349}
1350
1351UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1352{
1353 return mk8x4(
1354 qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1355 qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1356 qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1357 qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1358 );
1359}
1360
1361UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1362{
1363 return mk8x4(
1364 qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1365 qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1366 qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1367 qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1368 );
1369}
1370
1371UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1372{
1373 return mk8x4(
1374 qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1375 qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1376 qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1377 qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1378 );
1379}
1380
1381UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1382{
1383 return mk8x4(
1384 qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1385 qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1386 qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1387 qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1388 );
1389}
1390
1391UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1392{
1393 return mk16x2(
1394 cmpnez16( sel16x2_1(xx) ),
1395 cmpnez16( sel16x2_0(xx) )
1396 );
1397}
1398
1399UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1400{
1401 return mk8x4(
1402 cmpnez8( sel8x4_3(xx) ),
1403 cmpnez8( sel8x4_2(xx) ),
1404 cmpnez8( sel8x4_1(xx) ),
1405 cmpnez8( sel8x4_0(xx) )
1406 );
1407}
sewardj38a3f862005-01-13 15:06:51 +00001408
sewardj310d6b22010-10-18 16:29:40 +00001409UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1410{
1411 return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1412 + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1413 + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1414 + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1415}
1416
sewardj44ce46d2012-07-11 13:19:10 +00001417UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
1418{
1419 return qadd32S( xx, yy );
1420}
1421
1422UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
1423{
1424 return qsub32S( xx, yy );
1425}
1426
1427
sewardj4c96e612012-06-02 23:47:02 +00001428/*------------------------------------------------------------------*/
1429/* Decimal Floating Point (DFP) externally visible helper functions */
1430/* that implement Iop_BCDtoDPB and Iop_DPBtoBCD */
1431/*------------------------------------------------------------------*/
1432
1433#define NOT( x ) ( ( ( x ) == 0) ? 1 : 0)
1434#define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1435#define PUT( x, y ) ( ( x )<< ( y ) )
1436
sewardj8bde7f12013-04-11 13:57:43 +00001437static ULong dpb_to_bcd( ULong chunk )
sewardj4c96e612012-06-02 23:47:02 +00001438{
1439 Short a, b, c, d, e, f, g, h, i, j, k, m;
1440 Short p, q, r, s, t, u, v, w, x, y;
1441 ULong value;
1442
1443 /* convert 10 bit densely packed BCD to BCD */
1444 p = GET( chunk, 9 );
1445 q = GET( chunk, 8 );
1446 r = GET( chunk, 7 );
1447 s = GET( chunk, 6 );
1448 t = GET( chunk, 5 );
1449 u = GET( chunk, 4 );
1450 v = GET( chunk, 3 );
1451 w = GET( chunk, 2 );
1452 x = GET( chunk, 1 );
1453 y = GET( chunk, 0 );
1454
1455 /* The BCD bit values are given by the following boolean equations.*/
1456 a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
1457 b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
1458 c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
1459 d = r;
1460 e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
1461 f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
1462 g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
1463 h = u;
1464 i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
1465 j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
1466 | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
1467 k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
1468 | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
1469 m = y;
1470
1471 value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
1472 | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
1473 | PUT(k, 1) | PUT(m, 0);
1474 return value;
1475}
1476
sewardj8bde7f12013-04-11 13:57:43 +00001477static ULong bcd_to_dpb( ULong chunk )
sewardj4c96e612012-06-02 23:47:02 +00001478{
1479 Short a, b, c, d, e, f, g, h, i, j, k, m;
1480 Short p, q, r, s, t, u, v, w, x, y;
1481 ULong value;
1482 /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1483 The boolean equations to calculate the value of each of the DPD bit
1484 is given in Appendix B of Book 1: Power ISA User Instruction set. The
1485 bits for the DPD number are [abcdefghijkm]. The bits for the BCD value
1486 are [pqrstuvwxy]. The boolean logic equations in psuedo C code are:
1487 */
1488 a = GET( chunk, 11 );
1489 b = GET( chunk, 10 );
1490 c = GET( chunk, 9 );
1491 d = GET( chunk, 8 );
1492 e = GET( chunk, 7 );
1493 f = GET( chunk, 6 );
1494 g = GET( chunk, 5 );
1495 h = GET( chunk, 4 );
1496 i = GET( chunk, 3 );
1497 j = GET( chunk, 2 );
1498 k = GET( chunk, 1 );
1499 m = GET( chunk, 0 );
1500
1501 p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
1502 q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
1503 r = d;
1504 s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
1505 | ( f & NOT(a) & NOT(e) ) | ( e & i );
1506 t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
1507 | ( g & NOT(a) & NOT(e) ) | ( a & i );
1508 u = h;
1509 v = a | e | i;
1510 w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
1511 x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
1512 y = m;
1513
1514 value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
1515 | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
1516
1517 return value;
1518}
1519
sewardj8bde7f12013-04-11 13:57:43 +00001520ULong h_calc_DPBtoBCD( ULong dpb )
sewardj4c96e612012-06-02 23:47:02 +00001521{
1522 ULong result, chunk;
1523 Int i;
1524
1525 result = 0;
1526
1527 for (i = 0; i < 5; i++) {
1528 chunk = dpb >> ( 4 - i ) * 10;
1529 result = result << 12;
1530 result |= dpb_to_bcd( chunk & 0x3FF );
1531 }
1532 return result;
1533}
1534
sewardj8bde7f12013-04-11 13:57:43 +00001535ULong h_calc_BCDtoDPB( ULong bcd )
sewardj4c96e612012-06-02 23:47:02 +00001536{
1537 ULong result, chunk;
1538 Int i;
1539
1540 result = 0;
1541
1542 for (i = 0; i < 5; i++) {
1543 chunk = bcd >> ( 4 - i ) * 12;
1544 result = result << 10;
1545 result |= bcd_to_dpb( chunk & 0xFFF );
1546 }
1547 return result;
1548}
1549#undef NOT
1550#undef GET
1551#undef PUT
sewardj310d6b22010-10-18 16:29:40 +00001552
sewardj8bde7f12013-04-11 13:57:43 +00001553
1554/* ----------------------------------------------------- */
1555/* Signed and unsigned integer division, that behave like
sewardjbbcf1882014-01-12 12:49:10 +00001556 the ARMv7 UDIV ansd SDIV instructions.
1557
1558 sdiv32 also behaves like 64-bit v8 SDIV on w-regs.
1559 udiv32 also behaves like 64-bit v8 UDIV on w-regs.
1560*/
sewardj8bde7f12013-04-11 13:57:43 +00001561/* ----------------------------------------------------- */
1562
1563UInt h_calc_udiv32_w_arm_semantics ( UInt x, UInt y )
1564{
1565 // Division by zero --> zero
1566 if (UNLIKELY(y == 0)) return 0;
1567 // C requires rounding towards zero, which is also what we need.
1568 return x / y;
1569}
1570
sewardjbbcf1882014-01-12 12:49:10 +00001571ULong h_calc_udiv64_w_arm_semantics ( ULong x, ULong y )
1572{
1573 // Division by zero --> zero
1574 if (UNLIKELY(y == 0)) return 0;
1575 // C requires rounding towards zero, which is also what we need.
1576 return x / y;
1577}
1578
sewardj8bde7f12013-04-11 13:57:43 +00001579Int h_calc_sdiv32_w_arm_semantics ( Int x, Int y )
1580{
1581 // Division by zero --> zero
1582 if (UNLIKELY(y == 0)) return 0;
sewardjbbcf1882014-01-12 12:49:10 +00001583 // The single case that produces an unrepresentable result
sewardj8bde7f12013-04-11 13:57:43 +00001584 if (UNLIKELY( ((UInt)x) == ((UInt)0x80000000)
1585 && ((UInt)y) == ((UInt)0xFFFFFFFF) ))
1586 return (Int)(UInt)0x80000000;
1587 // Else return the result rounded towards zero. C89 says
1588 // this is implementation defined (in the signed case), but gcc
1589 // promises to round towards zero. Nevertheless, at startup,
1590 // in main_main.c, do a check for that.
1591 return x / y;
1592}
1593
sewardjbbcf1882014-01-12 12:49:10 +00001594Long h_calc_sdiv64_w_arm_semantics ( Long x, Long y )
1595{
1596 // Division by zero --> zero
1597 if (UNLIKELY(y == 0)) return 0;
1598 // The single case that produces an unrepresentable result
1599 if (UNLIKELY( ((ULong)x) == ((ULong)0x8000000000000000ULL )
1600 && ((ULong)y) == ((ULong)0xFFFFFFFFFFFFFFFFULL ) ))
1601 return (Long)(ULong)0x8000000000000000ULL;
1602 // Else return the result rounded towards zero. C89 says
1603 // this is implementation defined (in the signed case), but gcc
1604 // promises to round towards zero. Nevertheless, at startup,
1605 // in main_main.c, do a check for that.
1606 return x / y;
1607}
1608
sewardj8bde7f12013-04-11 13:57:43 +00001609
sewardj38a3f862005-01-13 15:06:51 +00001610/*---------------------------------------------------------------*/
sewardjcef7d3e2009-07-02 12:21:59 +00001611/*--- end host_generic_simd64.c ---*/
sewardj38a3f862005-01-13 15:06:51 +00001612/*---------------------------------------------------------------*/