blob: fdc9eed54e8cc16ff0c24f4f8489a7acfaf0cf17 [file] [log] [blame]
sewardj38a3f862005-01-13 15:06:51 +00001
2/*---------------------------------------------------------------*/
sewardj752f9062010-05-03 21:38:49 +00003/*--- begin host_generic_simd64.c ---*/
sewardj38a3f862005-01-13 15:06:51 +00004/*---------------------------------------------------------------*/
5
6/*
sewardj752f9062010-05-03 21:38:49 +00007 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
sewardj38a3f862005-01-13 15:06:51 +00009
sewardj25e54732012-08-05 15:36:51 +000010 Copyright (C) 2004-2012 OpenWorks LLP
sewardj752f9062010-05-03 21:38:49 +000011 info@open-works.net
sewardj38a3f862005-01-13 15:06:51 +000012
sewardj752f9062010-05-03 21:38:49 +000013 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
sewardj38a3f862005-01-13 15:06:51 +000017
sewardj752f9062010-05-03 21:38:49 +000018 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
sewardj7bd6ffe2005-08-03 16:07:36 +000026 02110-1301, USA.
27
sewardj752f9062010-05-03 21:38:49 +000028 The GNU General Public License is contained in the file COPYING.
sewardj38a3f862005-01-13 15:06:51 +000029
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
sewardj38a3f862005-01-13 15:06:51 +000034*/
35
36/* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37 where the instruction selectors cannot generate code in-line.
38 These are purely back-end entities and cannot be seen/referenced
39 from IR. */
40
41#include "libvex_basictypes.h"
sewardjcef7d3e2009-07-02 12:21:59 +000042#include "host_generic_simd64.h"
sewardj38a3f862005-01-13 15:06:51 +000043
44
45
46/* Tuple/select functions for 32x2 vectors. */
47
48static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
49 return (((ULong)w1) << 32) | ((ULong)w0);
50}
51
52static inline UInt sel32x2_1 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000053 return 0xFFFFFFFF & toUInt(w64 >> 32);
sewardj38a3f862005-01-13 15:06:51 +000054}
55static inline UInt sel32x2_0 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000056 return 0xFFFFFFFF & toUInt(w64);
sewardj38a3f862005-01-13 15:06:51 +000057}
58
59
60/* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless
61 with 64-bit shifts so we give it a hand. */
62
63static inline ULong mk16x4 ( UShort w3, UShort w2,
64 UShort w1, UShort w0 ) {
65 UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
66 UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
67 return mk32x2(hi32, lo32);
68}
69
70static inline UShort sel16x4_3 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000071 UInt hi32 = toUInt(w64 >> 32);
72 return toUShort(0xFFFF & (hi32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +000073}
74static inline UShort sel16x4_2 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000075 UInt hi32 = toUInt(w64 >> 32);
76 return toUShort(0xFFFF & hi32);
sewardj38a3f862005-01-13 15:06:51 +000077}
78static inline UShort sel16x4_1 ( ULong w64 ) {
79 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +000080 return toUShort(0xFFFF & (lo32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +000081}
82static inline UShort sel16x4_0 ( ULong w64 ) {
83 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +000084 return toUShort(0xFFFF & lo32);
sewardj38a3f862005-01-13 15:06:51 +000085}
86
87
88/* Tuple/select functions for 8x8 vectors. */
89
90static inline ULong mk8x8 ( UChar w7, UChar w6,
91 UChar w5, UChar w4,
92 UChar w3, UChar w2,
sewardje2ea1762010-09-22 00:56:37 +000093 UChar w1, UChar w0 ) {
sewardj38a3f862005-01-13 15:06:51 +000094 UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16)
95 | (((UInt)w5) << 8) | (((UInt)w4) << 0);
96 UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
97 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
98 return mk32x2(hi32, lo32);
99}
100
101static inline UChar sel8x8_7 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000102 UInt hi32 = toUInt(w64 >> 32);
103 return toUChar(0xFF & (hi32 >> 24));
sewardj38a3f862005-01-13 15:06:51 +0000104}
105static inline UChar sel8x8_6 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000106 UInt hi32 = toUInt(w64 >> 32);
107 return toUChar(0xFF & (hi32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +0000108}
109static inline UChar sel8x8_5 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000110 UInt hi32 = toUInt(w64 >> 32);
111 return toUChar(0xFF & (hi32 >> 8));
sewardj38a3f862005-01-13 15:06:51 +0000112}
113static inline UChar sel8x8_4 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000114 UInt hi32 = toUInt(w64 >> 32);
115 return toUChar(0xFF & (hi32 >> 0));
sewardj38a3f862005-01-13 15:06:51 +0000116}
117static inline UChar sel8x8_3 ( ULong w64 ) {
118 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000119 return toUChar(0xFF & (lo32 >> 24));
sewardj38a3f862005-01-13 15:06:51 +0000120}
121static inline UChar sel8x8_2 ( ULong w64 ) {
122 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000123 return toUChar(0xFF & (lo32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +0000124}
125static inline UChar sel8x8_1 ( ULong w64 ) {
126 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000127 return toUChar(0xFF & (lo32 >> 8));
sewardj38a3f862005-01-13 15:06:51 +0000128}
129static inline UChar sel8x8_0 ( ULong w64 ) {
130 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000131 return toUChar(0xFF & (lo32 >> 0));
sewardj38a3f862005-01-13 15:06:51 +0000132}
133
sewardjd166e282008-02-06 11:42:45 +0000134static inline UChar index8x8 ( ULong w64, UChar ix ) {
135 ix &= 7;
136 return toUChar((w64 >> (8*ix)) & 0xFF);
137}
138
sewardj38a3f862005-01-13 15:06:51 +0000139
140/* Scalar helpers. */
141
sewardj44ce46d2012-07-11 13:19:10 +0000142static inline Int qadd32S ( Int xx, Int yy )
143{
144 Long t = ((Long)xx) + ((Long)yy);
145 const Long loLim = -0x80000000LL;
146 const Long hiLim = 0x7FFFFFFFLL;
147 if (t < loLim) t = loLim;
148 if (t > hiLim) t = hiLim;
149 return (Int)t;
150}
151
sewardj38a3f862005-01-13 15:06:51 +0000152static inline Short qadd16S ( Short xx, Short yy )
153{
154 Int t = ((Int)xx) + ((Int)yy);
155 if (t < -32768) t = -32768;
156 if (t > 32767) t = 32767;
157 return (Short)t;
158}
159
160static inline Char qadd8S ( Char xx, Char yy )
161{
162 Int t = ((Int)xx) + ((Int)yy);
163 if (t < -128) t = -128;
164 if (t > 127) t = 127;
165 return (Char)t;
166}
167
168static inline UShort qadd16U ( UShort xx, UShort yy )
169{
170 UInt t = ((UInt)xx) + ((UInt)yy);
171 if (t > 0xFFFF) t = 0xFFFF;
172 return (UShort)t;
173}
174
175static inline UChar qadd8U ( UChar xx, UChar yy )
176{
177 UInt t = ((UInt)xx) + ((UInt)yy);
178 if (t > 0xFF) t = 0xFF;
179 return (UChar)t;
180}
181
sewardj44ce46d2012-07-11 13:19:10 +0000182static inline Int qsub32S ( Int xx, Int yy )
183{
184 Long t = ((Long)xx) - ((Long)yy);
185 const Long loLim = -0x80000000LL;
186 const Long hiLim = 0x7FFFFFFFLL;
187 if (t < loLim) t = loLim;
188 if (t > hiLim) t = hiLim;
189 return (Int)t;
190}
191
sewardj38a3f862005-01-13 15:06:51 +0000192static inline Short qsub16S ( Short xx, Short yy )
193{
194 Int t = ((Int)xx) - ((Int)yy);
195 if (t < -32768) t = -32768;
196 if (t > 32767) t = 32767;
197 return (Short)t;
198}
199
200static inline Char qsub8S ( Char xx, Char yy )
201{
202 Int t = ((Int)xx) - ((Int)yy);
203 if (t < -128) t = -128;
204 if (t > 127) t = 127;
205 return (Char)t;
206}
207
208static inline UShort qsub16U ( UShort xx, UShort yy )
209{
210 Int t = ((Int)xx) - ((Int)yy);
211 if (t < 0) t = 0;
212 if (t > 0xFFFF) t = 0xFFFF;
213 return (UShort)t;
214}
215
216static inline UChar qsub8U ( UChar xx, UChar yy )
217{
218 Int t = ((Int)xx) - ((Int)yy);
219 if (t < 0) t = 0;
220 if (t > 0xFF) t = 0xFF;
221 return (UChar)t;
222}
223
224static inline Short mul16 ( Short xx, Short yy )
225{
226 Int t = ((Int)xx) * ((Int)yy);
227 return (Short)t;
228}
229
sewardjd166e282008-02-06 11:42:45 +0000230static inline Int mul32 ( Int xx, Int yy )
231{
232 Int t = ((Int)xx) * ((Int)yy);
233 return (Int)t;
234}
235
sewardj38a3f862005-01-13 15:06:51 +0000236static inline Short mulhi16S ( Short xx, Short yy )
237{
238 Int t = ((Int)xx) * ((Int)yy);
239 t >>=/*s*/ 16;
240 return (Short)t;
241}
242
243static inline UShort mulhi16U ( UShort xx, UShort yy )
244{
245 UInt t = ((UInt)xx) * ((UInt)yy);
246 t >>=/*u*/ 16;
247 return (UShort)t;
248}
249
250static inline UInt cmpeq32 ( UInt xx, UInt yy )
251{
252 return xx==yy ? 0xFFFFFFFF : 0;
253}
254
255static inline UShort cmpeq16 ( UShort xx, UShort yy )
256{
sewardjd19fc162005-02-26 02:16:39 +0000257 return toUShort(xx==yy ? 0xFFFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000258}
259
260static inline UChar cmpeq8 ( UChar xx, UChar yy )
261{
sewardjd19fc162005-02-26 02:16:39 +0000262 return toUChar(xx==yy ? 0xFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000263}
264
265static inline UInt cmpgt32S ( Int xx, Int yy )
266{
267 return xx>yy ? 0xFFFFFFFF : 0;
268}
269
270static inline UShort cmpgt16S ( Short xx, Short yy )
271{
sewardjd19fc162005-02-26 02:16:39 +0000272 return toUShort(xx>yy ? 0xFFFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000273}
274
275static inline UChar cmpgt8S ( Char xx, Char yy )
276{
sewardjd19fc162005-02-26 02:16:39 +0000277 return toUChar(xx>yy ? 0xFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000278}
279
sewardj18069182005-01-13 19:16:04 +0000280static inline UInt cmpnez32 ( UInt xx )
281{
282 return xx==0 ? 0 : 0xFFFFFFFF;
283}
284
285static inline UShort cmpnez16 ( UShort xx )
286{
sewardjd19fc162005-02-26 02:16:39 +0000287 return toUShort(xx==0 ? 0 : 0xFFFF);
sewardj18069182005-01-13 19:16:04 +0000288}
289
290static inline UChar cmpnez8 ( UChar xx )
291{
sewardjd19fc162005-02-26 02:16:39 +0000292 return toUChar(xx==0 ? 0 : 0xFF);
sewardj18069182005-01-13 19:16:04 +0000293}
294
sewardjc9bff7d2011-06-15 15:09:37 +0000295static inline Short qnarrow32Sto16S ( UInt xx0 )
sewardj38a3f862005-01-13 15:06:51 +0000296{
297 Int xx = (Int)xx0;
298 if (xx < -32768) xx = -32768;
299 if (xx > 32767) xx = 32767;
300 return (Short)xx;
301}
302
sewardjc9bff7d2011-06-15 15:09:37 +0000303static inline Char qnarrow16Sto8S ( UShort xx0 )
sewardj38a3f862005-01-13 15:06:51 +0000304{
305 Short xx = (Short)xx0;
306 if (xx < -128) xx = -128;
307 if (xx > 127) xx = 127;
308 return (Char)xx;
309}
310
sewardjc9bff7d2011-06-15 15:09:37 +0000311static inline UChar qnarrow16Sto8U ( UShort xx0 )
sewardj38a3f862005-01-13 15:06:51 +0000312{
313 Short xx = (Short)xx0;
314 if (xx < 0) xx = 0;
315 if (xx > 255) xx = 255;
316 return (UChar)xx;
317}
318
sewardjad2c9ea2011-10-22 09:32:16 +0000319static inline UShort narrow32to16 ( UInt xx )
320{
321 return (UShort)xx;
322}
323
324static inline UChar narrow16to8 ( UShort xx )
325{
326 return (UChar)xx;
327}
328
sewardj38a3f862005-01-13 15:06:51 +0000329/* shifts: we don't care about out-of-range ones, since
330 that is dealt with at a higher level. */
331
sewardjd166e282008-02-06 11:42:45 +0000332static inline UChar shl8 ( UChar v, UInt n )
333{
334 return toUChar(v << n);
335}
336
sewardjd71ba832006-12-27 01:15:29 +0000337static inline UChar sar8 ( UChar v, UInt n )
338{
339 return toUChar(((Char)v) >> n);
340}
341
sewardj38a3f862005-01-13 15:06:51 +0000342static inline UShort shl16 ( UShort v, UInt n )
343{
sewardjd19fc162005-02-26 02:16:39 +0000344 return toUShort(v << n);
sewardj38a3f862005-01-13 15:06:51 +0000345}
346
347static inline UShort shr16 ( UShort v, UInt n )
348{
sewardjd19fc162005-02-26 02:16:39 +0000349 return toUShort((((UShort)v) >> n));
sewardj38a3f862005-01-13 15:06:51 +0000350}
351
352static inline UShort sar16 ( UShort v, UInt n )
353{
sewardjd19fc162005-02-26 02:16:39 +0000354 return toUShort(((Short)v) >> n);
sewardj38a3f862005-01-13 15:06:51 +0000355}
356
357static inline UInt shl32 ( UInt v, UInt n )
358{
359 return v << n;
360}
361
362static inline UInt shr32 ( UInt v, UInt n )
363{
364 return (((UInt)v) >> n);
365}
366
367static inline UInt sar32 ( UInt v, UInt n )
368{
369 return ((Int)v) >> n;
370}
371
372static inline UChar avg8U ( UChar xx, UChar yy )
373{
374 UInt xxi = (UInt)xx;
375 UInt yyi = (UInt)yy;
376 UInt r = (xxi + yyi + 1) >> 1;
377 return (UChar)r;
378}
379
380static inline UShort avg16U ( UShort xx, UShort yy )
381{
382 UInt xxi = (UInt)xx;
383 UInt yyi = (UInt)yy;
384 UInt r = (xxi + yyi + 1) >> 1;
385 return (UShort)r;
386}
387
388static inline Short max16S ( Short xx, Short yy )
389{
sewardjd19fc162005-02-26 02:16:39 +0000390 return toUShort((xx > yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000391}
392
393static inline UChar max8U ( UChar xx, UChar yy )
394{
sewardjd19fc162005-02-26 02:16:39 +0000395 return toUChar((xx > yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000396}
397
398static inline Short min16S ( Short xx, Short yy )
399{
sewardjd19fc162005-02-26 02:16:39 +0000400 return toUShort((xx < yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000401}
402
403static inline UChar min8U ( UChar xx, UChar yy )
404{
sewardjd19fc162005-02-26 02:16:39 +0000405 return toUChar((xx < yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000406}
407
sewardje2ea1762010-09-22 00:56:37 +0000408static inline UShort hadd16U ( UShort xx, UShort yy )
409{
410 UInt xxi = (UInt)xx;
411 UInt yyi = (UInt)yy;
412 UInt r = (xxi + yyi) >> 1;
413 return (UShort)r;
414}
415
416static inline Short hadd16S ( Short xx, Short yy )
417{
418 Int xxi = (Int)xx;
419 Int yyi = (Int)yy;
420 Int r = (xxi + yyi) >> 1;
421 return (Short)r;
422}
423
424static inline UShort hsub16U ( UShort xx, UShort yy )
425{
426 UInt xxi = (UInt)xx;
427 UInt yyi = (UInt)yy;
428 UInt r = (xxi - yyi) >> 1;
429 return (UShort)r;
430}
431
432static inline Short hsub16S ( Short xx, Short yy )
433{
434 Int xxi = (Int)xx;
435 Int yyi = (Int)yy;
436 Int r = (xxi - yyi) >> 1;
437 return (Short)r;
438}
439
440static inline UChar hadd8U ( UChar xx, UChar yy )
441{
442 UInt xxi = (UInt)xx;
443 UInt yyi = (UInt)yy;
444 UInt r = (xxi + yyi) >> 1;
445 return (UChar)r;
446}
447
448static inline Char hadd8S ( Char xx, Char yy )
449{
450 Int xxi = (Int)xx;
451 Int yyi = (Int)yy;
452 Int r = (xxi + yyi) >> 1;
453 return (Char)r;
454}
455
456static inline UChar hsub8U ( UChar xx, UChar yy )
457{
458 UInt xxi = (UInt)xx;
459 UInt yyi = (UInt)yy;
460 UInt r = (xxi - yyi) >> 1;
461 return (UChar)r;
462}
463
464static inline Char hsub8S ( Char xx, Char yy )
465{
466 Int xxi = (Int)xx;
467 Int yyi = (Int)yy;
468 Int r = (xxi - yyi) >> 1;
469 return (Char)r;
470}
471
sewardj310d6b22010-10-18 16:29:40 +0000472static inline UInt absdiff8U ( UChar xx, UChar yy )
473{
474 UInt xxu = (UChar)xx;
475 UInt yyu = (UChar)yy;
476 return xxu >= yyu ? xxu - yyu : yyu - xxu;
477}
sewardje2ea1762010-09-22 00:56:37 +0000478
sewardj38a3f862005-01-13 15:06:51 +0000479/* ----------------------------------------------------- */
480/* Start of the externally visible functions. These simply
481 implement the corresponding IR primops. */
482/* ----------------------------------------------------- */
483
484/* ------------ Normal addition ------------ */
485
486ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
487{
488 return mk32x2(
489 sel32x2_1(xx) + sel32x2_1(yy),
490 sel32x2_0(xx) + sel32x2_0(yy)
491 );
492}
493
494ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
495{
496 return mk16x4(
sewardjd19fc162005-02-26 02:16:39 +0000497 toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
498 toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
499 toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
500 toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000501 );
502}
503
504ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
505{
506 return mk8x8(
sewardjd19fc162005-02-26 02:16:39 +0000507 toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
508 toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
509 toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
510 toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
511 toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
512 toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
513 toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
514 toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000515 );
516}
517
518/* ------------ Saturating addition ------------ */
519
520ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
521{
522 return mk16x4(
523 qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
524 qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
525 qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
526 qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
527 );
528}
529
530ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
531{
532 return mk8x8(
533 qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
534 qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
535 qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
536 qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
537 qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
538 qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
539 qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
540 qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
541 );
542}
543
544ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
545{
546 return mk16x4(
547 qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
548 qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
549 qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
550 qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
551 );
552}
553
554ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
555{
556 return mk8x8(
557 qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
558 qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
559 qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
560 qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
561 qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
562 qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
563 qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
564 qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
565 );
566}
567
568/* ------------ Normal subtraction ------------ */
569
570ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
571{
572 return mk32x2(
573 sel32x2_1(xx) - sel32x2_1(yy),
574 sel32x2_0(xx) - sel32x2_0(yy)
575 );
576}
577
578ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
579{
580 return mk16x4(
sewardjd19fc162005-02-26 02:16:39 +0000581 toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
582 toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
583 toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
584 toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000585 );
586}
587
588ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
589{
590 return mk8x8(
sewardjd19fc162005-02-26 02:16:39 +0000591 toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
592 toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
593 toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
594 toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
595 toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
596 toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
597 toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
598 toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000599 );
600}
601
602/* ------------ Saturating subtraction ------------ */
603
604ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
605{
606 return mk16x4(
607 qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
608 qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
609 qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
610 qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
611 );
612}
613
614ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
615{
616 return mk8x8(
617 qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
618 qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
619 qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
620 qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
621 qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
622 qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
623 qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
624 qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
625 );
626}
627
628ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
629{
630 return mk16x4(
631 qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
632 qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
633 qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
634 qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
635 );
636}
637
638ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
639{
640 return mk8x8(
641 qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
642 qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
643 qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
644 qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
645 qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
646 qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
647 qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
648 qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
649 );
650}
651
652/* ------------ Multiplication ------------ */
653
654ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
655{
656 return mk16x4(
657 mul16( sel16x4_3(xx), sel16x4_3(yy) ),
658 mul16( sel16x4_2(xx), sel16x4_2(yy) ),
659 mul16( sel16x4_1(xx), sel16x4_1(yy) ),
660 mul16( sel16x4_0(xx), sel16x4_0(yy) )
661 );
662}
663
sewardjd166e282008-02-06 11:42:45 +0000664ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
665{
666 return mk32x2(
667 mul32( sel32x2_1(xx), sel32x2_1(yy) ),
668 mul32( sel32x2_0(xx), sel32x2_0(yy) )
669 );
670}
671
sewardj38a3f862005-01-13 15:06:51 +0000672ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
673{
674 return mk16x4(
675 mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
676 mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
677 mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
678 mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
679 );
680}
681
682ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
683{
684 return mk16x4(
685 mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
686 mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
687 mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
688 mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
689 );
690}
691
692/* ------------ Comparison ------------ */
693
694ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
695{
696 return mk32x2(
697 cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
698 cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
699 );
700}
701
702ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
703{
704 return mk16x4(
705 cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
706 cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
707 cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
708 cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
709 );
710}
711
712ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
713{
714 return mk8x8(
715 cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
716 cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
717 cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
718 cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
719 cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
720 cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
721 cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
722 cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
723 );
724}
725
726ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
727{
728 return mk32x2(
729 cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
730 cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
731 );
732}
733
734ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
735{
736 return mk16x4(
737 cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
738 cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
739 cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
740 cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
741 );
742}
743
744ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
745{
746 return mk8x8(
747 cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
748 cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
749 cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
750 cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
751 cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
752 cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
753 cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
754 cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
755 );
756}
757
sewardj18069182005-01-13 19:16:04 +0000758ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
759{
760 return mk32x2(
761 cmpnez32( sel32x2_1(xx) ),
762 cmpnez32( sel32x2_0(xx) )
763 );
764}
765
766ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
767{
768 return mk16x4(
769 cmpnez16( sel16x4_3(xx) ),
770 cmpnez16( sel16x4_2(xx) ),
771 cmpnez16( sel16x4_1(xx) ),
772 cmpnez16( sel16x4_0(xx) )
773 );
774}
775
776ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
777{
778 return mk8x8(
779 cmpnez8( sel8x8_7(xx) ),
780 cmpnez8( sel8x8_6(xx) ),
781 cmpnez8( sel8x8_5(xx) ),
782 cmpnez8( sel8x8_4(xx) ),
783 cmpnez8( sel8x8_3(xx) ),
784 cmpnez8( sel8x8_2(xx) ),
785 cmpnez8( sel8x8_1(xx) ),
786 cmpnez8( sel8x8_0(xx) )
787 );
788}
789
sewardj38a3f862005-01-13 15:06:51 +0000790/* ------------ Saturating narrowing ------------ */
791
sewardj5f438dd2011-06-16 11:36:23 +0000792ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
sewardj38a3f862005-01-13 15:06:51 +0000793{
794 UInt d = sel32x2_1(aa);
795 UInt c = sel32x2_0(aa);
796 UInt b = sel32x2_1(bb);
797 UInt a = sel32x2_0(bb);
798 return mk16x4(
sewardjc9bff7d2011-06-15 15:09:37 +0000799 qnarrow32Sto16S(d),
800 qnarrow32Sto16S(c),
801 qnarrow32Sto16S(b),
802 qnarrow32Sto16S(a)
sewardj38a3f862005-01-13 15:06:51 +0000803 );
804}
805
sewardj5f438dd2011-06-16 11:36:23 +0000806ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
sewardj38a3f862005-01-13 15:06:51 +0000807{
808 UShort h = sel16x4_3(aa);
809 UShort g = sel16x4_2(aa);
810 UShort f = sel16x4_1(aa);
811 UShort e = sel16x4_0(aa);
812 UShort d = sel16x4_3(bb);
813 UShort c = sel16x4_2(bb);
814 UShort b = sel16x4_1(bb);
815 UShort a = sel16x4_0(bb);
816 return mk8x8(
sewardjc9bff7d2011-06-15 15:09:37 +0000817 qnarrow16Sto8S(h),
818 qnarrow16Sto8S(g),
819 qnarrow16Sto8S(f),
820 qnarrow16Sto8S(e),
821 qnarrow16Sto8S(d),
822 qnarrow16Sto8S(c),
823 qnarrow16Sto8S(b),
824 qnarrow16Sto8S(a)
sewardj38a3f862005-01-13 15:06:51 +0000825 );
826}
827
sewardj5f438dd2011-06-16 11:36:23 +0000828ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
sewardj38a3f862005-01-13 15:06:51 +0000829{
830 UShort h = sel16x4_3(aa);
831 UShort g = sel16x4_2(aa);
832 UShort f = sel16x4_1(aa);
833 UShort e = sel16x4_0(aa);
834 UShort d = sel16x4_3(bb);
835 UShort c = sel16x4_2(bb);
836 UShort b = sel16x4_1(bb);
837 UShort a = sel16x4_0(bb);
838 return mk8x8(
sewardjc9bff7d2011-06-15 15:09:37 +0000839 qnarrow16Sto8U(h),
840 qnarrow16Sto8U(g),
841 qnarrow16Sto8U(f),
842 qnarrow16Sto8U(e),
843 qnarrow16Sto8U(d),
844 qnarrow16Sto8U(c),
845 qnarrow16Sto8U(b),
846 qnarrow16Sto8U(a)
sewardj38a3f862005-01-13 15:06:51 +0000847 );
848}
849
sewardjad2c9ea2011-10-22 09:32:16 +0000850/* ------------ Truncating narrowing ------------ */
851
852ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
853{
854 UInt d = sel32x2_1(aa);
855 UInt c = sel32x2_0(aa);
856 UInt b = sel32x2_1(bb);
857 UInt a = sel32x2_0(bb);
858 return mk16x4(
859 narrow32to16(d),
860 narrow32to16(c),
861 narrow32to16(b),
862 narrow32to16(a)
863 );
864}
865
866ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
867{
868 UShort h = sel16x4_3(aa);
869 UShort g = sel16x4_2(aa);
870 UShort f = sel16x4_1(aa);
871 UShort e = sel16x4_0(aa);
872 UShort d = sel16x4_3(bb);
873 UShort c = sel16x4_2(bb);
874 UShort b = sel16x4_1(bb);
875 UShort a = sel16x4_0(bb);
876 return mk8x8(
877 narrow16to8(h),
878 narrow16to8(g),
879 narrow16to8(f),
880 narrow16to8(e),
881 narrow16to8(d),
882 narrow16to8(c),
883 narrow16to8(b),
884 narrow16to8(a)
885 );
886}
887
sewardj38a3f862005-01-13 15:06:51 +0000888/* ------------ Interleaving ------------ */
889
890ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
891{
892 return mk8x8(
893 sel8x8_7(aa),
894 sel8x8_7(bb),
895 sel8x8_6(aa),
896 sel8x8_6(bb),
897 sel8x8_5(aa),
898 sel8x8_5(bb),
899 sel8x8_4(aa),
900 sel8x8_4(bb)
901 );
902}
903
904ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
905{
906 return mk8x8(
907 sel8x8_3(aa),
908 sel8x8_3(bb),
909 sel8x8_2(aa),
910 sel8x8_2(bb),
911 sel8x8_1(aa),
912 sel8x8_1(bb),
913 sel8x8_0(aa),
914 sel8x8_0(bb)
915 );
916}
917
918ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
919{
920 return mk16x4(
921 sel16x4_3(aa),
922 sel16x4_3(bb),
923 sel16x4_2(aa),
924 sel16x4_2(bb)
925 );
926}
927
928ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
929{
930 return mk16x4(
931 sel16x4_1(aa),
932 sel16x4_1(bb),
933 sel16x4_0(aa),
934 sel16x4_0(bb)
935 );
936}
937
938ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
939{
940 return mk32x2(
941 sel32x2_1(aa),
942 sel32x2_1(bb)
943 );
944}
945
946ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
947{
948 return mk32x2(
949 sel32x2_0(aa),
950 sel32x2_0(bb)
951 );
952}
953
sewardjd166e282008-02-06 11:42:45 +0000954/* ------------ Concatenation ------------ */
955
956ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
957{
958 return mk16x4(
959 sel16x4_3(aa),
960 sel16x4_1(aa),
961 sel16x4_3(bb),
962 sel16x4_1(bb)
963 );
964}
965
966ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
967{
968 return mk16x4(
969 sel16x4_2(aa),
970 sel16x4_0(aa),
971 sel16x4_2(bb),
972 sel16x4_0(bb)
973 );
974}
975
976/* misc hack looking for a proper home */
977ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
978{
979 return mk8x8(
980 index8x8(aa, sel8x8_7(bb)),
981 index8x8(aa, sel8x8_6(bb)),
982 index8x8(aa, sel8x8_5(bb)),
983 index8x8(aa, sel8x8_4(bb)),
984 index8x8(aa, sel8x8_3(bb)),
985 index8x8(aa, sel8x8_2(bb)),
986 index8x8(aa, sel8x8_1(bb)),
987 index8x8(aa, sel8x8_0(bb))
988 );
989}
sewardj38a3f862005-01-13 15:06:51 +0000990
991/* ------------ Shifting ------------ */
992/* Note that because these primops are undefined if the shift amount
993 equals or exceeds the lane width, the shift amount is masked so
994 that the scalar shifts are always in range. In fact, given the
995 semantics of these primops (ShlN16x4, etc) it is an error if in
996 fact we are ever given an out-of-range shift amount.
997*/
998ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
999{
1000 /* vassert(nn < 32); */
1001 nn &= 31;
1002 return mk32x2(
1003 shl32( sel32x2_1(xx), nn ),
1004 shl32( sel32x2_0(xx), nn )
1005 );
1006}
1007
1008ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
1009{
1010 /* vassert(nn < 16); */
1011 nn &= 15;
1012 return mk16x4(
1013 shl16( sel16x4_3(xx), nn ),
1014 shl16( sel16x4_2(xx), nn ),
1015 shl16( sel16x4_1(xx), nn ),
1016 shl16( sel16x4_0(xx), nn )
1017 );
1018}
1019
sewardjd166e282008-02-06 11:42:45 +00001020ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn )
1021{
1022 /* vassert(nn < 8); */
1023 nn &= 7;
1024 return mk8x8(
1025 shl8( sel8x8_7(xx), nn ),
1026 shl8( sel8x8_6(xx), nn ),
1027 shl8( sel8x8_5(xx), nn ),
1028 shl8( sel8x8_4(xx), nn ),
1029 shl8( sel8x8_3(xx), nn ),
1030 shl8( sel8x8_2(xx), nn ),
1031 shl8( sel8x8_1(xx), nn ),
1032 shl8( sel8x8_0(xx), nn )
1033 );
1034}
1035
sewardj38a3f862005-01-13 15:06:51 +00001036ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1037{
1038 /* vassert(nn < 32); */
1039 nn &= 31;
1040 return mk32x2(
1041 shr32( sel32x2_1(xx), nn ),
1042 shr32( sel32x2_0(xx), nn )
1043 );
1044}
1045
1046ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1047{
1048 /* vassert(nn < 16); */
1049 nn &= 15;
1050 return mk16x4(
1051 shr16( sel16x4_3(xx), nn ),
1052 shr16( sel16x4_2(xx), nn ),
1053 shr16( sel16x4_1(xx), nn ),
1054 shr16( sel16x4_0(xx), nn )
1055 );
1056}
1057
1058ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1059{
1060 /* vassert(nn < 32); */
1061 nn &= 31;
1062 return mk32x2(
1063 sar32( sel32x2_1(xx), nn ),
1064 sar32( sel32x2_0(xx), nn )
1065 );
1066}
1067
1068ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1069{
1070 /* vassert(nn < 16); */
1071 nn &= 15;
1072 return mk16x4(
1073 sar16( sel16x4_3(xx), nn ),
1074 sar16( sel16x4_2(xx), nn ),
1075 sar16( sel16x4_1(xx), nn ),
1076 sar16( sel16x4_0(xx), nn )
1077 );
1078}
1079
sewardjd71ba832006-12-27 01:15:29 +00001080ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1081{
1082 /* vassert(nn < 8); */
1083 nn &= 7;
1084 return mk8x8(
1085 sar8( sel8x8_7(xx), nn ),
1086 sar8( sel8x8_6(xx), nn ),
1087 sar8( sel8x8_5(xx), nn ),
1088 sar8( sel8x8_4(xx), nn ),
1089 sar8( sel8x8_3(xx), nn ),
1090 sar8( sel8x8_2(xx), nn ),
1091 sar8( sel8x8_1(xx), nn ),
1092 sar8( sel8x8_0(xx), nn )
1093 );
1094}
1095
sewardj38a3f862005-01-13 15:06:51 +00001096/* ------------ Averaging ------------ */
1097
1098ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1099{
1100 return mk8x8(
1101 avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1102 avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1103 avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1104 avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1105 avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1106 avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1107 avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1108 avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1109 );
1110}
1111
1112ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1113{
1114 return mk16x4(
1115 avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1116 avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1117 avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1118 avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1119 );
1120}
1121
1122/* ------------ max/min ------------ */
1123
1124ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1125{
1126 return mk16x4(
1127 max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1128 max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1129 max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1130 max16S( sel16x4_0(xx), sel16x4_0(yy) )
1131 );
1132}
1133
1134ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1135{
1136 return mk8x8(
1137 max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1138 max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1139 max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1140 max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1141 max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1142 max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1143 max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1144 max8U( sel8x8_0(xx), sel8x8_0(yy) )
1145 );
1146}
1147
1148ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1149{
1150 return mk16x4(
1151 min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1152 min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1153 min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1154 min16S( sel16x4_0(xx), sel16x4_0(yy) )
1155 );
1156}
1157
1158ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1159{
1160 return mk8x8(
1161 min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1162 min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1163 min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1164 min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1165 min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1166 min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1167 min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1168 min8U( sel8x8_0(xx), sel8x8_0(yy) )
1169 );
1170}
1171
sewardje13074c2012-11-08 10:57:08 +00001172UInt h_generic_calc_GetMSBs8x8 ( ULong xx )
1173{
1174 UInt r = 0;
1175 if (xx & (1ULL << (64-1))) r |= (1<<7);
1176 if (xx & (1ULL << (56-1))) r |= (1<<6);
1177 if (xx & (1ULL << (48-1))) r |= (1<<5);
1178 if (xx & (1ULL << (40-1))) r |= (1<<4);
1179 if (xx & (1ULL << (32-1))) r |= (1<<3);
1180 if (xx & (1ULL << (24-1))) r |= (1<<2);
1181 if (xx & (1ULL << (16-1))) r |= (1<<1);
1182 if (xx & (1ULL << ( 8-1))) r |= (1<<0);
1183 return r;
1184}
1185
sewardje2ea1762010-09-22 00:56:37 +00001186/* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1187
1188/* Tuple/select functions for 16x2 vectors. */
1189static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1190 return (((UInt)w1) << 16) | ((UInt)w2);
1191}
1192
1193static inline UShort sel16x2_1 ( UInt w32 ) {
1194 return 0xFFFF & (UShort)(w32 >> 16);
1195}
1196static inline UShort sel16x2_0 ( UInt w32 ) {
1197 return 0xFFFF & (UShort)(w32);
1198}
1199
1200static inline UInt mk8x4 ( UChar w3, UChar w2,
1201 UChar w1, UChar w0 ) {
1202 UInt w32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
1203 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
1204 return w32;
1205}
1206
1207static inline UChar sel8x4_3 ( UInt w32 ) {
1208 return toUChar(0xFF & (w32 >> 24));
1209}
1210static inline UChar sel8x4_2 ( UInt w32 ) {
1211 return toUChar(0xFF & (w32 >> 16));
1212}
1213static inline UChar sel8x4_1 ( UInt w32 ) {
1214 return toUChar(0xFF & (w32 >> 8));
1215}
1216static inline UChar sel8x4_0 ( UInt w32 ) {
1217 return toUChar(0xFF & (w32 >> 0));
1218}
1219
1220
1221/* ----------------------------------------------------- */
1222/* More externally visible functions. These simply
1223 implement the corresponding IR primops. */
1224/* ----------------------------------------------------- */
1225
1226/* ------ 16x2 ------ */
1227
1228UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1229{
1230 return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1231 sel16x2_0(xx) + sel16x2_0(yy) );
1232}
1233
1234UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1235{
1236 return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1237 sel16x2_0(xx) - sel16x2_0(yy) );
1238}
1239
1240UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1241{
1242 return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1243 hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1244}
1245
1246UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1247{
1248 return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1249 hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1250}
1251
1252UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1253{
1254 return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1255 hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1256}
1257
1258UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1259{
1260 return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1261 hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1262}
1263
1264UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1265{
1266 return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1267 qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1268}
1269
1270UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1271{
1272 return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1273 qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1274}
1275
1276UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1277{
1278 return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1279 qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1280}
1281
1282UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1283{
1284 return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1285 qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1286}
1287
1288/* ------ 8x4 ------ */
1289
1290UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1291{
1292 return mk8x4(
1293 sel8x4_3(xx) + sel8x4_3(yy),
1294 sel8x4_2(xx) + sel8x4_2(yy),
1295 sel8x4_1(xx) + sel8x4_1(yy),
1296 sel8x4_0(xx) + sel8x4_0(yy)
1297 );
1298}
1299
1300UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1301{
1302 return mk8x4(
1303 sel8x4_3(xx) - sel8x4_3(yy),
1304 sel8x4_2(xx) - sel8x4_2(yy),
1305 sel8x4_1(xx) - sel8x4_1(yy),
1306 sel8x4_0(xx) - sel8x4_0(yy)
1307 );
1308}
1309
1310UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1311{
1312 return mk8x4(
1313 hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1314 hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1315 hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1316 hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1317 );
1318}
1319
1320UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1321{
1322 return mk8x4(
1323 hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1324 hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1325 hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1326 hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1327 );
1328}
1329
1330UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1331{
1332 return mk8x4(
1333 hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1334 hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1335 hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1336 hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1337 );
1338}
1339
1340UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1341{
1342 return mk8x4(
1343 hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1344 hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1345 hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1346 hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1347 );
1348}
1349
1350UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1351{
1352 return mk8x4(
1353 qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1354 qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1355 qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1356 qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1357 );
1358}
1359
1360UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1361{
1362 return mk8x4(
1363 qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1364 qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1365 qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1366 qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1367 );
1368}
1369
1370UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1371{
1372 return mk8x4(
1373 qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1374 qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1375 qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1376 qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1377 );
1378}
1379
1380UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1381{
1382 return mk8x4(
1383 qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1384 qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1385 qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1386 qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1387 );
1388}
1389
1390UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1391{
1392 return mk16x2(
1393 cmpnez16( sel16x2_1(xx) ),
1394 cmpnez16( sel16x2_0(xx) )
1395 );
1396}
1397
1398UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1399{
1400 return mk8x4(
1401 cmpnez8( sel8x4_3(xx) ),
1402 cmpnez8( sel8x4_2(xx) ),
1403 cmpnez8( sel8x4_1(xx) ),
1404 cmpnez8( sel8x4_0(xx) )
1405 );
1406}
sewardj38a3f862005-01-13 15:06:51 +00001407
sewardj310d6b22010-10-18 16:29:40 +00001408UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1409{
1410 return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1411 + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1412 + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1413 + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1414}
1415
sewardj44ce46d2012-07-11 13:19:10 +00001416UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
1417{
1418 return qadd32S( xx, yy );
1419}
1420
1421UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
1422{
1423 return qsub32S( xx, yy );
1424}
1425
1426
sewardj4c96e612012-06-02 23:47:02 +00001427/*------------------------------------------------------------------*/
1428/* Decimal Floating Point (DFP) externally visible helper functions */
1429/* that implement Iop_BCDtoDPB and Iop_DPBtoBCD */
1430/*------------------------------------------------------------------*/
1431
1432#define NOT( x ) ( ( ( x ) == 0) ? 1 : 0)
1433#define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1434#define PUT( x, y ) ( ( x )<< ( y ) )
1435
1436ULong dpb_to_bcd( ULong chunk )
1437{
1438 Short a, b, c, d, e, f, g, h, i, j, k, m;
1439 Short p, q, r, s, t, u, v, w, x, y;
1440 ULong value;
1441
1442 /* convert 10 bit densely packed BCD to BCD */
1443 p = GET( chunk, 9 );
1444 q = GET( chunk, 8 );
1445 r = GET( chunk, 7 );
1446 s = GET( chunk, 6 );
1447 t = GET( chunk, 5 );
1448 u = GET( chunk, 4 );
1449 v = GET( chunk, 3 );
1450 w = GET( chunk, 2 );
1451 x = GET( chunk, 1 );
1452 y = GET( chunk, 0 );
1453
1454 /* The BCD bit values are given by the following boolean equations.*/
1455 a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
1456 b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
1457 c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
1458 d = r;
1459 e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
1460 f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
1461 g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
1462 h = u;
1463 i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
1464 j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
1465 | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
1466 k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
1467 | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
1468 m = y;
1469
1470 value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
1471 | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
1472 | PUT(k, 1) | PUT(m, 0);
1473 return value;
1474}
1475
1476ULong bcd_to_dpb( ULong chunk )
1477{
1478 Short a, b, c, d, e, f, g, h, i, j, k, m;
1479 Short p, q, r, s, t, u, v, w, x, y;
1480 ULong value;
1481 /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1482 The boolean equations to calculate the value of each of the DPD bit
1483 is given in Appendix B of Book 1: Power ISA User Instruction set. The
1484 bits for the DPD number are [abcdefghijkm]. The bits for the BCD value
1485 are [pqrstuvwxy]. The boolean logic equations in psuedo C code are:
1486 */
1487 a = GET( chunk, 11 );
1488 b = GET( chunk, 10 );
1489 c = GET( chunk, 9 );
1490 d = GET( chunk, 8 );
1491 e = GET( chunk, 7 );
1492 f = GET( chunk, 6 );
1493 g = GET( chunk, 5 );
1494 h = GET( chunk, 4 );
1495 i = GET( chunk, 3 );
1496 j = GET( chunk, 2 );
1497 k = GET( chunk, 1 );
1498 m = GET( chunk, 0 );
1499
1500 p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
1501 q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
1502 r = d;
1503 s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
1504 | ( f & NOT(a) & NOT(e) ) | ( e & i );
1505 t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
1506 | ( g & NOT(a) & NOT(e) ) | ( a & i );
1507 u = h;
1508 v = a | e | i;
1509 w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
1510 x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
1511 y = m;
1512
1513 value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
1514 | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
1515
1516 return value;
1517}
1518
1519ULong h_DPBtoBCD( ULong dpb )
1520{
1521 ULong result, chunk;
1522 Int i;
1523
1524 result = 0;
1525
1526 for (i = 0; i < 5; i++) {
1527 chunk = dpb >> ( 4 - i ) * 10;
1528 result = result << 12;
1529 result |= dpb_to_bcd( chunk & 0x3FF );
1530 }
1531 return result;
1532}
1533
1534ULong h_BCDtoDPB( ULong bcd )
1535{
1536 ULong result, chunk;
1537 Int i;
1538
1539 result = 0;
1540
1541 for (i = 0; i < 5; i++) {
1542 chunk = bcd >> ( 4 - i ) * 12;
1543 result = result << 10;
1544 result |= bcd_to_dpb( chunk & 0xFFF );
1545 }
1546 return result;
1547}
1548#undef NOT
1549#undef GET
1550#undef PUT
sewardj310d6b22010-10-18 16:29:40 +00001551
sewardj38a3f862005-01-13 15:06:51 +00001552/*---------------------------------------------------------------*/
sewardjcef7d3e2009-07-02 12:21:59 +00001553/*--- end host_generic_simd64.c ---*/
sewardj38a3f862005-01-13 15:06:51 +00001554/*---------------------------------------------------------------*/
sewardj4c96e612012-06-02 23:47:02 +00001555