blob: 3bebe9068547a1d983cb570c73e992fffd9bcc75 [file] [log] [blame]
sewardj38a3f862005-01-13 15:06:51 +00001
2/*---------------------------------------------------------------*/
sewardj752f9062010-05-03 21:38:49 +00003/*--- begin host_generic_simd64.c ---*/
sewardj38a3f862005-01-13 15:06:51 +00004/*---------------------------------------------------------------*/
5
6/*
sewardj752f9062010-05-03 21:38:49 +00007 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
sewardj38a3f862005-01-13 15:06:51 +00009
sewardje6c53e02011-10-23 07:33:43 +000010 Copyright (C) 2004-2011 OpenWorks LLP
sewardj752f9062010-05-03 21:38:49 +000011 info@open-works.net
sewardj38a3f862005-01-13 15:06:51 +000012
sewardj752f9062010-05-03 21:38:49 +000013 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
sewardj38a3f862005-01-13 15:06:51 +000017
sewardj752f9062010-05-03 21:38:49 +000018 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
sewardj7bd6ffe2005-08-03 16:07:36 +000026 02110-1301, USA.
27
sewardj752f9062010-05-03 21:38:49 +000028 The GNU General Public License is contained in the file COPYING.
sewardj38a3f862005-01-13 15:06:51 +000029
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
sewardj38a3f862005-01-13 15:06:51 +000034*/
35
36/* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37 where the instruction selectors cannot generate code in-line.
38 These are purely back-end entities and cannot be seen/referenced
39 from IR. */
40
41#include "libvex_basictypes.h"
sewardjcef7d3e2009-07-02 12:21:59 +000042#include "host_generic_simd64.h"
sewardj38a3f862005-01-13 15:06:51 +000043
44
45
46/* Tuple/select functions for 32x2 vectors. */
47
48static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
49 return (((ULong)w1) << 32) | ((ULong)w0);
50}
51
52static inline UInt sel32x2_1 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000053 return 0xFFFFFFFF & toUInt(w64 >> 32);
sewardj38a3f862005-01-13 15:06:51 +000054}
55static inline UInt sel32x2_0 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000056 return 0xFFFFFFFF & toUInt(w64);
sewardj38a3f862005-01-13 15:06:51 +000057}
58
59
60/* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless
61 with 64-bit shifts so we give it a hand. */
62
63static inline ULong mk16x4 ( UShort w3, UShort w2,
64 UShort w1, UShort w0 ) {
65 UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
66 UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
67 return mk32x2(hi32, lo32);
68}
69
70static inline UShort sel16x4_3 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000071 UInt hi32 = toUInt(w64 >> 32);
72 return toUShort(0xFFFF & (hi32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +000073}
74static inline UShort sel16x4_2 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000075 UInt hi32 = toUInt(w64 >> 32);
76 return toUShort(0xFFFF & hi32);
sewardj38a3f862005-01-13 15:06:51 +000077}
78static inline UShort sel16x4_1 ( ULong w64 ) {
79 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +000080 return toUShort(0xFFFF & (lo32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +000081}
82static inline UShort sel16x4_0 ( ULong w64 ) {
83 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +000084 return toUShort(0xFFFF & lo32);
sewardj38a3f862005-01-13 15:06:51 +000085}
86
87
88/* Tuple/select functions for 8x8 vectors. */
89
90static inline ULong mk8x8 ( UChar w7, UChar w6,
91 UChar w5, UChar w4,
92 UChar w3, UChar w2,
sewardje2ea1762010-09-22 00:56:37 +000093 UChar w1, UChar w0 ) {
sewardj38a3f862005-01-13 15:06:51 +000094 UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16)
95 | (((UInt)w5) << 8) | (((UInt)w4) << 0);
96 UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
97 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
98 return mk32x2(hi32, lo32);
99}
100
101static inline UChar sel8x8_7 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000102 UInt hi32 = toUInt(w64 >> 32);
103 return toUChar(0xFF & (hi32 >> 24));
sewardj38a3f862005-01-13 15:06:51 +0000104}
105static inline UChar sel8x8_6 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000106 UInt hi32 = toUInt(w64 >> 32);
107 return toUChar(0xFF & (hi32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +0000108}
109static inline UChar sel8x8_5 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000110 UInt hi32 = toUInt(w64 >> 32);
111 return toUChar(0xFF & (hi32 >> 8));
sewardj38a3f862005-01-13 15:06:51 +0000112}
113static inline UChar sel8x8_4 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000114 UInt hi32 = toUInt(w64 >> 32);
115 return toUChar(0xFF & (hi32 >> 0));
sewardj38a3f862005-01-13 15:06:51 +0000116}
117static inline UChar sel8x8_3 ( ULong w64 ) {
118 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000119 return toUChar(0xFF & (lo32 >> 24));
sewardj38a3f862005-01-13 15:06:51 +0000120}
121static inline UChar sel8x8_2 ( ULong w64 ) {
122 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000123 return toUChar(0xFF & (lo32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +0000124}
125static inline UChar sel8x8_1 ( ULong w64 ) {
126 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000127 return toUChar(0xFF & (lo32 >> 8));
sewardj38a3f862005-01-13 15:06:51 +0000128}
129static inline UChar sel8x8_0 ( ULong w64 ) {
130 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000131 return toUChar(0xFF & (lo32 >> 0));
sewardj38a3f862005-01-13 15:06:51 +0000132}
133
sewardjd166e282008-02-06 11:42:45 +0000134static inline UChar index8x8 ( ULong w64, UChar ix ) {
135 ix &= 7;
136 return toUChar((w64 >> (8*ix)) & 0xFF);
137}
138
sewardj38a3f862005-01-13 15:06:51 +0000139
140/* Scalar helpers. */
141
142static inline Short qadd16S ( Short xx, Short yy )
143{
144 Int t = ((Int)xx) + ((Int)yy);
145 if (t < -32768) t = -32768;
146 if (t > 32767) t = 32767;
147 return (Short)t;
148}
149
150static inline Char qadd8S ( Char xx, Char yy )
151{
152 Int t = ((Int)xx) + ((Int)yy);
153 if (t < -128) t = -128;
154 if (t > 127) t = 127;
155 return (Char)t;
156}
157
158static inline UShort qadd16U ( UShort xx, UShort yy )
159{
160 UInt t = ((UInt)xx) + ((UInt)yy);
161 if (t > 0xFFFF) t = 0xFFFF;
162 return (UShort)t;
163}
164
165static inline UChar qadd8U ( UChar xx, UChar yy )
166{
167 UInt t = ((UInt)xx) + ((UInt)yy);
168 if (t > 0xFF) t = 0xFF;
169 return (UChar)t;
170}
171
172static inline Short qsub16S ( Short xx, Short yy )
173{
174 Int t = ((Int)xx) - ((Int)yy);
175 if (t < -32768) t = -32768;
176 if (t > 32767) t = 32767;
177 return (Short)t;
178}
179
180static inline Char qsub8S ( Char xx, Char yy )
181{
182 Int t = ((Int)xx) - ((Int)yy);
183 if (t < -128) t = -128;
184 if (t > 127) t = 127;
185 return (Char)t;
186}
187
188static inline UShort qsub16U ( UShort xx, UShort yy )
189{
190 Int t = ((Int)xx) - ((Int)yy);
191 if (t < 0) t = 0;
192 if (t > 0xFFFF) t = 0xFFFF;
193 return (UShort)t;
194}
195
196static inline UChar qsub8U ( UChar xx, UChar yy )
197{
198 Int t = ((Int)xx) - ((Int)yy);
199 if (t < 0) t = 0;
200 if (t > 0xFF) t = 0xFF;
201 return (UChar)t;
202}
203
204static inline Short mul16 ( Short xx, Short yy )
205{
206 Int t = ((Int)xx) * ((Int)yy);
207 return (Short)t;
208}
209
sewardjd166e282008-02-06 11:42:45 +0000210static inline Int mul32 ( Int xx, Int yy )
211{
212 Int t = ((Int)xx) * ((Int)yy);
213 return (Int)t;
214}
215
sewardj38a3f862005-01-13 15:06:51 +0000216static inline Short mulhi16S ( Short xx, Short yy )
217{
218 Int t = ((Int)xx) * ((Int)yy);
219 t >>=/*s*/ 16;
220 return (Short)t;
221}
222
223static inline UShort mulhi16U ( UShort xx, UShort yy )
224{
225 UInt t = ((UInt)xx) * ((UInt)yy);
226 t >>=/*u*/ 16;
227 return (UShort)t;
228}
229
230static inline UInt cmpeq32 ( UInt xx, UInt yy )
231{
232 return xx==yy ? 0xFFFFFFFF : 0;
233}
234
235static inline UShort cmpeq16 ( UShort xx, UShort yy )
236{
sewardjd19fc162005-02-26 02:16:39 +0000237 return toUShort(xx==yy ? 0xFFFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000238}
239
240static inline UChar cmpeq8 ( UChar xx, UChar yy )
241{
sewardjd19fc162005-02-26 02:16:39 +0000242 return toUChar(xx==yy ? 0xFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000243}
244
245static inline UInt cmpgt32S ( Int xx, Int yy )
246{
247 return xx>yy ? 0xFFFFFFFF : 0;
248}
249
250static inline UShort cmpgt16S ( Short xx, Short yy )
251{
sewardjd19fc162005-02-26 02:16:39 +0000252 return toUShort(xx>yy ? 0xFFFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000253}
254
255static inline UChar cmpgt8S ( Char xx, Char yy )
256{
sewardjd19fc162005-02-26 02:16:39 +0000257 return toUChar(xx>yy ? 0xFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000258}
259
sewardj18069182005-01-13 19:16:04 +0000260static inline UInt cmpnez32 ( UInt xx )
261{
262 return xx==0 ? 0 : 0xFFFFFFFF;
263}
264
265static inline UShort cmpnez16 ( UShort xx )
266{
sewardjd19fc162005-02-26 02:16:39 +0000267 return toUShort(xx==0 ? 0 : 0xFFFF);
sewardj18069182005-01-13 19:16:04 +0000268}
269
270static inline UChar cmpnez8 ( UChar xx )
271{
sewardjd19fc162005-02-26 02:16:39 +0000272 return toUChar(xx==0 ? 0 : 0xFF);
sewardj18069182005-01-13 19:16:04 +0000273}
274
sewardjc9bff7d2011-06-15 15:09:37 +0000275static inline Short qnarrow32Sto16S ( UInt xx0 )
sewardj38a3f862005-01-13 15:06:51 +0000276{
277 Int xx = (Int)xx0;
278 if (xx < -32768) xx = -32768;
279 if (xx > 32767) xx = 32767;
280 return (Short)xx;
281}
282
sewardjc9bff7d2011-06-15 15:09:37 +0000283static inline Char qnarrow16Sto8S ( UShort xx0 )
sewardj38a3f862005-01-13 15:06:51 +0000284{
285 Short xx = (Short)xx0;
286 if (xx < -128) xx = -128;
287 if (xx > 127) xx = 127;
288 return (Char)xx;
289}
290
sewardjc9bff7d2011-06-15 15:09:37 +0000291static inline UChar qnarrow16Sto8U ( UShort xx0 )
sewardj38a3f862005-01-13 15:06:51 +0000292{
293 Short xx = (Short)xx0;
294 if (xx < 0) xx = 0;
295 if (xx > 255) xx = 255;
296 return (UChar)xx;
297}
298
sewardjad2c9ea2011-10-22 09:32:16 +0000299static inline UShort narrow32to16 ( UInt xx )
300{
301 return (UShort)xx;
302}
303
304static inline UChar narrow16to8 ( UShort xx )
305{
306 return (UChar)xx;
307}
308
sewardj38a3f862005-01-13 15:06:51 +0000309/* shifts: we don't care about out-of-range ones, since
310 that is dealt with at a higher level. */
311
sewardjd166e282008-02-06 11:42:45 +0000312static inline UChar shl8 ( UChar v, UInt n )
313{
314 return toUChar(v << n);
315}
316
sewardjd71ba832006-12-27 01:15:29 +0000317static inline UChar sar8 ( UChar v, UInt n )
318{
319 return toUChar(((Char)v) >> n);
320}
321
sewardj38a3f862005-01-13 15:06:51 +0000322static inline UShort shl16 ( UShort v, UInt n )
323{
sewardjd19fc162005-02-26 02:16:39 +0000324 return toUShort(v << n);
sewardj38a3f862005-01-13 15:06:51 +0000325}
326
327static inline UShort shr16 ( UShort v, UInt n )
328{
sewardjd19fc162005-02-26 02:16:39 +0000329 return toUShort((((UShort)v) >> n));
sewardj38a3f862005-01-13 15:06:51 +0000330}
331
332static inline UShort sar16 ( UShort v, UInt n )
333{
sewardjd19fc162005-02-26 02:16:39 +0000334 return toUShort(((Short)v) >> n);
sewardj38a3f862005-01-13 15:06:51 +0000335}
336
337static inline UInt shl32 ( UInt v, UInt n )
338{
339 return v << n;
340}
341
342static inline UInt shr32 ( UInt v, UInt n )
343{
344 return (((UInt)v) >> n);
345}
346
347static inline UInt sar32 ( UInt v, UInt n )
348{
349 return ((Int)v) >> n;
350}
351
352static inline UChar avg8U ( UChar xx, UChar yy )
353{
354 UInt xxi = (UInt)xx;
355 UInt yyi = (UInt)yy;
356 UInt r = (xxi + yyi + 1) >> 1;
357 return (UChar)r;
358}
359
360static inline UShort avg16U ( UShort xx, UShort yy )
361{
362 UInt xxi = (UInt)xx;
363 UInt yyi = (UInt)yy;
364 UInt r = (xxi + yyi + 1) >> 1;
365 return (UShort)r;
366}
367
368static inline Short max16S ( Short xx, Short yy )
369{
sewardjd19fc162005-02-26 02:16:39 +0000370 return toUShort((xx > yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000371}
372
373static inline UChar max8U ( UChar xx, UChar yy )
374{
sewardjd19fc162005-02-26 02:16:39 +0000375 return toUChar((xx > yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000376}
377
378static inline Short min16S ( Short xx, Short yy )
379{
sewardjd19fc162005-02-26 02:16:39 +0000380 return toUShort((xx < yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000381}
382
383static inline UChar min8U ( UChar xx, UChar yy )
384{
sewardjd19fc162005-02-26 02:16:39 +0000385 return toUChar((xx < yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000386}
387
sewardje2ea1762010-09-22 00:56:37 +0000388static inline UShort hadd16U ( UShort xx, UShort yy )
389{
390 UInt xxi = (UInt)xx;
391 UInt yyi = (UInt)yy;
392 UInt r = (xxi + yyi) >> 1;
393 return (UShort)r;
394}
395
396static inline Short hadd16S ( Short xx, Short yy )
397{
398 Int xxi = (Int)xx;
399 Int yyi = (Int)yy;
400 Int r = (xxi + yyi) >> 1;
401 return (Short)r;
402}
403
404static inline UShort hsub16U ( UShort xx, UShort yy )
405{
406 UInt xxi = (UInt)xx;
407 UInt yyi = (UInt)yy;
408 UInt r = (xxi - yyi) >> 1;
409 return (UShort)r;
410}
411
412static inline Short hsub16S ( Short xx, Short yy )
413{
414 Int xxi = (Int)xx;
415 Int yyi = (Int)yy;
416 Int r = (xxi - yyi) >> 1;
417 return (Short)r;
418}
419
420static inline UChar hadd8U ( UChar xx, UChar yy )
421{
422 UInt xxi = (UInt)xx;
423 UInt yyi = (UInt)yy;
424 UInt r = (xxi + yyi) >> 1;
425 return (UChar)r;
426}
427
428static inline Char hadd8S ( Char xx, Char yy )
429{
430 Int xxi = (Int)xx;
431 Int yyi = (Int)yy;
432 Int r = (xxi + yyi) >> 1;
433 return (Char)r;
434}
435
436static inline UChar hsub8U ( UChar xx, UChar yy )
437{
438 UInt xxi = (UInt)xx;
439 UInt yyi = (UInt)yy;
440 UInt r = (xxi - yyi) >> 1;
441 return (UChar)r;
442}
443
444static inline Char hsub8S ( Char xx, Char yy )
445{
446 Int xxi = (Int)xx;
447 Int yyi = (Int)yy;
448 Int r = (xxi - yyi) >> 1;
449 return (Char)r;
450}
451
sewardj310d6b22010-10-18 16:29:40 +0000452static inline UInt absdiff8U ( UChar xx, UChar yy )
453{
454 UInt xxu = (UChar)xx;
455 UInt yyu = (UChar)yy;
456 return xxu >= yyu ? xxu - yyu : yyu - xxu;
457}
sewardje2ea1762010-09-22 00:56:37 +0000458
sewardj38a3f862005-01-13 15:06:51 +0000459/* ----------------------------------------------------- */
460/* Start of the externally visible functions. These simply
461 implement the corresponding IR primops. */
462/* ----------------------------------------------------- */
463
464/* ------------ Normal addition ------------ */
465
466ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
467{
468 return mk32x2(
469 sel32x2_1(xx) + sel32x2_1(yy),
470 sel32x2_0(xx) + sel32x2_0(yy)
471 );
472}
473
474ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
475{
476 return mk16x4(
sewardjd19fc162005-02-26 02:16:39 +0000477 toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
478 toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
479 toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
480 toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000481 );
482}
483
484ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
485{
486 return mk8x8(
sewardjd19fc162005-02-26 02:16:39 +0000487 toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
488 toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
489 toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
490 toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
491 toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
492 toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
493 toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
494 toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000495 );
496}
497
498/* ------------ Saturating addition ------------ */
499
500ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
501{
502 return mk16x4(
503 qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
504 qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
505 qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
506 qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
507 );
508}
509
510ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
511{
512 return mk8x8(
513 qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
514 qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
515 qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
516 qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
517 qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
518 qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
519 qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
520 qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
521 );
522}
523
524ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
525{
526 return mk16x4(
527 qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
528 qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
529 qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
530 qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
531 );
532}
533
534ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
535{
536 return mk8x8(
537 qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
538 qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
539 qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
540 qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
541 qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
542 qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
543 qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
544 qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
545 );
546}
547
548/* ------------ Normal subtraction ------------ */
549
550ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
551{
552 return mk32x2(
553 sel32x2_1(xx) - sel32x2_1(yy),
554 sel32x2_0(xx) - sel32x2_0(yy)
555 );
556}
557
558ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
559{
560 return mk16x4(
sewardjd19fc162005-02-26 02:16:39 +0000561 toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
562 toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
563 toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
564 toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000565 );
566}
567
568ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
569{
570 return mk8x8(
sewardjd19fc162005-02-26 02:16:39 +0000571 toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
572 toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
573 toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
574 toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
575 toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
576 toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
577 toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
578 toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000579 );
580}
581
582/* ------------ Saturating subtraction ------------ */
583
584ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
585{
586 return mk16x4(
587 qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
588 qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
589 qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
590 qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
591 );
592}
593
594ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
595{
596 return mk8x8(
597 qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
598 qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
599 qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
600 qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
601 qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
602 qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
603 qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
604 qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
605 );
606}
607
608ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
609{
610 return mk16x4(
611 qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
612 qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
613 qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
614 qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
615 );
616}
617
618ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
619{
620 return mk8x8(
621 qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
622 qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
623 qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
624 qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
625 qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
626 qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
627 qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
628 qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
629 );
630}
631
632/* ------------ Multiplication ------------ */
633
634ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
635{
636 return mk16x4(
637 mul16( sel16x4_3(xx), sel16x4_3(yy) ),
638 mul16( sel16x4_2(xx), sel16x4_2(yy) ),
639 mul16( sel16x4_1(xx), sel16x4_1(yy) ),
640 mul16( sel16x4_0(xx), sel16x4_0(yy) )
641 );
642}
643
sewardjd166e282008-02-06 11:42:45 +0000644ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
645{
646 return mk32x2(
647 mul32( sel32x2_1(xx), sel32x2_1(yy) ),
648 mul32( sel32x2_0(xx), sel32x2_0(yy) )
649 );
650}
651
sewardj38a3f862005-01-13 15:06:51 +0000652ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
653{
654 return mk16x4(
655 mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
656 mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
657 mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
658 mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
659 );
660}
661
662ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
663{
664 return mk16x4(
665 mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
666 mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
667 mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
668 mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
669 );
670}
671
672/* ------------ Comparison ------------ */
673
674ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
675{
676 return mk32x2(
677 cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
678 cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
679 );
680}
681
682ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
683{
684 return mk16x4(
685 cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
686 cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
687 cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
688 cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
689 );
690}
691
692ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
693{
694 return mk8x8(
695 cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
696 cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
697 cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
698 cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
699 cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
700 cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
701 cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
702 cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
703 );
704}
705
706ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
707{
708 return mk32x2(
709 cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
710 cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
711 );
712}
713
714ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
715{
716 return mk16x4(
717 cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
718 cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
719 cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
720 cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
721 );
722}
723
724ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
725{
726 return mk8x8(
727 cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
728 cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
729 cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
730 cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
731 cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
732 cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
733 cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
734 cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
735 );
736}
737
sewardj18069182005-01-13 19:16:04 +0000738ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
739{
740 return mk32x2(
741 cmpnez32( sel32x2_1(xx) ),
742 cmpnez32( sel32x2_0(xx) )
743 );
744}
745
746ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
747{
748 return mk16x4(
749 cmpnez16( sel16x4_3(xx) ),
750 cmpnez16( sel16x4_2(xx) ),
751 cmpnez16( sel16x4_1(xx) ),
752 cmpnez16( sel16x4_0(xx) )
753 );
754}
755
756ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
757{
758 return mk8x8(
759 cmpnez8( sel8x8_7(xx) ),
760 cmpnez8( sel8x8_6(xx) ),
761 cmpnez8( sel8x8_5(xx) ),
762 cmpnez8( sel8x8_4(xx) ),
763 cmpnez8( sel8x8_3(xx) ),
764 cmpnez8( sel8x8_2(xx) ),
765 cmpnez8( sel8x8_1(xx) ),
766 cmpnez8( sel8x8_0(xx) )
767 );
768}
769
sewardj38a3f862005-01-13 15:06:51 +0000770/* ------------ Saturating narrowing ------------ */
771
sewardj5f438dd2011-06-16 11:36:23 +0000772ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
sewardj38a3f862005-01-13 15:06:51 +0000773{
774 UInt d = sel32x2_1(aa);
775 UInt c = sel32x2_0(aa);
776 UInt b = sel32x2_1(bb);
777 UInt a = sel32x2_0(bb);
778 return mk16x4(
sewardjc9bff7d2011-06-15 15:09:37 +0000779 qnarrow32Sto16S(d),
780 qnarrow32Sto16S(c),
781 qnarrow32Sto16S(b),
782 qnarrow32Sto16S(a)
sewardj38a3f862005-01-13 15:06:51 +0000783 );
784}
785
sewardj5f438dd2011-06-16 11:36:23 +0000786ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
sewardj38a3f862005-01-13 15:06:51 +0000787{
788 UShort h = sel16x4_3(aa);
789 UShort g = sel16x4_2(aa);
790 UShort f = sel16x4_1(aa);
791 UShort e = sel16x4_0(aa);
792 UShort d = sel16x4_3(bb);
793 UShort c = sel16x4_2(bb);
794 UShort b = sel16x4_1(bb);
795 UShort a = sel16x4_0(bb);
796 return mk8x8(
sewardjc9bff7d2011-06-15 15:09:37 +0000797 qnarrow16Sto8S(h),
798 qnarrow16Sto8S(g),
799 qnarrow16Sto8S(f),
800 qnarrow16Sto8S(e),
801 qnarrow16Sto8S(d),
802 qnarrow16Sto8S(c),
803 qnarrow16Sto8S(b),
804 qnarrow16Sto8S(a)
sewardj38a3f862005-01-13 15:06:51 +0000805 );
806}
807
sewardj5f438dd2011-06-16 11:36:23 +0000808ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
sewardj38a3f862005-01-13 15:06:51 +0000809{
810 UShort h = sel16x4_3(aa);
811 UShort g = sel16x4_2(aa);
812 UShort f = sel16x4_1(aa);
813 UShort e = sel16x4_0(aa);
814 UShort d = sel16x4_3(bb);
815 UShort c = sel16x4_2(bb);
816 UShort b = sel16x4_1(bb);
817 UShort a = sel16x4_0(bb);
818 return mk8x8(
sewardjc9bff7d2011-06-15 15:09:37 +0000819 qnarrow16Sto8U(h),
820 qnarrow16Sto8U(g),
821 qnarrow16Sto8U(f),
822 qnarrow16Sto8U(e),
823 qnarrow16Sto8U(d),
824 qnarrow16Sto8U(c),
825 qnarrow16Sto8U(b),
826 qnarrow16Sto8U(a)
sewardj38a3f862005-01-13 15:06:51 +0000827 );
828}
829
sewardjad2c9ea2011-10-22 09:32:16 +0000830/* ------------ Truncating narrowing ------------ */
831
832ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
833{
834 UInt d = sel32x2_1(aa);
835 UInt c = sel32x2_0(aa);
836 UInt b = sel32x2_1(bb);
837 UInt a = sel32x2_0(bb);
838 return mk16x4(
839 narrow32to16(d),
840 narrow32to16(c),
841 narrow32to16(b),
842 narrow32to16(a)
843 );
844}
845
846ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
847{
848 UShort h = sel16x4_3(aa);
849 UShort g = sel16x4_2(aa);
850 UShort f = sel16x4_1(aa);
851 UShort e = sel16x4_0(aa);
852 UShort d = sel16x4_3(bb);
853 UShort c = sel16x4_2(bb);
854 UShort b = sel16x4_1(bb);
855 UShort a = sel16x4_0(bb);
856 return mk8x8(
857 narrow16to8(h),
858 narrow16to8(g),
859 narrow16to8(f),
860 narrow16to8(e),
861 narrow16to8(d),
862 narrow16to8(c),
863 narrow16to8(b),
864 narrow16to8(a)
865 );
866}
867
sewardj38a3f862005-01-13 15:06:51 +0000868/* ------------ Interleaving ------------ */
869
870ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
871{
872 return mk8x8(
873 sel8x8_7(aa),
874 sel8x8_7(bb),
875 sel8x8_6(aa),
876 sel8x8_6(bb),
877 sel8x8_5(aa),
878 sel8x8_5(bb),
879 sel8x8_4(aa),
880 sel8x8_4(bb)
881 );
882}
883
884ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
885{
886 return mk8x8(
887 sel8x8_3(aa),
888 sel8x8_3(bb),
889 sel8x8_2(aa),
890 sel8x8_2(bb),
891 sel8x8_1(aa),
892 sel8x8_1(bb),
893 sel8x8_0(aa),
894 sel8x8_0(bb)
895 );
896}
897
898ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
899{
900 return mk16x4(
901 sel16x4_3(aa),
902 sel16x4_3(bb),
903 sel16x4_2(aa),
904 sel16x4_2(bb)
905 );
906}
907
908ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
909{
910 return mk16x4(
911 sel16x4_1(aa),
912 sel16x4_1(bb),
913 sel16x4_0(aa),
914 sel16x4_0(bb)
915 );
916}
917
918ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
919{
920 return mk32x2(
921 sel32x2_1(aa),
922 sel32x2_1(bb)
923 );
924}
925
926ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
927{
928 return mk32x2(
929 sel32x2_0(aa),
930 sel32x2_0(bb)
931 );
932}
933
sewardjd166e282008-02-06 11:42:45 +0000934/* ------------ Concatenation ------------ */
935
936ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
937{
938 return mk16x4(
939 sel16x4_3(aa),
940 sel16x4_1(aa),
941 sel16x4_3(bb),
942 sel16x4_1(bb)
943 );
944}
945
946ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
947{
948 return mk16x4(
949 sel16x4_2(aa),
950 sel16x4_0(aa),
951 sel16x4_2(bb),
952 sel16x4_0(bb)
953 );
954}
955
956/* misc hack looking for a proper home */
957ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
958{
959 return mk8x8(
960 index8x8(aa, sel8x8_7(bb)),
961 index8x8(aa, sel8x8_6(bb)),
962 index8x8(aa, sel8x8_5(bb)),
963 index8x8(aa, sel8x8_4(bb)),
964 index8x8(aa, sel8x8_3(bb)),
965 index8x8(aa, sel8x8_2(bb)),
966 index8x8(aa, sel8x8_1(bb)),
967 index8x8(aa, sel8x8_0(bb))
968 );
969}
sewardj38a3f862005-01-13 15:06:51 +0000970
971/* ------------ Shifting ------------ */
972/* Note that because these primops are undefined if the shift amount
973 equals or exceeds the lane width, the shift amount is masked so
974 that the scalar shifts are always in range. In fact, given the
975 semantics of these primops (ShlN16x4, etc) it is an error if in
976 fact we are ever given an out-of-range shift amount.
977*/
978ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
979{
980 /* vassert(nn < 32); */
981 nn &= 31;
982 return mk32x2(
983 shl32( sel32x2_1(xx), nn ),
984 shl32( sel32x2_0(xx), nn )
985 );
986}
987
988ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
989{
990 /* vassert(nn < 16); */
991 nn &= 15;
992 return mk16x4(
993 shl16( sel16x4_3(xx), nn ),
994 shl16( sel16x4_2(xx), nn ),
995 shl16( sel16x4_1(xx), nn ),
996 shl16( sel16x4_0(xx), nn )
997 );
998}
999
sewardjd166e282008-02-06 11:42:45 +00001000ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn )
1001{
1002 /* vassert(nn < 8); */
1003 nn &= 7;
1004 return mk8x8(
1005 shl8( sel8x8_7(xx), nn ),
1006 shl8( sel8x8_6(xx), nn ),
1007 shl8( sel8x8_5(xx), nn ),
1008 shl8( sel8x8_4(xx), nn ),
1009 shl8( sel8x8_3(xx), nn ),
1010 shl8( sel8x8_2(xx), nn ),
1011 shl8( sel8x8_1(xx), nn ),
1012 shl8( sel8x8_0(xx), nn )
1013 );
1014}
1015
sewardj38a3f862005-01-13 15:06:51 +00001016ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1017{
1018 /* vassert(nn < 32); */
1019 nn &= 31;
1020 return mk32x2(
1021 shr32( sel32x2_1(xx), nn ),
1022 shr32( sel32x2_0(xx), nn )
1023 );
1024}
1025
1026ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1027{
1028 /* vassert(nn < 16); */
1029 nn &= 15;
1030 return mk16x4(
1031 shr16( sel16x4_3(xx), nn ),
1032 shr16( sel16x4_2(xx), nn ),
1033 shr16( sel16x4_1(xx), nn ),
1034 shr16( sel16x4_0(xx), nn )
1035 );
1036}
1037
1038ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1039{
1040 /* vassert(nn < 32); */
1041 nn &= 31;
1042 return mk32x2(
1043 sar32( sel32x2_1(xx), nn ),
1044 sar32( sel32x2_0(xx), nn )
1045 );
1046}
1047
1048ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1049{
1050 /* vassert(nn < 16); */
1051 nn &= 15;
1052 return mk16x4(
1053 sar16( sel16x4_3(xx), nn ),
1054 sar16( sel16x4_2(xx), nn ),
1055 sar16( sel16x4_1(xx), nn ),
1056 sar16( sel16x4_0(xx), nn )
1057 );
1058}
1059
sewardjd71ba832006-12-27 01:15:29 +00001060ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1061{
1062 /* vassert(nn < 8); */
1063 nn &= 7;
1064 return mk8x8(
1065 sar8( sel8x8_7(xx), nn ),
1066 sar8( sel8x8_6(xx), nn ),
1067 sar8( sel8x8_5(xx), nn ),
1068 sar8( sel8x8_4(xx), nn ),
1069 sar8( sel8x8_3(xx), nn ),
1070 sar8( sel8x8_2(xx), nn ),
1071 sar8( sel8x8_1(xx), nn ),
1072 sar8( sel8x8_0(xx), nn )
1073 );
1074}
1075
sewardj38a3f862005-01-13 15:06:51 +00001076/* ------------ Averaging ------------ */
1077
1078ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1079{
1080 return mk8x8(
1081 avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1082 avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1083 avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1084 avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1085 avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1086 avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1087 avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1088 avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1089 );
1090}
1091
1092ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1093{
1094 return mk16x4(
1095 avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1096 avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1097 avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1098 avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1099 );
1100}
1101
1102/* ------------ max/min ------------ */
1103
1104ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1105{
1106 return mk16x4(
1107 max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1108 max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1109 max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1110 max16S( sel16x4_0(xx), sel16x4_0(yy) )
1111 );
1112}
1113
1114ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1115{
1116 return mk8x8(
1117 max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1118 max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1119 max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1120 max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1121 max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1122 max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1123 max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1124 max8U( sel8x8_0(xx), sel8x8_0(yy) )
1125 );
1126}
1127
1128ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1129{
1130 return mk16x4(
1131 min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1132 min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1133 min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1134 min16S( sel16x4_0(xx), sel16x4_0(yy) )
1135 );
1136}
1137
1138ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1139{
1140 return mk8x8(
1141 min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1142 min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1143 min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1144 min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1145 min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1146 min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1147 min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1148 min8U( sel8x8_0(xx), sel8x8_0(yy) )
1149 );
1150}
1151
sewardje2ea1762010-09-22 00:56:37 +00001152/* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1153
1154/* Tuple/select functions for 16x2 vectors. */
1155static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1156 return (((UInt)w1) << 16) | ((UInt)w2);
1157}
1158
1159static inline UShort sel16x2_1 ( UInt w32 ) {
1160 return 0xFFFF & (UShort)(w32 >> 16);
1161}
1162static inline UShort sel16x2_0 ( UInt w32 ) {
1163 return 0xFFFF & (UShort)(w32);
1164}
1165
1166static inline UInt mk8x4 ( UChar w3, UChar w2,
1167 UChar w1, UChar w0 ) {
1168 UInt w32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
1169 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
1170 return w32;
1171}
1172
1173static inline UChar sel8x4_3 ( UInt w32 ) {
1174 return toUChar(0xFF & (w32 >> 24));
1175}
1176static inline UChar sel8x4_2 ( UInt w32 ) {
1177 return toUChar(0xFF & (w32 >> 16));
1178}
1179static inline UChar sel8x4_1 ( UInt w32 ) {
1180 return toUChar(0xFF & (w32 >> 8));
1181}
1182static inline UChar sel8x4_0 ( UInt w32 ) {
1183 return toUChar(0xFF & (w32 >> 0));
1184}
1185
1186
1187/* ----------------------------------------------------- */
1188/* More externally visible functions. These simply
1189 implement the corresponding IR primops. */
1190/* ----------------------------------------------------- */
1191
1192/* ------ 16x2 ------ */
1193
1194UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1195{
1196 return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1197 sel16x2_0(xx) + sel16x2_0(yy) );
1198}
1199
1200UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1201{
1202 return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1203 sel16x2_0(xx) - sel16x2_0(yy) );
1204}
1205
1206UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1207{
1208 return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1209 hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1210}
1211
1212UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1213{
1214 return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1215 hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1216}
1217
1218UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1219{
1220 return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1221 hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1222}
1223
1224UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1225{
1226 return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1227 hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1228}
1229
1230UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1231{
1232 return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1233 qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1234}
1235
1236UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1237{
1238 return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1239 qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1240}
1241
1242UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1243{
1244 return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1245 qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1246}
1247
1248UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1249{
1250 return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1251 qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1252}
1253
1254/* ------ 8x4 ------ */
1255
1256UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1257{
1258 return mk8x4(
1259 sel8x4_3(xx) + sel8x4_3(yy),
1260 sel8x4_2(xx) + sel8x4_2(yy),
1261 sel8x4_1(xx) + sel8x4_1(yy),
1262 sel8x4_0(xx) + sel8x4_0(yy)
1263 );
1264}
1265
1266UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1267{
1268 return mk8x4(
1269 sel8x4_3(xx) - sel8x4_3(yy),
1270 sel8x4_2(xx) - sel8x4_2(yy),
1271 sel8x4_1(xx) - sel8x4_1(yy),
1272 sel8x4_0(xx) - sel8x4_0(yy)
1273 );
1274}
1275
1276UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1277{
1278 return mk8x4(
1279 hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1280 hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1281 hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1282 hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1283 );
1284}
1285
1286UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1287{
1288 return mk8x4(
1289 hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1290 hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1291 hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1292 hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1293 );
1294}
1295
1296UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1297{
1298 return mk8x4(
1299 hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1300 hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1301 hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1302 hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1303 );
1304}
1305
1306UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1307{
1308 return mk8x4(
1309 hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1310 hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1311 hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1312 hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1313 );
1314}
1315
1316UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1317{
1318 return mk8x4(
1319 qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1320 qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1321 qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1322 qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1323 );
1324}
1325
1326UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1327{
1328 return mk8x4(
1329 qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1330 qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1331 qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1332 qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1333 );
1334}
1335
1336UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1337{
1338 return mk8x4(
1339 qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1340 qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1341 qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1342 qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1343 );
1344}
1345
1346UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1347{
1348 return mk8x4(
1349 qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1350 qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1351 qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1352 qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1353 );
1354}
1355
1356UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1357{
1358 return mk16x2(
1359 cmpnez16( sel16x2_1(xx) ),
1360 cmpnez16( sel16x2_0(xx) )
1361 );
1362}
1363
1364UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1365{
1366 return mk8x4(
1367 cmpnez8( sel8x4_3(xx) ),
1368 cmpnez8( sel8x4_2(xx) ),
1369 cmpnez8( sel8x4_1(xx) ),
1370 cmpnez8( sel8x4_0(xx) )
1371 );
1372}
sewardj38a3f862005-01-13 15:06:51 +00001373
sewardj310d6b22010-10-18 16:29:40 +00001374UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1375{
1376 return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1377 + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1378 + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1379 + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1380}
1381
sewardj4c96e612012-06-02 23:47:02 +00001382/*------------------------------------------------------------------*/
1383/* Decimal Floating Point (DFP) externally visible helper functions */
1384/* that implement Iop_BCDtoDPB and Iop_DPBtoBCD */
1385/*------------------------------------------------------------------*/
1386
1387#define NOT( x ) ( ( ( x ) == 0) ? 1 : 0)
1388#define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1389#define PUT( x, y ) ( ( x )<< ( y ) )
1390
1391ULong dpb_to_bcd( ULong chunk )
1392{
1393 Short a, b, c, d, e, f, g, h, i, j, k, m;
1394 Short p, q, r, s, t, u, v, w, x, y;
1395 ULong value;
1396
1397 /* convert 10 bit densely packed BCD to BCD */
1398 p = GET( chunk, 9 );
1399 q = GET( chunk, 8 );
1400 r = GET( chunk, 7 );
1401 s = GET( chunk, 6 );
1402 t = GET( chunk, 5 );
1403 u = GET( chunk, 4 );
1404 v = GET( chunk, 3 );
1405 w = GET( chunk, 2 );
1406 x = GET( chunk, 1 );
1407 y = GET( chunk, 0 );
1408
1409 /* The BCD bit values are given by the following boolean equations.*/
1410 a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
1411 b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
1412 c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
1413 d = r;
1414 e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
1415 f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
1416 g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
1417 h = u;
1418 i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
1419 j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
1420 | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
1421 k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
1422 | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
1423 m = y;
1424
1425 value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
1426 | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
1427 | PUT(k, 1) | PUT(m, 0);
1428 return value;
1429}
1430
1431ULong bcd_to_dpb( ULong chunk )
1432{
1433 Short a, b, c, d, e, f, g, h, i, j, k, m;
1434 Short p, q, r, s, t, u, v, w, x, y;
1435 ULong value;
1436 /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1437 The boolean equations to calculate the value of each of the DPD bit
1438 is given in Appendix B of Book 1: Power ISA User Instruction set. The
1439 bits for the DPD number are [abcdefghijkm]. The bits for the BCD value
1440 are [pqrstuvwxy]. The boolean logic equations in psuedo C code are:
1441 */
1442 a = GET( chunk, 11 );
1443 b = GET( chunk, 10 );
1444 c = GET( chunk, 9 );
1445 d = GET( chunk, 8 );
1446 e = GET( chunk, 7 );
1447 f = GET( chunk, 6 );
1448 g = GET( chunk, 5 );
1449 h = GET( chunk, 4 );
1450 i = GET( chunk, 3 );
1451 j = GET( chunk, 2 );
1452 k = GET( chunk, 1 );
1453 m = GET( chunk, 0 );
1454
1455 p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
1456 q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
1457 r = d;
1458 s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
1459 | ( f & NOT(a) & NOT(e) ) | ( e & i );
1460 t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
1461 | ( g & NOT(a) & NOT(e) ) | ( a & i );
1462 u = h;
1463 v = a | e | i;
1464 w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
1465 x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
1466 y = m;
1467
1468 value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
1469 | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
1470
1471 return value;
1472}
1473
1474ULong h_DPBtoBCD( ULong dpb )
1475{
1476 ULong result, chunk;
1477 Int i;
1478
1479 result = 0;
1480
1481 for (i = 0; i < 5; i++) {
1482 chunk = dpb >> ( 4 - i ) * 10;
1483 result = result << 12;
1484 result |= dpb_to_bcd( chunk & 0x3FF );
1485 }
1486 return result;
1487}
1488
1489ULong h_BCDtoDPB( ULong bcd )
1490{
1491 ULong result, chunk;
1492 Int i;
1493
1494 result = 0;
1495
1496 for (i = 0; i < 5; i++) {
1497 chunk = bcd >> ( 4 - i ) * 12;
1498 result = result << 10;
1499 result |= bcd_to_dpb( chunk & 0xFFF );
1500 }
1501 return result;
1502}
1503#undef NOT
1504#undef GET
1505#undef PUT
sewardj310d6b22010-10-18 16:29:40 +00001506
sewardj38a3f862005-01-13 15:06:51 +00001507/*---------------------------------------------------------------*/
sewardjcef7d3e2009-07-02 12:21:59 +00001508/*--- end host_generic_simd64.c ---*/
sewardj38a3f862005-01-13 15:06:51 +00001509/*---------------------------------------------------------------*/
sewardj4c96e612012-06-02 23:47:02 +00001510