blob: 58ebc7bf48fa10a676ab41ddc034c2a02d1eef19 [file] [log] [blame]
sewardj38a3f862005-01-13 15:06:51 +00001
2/*---------------------------------------------------------------*/
sewardj752f9062010-05-03 21:38:49 +00003/*--- begin host_generic_simd64.c ---*/
sewardj38a3f862005-01-13 15:06:51 +00004/*---------------------------------------------------------------*/
5
6/*
sewardj752f9062010-05-03 21:38:49 +00007 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
sewardj38a3f862005-01-13 15:06:51 +00009
sewardj752f9062010-05-03 21:38:49 +000010 Copyright (C) 2004-2010 OpenWorks LLP
11 info@open-works.net
sewardj38a3f862005-01-13 15:06:51 +000012
sewardj752f9062010-05-03 21:38:49 +000013 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
sewardj38a3f862005-01-13 15:06:51 +000017
sewardj752f9062010-05-03 21:38:49 +000018 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
sewardj7bd6ffe2005-08-03 16:07:36 +000026 02110-1301, USA.
27
sewardj752f9062010-05-03 21:38:49 +000028 The GNU General Public License is contained in the file COPYING.
sewardj38a3f862005-01-13 15:06:51 +000029
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
sewardj38a3f862005-01-13 15:06:51 +000034*/
35
36/* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37 where the instruction selectors cannot generate code in-line.
38 These are purely back-end entities and cannot be seen/referenced
39 from IR. */
40
41#include "libvex_basictypes.h"
sewardjcef7d3e2009-07-02 12:21:59 +000042#include "host_generic_simd64.h"
sewardj38a3f862005-01-13 15:06:51 +000043
44
45
46/* Tuple/select functions for 32x2 vectors. */
47
48static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
49 return (((ULong)w1) << 32) | ((ULong)w0);
50}
51
52static inline UInt sel32x2_1 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000053 return 0xFFFFFFFF & toUInt(w64 >> 32);
sewardj38a3f862005-01-13 15:06:51 +000054}
55static inline UInt sel32x2_0 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000056 return 0xFFFFFFFF & toUInt(w64);
sewardj38a3f862005-01-13 15:06:51 +000057}
58
59
60/* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless
61 with 64-bit shifts so we give it a hand. */
62
63static inline ULong mk16x4 ( UShort w3, UShort w2,
64 UShort w1, UShort w0 ) {
65 UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
66 UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
67 return mk32x2(hi32, lo32);
68}
69
70static inline UShort sel16x4_3 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000071 UInt hi32 = toUInt(w64 >> 32);
72 return toUShort(0xFFFF & (hi32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +000073}
74static inline UShort sel16x4_2 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +000075 UInt hi32 = toUInt(w64 >> 32);
76 return toUShort(0xFFFF & hi32);
sewardj38a3f862005-01-13 15:06:51 +000077}
78static inline UShort sel16x4_1 ( ULong w64 ) {
79 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +000080 return toUShort(0xFFFF & (lo32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +000081}
82static inline UShort sel16x4_0 ( ULong w64 ) {
83 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +000084 return toUShort(0xFFFF & lo32);
sewardj38a3f862005-01-13 15:06:51 +000085}
86
87
88/* Tuple/select functions for 8x8 vectors. */
89
90static inline ULong mk8x8 ( UChar w7, UChar w6,
91 UChar w5, UChar w4,
92 UChar w3, UChar w2,
sewardje2ea1762010-09-22 00:56:37 +000093 UChar w1, UChar w0 ) {
sewardj38a3f862005-01-13 15:06:51 +000094 UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16)
95 | (((UInt)w5) << 8) | (((UInt)w4) << 0);
96 UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
97 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
98 return mk32x2(hi32, lo32);
99}
100
101static inline UChar sel8x8_7 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000102 UInt hi32 = toUInt(w64 >> 32);
103 return toUChar(0xFF & (hi32 >> 24));
sewardj38a3f862005-01-13 15:06:51 +0000104}
105static inline UChar sel8x8_6 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000106 UInt hi32 = toUInt(w64 >> 32);
107 return toUChar(0xFF & (hi32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +0000108}
109static inline UChar sel8x8_5 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000110 UInt hi32 = toUInt(w64 >> 32);
111 return toUChar(0xFF & (hi32 >> 8));
sewardj38a3f862005-01-13 15:06:51 +0000112}
113static inline UChar sel8x8_4 ( ULong w64 ) {
sewardjd19fc162005-02-26 02:16:39 +0000114 UInt hi32 = toUInt(w64 >> 32);
115 return toUChar(0xFF & (hi32 >> 0));
sewardj38a3f862005-01-13 15:06:51 +0000116}
117static inline UChar sel8x8_3 ( ULong w64 ) {
118 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000119 return toUChar(0xFF & (lo32 >> 24));
sewardj38a3f862005-01-13 15:06:51 +0000120}
121static inline UChar sel8x8_2 ( ULong w64 ) {
122 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000123 return toUChar(0xFF & (lo32 >> 16));
sewardj38a3f862005-01-13 15:06:51 +0000124}
125static inline UChar sel8x8_1 ( ULong w64 ) {
126 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000127 return toUChar(0xFF & (lo32 >> 8));
sewardj38a3f862005-01-13 15:06:51 +0000128}
129static inline UChar sel8x8_0 ( ULong w64 ) {
130 UInt lo32 = (UInt)w64;
sewardjd19fc162005-02-26 02:16:39 +0000131 return toUChar(0xFF & (lo32 >> 0));
sewardj38a3f862005-01-13 15:06:51 +0000132}
133
sewardjd166e282008-02-06 11:42:45 +0000134static inline UChar index8x8 ( ULong w64, UChar ix ) {
135 ix &= 7;
136 return toUChar((w64 >> (8*ix)) & 0xFF);
137}
138
sewardj38a3f862005-01-13 15:06:51 +0000139
140/* Scalar helpers. */
141
142static inline Short qadd16S ( Short xx, Short yy )
143{
144 Int t = ((Int)xx) + ((Int)yy);
145 if (t < -32768) t = -32768;
146 if (t > 32767) t = 32767;
147 return (Short)t;
148}
149
150static inline Char qadd8S ( Char xx, Char yy )
151{
152 Int t = ((Int)xx) + ((Int)yy);
153 if (t < -128) t = -128;
154 if (t > 127) t = 127;
155 return (Char)t;
156}
157
158static inline UShort qadd16U ( UShort xx, UShort yy )
159{
160 UInt t = ((UInt)xx) + ((UInt)yy);
161 if (t > 0xFFFF) t = 0xFFFF;
162 return (UShort)t;
163}
164
165static inline UChar qadd8U ( UChar xx, UChar yy )
166{
167 UInt t = ((UInt)xx) + ((UInt)yy);
168 if (t > 0xFF) t = 0xFF;
169 return (UChar)t;
170}
171
172static inline Short qsub16S ( Short xx, Short yy )
173{
174 Int t = ((Int)xx) - ((Int)yy);
175 if (t < -32768) t = -32768;
176 if (t > 32767) t = 32767;
177 return (Short)t;
178}
179
180static inline Char qsub8S ( Char xx, Char yy )
181{
182 Int t = ((Int)xx) - ((Int)yy);
183 if (t < -128) t = -128;
184 if (t > 127) t = 127;
185 return (Char)t;
186}
187
188static inline UShort qsub16U ( UShort xx, UShort yy )
189{
190 Int t = ((Int)xx) - ((Int)yy);
191 if (t < 0) t = 0;
192 if (t > 0xFFFF) t = 0xFFFF;
193 return (UShort)t;
194}
195
196static inline UChar qsub8U ( UChar xx, UChar yy )
197{
198 Int t = ((Int)xx) - ((Int)yy);
199 if (t < 0) t = 0;
200 if (t > 0xFF) t = 0xFF;
201 return (UChar)t;
202}
203
204static inline Short mul16 ( Short xx, Short yy )
205{
206 Int t = ((Int)xx) * ((Int)yy);
207 return (Short)t;
208}
209
sewardjd166e282008-02-06 11:42:45 +0000210static inline Int mul32 ( Int xx, Int yy )
211{
212 Int t = ((Int)xx) * ((Int)yy);
213 return (Int)t;
214}
215
sewardj38a3f862005-01-13 15:06:51 +0000216static inline Short mulhi16S ( Short xx, Short yy )
217{
218 Int t = ((Int)xx) * ((Int)yy);
219 t >>=/*s*/ 16;
220 return (Short)t;
221}
222
223static inline UShort mulhi16U ( UShort xx, UShort yy )
224{
225 UInt t = ((UInt)xx) * ((UInt)yy);
226 t >>=/*u*/ 16;
227 return (UShort)t;
228}
229
230static inline UInt cmpeq32 ( UInt xx, UInt yy )
231{
232 return xx==yy ? 0xFFFFFFFF : 0;
233}
234
235static inline UShort cmpeq16 ( UShort xx, UShort yy )
236{
sewardjd19fc162005-02-26 02:16:39 +0000237 return toUShort(xx==yy ? 0xFFFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000238}
239
240static inline UChar cmpeq8 ( UChar xx, UChar yy )
241{
sewardjd19fc162005-02-26 02:16:39 +0000242 return toUChar(xx==yy ? 0xFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000243}
244
245static inline UInt cmpgt32S ( Int xx, Int yy )
246{
247 return xx>yy ? 0xFFFFFFFF : 0;
248}
249
250static inline UShort cmpgt16S ( Short xx, Short yy )
251{
sewardjd19fc162005-02-26 02:16:39 +0000252 return toUShort(xx>yy ? 0xFFFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000253}
254
255static inline UChar cmpgt8S ( Char xx, Char yy )
256{
sewardjd19fc162005-02-26 02:16:39 +0000257 return toUChar(xx>yy ? 0xFF : 0);
sewardj38a3f862005-01-13 15:06:51 +0000258}
259
sewardj18069182005-01-13 19:16:04 +0000260static inline UInt cmpnez32 ( UInt xx )
261{
262 return xx==0 ? 0 : 0xFFFFFFFF;
263}
264
265static inline UShort cmpnez16 ( UShort xx )
266{
sewardjd19fc162005-02-26 02:16:39 +0000267 return toUShort(xx==0 ? 0 : 0xFFFF);
sewardj18069182005-01-13 19:16:04 +0000268}
269
270static inline UChar cmpnez8 ( UChar xx )
271{
sewardjd19fc162005-02-26 02:16:39 +0000272 return toUChar(xx==0 ? 0 : 0xFF);
sewardj18069182005-01-13 19:16:04 +0000273}
274
sewardjc9bff7d2011-06-15 15:09:37 +0000275static inline Short qnarrow32Sto16S ( UInt xx0 )
sewardj38a3f862005-01-13 15:06:51 +0000276{
277 Int xx = (Int)xx0;
278 if (xx < -32768) xx = -32768;
279 if (xx > 32767) xx = 32767;
280 return (Short)xx;
281}
282
sewardjc9bff7d2011-06-15 15:09:37 +0000283static inline Char qnarrow16Sto8S ( UShort xx0 )
sewardj38a3f862005-01-13 15:06:51 +0000284{
285 Short xx = (Short)xx0;
286 if (xx < -128) xx = -128;
287 if (xx > 127) xx = 127;
288 return (Char)xx;
289}
290
sewardjc9bff7d2011-06-15 15:09:37 +0000291static inline UChar qnarrow16Sto8U ( UShort xx0 )
sewardj38a3f862005-01-13 15:06:51 +0000292{
293 Short xx = (Short)xx0;
294 if (xx < 0) xx = 0;
295 if (xx > 255) xx = 255;
296 return (UChar)xx;
297}
298
299/* shifts: we don't care about out-of-range ones, since
300 that is dealt with at a higher level. */
301
sewardjd166e282008-02-06 11:42:45 +0000302static inline UChar shl8 ( UChar v, UInt n )
303{
304 return toUChar(v << n);
305}
306
sewardjd71ba832006-12-27 01:15:29 +0000307static inline UChar sar8 ( UChar v, UInt n )
308{
309 return toUChar(((Char)v) >> n);
310}
311
sewardj38a3f862005-01-13 15:06:51 +0000312static inline UShort shl16 ( UShort v, UInt n )
313{
sewardjd19fc162005-02-26 02:16:39 +0000314 return toUShort(v << n);
sewardj38a3f862005-01-13 15:06:51 +0000315}
316
317static inline UShort shr16 ( UShort v, UInt n )
318{
sewardjd19fc162005-02-26 02:16:39 +0000319 return toUShort((((UShort)v) >> n));
sewardj38a3f862005-01-13 15:06:51 +0000320}
321
322static inline UShort sar16 ( UShort v, UInt n )
323{
sewardjd19fc162005-02-26 02:16:39 +0000324 return toUShort(((Short)v) >> n);
sewardj38a3f862005-01-13 15:06:51 +0000325}
326
327static inline UInt shl32 ( UInt v, UInt n )
328{
329 return v << n;
330}
331
332static inline UInt shr32 ( UInt v, UInt n )
333{
334 return (((UInt)v) >> n);
335}
336
337static inline UInt sar32 ( UInt v, UInt n )
338{
339 return ((Int)v) >> n;
340}
341
342static inline UChar avg8U ( UChar xx, UChar yy )
343{
344 UInt xxi = (UInt)xx;
345 UInt yyi = (UInt)yy;
346 UInt r = (xxi + yyi + 1) >> 1;
347 return (UChar)r;
348}
349
350static inline UShort avg16U ( UShort xx, UShort yy )
351{
352 UInt xxi = (UInt)xx;
353 UInt yyi = (UInt)yy;
354 UInt r = (xxi + yyi + 1) >> 1;
355 return (UShort)r;
356}
357
358static inline Short max16S ( Short xx, Short yy )
359{
sewardjd19fc162005-02-26 02:16:39 +0000360 return toUShort((xx > yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000361}
362
363static inline UChar max8U ( UChar xx, UChar yy )
364{
sewardjd19fc162005-02-26 02:16:39 +0000365 return toUChar((xx > yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000366}
367
368static inline Short min16S ( Short xx, Short yy )
369{
sewardjd19fc162005-02-26 02:16:39 +0000370 return toUShort((xx < yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000371}
372
373static inline UChar min8U ( UChar xx, UChar yy )
374{
sewardjd19fc162005-02-26 02:16:39 +0000375 return toUChar((xx < yy) ? xx : yy);
sewardj38a3f862005-01-13 15:06:51 +0000376}
377
sewardje2ea1762010-09-22 00:56:37 +0000378static inline UShort hadd16U ( UShort xx, UShort yy )
379{
380 UInt xxi = (UInt)xx;
381 UInt yyi = (UInt)yy;
382 UInt r = (xxi + yyi) >> 1;
383 return (UShort)r;
384}
385
386static inline Short hadd16S ( Short xx, Short yy )
387{
388 Int xxi = (Int)xx;
389 Int yyi = (Int)yy;
390 Int r = (xxi + yyi) >> 1;
391 return (Short)r;
392}
393
394static inline UShort hsub16U ( UShort xx, UShort yy )
395{
396 UInt xxi = (UInt)xx;
397 UInt yyi = (UInt)yy;
398 UInt r = (xxi - yyi) >> 1;
399 return (UShort)r;
400}
401
402static inline Short hsub16S ( Short xx, Short yy )
403{
404 Int xxi = (Int)xx;
405 Int yyi = (Int)yy;
406 Int r = (xxi - yyi) >> 1;
407 return (Short)r;
408}
409
410static inline UChar hadd8U ( UChar xx, UChar yy )
411{
412 UInt xxi = (UInt)xx;
413 UInt yyi = (UInt)yy;
414 UInt r = (xxi + yyi) >> 1;
415 return (UChar)r;
416}
417
418static inline Char hadd8S ( Char xx, Char yy )
419{
420 Int xxi = (Int)xx;
421 Int yyi = (Int)yy;
422 Int r = (xxi + yyi) >> 1;
423 return (Char)r;
424}
425
426static inline UChar hsub8U ( UChar xx, UChar yy )
427{
428 UInt xxi = (UInt)xx;
429 UInt yyi = (UInt)yy;
430 UInt r = (xxi - yyi) >> 1;
431 return (UChar)r;
432}
433
434static inline Char hsub8S ( Char xx, Char yy )
435{
436 Int xxi = (Int)xx;
437 Int yyi = (Int)yy;
438 Int r = (xxi - yyi) >> 1;
439 return (Char)r;
440}
441
sewardj310d6b22010-10-18 16:29:40 +0000442static inline UInt absdiff8U ( UChar xx, UChar yy )
443{
444 UInt xxu = (UChar)xx;
445 UInt yyu = (UChar)yy;
446 return xxu >= yyu ? xxu - yyu : yyu - xxu;
447}
sewardje2ea1762010-09-22 00:56:37 +0000448
sewardj38a3f862005-01-13 15:06:51 +0000449/* ----------------------------------------------------- */
450/* Start of the externally visible functions. These simply
451 implement the corresponding IR primops. */
452/* ----------------------------------------------------- */
453
454/* ------------ Normal addition ------------ */
455
456ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
457{
458 return mk32x2(
459 sel32x2_1(xx) + sel32x2_1(yy),
460 sel32x2_0(xx) + sel32x2_0(yy)
461 );
462}
463
464ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
465{
466 return mk16x4(
sewardjd19fc162005-02-26 02:16:39 +0000467 toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
468 toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
469 toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
470 toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000471 );
472}
473
474ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
475{
476 return mk8x8(
sewardjd19fc162005-02-26 02:16:39 +0000477 toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
478 toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
479 toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
480 toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
481 toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
482 toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
483 toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
484 toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000485 );
486}
487
488/* ------------ Saturating addition ------------ */
489
490ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
491{
492 return mk16x4(
493 qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
494 qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
495 qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
496 qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
497 );
498}
499
500ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
501{
502 return mk8x8(
503 qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
504 qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
505 qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
506 qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
507 qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
508 qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
509 qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
510 qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
511 );
512}
513
514ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
515{
516 return mk16x4(
517 qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
518 qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
519 qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
520 qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
521 );
522}
523
524ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
525{
526 return mk8x8(
527 qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
528 qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
529 qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
530 qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
531 qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
532 qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
533 qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
534 qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
535 );
536}
537
538/* ------------ Normal subtraction ------------ */
539
540ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
541{
542 return mk32x2(
543 sel32x2_1(xx) - sel32x2_1(yy),
544 sel32x2_0(xx) - sel32x2_0(yy)
545 );
546}
547
548ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
549{
550 return mk16x4(
sewardjd19fc162005-02-26 02:16:39 +0000551 toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
552 toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
553 toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
554 toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000555 );
556}
557
558ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
559{
560 return mk8x8(
sewardjd19fc162005-02-26 02:16:39 +0000561 toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
562 toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
563 toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
564 toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
565 toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
566 toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
567 toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
568 toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
sewardj38a3f862005-01-13 15:06:51 +0000569 );
570}
571
572/* ------------ Saturating subtraction ------------ */
573
574ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
575{
576 return mk16x4(
577 qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
578 qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
579 qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
580 qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
581 );
582}
583
584ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
585{
586 return mk8x8(
587 qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
588 qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
589 qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
590 qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
591 qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
592 qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
593 qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
594 qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
595 );
596}
597
598ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
599{
600 return mk16x4(
601 qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
602 qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
603 qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
604 qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
605 );
606}
607
608ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
609{
610 return mk8x8(
611 qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
612 qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
613 qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
614 qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
615 qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
616 qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
617 qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
618 qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
619 );
620}
621
622/* ------------ Multiplication ------------ */
623
624ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
625{
626 return mk16x4(
627 mul16( sel16x4_3(xx), sel16x4_3(yy) ),
628 mul16( sel16x4_2(xx), sel16x4_2(yy) ),
629 mul16( sel16x4_1(xx), sel16x4_1(yy) ),
630 mul16( sel16x4_0(xx), sel16x4_0(yy) )
631 );
632}
633
sewardjd166e282008-02-06 11:42:45 +0000634ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
635{
636 return mk32x2(
637 mul32( sel32x2_1(xx), sel32x2_1(yy) ),
638 mul32( sel32x2_0(xx), sel32x2_0(yy) )
639 );
640}
641
sewardj38a3f862005-01-13 15:06:51 +0000642ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
643{
644 return mk16x4(
645 mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
646 mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
647 mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
648 mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
649 );
650}
651
652ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
653{
654 return mk16x4(
655 mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
656 mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
657 mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
658 mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
659 );
660}
661
662/* ------------ Comparison ------------ */
663
664ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
665{
666 return mk32x2(
667 cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
668 cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
669 );
670}
671
672ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
673{
674 return mk16x4(
675 cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
676 cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
677 cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
678 cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
679 );
680}
681
682ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
683{
684 return mk8x8(
685 cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
686 cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
687 cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
688 cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
689 cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
690 cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
691 cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
692 cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
693 );
694}
695
696ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
697{
698 return mk32x2(
699 cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
700 cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
701 );
702}
703
704ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
705{
706 return mk16x4(
707 cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
708 cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
709 cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
710 cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
711 );
712}
713
714ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
715{
716 return mk8x8(
717 cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
718 cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
719 cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
720 cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
721 cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
722 cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
723 cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
724 cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
725 );
726}
727
sewardj18069182005-01-13 19:16:04 +0000728ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
729{
730 return mk32x2(
731 cmpnez32( sel32x2_1(xx) ),
732 cmpnez32( sel32x2_0(xx) )
733 );
734}
735
736ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
737{
738 return mk16x4(
739 cmpnez16( sel16x4_3(xx) ),
740 cmpnez16( sel16x4_2(xx) ),
741 cmpnez16( sel16x4_1(xx) ),
742 cmpnez16( sel16x4_0(xx) )
743 );
744}
745
746ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
747{
748 return mk8x8(
749 cmpnez8( sel8x8_7(xx) ),
750 cmpnez8( sel8x8_6(xx) ),
751 cmpnez8( sel8x8_5(xx) ),
752 cmpnez8( sel8x8_4(xx) ),
753 cmpnez8( sel8x8_3(xx) ),
754 cmpnez8( sel8x8_2(xx) ),
755 cmpnez8( sel8x8_1(xx) ),
756 cmpnez8( sel8x8_0(xx) )
757 );
758}
759
sewardj38a3f862005-01-13 15:06:51 +0000760/* ------------ Saturating narrowing ------------ */
761
sewardjc9bff7d2011-06-15 15:09:37 +0000762ULong h_generic_calc_QNarrow32Sto16Sx4 ( ULong aa, ULong bb )
sewardj38a3f862005-01-13 15:06:51 +0000763{
764 UInt d = sel32x2_1(aa);
765 UInt c = sel32x2_0(aa);
766 UInt b = sel32x2_1(bb);
767 UInt a = sel32x2_0(bb);
768 return mk16x4(
sewardjc9bff7d2011-06-15 15:09:37 +0000769 qnarrow32Sto16S(d),
770 qnarrow32Sto16S(c),
771 qnarrow32Sto16S(b),
772 qnarrow32Sto16S(a)
sewardj38a3f862005-01-13 15:06:51 +0000773 );
774}
775
sewardjc9bff7d2011-06-15 15:09:37 +0000776ULong h_generic_calc_QNarrow16Sto8Sx8 ( ULong aa, ULong bb )
sewardj38a3f862005-01-13 15:06:51 +0000777{
778 UShort h = sel16x4_3(aa);
779 UShort g = sel16x4_2(aa);
780 UShort f = sel16x4_1(aa);
781 UShort e = sel16x4_0(aa);
782 UShort d = sel16x4_3(bb);
783 UShort c = sel16x4_2(bb);
784 UShort b = sel16x4_1(bb);
785 UShort a = sel16x4_0(bb);
786 return mk8x8(
sewardjc9bff7d2011-06-15 15:09:37 +0000787 qnarrow16Sto8S(h),
788 qnarrow16Sto8S(g),
789 qnarrow16Sto8S(f),
790 qnarrow16Sto8S(e),
791 qnarrow16Sto8S(d),
792 qnarrow16Sto8S(c),
793 qnarrow16Sto8S(b),
794 qnarrow16Sto8S(a)
sewardj38a3f862005-01-13 15:06:51 +0000795 );
796}
797
sewardjc9bff7d2011-06-15 15:09:37 +0000798ULong h_generic_calc_QNarrow16Sto8Ux8 ( ULong aa, ULong bb )
sewardj38a3f862005-01-13 15:06:51 +0000799{
800 UShort h = sel16x4_3(aa);
801 UShort g = sel16x4_2(aa);
802 UShort f = sel16x4_1(aa);
803 UShort e = sel16x4_0(aa);
804 UShort d = sel16x4_3(bb);
805 UShort c = sel16x4_2(bb);
806 UShort b = sel16x4_1(bb);
807 UShort a = sel16x4_0(bb);
808 return mk8x8(
sewardjc9bff7d2011-06-15 15:09:37 +0000809 qnarrow16Sto8U(h),
810 qnarrow16Sto8U(g),
811 qnarrow16Sto8U(f),
812 qnarrow16Sto8U(e),
813 qnarrow16Sto8U(d),
814 qnarrow16Sto8U(c),
815 qnarrow16Sto8U(b),
816 qnarrow16Sto8U(a)
sewardj38a3f862005-01-13 15:06:51 +0000817 );
818}
819
820/* ------------ Interleaving ------------ */
821
822ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
823{
824 return mk8x8(
825 sel8x8_7(aa),
826 sel8x8_7(bb),
827 sel8x8_6(aa),
828 sel8x8_6(bb),
829 sel8x8_5(aa),
830 sel8x8_5(bb),
831 sel8x8_4(aa),
832 sel8x8_4(bb)
833 );
834}
835
836ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
837{
838 return mk8x8(
839 sel8x8_3(aa),
840 sel8x8_3(bb),
841 sel8x8_2(aa),
842 sel8x8_2(bb),
843 sel8x8_1(aa),
844 sel8x8_1(bb),
845 sel8x8_0(aa),
846 sel8x8_0(bb)
847 );
848}
849
850ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
851{
852 return mk16x4(
853 sel16x4_3(aa),
854 sel16x4_3(bb),
855 sel16x4_2(aa),
856 sel16x4_2(bb)
857 );
858}
859
860ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
861{
862 return mk16x4(
863 sel16x4_1(aa),
864 sel16x4_1(bb),
865 sel16x4_0(aa),
866 sel16x4_0(bb)
867 );
868}
869
870ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
871{
872 return mk32x2(
873 sel32x2_1(aa),
874 sel32x2_1(bb)
875 );
876}
877
878ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
879{
880 return mk32x2(
881 sel32x2_0(aa),
882 sel32x2_0(bb)
883 );
884}
885
sewardjd166e282008-02-06 11:42:45 +0000886/* ------------ Concatenation ------------ */
887
888ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
889{
890 return mk16x4(
891 sel16x4_3(aa),
892 sel16x4_1(aa),
893 sel16x4_3(bb),
894 sel16x4_1(bb)
895 );
896}
897
898ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
899{
900 return mk16x4(
901 sel16x4_2(aa),
902 sel16x4_0(aa),
903 sel16x4_2(bb),
904 sel16x4_0(bb)
905 );
906}
907
908/* misc hack looking for a proper home */
909ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
910{
911 return mk8x8(
912 index8x8(aa, sel8x8_7(bb)),
913 index8x8(aa, sel8x8_6(bb)),
914 index8x8(aa, sel8x8_5(bb)),
915 index8x8(aa, sel8x8_4(bb)),
916 index8x8(aa, sel8x8_3(bb)),
917 index8x8(aa, sel8x8_2(bb)),
918 index8x8(aa, sel8x8_1(bb)),
919 index8x8(aa, sel8x8_0(bb))
920 );
921}
sewardj38a3f862005-01-13 15:06:51 +0000922
923/* ------------ Shifting ------------ */
924/* Note that because these primops are undefined if the shift amount
925 equals or exceeds the lane width, the shift amount is masked so
926 that the scalar shifts are always in range. In fact, given the
927 semantics of these primops (ShlN16x4, etc) it is an error if in
928 fact we are ever given an out-of-range shift amount.
929*/
930ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
931{
932 /* vassert(nn < 32); */
933 nn &= 31;
934 return mk32x2(
935 shl32( sel32x2_1(xx), nn ),
936 shl32( sel32x2_0(xx), nn )
937 );
938}
939
940ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
941{
942 /* vassert(nn < 16); */
943 nn &= 15;
944 return mk16x4(
945 shl16( sel16x4_3(xx), nn ),
946 shl16( sel16x4_2(xx), nn ),
947 shl16( sel16x4_1(xx), nn ),
948 shl16( sel16x4_0(xx), nn )
949 );
950}
951
sewardjd166e282008-02-06 11:42:45 +0000952ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn )
953{
954 /* vassert(nn < 8); */
955 nn &= 7;
956 return mk8x8(
957 shl8( sel8x8_7(xx), nn ),
958 shl8( sel8x8_6(xx), nn ),
959 shl8( sel8x8_5(xx), nn ),
960 shl8( sel8x8_4(xx), nn ),
961 shl8( sel8x8_3(xx), nn ),
962 shl8( sel8x8_2(xx), nn ),
963 shl8( sel8x8_1(xx), nn ),
964 shl8( sel8x8_0(xx), nn )
965 );
966}
967
sewardj38a3f862005-01-13 15:06:51 +0000968ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
969{
970 /* vassert(nn < 32); */
971 nn &= 31;
972 return mk32x2(
973 shr32( sel32x2_1(xx), nn ),
974 shr32( sel32x2_0(xx), nn )
975 );
976}
977
978ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
979{
980 /* vassert(nn < 16); */
981 nn &= 15;
982 return mk16x4(
983 shr16( sel16x4_3(xx), nn ),
984 shr16( sel16x4_2(xx), nn ),
985 shr16( sel16x4_1(xx), nn ),
986 shr16( sel16x4_0(xx), nn )
987 );
988}
989
990ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
991{
992 /* vassert(nn < 32); */
993 nn &= 31;
994 return mk32x2(
995 sar32( sel32x2_1(xx), nn ),
996 sar32( sel32x2_0(xx), nn )
997 );
998}
999
1000ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1001{
1002 /* vassert(nn < 16); */
1003 nn &= 15;
1004 return mk16x4(
1005 sar16( sel16x4_3(xx), nn ),
1006 sar16( sel16x4_2(xx), nn ),
1007 sar16( sel16x4_1(xx), nn ),
1008 sar16( sel16x4_0(xx), nn )
1009 );
1010}
1011
sewardjd71ba832006-12-27 01:15:29 +00001012ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1013{
1014 /* vassert(nn < 8); */
1015 nn &= 7;
1016 return mk8x8(
1017 sar8( sel8x8_7(xx), nn ),
1018 sar8( sel8x8_6(xx), nn ),
1019 sar8( sel8x8_5(xx), nn ),
1020 sar8( sel8x8_4(xx), nn ),
1021 sar8( sel8x8_3(xx), nn ),
1022 sar8( sel8x8_2(xx), nn ),
1023 sar8( sel8x8_1(xx), nn ),
1024 sar8( sel8x8_0(xx), nn )
1025 );
1026}
1027
sewardj38a3f862005-01-13 15:06:51 +00001028/* ------------ Averaging ------------ */
1029
1030ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1031{
1032 return mk8x8(
1033 avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1034 avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1035 avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1036 avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1037 avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1038 avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1039 avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1040 avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1041 );
1042}
1043
1044ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1045{
1046 return mk16x4(
1047 avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1048 avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1049 avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1050 avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1051 );
1052}
1053
1054/* ------------ max/min ------------ */
1055
1056ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1057{
1058 return mk16x4(
1059 max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1060 max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1061 max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1062 max16S( sel16x4_0(xx), sel16x4_0(yy) )
1063 );
1064}
1065
1066ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1067{
1068 return mk8x8(
1069 max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1070 max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1071 max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1072 max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1073 max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1074 max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1075 max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1076 max8U( sel8x8_0(xx), sel8x8_0(yy) )
1077 );
1078}
1079
1080ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1081{
1082 return mk16x4(
1083 min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1084 min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1085 min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1086 min16S( sel16x4_0(xx), sel16x4_0(yy) )
1087 );
1088}
1089
1090ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1091{
1092 return mk8x8(
1093 min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1094 min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1095 min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1096 min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1097 min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1098 min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1099 min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1100 min8U( sel8x8_0(xx), sel8x8_0(yy) )
1101 );
1102}
1103
sewardje2ea1762010-09-22 00:56:37 +00001104/* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1105
1106/* Tuple/select functions for 16x2 vectors. */
1107static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1108 return (((UInt)w1) << 16) | ((UInt)w2);
1109}
1110
1111static inline UShort sel16x2_1 ( UInt w32 ) {
1112 return 0xFFFF & (UShort)(w32 >> 16);
1113}
1114static inline UShort sel16x2_0 ( UInt w32 ) {
1115 return 0xFFFF & (UShort)(w32);
1116}
1117
1118static inline UInt mk8x4 ( UChar w3, UChar w2,
1119 UChar w1, UChar w0 ) {
1120 UInt w32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
1121 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
1122 return w32;
1123}
1124
1125static inline UChar sel8x4_3 ( UInt w32 ) {
1126 return toUChar(0xFF & (w32 >> 24));
1127}
1128static inline UChar sel8x4_2 ( UInt w32 ) {
1129 return toUChar(0xFF & (w32 >> 16));
1130}
1131static inline UChar sel8x4_1 ( UInt w32 ) {
1132 return toUChar(0xFF & (w32 >> 8));
1133}
1134static inline UChar sel8x4_0 ( UInt w32 ) {
1135 return toUChar(0xFF & (w32 >> 0));
1136}
1137
1138
1139/* ----------------------------------------------------- */
1140/* More externally visible functions. These simply
1141 implement the corresponding IR primops. */
1142/* ----------------------------------------------------- */
1143
1144/* ------ 16x2 ------ */
1145
1146UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1147{
1148 return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1149 sel16x2_0(xx) + sel16x2_0(yy) );
1150}
1151
1152UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1153{
1154 return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1155 sel16x2_0(xx) - sel16x2_0(yy) );
1156}
1157
1158UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1159{
1160 return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1161 hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1162}
1163
1164UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1165{
1166 return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1167 hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1168}
1169
1170UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1171{
1172 return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1173 hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1174}
1175
1176UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1177{
1178 return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1179 hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1180}
1181
1182UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1183{
1184 return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1185 qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1186}
1187
1188UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1189{
1190 return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1191 qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1192}
1193
1194UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1195{
1196 return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1197 qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1198}
1199
1200UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1201{
1202 return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1203 qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1204}
1205
1206/* ------ 8x4 ------ */
1207
1208UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1209{
1210 return mk8x4(
1211 sel8x4_3(xx) + sel8x4_3(yy),
1212 sel8x4_2(xx) + sel8x4_2(yy),
1213 sel8x4_1(xx) + sel8x4_1(yy),
1214 sel8x4_0(xx) + sel8x4_0(yy)
1215 );
1216}
1217
1218UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1219{
1220 return mk8x4(
1221 sel8x4_3(xx) - sel8x4_3(yy),
1222 sel8x4_2(xx) - sel8x4_2(yy),
1223 sel8x4_1(xx) - sel8x4_1(yy),
1224 sel8x4_0(xx) - sel8x4_0(yy)
1225 );
1226}
1227
1228UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1229{
1230 return mk8x4(
1231 hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1232 hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1233 hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1234 hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1235 );
1236}
1237
1238UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1239{
1240 return mk8x4(
1241 hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1242 hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1243 hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1244 hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1245 );
1246}
1247
1248UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1249{
1250 return mk8x4(
1251 hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1252 hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1253 hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1254 hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1255 );
1256}
1257
1258UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1259{
1260 return mk8x4(
1261 hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1262 hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1263 hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1264 hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1265 );
1266}
1267
1268UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1269{
1270 return mk8x4(
1271 qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1272 qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1273 qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1274 qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1275 );
1276}
1277
1278UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1279{
1280 return mk8x4(
1281 qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1282 qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1283 qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1284 qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1285 );
1286}
1287
1288UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1289{
1290 return mk8x4(
1291 qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1292 qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1293 qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1294 qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1295 );
1296}
1297
1298UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1299{
1300 return mk8x4(
1301 qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1302 qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1303 qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1304 qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1305 );
1306}
1307
1308UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1309{
1310 return mk16x2(
1311 cmpnez16( sel16x2_1(xx) ),
1312 cmpnez16( sel16x2_0(xx) )
1313 );
1314}
1315
1316UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1317{
1318 return mk8x4(
1319 cmpnez8( sel8x4_3(xx) ),
1320 cmpnez8( sel8x4_2(xx) ),
1321 cmpnez8( sel8x4_1(xx) ),
1322 cmpnez8( sel8x4_0(xx) )
1323 );
1324}
sewardj38a3f862005-01-13 15:06:51 +00001325
sewardj310d6b22010-10-18 16:29:40 +00001326UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1327{
1328 return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1329 + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1330 + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1331 + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1332}
1333
1334
sewardj38a3f862005-01-13 15:06:51 +00001335/*---------------------------------------------------------------*/
sewardjcef7d3e2009-07-02 12:21:59 +00001336/*--- end host_generic_simd64.c ---*/
sewardj38a3f862005-01-13 15:06:51 +00001337/*---------------------------------------------------------------*/