José Fonseca | 421507d | 2009-10-22 18:28:17 +0100 | [diff] [blame] | 1 | /************************************************************************** |
| 2 | * |
| 3 | * Copyright 2009 VMware, Inc. |
| 4 | * All Rights Reserved. |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a |
| 7 | * copy of this software and associated documentation files (the |
| 8 | * "Software"), to deal in the Software without restriction, including |
| 9 | * without limitation the rights to use, copy, modify, merge, publish, |
| 10 | * distribute, sub license, and/or sell copies of the Software, and to |
| 11 | * permit persons to whom the Software is furnished to do so, subject to |
| 12 | * the following conditions: |
| 13 | * |
| 14 | * The above copyright notice and this permission notice (including the |
| 15 | * next paragraph) shall be included in all copies or substantial portions |
| 16 | * of the Software. |
| 17 | * |
| 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
| 19 | * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| 20 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. |
| 21 | * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR |
| 22 | * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
| 23 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
| 24 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 25 | * |
| 26 | **************************************************************************/ |
| 27 | |
| 28 | |
| 29 | /** |
| 30 | * @file |
| 31 | * Helper functions for packing/unpacking. |
| 32 | * |
| 33 | * Pack/unpacking is necessary for conversion between types of different |
| 34 | * bit width. |
| 35 | * |
| 36 | * They are also commonly used when an computation needs higher |
| 37 | * precision for the intermediate values. For example, if one needs the |
| 38 | * function: |
| 39 | * |
| 40 | * c = compute(a, b); |
| 41 | * |
| 42 | * to use more precision for intermediate results then one should implement it |
| 43 | * as: |
| 44 | * |
| 45 | * LLVMValueRef |
| 46 | * compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b) |
| 47 | * { |
| 48 | * struct lp_type wide_type = lp_wider_type(type); |
| 49 | * LLVMValueRef al, ah, bl, bh, cl, ch, c; |
| 50 | * |
| 51 | * lp_build_unpack2(builder, type, wide_type, a, &al, &ah); |
| 52 | * lp_build_unpack2(builder, type, wide_type, b, &bl, &bh); |
| 53 | * |
| 54 | * cl = compute_half(al, bl); |
| 55 | * ch = compute_half(ah, bh); |
| 56 | * |
| 57 | * c = lp_build_pack2(bld->builder, wide_type, type, cl, ch); |
| 58 | * |
| 59 | * return c; |
| 60 | * } |
| 61 | * |
| 62 | * where compute_half() would do the computation for half the elements with |
| 63 | * twice the precision. |
| 64 | * |
| 65 | * @author Jose Fonseca <jfonseca@vmware.com> |
| 66 | */ |
| 67 | |
| 68 | |
| 69 | #include "util/u_debug.h" |
| 70 | #include "util/u_math.h" |
| 71 | #include "util/u_cpu_detect.h" |
| 72 | |
| 73 | #include "lp_bld_type.h" |
| 74 | #include "lp_bld_const.h" |
| 75 | #include "lp_bld_intr.h" |
| 76 | #include "lp_bld_arit.h" |
| 77 | #include "lp_bld_pack.h" |
| 78 | |
| 79 | |
| 80 | /** |
| 81 | * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions. |
| 82 | */ |
| 83 | static LLVMValueRef |
| 84 | lp_build_const_unpack_shuffle(unsigned n, unsigned lo_hi) |
| 85 | { |
| 86 | LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; |
| 87 | unsigned i, j; |
| 88 | |
| 89 | assert(n <= LP_MAX_VECTOR_LENGTH); |
| 90 | assert(lo_hi < 2); |
| 91 | |
| 92 | /* TODO: cache results in a static table */ |
| 93 | |
| 94 | for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) { |
| 95 | elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0); |
| 96 | elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0); |
| 97 | } |
| 98 | |
| 99 | return LLVMConstVector(elems, n); |
| 100 | } |
| 101 | |
| 102 | |
| 103 | /** |
| 104 | * Build shuffle vectors that match PACKxx instructions. |
| 105 | */ |
| 106 | static LLVMValueRef |
| 107 | lp_build_const_pack_shuffle(unsigned n) |
| 108 | { |
| 109 | LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; |
| 110 | unsigned i; |
| 111 | |
| 112 | assert(n <= LP_MAX_VECTOR_LENGTH); |
| 113 | |
| 114 | /* TODO: cache results in a static table */ |
| 115 | |
| 116 | for(i = 0; i < n; ++i) |
| 117 | elems[i] = LLVMConstInt(LLVMInt32Type(), 2*i, 0); |
| 118 | |
| 119 | return LLVMConstVector(elems, n); |
| 120 | } |
| 121 | |
| 122 | |
| 123 | /** |
| 124 | * Interleave vector elements. |
| 125 | * |
| 126 | * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions. |
| 127 | */ |
| 128 | LLVMValueRef |
| 129 | lp_build_interleave2(LLVMBuilderRef builder, |
| 130 | struct lp_type type, |
| 131 | LLVMValueRef a, |
| 132 | LLVMValueRef b, |
| 133 | unsigned lo_hi) |
| 134 | { |
| 135 | LLVMValueRef shuffle; |
| 136 | |
| 137 | shuffle = lp_build_const_unpack_shuffle(type.length, lo_hi); |
| 138 | |
| 139 | return LLVMBuildShuffleVector(builder, a, b, shuffle, ""); |
| 140 | } |
| 141 | |
| 142 | |
| 143 | /** |
| 144 | * Double the bit width. |
| 145 | * |
| 146 | * This will only change the number of bits the values are represented, not the |
| 147 | * values themselves. |
| 148 | */ |
| 149 | void |
| 150 | lp_build_unpack2(LLVMBuilderRef builder, |
| 151 | struct lp_type src_type, |
| 152 | struct lp_type dst_type, |
| 153 | LLVMValueRef src, |
| 154 | LLVMValueRef *dst_lo, |
| 155 | LLVMValueRef *dst_hi) |
| 156 | { |
| 157 | LLVMValueRef msb; |
| 158 | LLVMTypeRef dst_vec_type; |
| 159 | |
| 160 | assert(!src_type.floating); |
| 161 | assert(!dst_type.floating); |
José Fonseca | 421507d | 2009-10-22 18:28:17 +0100 | [diff] [blame] | 162 | assert(dst_type.width == src_type.width * 2); |
| 163 | assert(dst_type.length * 2 == src_type.length); |
| 164 | |
José Fonseca | 8d80fd3 | 2009-10-25 09:03:50 +0000 | [diff] [blame] | 165 | if(dst_type.sign && src_type.sign) { |
José Fonseca | 421507d | 2009-10-22 18:28:17 +0100 | [diff] [blame] | 166 | /* Replicate the sign bit in the most significant bits */ |
| 167 | msb = LLVMBuildAShr(builder, src, lp_build_int_const_scalar(src_type, src_type.width - 1), ""); |
| 168 | } |
| 169 | else |
| 170 | /* Most significant bits always zero */ |
| 171 | msb = lp_build_zero(src_type); |
| 172 | |
| 173 | /* Interleave bits */ |
| 174 | if(util_cpu_caps.little_endian) { |
| 175 | *dst_lo = lp_build_interleave2(builder, src_type, src, msb, 0); |
| 176 | *dst_hi = lp_build_interleave2(builder, src_type, src, msb, 1); |
| 177 | } |
| 178 | else { |
| 179 | *dst_lo = lp_build_interleave2(builder, src_type, msb, src, 0); |
| 180 | *dst_hi = lp_build_interleave2(builder, src_type, msb, src, 1); |
| 181 | } |
| 182 | |
| 183 | /* Cast the result into the new type (twice as wide) */ |
| 184 | |
| 185 | dst_vec_type = lp_build_vec_type(dst_type); |
| 186 | |
| 187 | *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, ""); |
| 188 | *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, ""); |
| 189 | } |
| 190 | |
| 191 | |
| 192 | /** |
| 193 | * Expand the bit width. |
| 194 | * |
| 195 | * This will only change the number of bits the values are represented, not the |
| 196 | * values themselves. |
| 197 | */ |
| 198 | void |
| 199 | lp_build_unpack(LLVMBuilderRef builder, |
| 200 | struct lp_type src_type, |
| 201 | struct lp_type dst_type, |
| 202 | LLVMValueRef src, |
| 203 | LLVMValueRef *dst, unsigned num_dsts) |
| 204 | { |
| 205 | unsigned num_tmps; |
| 206 | unsigned i; |
| 207 | |
| 208 | /* Register width must remain constant */ |
| 209 | assert(src_type.width * src_type.length == dst_type.width * dst_type.length); |
| 210 | |
| 211 | /* We must not loose or gain channels. Only precision */ |
| 212 | assert(src_type.length == dst_type.length * num_dsts); |
| 213 | |
| 214 | num_tmps = 1; |
| 215 | dst[0] = src; |
| 216 | |
| 217 | while(src_type.width < dst_type.width) { |
| 218 | struct lp_type tmp_type = src_type; |
| 219 | |
| 220 | tmp_type.width *= 2; |
| 221 | tmp_type.length /= 2; |
| 222 | |
| 223 | for(i = num_tmps; i--; ) { |
| 224 | lp_build_unpack2(builder, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]); |
| 225 | } |
| 226 | |
| 227 | src_type = tmp_type; |
| 228 | |
| 229 | num_tmps *= 2; |
| 230 | } |
| 231 | |
| 232 | assert(num_tmps == num_dsts); |
| 233 | } |
| 234 | |
| 235 | |
| 236 | /** |
| 237 | * Non-interleaved pack. |
| 238 | * |
| 239 | * This will move values as |
| 240 | * |
| 241 | * lo = __ l0 __ l1 __ l2 __.. __ ln |
| 242 | * hi = __ h0 __ h1 __ h2 __.. __ hn |
| 243 | * res = l0 l1 l2 .. ln h0 h1 h2 .. hn |
| 244 | * |
| 245 | * This will only change the number of bits the values are represented, not the |
| 246 | * values themselves. |
| 247 | * |
| 248 | * It is assumed the values are already clamped into the destination type range. |
| 249 | * Values outside that range will produce undefined results. Use |
| 250 | * lp_build_packs2 instead. |
| 251 | */ |
| 252 | LLVMValueRef |
| 253 | lp_build_pack2(LLVMBuilderRef builder, |
| 254 | struct lp_type src_type, |
| 255 | struct lp_type dst_type, |
| 256 | LLVMValueRef lo, |
| 257 | LLVMValueRef hi) |
| 258 | { |
| 259 | LLVMTypeRef src_vec_type = lp_build_vec_type(src_type); |
| 260 | LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type); |
| 261 | LLVMValueRef shuffle; |
| 262 | LLVMValueRef res; |
| 263 | |
| 264 | dst_vec_type = lp_build_vec_type(dst_type); |
| 265 | |
| 266 | assert(!src_type.floating); |
| 267 | assert(!dst_type.floating); |
| 268 | assert(src_type.width == dst_type.width * 2); |
| 269 | assert(src_type.length * 2 == dst_type.length); |
| 270 | |
| 271 | if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) { |
| 272 | switch(src_type.width) { |
| 273 | case 32: |
| 274 | if(dst_type.sign) { |
| 275 | res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi); |
| 276 | } |
| 277 | else { |
| 278 | if (util_cpu_caps.has_sse4_1) { |
| 279 | /* PACKUSDW is the only instrinsic with a consistent signature */ |
| 280 | return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi); |
| 281 | } |
| 282 | else { |
| 283 | assert(0); |
| 284 | return LLVMGetUndef(dst_vec_type); |
| 285 | } |
| 286 | } |
| 287 | break; |
| 288 | |
| 289 | case 16: |
| 290 | if(dst_type.sign) |
| 291 | res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi); |
| 292 | else |
| 293 | res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi); |
| 294 | break; |
| 295 | |
| 296 | default: |
| 297 | assert(0); |
| 298 | return LLVMGetUndef(dst_vec_type); |
| 299 | break; |
| 300 | } |
| 301 | |
| 302 | res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); |
| 303 | return res; |
| 304 | } |
| 305 | |
| 306 | lo = LLVMBuildBitCast(builder, lo, dst_vec_type, ""); |
| 307 | hi = LLVMBuildBitCast(builder, hi, dst_vec_type, ""); |
| 308 | |
| 309 | shuffle = lp_build_const_pack_shuffle(dst_type.length); |
| 310 | |
| 311 | res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, ""); |
| 312 | |
| 313 | return res; |
| 314 | } |
| 315 | |
| 316 | |
| 317 | |
| 318 | /** |
| 319 | * Non-interleaved pack and saturate. |
| 320 | * |
| 321 | * Same as lp_build_pack2 but will saturate values so that they fit into the |
| 322 | * destination type. |
| 323 | */ |
| 324 | LLVMValueRef |
| 325 | lp_build_packs2(LLVMBuilderRef builder, |
| 326 | struct lp_type src_type, |
| 327 | struct lp_type dst_type, |
| 328 | LLVMValueRef lo, |
| 329 | LLVMValueRef hi) |
| 330 | { |
| 331 | boolean clamp; |
| 332 | |
| 333 | assert(!src_type.floating); |
| 334 | assert(!dst_type.floating); |
| 335 | assert(src_type.sign == dst_type.sign); |
| 336 | assert(src_type.width == dst_type.width * 2); |
| 337 | assert(src_type.length * 2 == dst_type.length); |
| 338 | |
| 339 | clamp = TRUE; |
| 340 | |
| 341 | /* All X86 SSE non-interleaved pack instructions take signed inputs and |
| 342 | * saturate them, so no need to clamp for those cases. */ |
| 343 | if(util_cpu_caps.has_sse2 && |
| 344 | src_type.width * src_type.length == 128 && |
| 345 | src_type.sign) |
| 346 | clamp = FALSE; |
| 347 | |
| 348 | if(clamp) { |
| 349 | struct lp_build_context bld; |
| 350 | unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width; |
| 351 | LLVMValueRef dst_max = lp_build_int_const_scalar(src_type, ((unsigned long long)1 << dst_bits) - 1); |
| 352 | lp_build_context_init(&bld, builder, src_type); |
| 353 | lo = lp_build_min(&bld, lo, dst_max); |
| 354 | hi = lp_build_min(&bld, hi, dst_max); |
| 355 | /* FIXME: What about lower bound? */ |
| 356 | } |
| 357 | |
| 358 | return lp_build_pack2(builder, src_type, dst_type, lo, hi); |
| 359 | } |
| 360 | |
| 361 | |
| 362 | /** |
| 363 | * Truncate the bit width. |
| 364 | * |
| 365 | * TODO: Handle saturation consistently. |
| 366 | */ |
| 367 | LLVMValueRef |
| 368 | lp_build_pack(LLVMBuilderRef builder, |
| 369 | struct lp_type src_type, |
| 370 | struct lp_type dst_type, |
| 371 | boolean clamped, |
| 372 | const LLVMValueRef *src, unsigned num_srcs) |
| 373 | { |
| 374 | LLVMValueRef (*pack2)(LLVMBuilderRef builder, |
| 375 | struct lp_type src_type, |
| 376 | struct lp_type dst_type, |
| 377 | LLVMValueRef lo, |
| 378 | LLVMValueRef hi); |
| 379 | LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; |
| 380 | unsigned i; |
| 381 | |
| 382 | |
| 383 | /* Register width must remain constant */ |
| 384 | assert(src_type.width * src_type.length == dst_type.width * dst_type.length); |
| 385 | |
| 386 | /* We must not loose or gain channels. Only precision */ |
| 387 | assert(src_type.length * num_srcs == dst_type.length); |
| 388 | |
| 389 | if(clamped) |
| 390 | pack2 = &lp_build_pack2; |
| 391 | else |
| 392 | pack2 = &lp_build_packs2; |
| 393 | |
| 394 | for(i = 0; i < num_srcs; ++i) |
| 395 | tmp[i] = src[i]; |
| 396 | |
| 397 | while(src_type.width > dst_type.width) { |
| 398 | struct lp_type tmp_type = src_type; |
| 399 | |
| 400 | tmp_type.width /= 2; |
| 401 | tmp_type.length *= 2; |
| 402 | |
| 403 | /* Take in consideration the sign changes only in the last step */ |
| 404 | if(tmp_type.width == dst_type.width) |
| 405 | tmp_type.sign = dst_type.sign; |
| 406 | |
| 407 | num_srcs /= 2; |
| 408 | |
| 409 | for(i = 0; i < num_srcs; ++i) |
| 410 | tmp[i] = pack2(builder, src_type, tmp_type, tmp[2*i + 0], tmp[2*i + 1]); |
| 411 | |
| 412 | src_type = tmp_type; |
| 413 | } |
| 414 | |
| 415 | assert(num_srcs == 1); |
| 416 | |
| 417 | return tmp[0]; |
| 418 | } |