Blame - src/gallium/auxiliary/gallivm/lp_bld_pack.c - fp2-dev/platform/external/mesa3d

blob: bc360ad77add1f24c57148fe5edf6b85770a8a6f [file] [log] [blame]

José Fonseca	421507d	2009-10-22 18:28:17 +0100	[diff] [blame]	1	/**************************************************************************
				2	*
				3	* Copyright 2009 VMware, Inc.
				4	* All Rights Reserved.
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a
				7	* copy of this software and associated documentation files (the
				8	* "Software"), to deal in the Software without restriction, including
				9	* without limitation the rights to use, copy, modify, merge, publish,
				10	* distribute, sub license, and/or sell copies of the Software, and to
				11	* permit persons to whom the Software is furnished to do so, subject to
				12	* the following conditions:
				13	*
				14	* The above copyright notice and this permission notice (including the
				15	* next paragraph) shall be included in all copies or substantial portions
				16	* of the Software.
				17	*
				18	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
				19	* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				20	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
				21	* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
				22	* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
				23	* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
				24	* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
				25	*
				26	**************************************************************************/
				27
				28
				29	/**
				30	* @file
				31	* Helper functions for packing/unpacking.
				32	*
				33	* Pack/unpacking is necessary for conversion between types of different
				34	* bit width.
				35	*
				36	* They are also commonly used when an computation needs higher
				37	* precision for the intermediate values. For example, if one needs the
				38	* function:
				39	*
				40	* c = compute(a, b);
				41	*
				42	* to use more precision for intermediate results then one should implement it
				43	* as:
				44	*
				45	* LLVMValueRef
				46	* compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
				47	* {
				48	* struct lp_type wide_type = lp_wider_type(type);
				49	* LLVMValueRef al, ah, bl, bh, cl, ch, c;
				50	*
				51	* lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
				52	* lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
				53	*
				54	* cl = compute_half(al, bl);
				55	* ch = compute_half(ah, bh);
				56	*
				57	* c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
				58	*
				59	* return c;
				60	* }
				61	*
				62	* where compute_half() would do the computation for half the elements with
				63	* twice the precision.
				64	*
				65	* @author Jose Fonseca <jfonseca@vmware.com>
				66	*/
				67
				68
				69	#include "util/u_debug.h"
				70	#include "util/u_math.h"
				71	#include "util/u_cpu_detect.h"
				72
				73	#include "lp_bld_type.h"
				74	#include "lp_bld_const.h"
				75	#include "lp_bld_intr.h"
				76	#include "lp_bld_arit.h"
				77	#include "lp_bld_pack.h"
				78
				79
				80	/**
				81	* Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
				82	*/
				83	static LLVMValueRef
				84	lp_build_const_unpack_shuffle(unsigned n, unsigned lo_hi)
				85	{
				86	LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
				87	unsigned i, j;
				88
				89	assert(n <= LP_MAX_VECTOR_LENGTH);
				90	assert(lo_hi < 2);
				91
				92	/* TODO: cache results in a static table */
				93
				94	for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
				95	elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
				96	elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
				97	}
				98
				99	return LLVMConstVector(elems, n);
				100	}
				101
				102
				103	/**
				104	* Build shuffle vectors that match PACKxx instructions.
				105	*/
				106	static LLVMValueRef
				107	lp_build_const_pack_shuffle(unsigned n)
				108	{
				109	LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
				110	unsigned i;
				111
				112	assert(n <= LP_MAX_VECTOR_LENGTH);
				113
				114	/* TODO: cache results in a static table */
				115
				116	for(i = 0; i < n; ++i)
				117	elems[i] = LLVMConstInt(LLVMInt32Type(), 2*i, 0);
				118
				119	return LLVMConstVector(elems, n);
				120	}
				121
				122
				123	/**
				124	* Interleave vector elements.
				125	*
				126	* Matches the PUNPCKLxx and PUNPCKHxx SSE instructions.
				127	*/
				128	LLVMValueRef
				129	lp_build_interleave2(LLVMBuilderRef builder,
				130	struct lp_type type,
				131	LLVMValueRef a,
				132	LLVMValueRef b,
				133	unsigned lo_hi)
				134	{
				135	LLVMValueRef shuffle;
				136
				137	shuffle = lp_build_const_unpack_shuffle(type.length, lo_hi);
				138
				139	return LLVMBuildShuffleVector(builder, a, b, shuffle, "");
				140	}
				141
				142
				143	/**
				144	* Double the bit width.
				145	*
				146	* This will only change the number of bits the values are represented, not the
				147	* values themselves.
				148	*/
				149	void
				150	lp_build_unpack2(LLVMBuilderRef builder,
				151	struct lp_type src_type,
				152	struct lp_type dst_type,
				153	LLVMValueRef src,
				154	LLVMValueRef *dst_lo,
				155	LLVMValueRef *dst_hi)
				156	{
				157	LLVMValueRef msb;
				158	LLVMTypeRef dst_vec_type;
				159
				160	assert(!src_type.floating);
				161	assert(!dst_type.floating);
José Fonseca	421507d	2009-10-22 18:28:17 +0100	[diff] [blame]	162	assert(dst_type.width == src_type.width * 2);
				163	assert(dst_type.length * 2 == src_type.length);
				164
José Fonseca	8d80fd3	2009-10-25 09:03:50 +0000	[diff] [blame]	165	if(dst_type.sign && src_type.sign) {
José Fonseca	421507d	2009-10-22 18:28:17 +0100	[diff] [blame]	166	/* Replicate the sign bit in the most significant bits */
				167	msb = LLVMBuildAShr(builder, src, lp_build_int_const_scalar(src_type, src_type.width - 1), "");
				168	}
				169	else
				170	/* Most significant bits always zero */
				171	msb = lp_build_zero(src_type);
				172
				173	/* Interleave bits */
				174	if(util_cpu_caps.little_endian) {
				175	*dst_lo = lp_build_interleave2(builder, src_type, src, msb, 0);
				176	*dst_hi = lp_build_interleave2(builder, src_type, src, msb, 1);
				177	}
				178	else {
				179	*dst_lo = lp_build_interleave2(builder, src_type, msb, src, 0);
				180	*dst_hi = lp_build_interleave2(builder, src_type, msb, src, 1);
				181	}
				182
				183	/* Cast the result into the new type (twice as wide) */
				184
				185	dst_vec_type = lp_build_vec_type(dst_type);
				186
				187	dst_lo = LLVMBuildBitCast(builder, dst_lo, dst_vec_type, "");
				188	dst_hi = LLVMBuildBitCast(builder, dst_hi, dst_vec_type, "");
				189	}
				190
				191
				192	/**
				193	* Expand the bit width.
				194	*
				195	* This will only change the number of bits the values are represented, not the
				196	* values themselves.
				197	*/
				198	void
				199	lp_build_unpack(LLVMBuilderRef builder,
				200	struct lp_type src_type,
				201	struct lp_type dst_type,
				202	LLVMValueRef src,
				203	LLVMValueRef *dst, unsigned num_dsts)
				204	{
				205	unsigned num_tmps;
				206	unsigned i;
				207
				208	/* Register width must remain constant */
				209	assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
				210
				211	/* We must not loose or gain channels. Only precision */
				212	assert(src_type.length == dst_type.length * num_dsts);
				213
				214	num_tmps = 1;
				215	dst[0] = src;
				216
				217	while(src_type.width < dst_type.width) {
				218	struct lp_type tmp_type = src_type;
				219
				220	tmp_type.width *= 2;
				221	tmp_type.length /= 2;
				222
				223	for(i = num_tmps; i--; ) {
				224	lp_build_unpack2(builder, src_type, tmp_type, dst[i], &dst[2i + 0], &dst[2i + 1]);
				225	}
				226
				227	src_type = tmp_type;
				228
				229	num_tmps *= 2;
				230	}
				231
				232	assert(num_tmps == num_dsts);
				233	}
				234
				235
				236	/**
				237	* Non-interleaved pack.
				238	*
				239	* This will move values as
				240	*
				241	* lo = __ l0 __ l1 __ l2 __.. __ ln
				242	* hi = __ h0 __ h1 __ h2 __.. __ hn
				243	* res = l0 l1 l2 .. ln h0 h1 h2 .. hn
				244	*
				245	* This will only change the number of bits the values are represented, not the
				246	* values themselves.
				247	*
				248	* It is assumed the values are already clamped into the destination type range.
				249	* Values outside that range will produce undefined results. Use
				250	* lp_build_packs2 instead.
				251	*/
				252	LLVMValueRef
				253	lp_build_pack2(LLVMBuilderRef builder,
				254	struct lp_type src_type,
				255	struct lp_type dst_type,
				256	LLVMValueRef lo,
				257	LLVMValueRef hi)
				258	{
				259	LLVMTypeRef src_vec_type = lp_build_vec_type(src_type);
				260	LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
				261	LLVMValueRef shuffle;
				262	LLVMValueRef res;
				263
				264	dst_vec_type = lp_build_vec_type(dst_type);
				265
				266	assert(!src_type.floating);
				267	assert(!dst_type.floating);
				268	assert(src_type.width == dst_type.width * 2);
				269	assert(src_type.length * 2 == dst_type.length);
				270
				271	if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
				272	switch(src_type.width) {
				273	case 32:
				274	if(dst_type.sign) {
				275	res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
				276	}
				277	else {
				278	if (util_cpu_caps.has_sse4_1) {
				279	/* PACKUSDW is the only instrinsic with a consistent signature */
				280	return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
				281	}
				282	else {
				283	assert(0);
				284	return LLVMGetUndef(dst_vec_type);
				285	}
				286	}
				287	break;
				288
				289	case 16:
				290	if(dst_type.sign)
				291	res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
				292	else
				293	res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
				294	break;
				295
				296	default:
				297	assert(0);
				298	return LLVMGetUndef(dst_vec_type);
				299	break;
				300	}
				301
				302	res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
				303	return res;
				304	}
				305
				306	lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
				307	hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
				308
				309	shuffle = lp_build_const_pack_shuffle(dst_type.length);
				310
				311	res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
				312
				313	return res;
				314	}
				315
				316
				317
				318	/**
				319	* Non-interleaved pack and saturate.
				320	*
				321	* Same as lp_build_pack2 but will saturate values so that they fit into the
				322	* destination type.
				323	*/
				324	LLVMValueRef
				325	lp_build_packs2(LLVMBuilderRef builder,
				326	struct lp_type src_type,
				327	struct lp_type dst_type,
				328	LLVMValueRef lo,
				329	LLVMValueRef hi)
				330	{
				331	boolean clamp;
				332
				333	assert(!src_type.floating);
				334	assert(!dst_type.floating);
				335	assert(src_type.sign == dst_type.sign);
				336	assert(src_type.width == dst_type.width * 2);
				337	assert(src_type.length * 2 == dst_type.length);
				338
				339	clamp = TRUE;
				340
				341	/* All X86 SSE non-interleaved pack instructions take signed inputs and
				342	* saturate them, so no need to clamp for those cases. */
				343	if(util_cpu_caps.has_sse2 &&
				344	src_type.width * src_type.length == 128 &&
				345	src_type.sign)
				346	clamp = FALSE;
				347
				348	if(clamp) {
				349	struct lp_build_context bld;
				350	unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
				351	LLVMValueRef dst_max = lp_build_int_const_scalar(src_type, ((unsigned long long)1 << dst_bits) - 1);
				352	lp_build_context_init(&bld, builder, src_type);
				353	lo = lp_build_min(&bld, lo, dst_max);
				354	hi = lp_build_min(&bld, hi, dst_max);
				355	/* FIXME: What about lower bound? */
				356	}
				357
				358	return lp_build_pack2(builder, src_type, dst_type, lo, hi);
				359	}
				360
				361
				362	/**
				363	* Truncate the bit width.
				364	*
				365	* TODO: Handle saturation consistently.
				366	*/
				367	LLVMValueRef
				368	lp_build_pack(LLVMBuilderRef builder,
				369	struct lp_type src_type,
				370	struct lp_type dst_type,
				371	boolean clamped,
				372	const LLVMValueRef *src, unsigned num_srcs)
				373	{
				374	LLVMValueRef (*pack2)(LLVMBuilderRef builder,
				375	struct lp_type src_type,
				376	struct lp_type dst_type,
				377	LLVMValueRef lo,
				378	LLVMValueRef hi);
				379	LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
				380	unsigned i;
				381
				382
				383	/* Register width must remain constant */
				384	assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
				385
				386	/* We must not loose or gain channels. Only precision */
				387	assert(src_type.length * num_srcs == dst_type.length);
				388
				389	if(clamped)
				390	pack2 = &lp_build_pack2;
				391	else
				392	pack2 = &lp_build_packs2;
				393
				394	for(i = 0; i < num_srcs; ++i)
				395	tmp[i] = src[i];
				396
				397	while(src_type.width > dst_type.width) {
				398	struct lp_type tmp_type = src_type;
				399
				400	tmp_type.width /= 2;
				401	tmp_type.length *= 2;
				402
				403	/* Take in consideration the sign changes only in the last step */
				404	if(tmp_type.width == dst_type.width)
				405	tmp_type.sign = dst_type.sign;
				406
				407	num_srcs /= 2;
				408
				409	for(i = 0; i < num_srcs; ++i)
				410	tmp[i] = pack2(builder, src_type, tmp_type, tmp[2i + 0], tmp[2i + 1]);
				411
				412	src_type = tmp_type;
				413	}
				414
				415	assert(num_srcs == 1);
				416
				417	return tmp[0];
				418	}