Blame - linux-x86/lib64/clang/14.0.2/include/__clang_cuda_texture_intrinsics.h - platform/prebuilts/clang-tools

blob: 3c0f0026f1f026df7f05611acae2577f8458227f [file] [log] [blame]

Pirama Arumuga Nainar	ec8c89d	2022-02-23 09:26:16 -0800	[diff] [blame]	1	/*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
				2	*
				3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	* See https://llvm.org/LICENSE.txt for license information.
				5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				6	*
				7	*===-----------------------------------------------------------------------===
				8	*
				9	* This header provides in-header implmentations for NVCC's built-in
				10	* __nv_tex_surf_handler() which is used by CUDA's texture-related headers. The
				11	* built-in is unusual as it's actually a set of function overloads that use the
				12	* first string literal argument as one of the overload parameters.
				13	*/
				14	#ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
				15	#define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
				16	#ifndef __CUDA__
				17	#error "This file is for CUDA compilation only."
				18	#endif
				19
				20	// __nv_tex_surf_handler() provided by this header as a macro.
				21	#define __nv_tex_surf_handler(__op, __ptr, ...) \
				22	::__cuda_tex::__tex_fetch< \
				23	::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr, \
				24	__VA_ARGS__)
				25
				26	#pragma push_macro("__ASM_OUT")
				27	#pragma push_macro("__ASM_OUTP")
				28	#pragma push_macro("__Args")
				29	#pragma push_macro("__ID")
				30	#pragma push_macro("__IDV")
				31	#pragma push_macro("__IMPL_2DGATHER")
				32	#pragma push_macro("__IMPL_ALIAS")
				33	#pragma push_macro("__IMPL_ALIASI")
				34	#pragma push_macro("__IMPL_F1")
				35	#pragma push_macro("__IMPL_F3")
				36	#pragma push_macro("__IMPL_F3N")
				37	#pragma push_macro("__IMPL_F3S")
				38	#pragma push_macro("__IMPL_S")
				39	#pragma push_macro("__IMPL_S3")
				40	#pragma push_macro("__IMPL_S3I")
				41	#pragma push_macro("__IMPL_S3N")
				42	#pragma push_macro("__IMPL_S3NI")
				43	#pragma push_macro("__IMPL_S3S")
				44	#pragma push_macro("__IMPL_S3SI")
				45	#pragma push_macro("__IMPL_SI")
				46	#pragma push_macro("__L")
				47	#pragma push_macro("__STRIP_PARENS")
				48
				49	// Put all functions into anonymous namespace so they have internal linkage.
				50	// The device-only function here must be internal in order to avoid ODR
				51	// violations in case they are used from the files compiled with
				52	// -fgpu-rdc. E.g. a library and an app using it may be built with a different
				53	// version of this header file.
				54	namespace {
				55
				56	// Put the implmentation into its own namespace so we don't pollute the TU.
				57	namespace __cuda_tex {
				58
				59	// First, we need a perfect hash function and a few constexpr helper functions
				60	// for converting a string literal into a numeric value which can be used to
				61	// parametrize a template. We can not use string literals for that as that would
				62	// require C++20.
				63	//
				64	// The hash function was generated with 'gperf' and then manually converted into
				65	// its constexpr equivalent.
				66	//
				67	// NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
				68	// function has a collision for any of the texture operations, the compilation
				69	// will fail due to an attempt to redefine a tag with the same value. If the
				70	// header compiles, then the hash function is good enough for the job.
				71
				72	constexpr int __tex_len(const char *s) {
				73	return (s[0] == 0) ? 0
				74	: (s[1] == 0) ? 1
				75	: (s[2] == 0) ? 2
				76	: (s[3] == 0) ? 3
				77	: (s[4] == 0) ? 4
				78	: (s[5] == 0) ? 5
				79	: (s[6] == 0) ? 6
				80	: (s[7] == 0) ? 7
				81	: (s[8] == 0) ? 8
				82	: (s[9] == 0) ? 9
				83	: (s[10] == 0) ? 10
				84	: (s[11] == 0) ? 11
				85	: (s[12] == 0) ? 12
				86	: (s[13] == 0) ? 13
				87	: (s[14] == 0) ? 14
				88	: (s[15] == 0) ? 15
				89	: (s[16] == 0) ? 16
				90	: (s[17] == 0) ? 17
				91	: (s[18] == 0) ? 18
				92	: (s[19] == 0) ? 19
				93	: (s[20] == 0) ? 20
				94	: (s[21] == 0) ? 21
				95	: (s[22] == 0) ? 22
				96	: (s[23] == 0) ? 23
				97	: (s[24] == 0) ? 24
				98	: (s[25] == 0) ? 25
				99	: (s[26] == 0) ? 26
				100	: (s[27] == 0) ? 27
				101	: (s[28] == 0) ? 28
				102	: (s[29] == 0) ? 29
				103	: (s[30] == 0) ? 30
				104	: (s[31] == 0) ? 31
				105	: 32;
				106	}
				107
				108	constexpr int __tex_hash_map(int c) {
				109	return (c == 49) ? 10
				110	: (c == 50) ? 0
				111	: (c == 51) ? 100
				112	: (c == 52) ? 30
				113	: (c == 67) ? 10
				114	: (c == 68) ? 0
				115	: (c == 69) ? 25
				116	: (c == 72) ? 70
				117	: (c == 77) ? 0
				118	: (c == 96) ? 44
				119	: (c == 99) ? 10
				120	: (c == 100) ? 5
				121	: (c == 101) ? 60
				122	: (c == 102) ? 40
				123	: (c == 103) ? 70
				124	: (c == 104) ? 25
				125	: (c == 112) ? 0
				126	: (c == 114) ? 45
				127	: (c == 117) ? 5
				128	: (c == 118) ? 85
				129	: (c == 120) ? 20
				130	: 225;
				131	}
				132
				133	constexpr int __tex_op_hash(const char *str) {
				134	return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) +
				135	__tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]);
				136	}
				137
				138	// Tag type to identify particular texture operation.
				139	template <int N> struct __Tag;
				140	#define __ID(__op) __Tag<__tex_op_hash(__op)>
				141	// Tags for variants of particular operation. E.g. tex2Dgather can translate
				142	// into 4 different instructions.
				143	#define __IDV(__op, __variant) \
				144	__Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
				145
				146	// Helper classes for figuring out key data types for derived types.
				147	// E.g. char2 has __base_t = char, __fetch_t = char4
				148	template <class> struct __TypeInfoT;
				149	// Type info for the fundamental types.
				150	template <> struct __TypeInfoT<float> {
				151	using __base_t = float;
				152	using __fetch_t = float4;
				153	};
				154	template <> struct __TypeInfoT<char> {
				155	using __base_t = char;
				156	using __fetch_t = int4;
				157	};
				158	template <> struct __TypeInfoT<signed char> {
				159	using __base_t = signed char;
				160	using __fetch_t = int4;
				161	};
				162	template <> struct __TypeInfoT<unsigned char> {
				163	using __base_t = unsigned char;
				164	using __fetch_t = uint4;
				165	};
				166	template <> struct __TypeInfoT<short> {
				167	using __base_t = short;
				168	using __fetch_t = int4;
				169	};
				170	template <> struct __TypeInfoT<unsigned short> {
				171	using __base_t = unsigned short;
				172	using __fetch_t = uint4;
				173	};
				174	template <> struct __TypeInfoT<int> {
				175	using __base_t = int;
				176	using __fetch_t = int4;
				177	};
				178	template <> struct __TypeInfoT<unsigned int> {
				179	using __base_t = unsigned int;
				180	using __fetch_t = uint4;
				181	};
				182
				183	// Derived base/fetch types for N-element vectors.
				184	template <class __T> struct __TypeInfoT {
				185	using __base_t = decltype(__T::x);
				186	using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t;
				187	};
				188
				189	// Classes that implement specific texture ops.
				190	template <class __op> struct __tex_fetch_v4;
				191
				192	// Helper macros to strip parens from a macro argument.
				193	#define __Args(...) __VA_ARGS__
				194	#define __STRIP_PARENS(__X) __X
				195	#define __L(__X) __STRIP_PARENS(__Args __X)
				196
				197	// Construct inline assembly output args.
				198	// Results are stored in a temp var __r.
				199	// isResident bool is pointed to by __ir
				200	// Asm args for return values. It's a 4-element vector
				201	#define __ASM_OUT(__t) \
				202	("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
				203	// .. possibly combined with a predicate.
				204	#define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
				205
				206	// Implements a single variant of texture fetch instruction.
				207	#define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args) \
				208	template <> \
				209	__device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) { \
				210	__rt __r; \
				211	asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args)); \
				212	return __r; \
				213	}
				214
				215	// Implements texture fetch instructions for int4/uint4/float4 data types.
				216	#define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
				217	__IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
				218	__ASM_OUT("r"), __asm_args) \
				219	__IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
				220	__ASM_OUT("r"), __asm_args) \
				221	__IMPL_F1(float4, float4, __args, \
				222	__asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"), \
				223	__asm_args)
				224	// Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
				225	// types. Similar to above, but returns a boolean 'isPresent' value in addition
				226	// to texture data,
				227	#define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
				228	__IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
				229	__ASM_OUTP("r"), __asm_args) \
				230	__IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
				231	__ASM_OUTP("r"), __asm_args) \
				232	__IMPL_F1(float4, float4, __args, \
				233	__asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"), \
				234	__asm_args)
				235
				236	// Similar to F3, but for integer data which is returned as normalized floats.
				237	// Only instantiates fetch functions for int4/uint4.
				238	#define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
				239	__IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
				240	__ASM_OUT("r"), __asm_args) \
				241	__IMPL_F1(float4, uint4, __args, \
				242	__asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"), \
				243	__asm_args)
				244
				245	// Instantiates __tex_fetch_v4 with regular fetch functions.
				246	#define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
				247	template <> struct __tex_fetch_v4<__op> { \
				248	template <class T> \
				249	__device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
				250	__IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
				251	}
				252
				253	// Same, but for sparse ops. Only available on sm_60+
				254	#if !defined(__CUDA_ARCH__) \|\| (__CUDA_ARCH__ >= 600)
				255	#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, \
				256	__asm_args) \
				257	template <> struct __tex_fetch_v4<__op> { \
				258	template <class T> \
				259	__device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
				260	__IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
				261	}
				262	#else
				263	#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
				264	#endif
				265
				266	// Same, but for normalized float ops.
				267	#define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args, \
				268	__asm_args) \
				269	template <> struct __tex_fetch_v4<__op> { \
				270	template <class T> \
				271	__device__ static float4 __run(cudaTextureObject_t __obj, __L(__args)); \
				272	__IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
				273	}
				274
				275	// Regular and normalized float ops share a lot of similarities. This macro
				276	// instantiates both variants -- normal for __op and normalized for __opn.
				277	#define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
				278	__asm_args) \
				279	__IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args); \
				280	__IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
				281
				282	// Convenience macros which converts string literal __op into a __Tag,
				283	#define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
				284	__IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
				285	#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
				286	__IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
				287	#define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
				288	__IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
				289	#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
				290	__asm_args) \
				291	__IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
				292	__asm_args)
				293
				294	// CUDA headers have some 'legacy' texture oprerations that duplicate
				295	// functionality. So, we just inherit it, instead of refining a copy.
				296	#define __IMPL_ALIASI(__op, __opn) \
				297	template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
				298	#define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
				299
				300	// Now we can instantiate everything we need for each specific texture fetch
				301	// variant.
				302	__IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
				303	"{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
				304	__IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
				305	"s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
				306	__IMPL_ALIAS("__itex1D", "__tex1D_v2");
				307	__IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
				308
				309	__IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
				310	(float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
				311	"{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
				312	("f"(__x), "f"(__dPdx), "f"(__dPdy)));
				313	__IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
				314
				315	__IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
				316	(float __x, int __layer), "tex.a1d.v4", "f32",
				317	"{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
				318	__IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
				319
				320	__IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
				321	(float __x, int __layer, float __dPdx, float __dPdy),
				322	"tex.grad.a1d.v4", "f32",
				323	"{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
				324	("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
				325	__IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
				326
				327	__IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
				328	(float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
				329	"{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
				330	("r"(__layer), "f"(__x), "f"(__level)));
				331	__IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
				332
				333	__IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
				334	"tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
				335	("f"(__x), "f"(__level)));
				336	__IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
				337
				338	// 2D
				339	__IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
				340	"f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
				341	__IMPL_ALIAS("__itex2D", "__tex2D_v2");
				342
				343	__IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
				344	"{.reg .pred %%p0;\n\t"
				345	"tex.2d.v4",
				346	"f32",
				347	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7}];\n\t"
				348	" selp.u16 %4, 1, 0, %%p0; }",
				349	("f"(__x), "f"(__y)));
				350
				351	__IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
				352	(float __x, float __y, const float2 __dPdx, const float2 __dPdy),
				353	"tex.grad.2d.v4", "f32",
				354	"{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
				355	("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
				356	"f"(__dPdy->y)));
				357	__IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
				358
				359	__IMPL_S3S("__itex2DGrad_sparse",
				360	(float __x, float __y, const float2 __dPdx, const float2 __dPdy,
				361	unsigned char *__ir),
				362	"{.reg .pred %%p0;\n\t"
				363	"tex.grad.2d.v4",
				364	"f32",
				365	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
				366	"selp.u16 %4, 1, 0, %%p0; }",
				367	("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
				368	"f"(__dPdy->y)));
				369
				370	__IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
				371	(float __x, float __y, int __layer), "tex.a2d.v4", "f32",
				372	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
				373	("r"(__layer), "f"(__x), "f"(__y)));
				374	__IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
				375
				376	__IMPL_S3S("__itex2DLayered_sparse",
				377	(float __x, float __y, int __layer, unsigned char *__ir),
				378	"{.reg .pred %%p0;\n\t"
				379	"tex.a2d.v4",
				380	"f32",
				381	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
				382	"selp.u16 %4, 1, 0, %%p0; }",
				383	("r"(__layer), "f"(__x), "f"(__y)));
				384
				385	__IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
				386	(float __x, float __y, int __layer, const float2 *__dPdx,
				387	const float2 *__dPdy),
				388	"tex.grad.a2d.v4", "f32",
				389	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
				390	("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
				391	"f"(__dPdy->x), "f"(__dPdy->y)));
				392	__IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
				393
				394	__IMPL_S3S(
				395	"__itex2DLayeredGrad_sparse",
				396	(float __x, float __y, int __layer, const float2 *__dPdx,
				397	const float2 __dPdy, unsigned char __ir),
				398	"{.reg .pred %%p0;\n\t"
				399	"tex.grad.a2d.v4",
				400	"f32",
				401	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
				402	"selp.u16 %4, 1, 0, %%p0; }",
				403	("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
				404	"f"(__dPdy->x), "f"(__dPdy->y)));
				405
				406	__IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
				407	(float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
				408	"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
				409	("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
				410	__IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
				411
				412	__IMPL_S3S("__itex2DLayeredLod_sparse",
				413	(float __x, float __y, int __layer, float __level,
				414	unsigned char *__ir),
				415	"{.reg .pred %%p0;\n\t"
				416	"tex.level.a2d.v4",
				417	"f32",
				418	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
				419	"selp.u16 %4, 1, 0, %%p0; }",
				420	("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
				421
				422	__IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
				423	(float __x, float __y, float __level), "tex.level.2d.v4", "f32",
				424	"{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
				425	("f"(__x), "f"(__y), "f"(__level)));
				426	__IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
				427
				428	__IMPL_S3S("__itex2DLod_sparse",
				429	(float __x, float __y, float __level, unsigned char *__ir),
				430	"{.reg .pred %%p0;\n\t"
				431	"tex.level.2d.v4",
				432	"f32",
				433	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7}], %8;\n\t"
				434	"selp.u16 %4, 1, 0, %%p0; }",
				435	("f"(__x), "f"(__y), "f"(__level)));
				436
				437	// 2D gather is special. Unlike other variants that translate into exactly one
				438	// asm instruction, it uses one of the four different instructions selected by
				439	// __comp. We implement each instruction variant separately, and dispatch the
				440	// right one from the manually implemented 'umbrella' fetch.
				441	#define __IMPL_2DGATHER(variant, instr) \
				442	__IMPL_SI(__IDV("__tex2Dgather_v2", variant), \
				443	__IDV("__tex2Dgather_rmnf_v2", variant), \
				444	(float __x, float __y, int __comp), instr, "f32", \
				445	"{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); \
				446	__IMPL_ALIASI(__IDV("__itex2Dgather", variant), \
				447	__IDV("__tex2Dgather_v2", variant)); \
				448	__IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant), \
				449	(float __x, float __y, unsigned char *__ir, int __comp), \
				450	"{.reg .pred %%p0;\n\t" instr, "f32", \
				451	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7}];\n\t" \
				452	"selp.u16 %4, 1, 0, %%p0; }", \
				453	("f"(__x), "f"(__y)));
				454	__IMPL_2DGATHER(0, "tld4.r.2d.v4");
				455	__IMPL_2DGATHER(1, "tld4.g.2d.v4");
				456	__IMPL_2DGATHER(2, "tld4.b.2d.v4");
				457	__IMPL_2DGATHER(3, "tld4.a.2d.v4");
				458
				459	// Umbrella dispatcher -- calls into specific 2Dgather variant.
				460	template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
				461	template <class __T>
				462	__device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
				463	int __comp) {
				464	switch (__comp) {
				465	case 0:
				466	return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
				467	__obj, __x, __y, __comp);
				468	case 1:
				469	return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
				470	__obj, __x, __y, __comp);
				471	case 2:
				472	return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
				473	__obj, __x, __y, __comp);
				474	case 3:
				475	return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
				476	__obj, __x, __y, __comp);
				477	}
				478	}
				479	};
				480	__IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
				481
				482	template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
				483	template <class __T>
				484	__device__ static float4 __run(cudaTextureObject_t __obj, float __x,
				485	float __y, int __comp) {
				486	switch (__comp) {
				487	case 0:
				488	return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
				489	__obj, __x, __y, __comp);
				490	case 1:
				491	return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
				492	__obj, __x, __y, __comp);
				493	case 2:
				494	return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
				495	__obj, __x, __y, __comp);
				496	case 3:
				497	return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
				498	__obj, __x, __y, __comp);
				499	}
				500	}
				501	};
				502
				503	#if !defined(__CUDA_ARCH__) \|\| (__CUDA_ARCH__ >= 600)
				504	template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
				505	template <class __T>
				506	__device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
				507	unsigned char *__ir, int __comp) {
				508	switch (__comp) {
				509	case 0:
				510	return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
				511	__obj, __x, __y, __ir, __comp);
				512	case 1:
				513	return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
				514	__obj, __x, __y, __ir, __comp);
				515	case 2:
				516	return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
				517	__obj, __x, __y, __ir, __comp);
				518	case 3:
				519	return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
				520	__obj, __x, __y, __ir, __comp);
				521	}
				522	}
				523	};
				524	#endif
				525
				526	// 3D
				527	__IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
				528	"tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
				529	("f"(__x), "f"(__y), "f"(__z)));
				530	__IMPL_ALIAS("__itex3D", "__tex3D_v2");
				531
				532	__IMPL_S3S("__itex3D_sparse",
				533	(float __x, float __y, float __z, unsigned char *__ir),
				534	"{.reg .pred %%p0;\n\t"
				535	"tex.3d.v4",
				536	"f32",
				537	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
				538	"selp.u16 %4, 1, 0, %%p0; }",
				539	("f"(__x), "f"(__y), "f"(__z)));
				540
				541	__IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
				542	(float __x, float __y, float __z, const float4 *__dPdx,
				543	const float4 *__dPdy),
				544	"tex.grad.3d.v4", "f32",
				545	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
				546	"{%8, %9, %10, %10}, {%11, %12, %13, %13};",
				547	("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
				548	"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
				549	__IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
				550
				551	__IMPL_S3S("__itex3DGrad_sparse",
				552	(float __x, float __y, float __z, const float4 *__dPdx,
				553	const float4 __dPdy, unsigned char __ir),
				554	"{.reg .pred %%p0;\n\t"
				555	"tex.grad.3d.v4",
				556	"f32",
				557	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}], "
				558	"{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
				559	"selp.u16 %4, 1, 0, %%p0; }",
				560	("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
				561	"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
				562
				563	__IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
				564	(float __x, float __y, float __z, float __level), "tex.level.3d.v4",
				565	"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
				566	("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
				567	__IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
				568
				569	__IMPL_S3S("__itex3DLod_sparse",
				570	(float __x, float __y, float __z, float __level,
				571	unsigned char *__ir),
				572	"{.reg .pred %%p0;\n\t"
				573	"tex.level.3d.v4",
				574	"f32",
				575	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
				576	"selp.u16 %4, 1, 0, %%p0; }",
				577	("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
				578
				579	// Cubemap
				580	__IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
				581	(float __x, float __y, float __z), "tex.cube.v4", "f32",
				582	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
				583	("f"(__x), "f"(__y), "f"(__z)));
				584	__IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
				585
				586	__IMPL_S3S("__itexCubemap_sparse",
				587	(float __x, float __y, float __z, unsigned char *__ir),
				588	"{.reg .pred %%p0;\n\t"
				589	"tex.cube.v4",
				590	"f32",
				591	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
				592	"selp.u16 %4, 1, 0, %%p0; }",
				593	("f"(__x), "f"(__y), "f"(__z)));
				594
				595	__IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
				596	(float __x, float __y, float __z, const float4 *__dPdx,
				597	const float4 *__dPdy),
				598	"tex.grad.cube.v4", "f32",
				599	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
				600	"{%8, %9, %10, %10}, {%11, %12, %13, %13};",
				601	("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
				602	"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
				603	__IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
				604
				605	__IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
				606	(float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
				607	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
				608	("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
				609	__IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
				610
				611	__IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
				612	(float __x, float __y, float __z, int __layer, const float4 *__dPdx,
				613	const float4 *__dPdy),
				614	"tex.grad.acube.v4", "f32",
				615	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
				616	"{%9, %10, %11, %11}, {%12, %13, %14, %14};",
				617	("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
				618	"f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
				619	"f"(__dPdy->z)));
				620	__IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
				621
				622	__IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
				623	(float __x, float __y, float __z, int __layer, float __level),
				624	"tex.level.acube.v4", "f32",
				625	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
				626	("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
				627	__IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
				628
				629	__IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
				630	(float __x, float __y, float __z, float __level), "tex.level.cube.v4",
				631	"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
				632	("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
				633	__IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
				634
				635	// Helper class for extracting slice of data from V4 fetch results.
				636	template <class __DestT, class __SrcT> struct __convert {
				637	template <int __NElements = sizeof(__DestT) /
				638	sizeof(typename __TypeInfoT<__DestT>::__base_t)>
				639	__device__ static __DestT __run(__SrcT __v);
				640	template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; }
				641	template <> __device__ static __DestT __run<2>(__SrcT __v) {
				642	return {__v.x, __v.y};
				643	}
				644	template <> __device__ static __DestT __run<3>(__SrcT __v) {
				645	return {__v.x, __v.y, __v.z};
				646	}
				647	template <> __device__ static __DestT __run<4>(__SrcT __v) {
				648	return {__v.x, __v.y, __v.z, __v.w};
				649	}
				650	};
				651
				652	// These are the top-level function overloads the __nv_tex_surf_handler expands
				653	// to. Each overload deals with one of the several ways __nv_tex_surf_handler
				654	// is called by CUDA headers. In the end, each of the overloads does the same
				655	// job -- it figures out which `__tex_fetch_v4::run` variant should be used to
				656	// fetch texture data and which `__convert::run` is needed to convert it into
				657	// appropriate return type.
				658
				659	// __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
				660	// Data type and return type are based on ret.
				661	template <class __op, class __T, class... __Args>
				662	__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
				663	__Args... __args) {
				664	using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
				665	*__ptr = __convert<__T, __FetchT>::__run(
				666	__tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
				667	}
				668
				669	// texture<> objects get magically converted into a texture reference. However,
				670	// there's no way to convert them to cudaTextureObject_t on C++ level. So, we
				671	// cheat a bit and use inline assembly to do it. It costs us an extra register
				672	// and a move, but that is easy for ptxas to optimize away.
				673	template <class __T>
				674	__device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
				675	cudaTextureObject_t __obj;
				676	asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
				677	return __obj;
				678	}
				679
				680	// __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
				681	// Data type and return type is based on ret.
				682	template <class __op, class __T, class __HandleT, class... __Args>
				683	__device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
				684	__Args... __args) {
				685	using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
				686	*__ptr = __convert<__T, __FetchT>::__run(
				687	__tex_fetch_v4<__op>::template __run<__FetchT>(
				688	__tex_handle_to_obj(__handle), __args...));
				689	}
				690
				691	// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
				692	// cudaReadModeNormalizedFloat fetches always return float4.
				693	template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
				694	__device__ static void
				695	__tex_fetch(__DataT , __RetT __ptr,
				696	texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
				697	__Args... __args) {
				698	using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
				699	*__ptr = __convert<__RetT, float4>::__run(
				700	__tex_fetch_v4<__op>::template __run<__FetchT>(
				701	__tex_handle_to_obj(__handle), __args...));
				702	}
				703
				704	// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
				705	// For cudaReadModeElementType fetch return type is based on type_dummy.
				706	template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
				707	__device__ static void
				708	__tex_fetch(__DataT , __RetT __ptr,
				709	texture<__DataT, __TexT, cudaReadModeElementType> __handle,
				710	__Args... __args) {
				711	using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
				712	*__ptr = __convert<__RetT, __FetchT>::__run(
				713	__tex_fetch_v4<__op>::template __run<__FetchT>(
				714	__tex_handle_to_obj(__handle), __args...));
				715	}
				716	} // namespace __cuda_tex
				717	} // namespace
				718	#pragma pop_macro("__ASM_OUT")
				719	#pragma pop_macro("__ASM_OUTP")
				720	#pragma pop_macro("__Args")
				721	#pragma pop_macro("__ID")
				722	#pragma pop_macro("__IDV")
				723	#pragma pop_macro("__IMPL_2DGATHER")
				724	#pragma pop_macro("__IMPL_ALIAS")
				725	#pragma pop_macro("__IMPL_ALIASI")
				726	#pragma pop_macro("__IMPL_F1")
				727	#pragma pop_macro("__IMPL_F3")
				728	#pragma pop_macro("__IMPL_F3N")
				729	#pragma pop_macro("__IMPL_F3S")
				730	#pragma pop_macro("__IMPL_S")
				731	#pragma pop_macro("__IMPL_S3")
				732	#pragma pop_macro("__IMPL_S3I")
				733	#pragma pop_macro("__IMPL_S3N")
				734	#pragma pop_macro("__IMPL_S3NI")
				735	#pragma pop_macro("__IMPL_S3S")
				736	#pragma pop_macro("__IMPL_S3SI")
				737	#pragma pop_macro("__IMPL_SI")
				738	#pragma pop_macro("__L")
				739	#pragma pop_macro("__STRIP_PARENS")
				740	#endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__