blob: b79bdeadef380ede7962830d845a53420886ddf2 [file] [log] [blame]
Tom Stellard75aadc22012-12-11 21:25:42 +00001//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for SI
12//
13//===----------------------------------------------------------------------===//
14
NAKAMURA Takumi45e0a832014-07-20 11:15:07 +000015#ifdef _MSC_VER
16// Provide M_PI.
17#define _USE_MATH_DEFINES
18#include <cmath>
19#endif
20
Christian Konig99ee0f42013-03-07 09:04:14 +000021#include "AMDGPU.h"
Matt Arsenaultc791f392014-06-23 18:00:31 +000022#include "AMDGPUIntrinsicInfo.h"
Matt Arsenault41e2f2b2014-02-24 21:01:28 +000023#include "AMDGPUSubtarget.h"
Mehdi Aminib550cb12016-04-18 09:17:29 +000024#include "SIISelLowering.h"
Tom Stellard75aadc22012-12-11 21:25:42 +000025#include "SIInstrInfo.h"
26#include "SIMachineFunctionInfo.h"
27#include "SIRegisterInfo.h"
Alexey Samsonova253bf92014-08-27 19:36:53 +000028#include "llvm/ADT/BitVector.h"
Matt Arsenault9a10cea2016-01-26 04:29:24 +000029#include "llvm/ADT/StringSwitch.h"
Christian Konig2c8f6d52013-03-07 09:03:52 +000030#include "llvm/CodeGen/CallingConvLower.h"
Tom Stellard75aadc22012-12-11 21:25:42 +000031#include "llvm/CodeGen/MachineInstrBuilder.h"
32#include "llvm/CodeGen/MachineRegisterInfo.h"
33#include "llvm/CodeGen/SelectionDAG.h"
Wei Ding07e03712016-07-28 16:42:13 +000034#include "llvm/CodeGen/Analysis.h"
Oliver Stannard7e7d9832016-02-02 13:52:43 +000035#include "llvm/IR/DiagnosticInfo.h"
Benjamin Kramerd78bb462013-05-23 17:10:37 +000036#include "llvm/IR/Function.h"
Tom Stellard75aadc22012-12-11 21:25:42 +000037
38using namespace llvm;
39
Tom Stellardf110f8f2016-04-14 16:27:03 +000040static unsigned findFirstFreeSGPR(CCState &CCInfo) {
41 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
42 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
43 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
44 return AMDGPU::SGPR0 + Reg;
45 }
46 }
47 llvm_unreachable("Cannot allocate sgpr");
48}
49
Matt Arsenault43e92fe2016-06-24 06:30:11 +000050SITargetLowering::SITargetLowering(const TargetMachine &TM,
51 const SISubtarget &STI)
Eric Christopher7792e322015-01-30 23:24:40 +000052 : AMDGPUTargetLowering(TM, STI) {
Tom Stellard1bd80722014-04-30 15:31:33 +000053 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
Tom Stellard436780b2014-05-15 14:41:57 +000054 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
Christian Konig2214f142013-03-07 09:03:38 +000055
Tom Stellard334b29c2014-04-17 21:00:09 +000056 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
Tom Stellard45c0b3a2015-01-07 20:59:25 +000057 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
Tom Stellard75aadc22012-12-11 21:25:42 +000058
Tom Stellard436780b2014-05-15 14:41:57 +000059 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
60 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
61 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
Christian Konig2214f142013-03-07 09:03:38 +000062
Matt Arsenault61001bb2015-11-25 19:58:34 +000063 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
64 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
65
Tom Stellard436780b2014-05-15 14:41:57 +000066 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
67 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
Christian Konig2214f142013-03-07 09:03:38 +000068
Tom Stellardf0a21072014-11-18 20:39:39 +000069 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
Christian Konig2214f142013-03-07 09:03:38 +000070 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
71
Tom Stellardf0a21072014-11-18 20:39:39 +000072 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
Christian Konig2214f142013-03-07 09:03:38 +000073 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
Tom Stellard75aadc22012-12-11 21:25:42 +000074
Eric Christopher23a3a7c2015-02-26 00:00:24 +000075 computeRegisterProperties(STI.getRegisterInfo());
Tom Stellard75aadc22012-12-11 21:25:42 +000076
Tom Stellard35bb18c2013-08-26 15:06:04 +000077 // We need to custom lower vector stores from local memory
Matt Arsenault71e66762016-05-21 02:27:49 +000078 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
Tom Stellard35bb18c2013-08-26 15:06:04 +000079 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
Tom Stellardaf775432013-10-23 00:44:32 +000080 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
81 setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +000082 setOperationAction(ISD::LOAD, MVT::i1, Custom);
Matt Arsenault2b957b52016-05-02 20:07:26 +000083
Matt Arsenaultbcdfee72016-05-02 20:13:51 +000084 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +000085 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
86 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
87 setOperationAction(ISD::STORE, MVT::v16i32, Custom);
88 setOperationAction(ISD::STORE, MVT::i1, Custom);
Matt Arsenaultbcdfee72016-05-02 20:13:51 +000089
Matt Arsenault71e66762016-05-21 02:27:49 +000090 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
91 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
92 setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
93 setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
94
95 setOperationAction(ISD::SELECT, MVT::i1, Promote);
Tom Stellard0ec134f2014-02-04 17:18:40 +000096 setOperationAction(ISD::SELECT, MVT::i64, Custom);
Tom Stellardda99c6e2014-03-24 16:07:30 +000097 setOperationAction(ISD::SELECT, MVT::f64, Promote);
98 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
Tom Stellard81d871d2013-11-13 23:36:50 +000099
Tom Stellard3ca1bfc2014-06-10 16:01:22 +0000100 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
101 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
102 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
103 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
Matt Arsenault71e66762016-05-21 02:27:49 +0000104 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
Tom Stellard754f80f2013-04-05 23:31:51 +0000105
Tom Stellardd1efda82016-01-20 21:48:24 +0000106 setOperationAction(ISD::SETCC, MVT::i1, Promote);
Tom Stellard83747202013-07-18 21:43:53 +0000107 setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
108 setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
109
Matt Arsenault71e66762016-05-21 02:27:49 +0000110 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
111 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
Matt Arsenaulte306a322014-10-21 16:25:08 +0000112
Matt Arsenault4e466652014-04-16 01:41:30 +0000113 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
114 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000115 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
116 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000117 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
118 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000119 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
120
Tom Stellard9fa17912013-08-14 23:24:45 +0000121 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
Tom Stellard9fa17912013-08-14 23:24:45 +0000122 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000123 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
124
Matt Arsenaulte54e1c32014-06-23 18:00:44 +0000125 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000126 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
Tom Stellardbc4497b2016-02-12 23:45:29 +0000127 setOperationAction(ISD::BR_CC, MVT::i32, Expand);
128 setOperationAction(ISD::BR_CC, MVT::i64, Expand);
129 setOperationAction(ISD::BR_CC, MVT::f32, Expand);
130 setOperationAction(ISD::BR_CC, MVT::f64, Expand);
Tom Stellardafcf12f2013-09-12 02:55:14 +0000131
Benjamin Kramer867bfc52015-03-07 17:41:00 +0000132 // We only support LOAD/STORE and vector manipulation ops for vectors
133 // with > 4 elements.
Matt Arsenault61001bb2015-11-25 19:58:34 +0000134 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
Tom Stellard967bf582014-02-13 23:34:15 +0000135 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
Matt Arsenault71e66762016-05-21 02:27:49 +0000136 switch (Op) {
Tom Stellard967bf582014-02-13 23:34:15 +0000137 case ISD::LOAD:
138 case ISD::STORE:
139 case ISD::BUILD_VECTOR:
140 case ISD::BITCAST:
141 case ISD::EXTRACT_VECTOR_ELT:
142 case ISD::INSERT_VECTOR_ELT:
Tom Stellard967bf582014-02-13 23:34:15 +0000143 case ISD::INSERT_SUBVECTOR:
144 case ISD::EXTRACT_SUBVECTOR:
Matt Arsenault61001bb2015-11-25 19:58:34 +0000145 case ISD::SCALAR_TO_VECTOR:
Tom Stellard967bf582014-02-13 23:34:15 +0000146 break;
Tom Stellardc0503db2014-08-09 01:06:56 +0000147 case ISD::CONCAT_VECTORS:
148 setOperationAction(Op, VT, Custom);
149 break;
Tom Stellard967bf582014-02-13 23:34:15 +0000150 default:
Matt Arsenaultd504a742014-05-15 21:44:05 +0000151 setOperationAction(Op, VT, Expand);
Tom Stellard967bf582014-02-13 23:34:15 +0000152 break;
153 }
154 }
155 }
156
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000157 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
158 // is expanded to avoid having two separate loops in case the index is a VGPR.
159
Matt Arsenault61001bb2015-11-25 19:58:34 +0000160 // Most operations are naturally 32-bit vector operations. We only support
161 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
162 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
163 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
164 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
165
166 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
167 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
168
169 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
170 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
171
172 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
173 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
174 }
175
Matt Arsenault71e66762016-05-21 02:27:49 +0000176 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
177 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
178 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
179 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +0000180
Tom Stellard354a43c2016-04-01 18:27:37 +0000181 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
182 // and output demarshalling
183 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
184 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
185
186 // We can't return success/failure, only the old value,
187 // let LLVM add the comparison
188 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
189 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
190
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000191 if (getSubtarget()->hasFlatAddressSpace()) {
Matt Arsenault99c14522016-04-25 19:27:24 +0000192 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
193 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
194 }
195
Matt Arsenault71e66762016-05-21 02:27:49 +0000196 setOperationAction(ISD::BSWAP, MVT::i32, Legal);
197 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
198
199 // On SI this is s_memtime and s_memrealtime on VI.
200 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
Matt Arsenault0bb294b2016-06-17 22:27:03 +0000201 setOperationAction(ISD::TRAP, MVT::Other, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000202
203 setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
204 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
205
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000206 if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
Matt Arsenault71e66762016-05-21 02:27:49 +0000207 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
208 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
209 setOperationAction(ISD::FRINT, MVT::f64, Legal);
210 }
211
212 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
213
214 setOperationAction(ISD::FSIN, MVT::f32, Custom);
215 setOperationAction(ISD::FCOS, MVT::f32, Custom);
216 setOperationAction(ISD::FDIV, MVT::f32, Custom);
217 setOperationAction(ISD::FDIV, MVT::f64, Custom);
218
Matt Arsenault02cb0ff2014-09-29 14:59:34 +0000219 setTargetDAGCombine(ISD::FADD);
Matt Arsenault8675db12014-08-29 16:01:14 +0000220 setTargetDAGCombine(ISD::FSUB);
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +0000221 setTargetDAGCombine(ISD::FMINNUM);
222 setTargetDAGCombine(ISD::FMAXNUM);
Matt Arsenault5881f4e2015-06-09 00:52:37 +0000223 setTargetDAGCombine(ISD::SMIN);
224 setTargetDAGCombine(ISD::SMAX);
225 setTargetDAGCombine(ISD::UMIN);
226 setTargetDAGCombine(ISD::UMAX);
Tom Stellard75aadc22012-12-11 21:25:42 +0000227 setTargetDAGCombine(ISD::SETCC);
Matt Arsenaultd0101a22015-01-06 23:00:46 +0000228 setTargetDAGCombine(ISD::AND);
Matt Arsenaultf2290332015-01-06 23:00:39 +0000229 setTargetDAGCombine(ISD::OR);
Matt Arsenault364a6742014-06-11 17:50:44 +0000230 setTargetDAGCombine(ISD::UINT_TO_FP);
Matt Arsenault9cd90712016-04-14 01:42:16 +0000231 setTargetDAGCombine(ISD::FCANONICALIZE);
Matt Arsenault364a6742014-06-11 17:50:44 +0000232
Matt Arsenaultb2baffa2014-08-15 17:49:05 +0000233 // All memory operations. Some folding on the pointer operand is done to help
234 // matching the constant offsets in the addressing modes.
235 setTargetDAGCombine(ISD::LOAD);
236 setTargetDAGCombine(ISD::STORE);
237 setTargetDAGCombine(ISD::ATOMIC_LOAD);
238 setTargetDAGCombine(ISD::ATOMIC_STORE);
239 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
240 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
241 setTargetDAGCombine(ISD::ATOMIC_SWAP);
242 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
243 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
244 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
245 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
246 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
247 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
248 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
249 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
250 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
251 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
252
Christian Konigeecebd02013-03-26 14:04:02 +0000253 setSchedulingPreference(Sched::RegPressure);
Tom Stellard75aadc22012-12-11 21:25:42 +0000254}
255
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000256const SISubtarget *SITargetLowering::getSubtarget() const {
257 return static_cast<const SISubtarget *>(Subtarget);
258}
259
Tom Stellard0125f2a2013-06-25 02:39:35 +0000260//===----------------------------------------------------------------------===//
261// TargetLowering queries
262//===----------------------------------------------------------------------===//
263
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000264bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
265 const CallInst &CI,
266 unsigned IntrID) const {
267 switch (IntrID) {
268 case Intrinsic::amdgcn_atomic_inc:
269 case Intrinsic::amdgcn_atomic_dec:
270 Info.opc = ISD::INTRINSIC_W_CHAIN;
271 Info.memVT = MVT::getVT(CI.getType());
272 Info.ptrVal = CI.getOperand(0);
273 Info.align = 0;
274 Info.vol = false;
275 Info.readMem = true;
276 Info.writeMem = true;
277 return true;
278 default:
279 return false;
280 }
281}
282
Matt Arsenaulte306a322014-10-21 16:25:08 +0000283bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
284 EVT) const {
285 // SI has some legal vector types, but no legal vector operations. Say no
286 // shuffles are legal in order to prefer scalarizing some vector operations.
287 return false;
288}
289
Tom Stellard70580f82015-07-20 14:28:41 +0000290bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
291 // Flat instructions do not have offsets, and only have the register
292 // address.
293 return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
294}
295
Matt Arsenault711b3902015-08-07 20:18:34 +0000296bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
297 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
298 // additionally can do r + r + i with addr64. 32-bit has more addressing
299 // mode options. Depending on the resource constant, it can also do
300 // (i64 r0) + (i32 r1) * (i14 i).
301 //
302 // Private arrays end up using a scratch buffer most of the time, so also
303 // assume those use MUBUF instructions. Scratch loads / stores are currently
304 // implemented as mubuf instructions with offen bit set, so slightly
305 // different than the normal addr64.
306 if (!isUInt<12>(AM.BaseOffs))
307 return false;
308
309 // FIXME: Since we can split immediate into soffset and immediate offset,
310 // would it make sense to allow any immediate?
311
312 switch (AM.Scale) {
313 case 0: // r + i or just i, depending on HasBaseReg.
314 return true;
315 case 1:
316 return true; // We have r + r or r + i.
317 case 2:
318 if (AM.HasBaseReg) {
319 // Reject 2 * r + r.
320 return false;
321 }
322
323 // Allow 2 * r as r + r
324 // Or 2 * r + i is allowed as r + r + i.
325 return true;
326 default: // Don't allow n * r
327 return false;
328 }
329}
330
Mehdi Amini0cdec1e2015-07-09 02:09:40 +0000331bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
332 const AddrMode &AM, Type *Ty,
333 unsigned AS) const {
Matt Arsenault5015a892014-08-15 17:17:07 +0000334 // No global is ever allowed as a base.
335 if (AM.BaseGV)
336 return false;
337
Matt Arsenault73e06fa2015-06-04 16:17:42 +0000338 switch (AS) {
Matt Arsenault711b3902015-08-07 20:18:34 +0000339 case AMDGPUAS::GLOBAL_ADDRESS: {
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000340 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
Tom Stellard70580f82015-07-20 14:28:41 +0000341 // Assume the we will use FLAT for all global memory accesses
342 // on VI.
343 // FIXME: This assumption is currently wrong. On VI we still use
344 // MUBUF instructions for the r + i addressing mode. As currently
345 // implemented, the MUBUF instructions only work on buffer < 4GB.
346 // It may be possible to support > 4GB buffers with MUBUF instructions,
347 // by setting the stride value in the resource descriptor which would
348 // increase the size limit to (stride * 4GB). However, this is risky,
349 // because it has never been validated.
350 return isLegalFlatAddressingMode(AM);
351 }
Matt Arsenault5015a892014-08-15 17:17:07 +0000352
Matt Arsenault711b3902015-08-07 20:18:34 +0000353 return isLegalMUBUFAddressingMode(AM);
Matt Arsenault73e06fa2015-06-04 16:17:42 +0000354 }
Matt Arsenault711b3902015-08-07 20:18:34 +0000355 case AMDGPUAS::CONSTANT_ADDRESS: {
356 // If the offset isn't a multiple of 4, it probably isn't going to be
357 // correctly aligned.
Matt Arsenault3cc1e002016-08-13 01:43:51 +0000358 // FIXME: Can we get the real alignment here?
Matt Arsenault711b3902015-08-07 20:18:34 +0000359 if (AM.BaseOffs % 4 != 0)
360 return isLegalMUBUFAddressingMode(AM);
361
362 // There are no SMRD extloads, so if we have to do a small type access we
363 // will use a MUBUF load.
364 // FIXME?: We also need to do this if unaligned, but we don't know the
365 // alignment here.
366 if (DL.getTypeStoreSize(Ty) < 4)
367 return isLegalMUBUFAddressingMode(AM);
368
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000369 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +0000370 // SMRD instructions have an 8-bit, dword offset on SI.
371 if (!isUInt<8>(AM.BaseOffs / 4))
372 return false;
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000373 } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +0000374 // On CI+, this can also be a 32-bit literal constant offset. If it fits
375 // in 8-bits, it can use a smaller encoding.
376 if (!isUInt<32>(AM.BaseOffs / 4))
377 return false;
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000378 } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +0000379 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
380 if (!isUInt<20>(AM.BaseOffs))
381 return false;
382 } else
383 llvm_unreachable("unhandled generation");
384
385 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
386 return true;
387
388 if (AM.Scale == 1 && AM.HasBaseReg)
389 return true;
390
391 return false;
392 }
393
394 case AMDGPUAS::PRIVATE_ADDRESS:
Matt Arsenault711b3902015-08-07 20:18:34 +0000395 return isLegalMUBUFAddressingMode(AM);
396
Matt Arsenault73e06fa2015-06-04 16:17:42 +0000397 case AMDGPUAS::LOCAL_ADDRESS:
398 case AMDGPUAS::REGION_ADDRESS: {
399 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
400 // field.
401 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
402 // an 8-bit dword offset but we don't know the alignment here.
403 if (!isUInt<16>(AM.BaseOffs))
Matt Arsenault5015a892014-08-15 17:17:07 +0000404 return false;
Matt Arsenault73e06fa2015-06-04 16:17:42 +0000405
406 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
407 return true;
408
409 if (AM.Scale == 1 && AM.HasBaseReg)
410 return true;
411
Matt Arsenault5015a892014-08-15 17:17:07 +0000412 return false;
413 }
Tom Stellard70580f82015-07-20 14:28:41 +0000414 case AMDGPUAS::FLAT_ADDRESS:
Matt Arsenault7d1b6c82016-04-29 06:25:10 +0000415 case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
416 // For an unknown address space, this usually means that this is for some
417 // reason being used for pure arithmetic, and not based on some addressing
418 // computation. We don't have instructions that compute pointers with any
419 // addressing modes, so treat them as having no offset like flat
420 // instructions.
Tom Stellard70580f82015-07-20 14:28:41 +0000421 return isLegalFlatAddressingMode(AM);
422
Matt Arsenault73e06fa2015-06-04 16:17:42 +0000423 default:
424 llvm_unreachable("unhandled address space");
425 }
Matt Arsenault5015a892014-08-15 17:17:07 +0000426}
427
Matt Arsenaulte6986632015-01-14 01:35:22 +0000428bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
Matt Arsenault6f2a5262014-07-27 17:46:40 +0000429 unsigned AddrSpace,
430 unsigned Align,
431 bool *IsFast) const {
Matt Arsenault1018c892014-04-24 17:08:26 +0000432 if (IsFast)
433 *IsFast = false;
434
Matt Arsenault1018c892014-04-24 17:08:26 +0000435 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
436 // which isn't a simple VT.
Alina Sbirlea6f937b12016-08-04 16:38:44 +0000437 // Until MVT is extended to handle this, simply check for the size and
438 // rely on the condition below: allow accesses if the size is a multiple of 4.
439 if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
440 VT.getStoreSize() > 16)) {
Tom Stellard81d871d2013-11-13 23:36:50 +0000441 return false;
Alina Sbirlea6f937b12016-08-04 16:38:44 +0000442 }
Matt Arsenault1018c892014-04-24 17:08:26 +0000443
Matt Arsenault7f681ac2016-07-01 23:03:44 +0000444 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
445 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
Matt Arsenault6f2a5262014-07-27 17:46:40 +0000446 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
447 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
448 // with adjacent offsets.
Sanjay Patelce74db92015-09-03 15:03:19 +0000449 bool AlignedBy4 = (Align % 4 == 0);
450 if (IsFast)
451 *IsFast = AlignedBy4;
Matt Arsenault7f681ac2016-07-01 23:03:44 +0000452
Sanjay Patelce74db92015-09-03 15:03:19 +0000453 return AlignedBy4;
Matt Arsenault6f2a5262014-07-27 17:46:40 +0000454 }
Matt Arsenault1018c892014-04-24 17:08:26 +0000455
Matt Arsenault7f681ac2016-07-01 23:03:44 +0000456 if (Subtarget->hasUnalignedBufferAccess()) {
457 // If we have an uniform constant load, it still requires using a slow
458 // buffer instruction if unaligned.
459 if (IsFast) {
460 *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ?
461 (Align % 4 == 0) : true;
462 }
463
464 return true;
465 }
466
Tom Stellard33e64c62015-02-04 20:49:52 +0000467 // Smaller than dword value must be aligned.
Tom Stellard33e64c62015-02-04 20:49:52 +0000468 if (VT.bitsLT(MVT::i32))
469 return false;
470
Matt Arsenault1018c892014-04-24 17:08:26 +0000471 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
472 // byte-address are ignored, thus forcing Dword alignment.
Tom Stellarde812f2f2014-07-21 15:45:06 +0000473 // This applies to private, global, and constant memory.
Matt Arsenault1018c892014-04-24 17:08:26 +0000474 if (IsFast)
475 *IsFast = true;
Tom Stellardc6b299c2015-02-02 18:02:28 +0000476
477 return VT.bitsGT(MVT::i32) && Align % 4 == 0;
Tom Stellard0125f2a2013-06-25 02:39:35 +0000478}
479
Matt Arsenault46645fa2014-07-28 17:49:26 +0000480EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
481 unsigned SrcAlign, bool IsMemset,
482 bool ZeroMemset,
483 bool MemcpyStrSrc,
484 MachineFunction &MF) const {
485 // FIXME: Should account for address space here.
486
487 // The default fallback uses the private pointer size as a guess for a type to
488 // use. Make sure we switch these to 64-bit accesses.
489
490 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
491 return MVT::v4i32;
492
493 if (Size >= 8 && DstAlign >= 4)
494 return MVT::v2i32;
495
496 // Use the default.
497 return MVT::Other;
498}
499
Matt Arsenaultf9bfeaf2015-12-01 23:04:00 +0000500static bool isFlatGlobalAddrSpace(unsigned AS) {
501 return AS == AMDGPUAS::GLOBAL_ADDRESS ||
502 AS == AMDGPUAS::FLAT_ADDRESS ||
503 AS == AMDGPUAS::CONSTANT_ADDRESS;
504}
505
506bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
507 unsigned DestAS) const {
Matt Arsenault37fefd62016-06-10 02:18:02 +0000508 return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
Matt Arsenaultf9bfeaf2015-12-01 23:04:00 +0000509}
510
Tom Stellarda6f24c62015-12-15 20:55:55 +0000511bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
512 const MemSDNode *MemNode = cast<MemSDNode>(N);
513 const Value *Ptr = MemNode->getMemOperand()->getValue();
514
515 // UndefValue means this is a load of a kernel input. These are uniform.
Tom Stellard418beb72016-07-13 14:23:33 +0000516 // Sometimes LDS instructions have constant pointers.
517 // If Ptr is null, then that means this mem operand contains a
518 // PseudoSourceValue like GOT.
519 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
520 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
Tom Stellarda6f24c62015-12-15 20:55:55 +0000521 return true;
522
Tom Stellard418beb72016-07-13 14:23:33 +0000523 const Instruction *I = dyn_cast<Instruction>(Ptr);
Tom Stellarda6f24c62015-12-15 20:55:55 +0000524 return I && I->getMetadata("amdgpu.uniform");
525}
526
Chandler Carruth9d010ff2014-07-03 00:23:43 +0000527TargetLoweringBase::LegalizeTypeAction
528SITargetLowering::getPreferredVectorAction(EVT VT) const {
529 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
530 return TypeSplitVector;
531
532 return TargetLoweringBase::getPreferredVectorAction(VT);
Tom Stellardd86003e2013-08-14 23:25:00 +0000533}
Tom Stellard0125f2a2013-06-25 02:39:35 +0000534
Matt Arsenaultd7bdcc42014-03-31 19:54:27 +0000535bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
536 Type *Ty) const {
Matt Arsenault749035b2016-07-30 01:40:36 +0000537 // FIXME: Could be smarter if called for vector constants.
538 return true;
Matt Arsenaultd7bdcc42014-03-31 19:54:27 +0000539}
540
Tom Stellard2e045bb2016-01-20 00:13:22 +0000541bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
542
543 // SimplifySetCC uses this function to determine whether or not it should
544 // create setcc with i1 operands. We don't have instructions for i1 setcc.
545 if (VT == MVT::i1 && Op == ISD::SETCC)
546 return false;
547
548 return TargetLowering::isTypeDesirableForOp(Op, VT);
549}
550
Jan Veselyfea814d2016-06-21 20:46:20 +0000551SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
552 const SDLoc &SL, SDValue Chain,
553 unsigned Offset) const {
Mehdi Aminia749f2a2015-07-09 02:09:52 +0000554 const DataLayout &DL = DAG.getDataLayout();
Tom Stellardec2e43c2014-09-22 15:35:29 +0000555 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000556 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
Matt Arsenaultac234b62015-11-30 21:15:57 +0000557 unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
Tom Stellard94593ee2013-06-03 17:40:18 +0000558
Matt Arsenault86033ca2014-07-28 17:31:39 +0000559 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
Mehdi Aminia749f2a2015-07-09 02:09:52 +0000560 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
Matt Arsenaulta0269b62015-06-01 21:58:24 +0000561 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
562 MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
Jan Veselyfea814d2016-06-21 20:46:20 +0000563 return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
564 DAG.getConstant(Offset, SL, PtrVT));
565}
566SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
567 const SDLoc &SL, SDValue Chain,
568 unsigned Offset, bool Signed) const {
569 const DataLayout &DL = DAG.getDataLayout();
570 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
571 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
572 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
Mehdi Amini44ede332015-07-09 02:09:04 +0000573 SDValue PtrOffset = DAG.getUNDEF(PtrVT);
Matt Arsenault86033ca2014-07-28 17:31:39 +0000574 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
575
Mehdi Aminia749f2a2015-07-09 02:09:52 +0000576 unsigned Align = DL.getABITypeAlignment(Ty);
Matt Arsenault81c7ae22015-06-04 16:00:27 +0000577
Matt Arsenault81c7ae22015-06-04 16:00:27 +0000578 ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
Matt Arsenaultacd68b52015-09-09 01:12:27 +0000579 if (MemVT.isFloatingPoint())
580 ExtTy = ISD::EXTLOAD;
581
Jan Veselyfea814d2016-06-21 20:46:20 +0000582 SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
Justin Lebar9c375812016-07-15 18:27:10 +0000583 return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset,
584 PtrInfo, MemVT, Align, MachineMemOperand::MONonTemporal |
585 MachineMemOperand::MOInvariant);
Tom Stellard94593ee2013-06-03 17:40:18 +0000586}
587
Christian Konig2c8f6d52013-03-07 09:03:52 +0000588SDValue SITargetLowering::LowerFormalArguments(
Eric Christopher7792e322015-01-30 23:24:40 +0000589 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
Benjamin Kramerbdc49562016-06-12 15:39:02 +0000590 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
591 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000592 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
Christian Konig2c8f6d52013-03-07 09:03:52 +0000593
594 MachineFunction &MF = DAG.getMachineFunction();
595 FunctionType *FType = MF.getFunction()->getFunctionType();
Christian Konig99ee0f42013-03-07 09:04:14 +0000596 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000597 const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
Christian Konig2c8f6d52013-03-07 09:03:52 +0000598
Nicolai Haehnledf3a20c2016-04-06 19:40:20 +0000599 if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
Matt Arsenaultd48da142015-11-02 23:23:02 +0000600 const Function *Fn = MF.getFunction();
Oliver Stannard7e7d9832016-02-02 13:52:43 +0000601 DiagnosticInfoUnsupported NoGraphicsHSA(
602 *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
Matt Arsenaultd48da142015-11-02 23:23:02 +0000603 DAG.getContext()->diagnose(NoGraphicsHSA);
Diana Picus81bc3172016-05-26 15:24:55 +0000604 return DAG.getEntryNode();
Matt Arsenaultd48da142015-11-02 23:23:02 +0000605 }
606
Konstantin Zhuravlyovf2f3d142016-06-25 03:11:28 +0000607 // Create stack objects that are used for emitting debugger prologue if
608 // "amdgpu-debugger-emit-prologue" attribute was specified.
609 if (ST.debuggerEmitPrologue())
610 createDebuggerPrologueStackObjects(MF);
611
Christian Konig2c8f6d52013-03-07 09:03:52 +0000612 SmallVector<ISD::InputArg, 16> Splits;
Alexey Samsonova253bf92014-08-27 19:36:53 +0000613 BitVector Skipped(Ins.size());
Christian Konig99ee0f42013-03-07 09:04:14 +0000614
615 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
Christian Konig2c8f6d52013-03-07 09:03:52 +0000616 const ISD::InputArg &Arg = Ins[i];
Matt Arsenault758659232013-05-18 00:21:46 +0000617
618 // First check if it's a PS input addr
Nicolai Haehnledf3a20c2016-04-06 19:40:20 +0000619 if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
Marek Olsakb6c8c3d2016-01-13 11:46:10 +0000620 !Arg.Flags.isByVal() && PSInputNum <= 15) {
Christian Konig99ee0f42013-03-07 09:04:14 +0000621
Marek Olsakfccabaf2016-01-13 11:45:36 +0000622 if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
Benjamin Kramerdf005cb2015-08-08 18:27:36 +0000623 // We can safely skip PS inputs
Alexey Samsonova253bf92014-08-27 19:36:53 +0000624 Skipped.set(i);
Christian Konig99ee0f42013-03-07 09:04:14 +0000625 ++PSInputNum;
626 continue;
627 }
628
Marek Olsakfccabaf2016-01-13 11:45:36 +0000629 Info->markPSInputAllocated(PSInputNum);
630 if (Arg.Used)
631 Info->PSInputEna |= 1 << PSInputNum;
632
633 ++PSInputNum;
Christian Konig99ee0f42013-03-07 09:04:14 +0000634 }
635
Matt Arsenault539ca882016-05-05 20:27:02 +0000636 if (AMDGPU::isShader(CallConv)) {
637 // Second split vertices into their elements
638 if (Arg.VT.isVector()) {
639 ISD::InputArg NewArg = Arg;
640 NewArg.Flags.setSplit();
641 NewArg.VT = Arg.VT.getVectorElementType();
Christian Konig2c8f6d52013-03-07 09:03:52 +0000642
Matt Arsenault539ca882016-05-05 20:27:02 +0000643 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
644 // three or five element vertex only needs three or five registers,
645 // NOT four or eight.
646 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
647 unsigned NumElements = ParamType->getVectorNumElements();
Christian Konig2c8f6d52013-03-07 09:03:52 +0000648
Matt Arsenault539ca882016-05-05 20:27:02 +0000649 for (unsigned j = 0; j != NumElements; ++j) {
650 Splits.push_back(NewArg);
651 NewArg.PartOffset += NewArg.VT.getStoreSize();
652 }
653 } else {
654 Splits.push_back(Arg);
Christian Konig2c8f6d52013-03-07 09:03:52 +0000655 }
Christian Konig2c8f6d52013-03-07 09:03:52 +0000656 }
657 }
658
659 SmallVector<CCValAssign, 16> ArgLocs;
Eric Christopherb5217502014-08-06 18:45:26 +0000660 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
661 *DAG.getContext());
Christian Konig2c8f6d52013-03-07 09:03:52 +0000662
Christian Konig99ee0f42013-03-07 09:04:14 +0000663 // At least one interpolation mode must be enabled or else the GPU will hang.
Marek Olsakfccabaf2016-01-13 11:45:36 +0000664 //
665 // Check PSInputAddr instead of PSInputEna. The idea is that if the user set
666 // PSInputAddr, the user wants to enable some bits after the compilation
667 // based on run-time states. Since we can't know what the final PSInputEna
668 // will look like, so we shouldn't do anything here and the user should take
669 // responsibility for the correct programming.
Marek Olsak46dadbf2016-01-13 17:23:20 +0000670 //
671 // Otherwise, the following restrictions apply:
672 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
673 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
674 // enabled too.
Nicolai Haehnledf3a20c2016-04-06 19:40:20 +0000675 if (CallConv == CallingConv::AMDGPU_PS &&
Marek Olsak46dadbf2016-01-13 17:23:20 +0000676 ((Info->getPSInputAddr() & 0x7F) == 0 ||
NAKAMURA Takumife1202c2016-06-20 00:37:41 +0000677 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) {
Christian Konig99ee0f42013-03-07 09:04:14 +0000678 CCInfo.AllocateReg(AMDGPU::VGPR0);
679 CCInfo.AllocateReg(AMDGPU::VGPR1);
Marek Olsakfccabaf2016-01-13 11:45:36 +0000680 Info->markPSInputAllocated(0);
681 Info->PSInputEna |= 1;
Christian Konig99ee0f42013-03-07 09:04:14 +0000682 }
683
Nicolai Haehnledf3a20c2016-04-06 19:40:20 +0000684 if (!AMDGPU::isShader(CallConv)) {
Tom Stellardaf775432013-10-23 00:44:32 +0000685 getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
686 Splits);
Tom Stellardf110f8f2016-04-14 16:27:03 +0000687
688 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
689 } else {
690 assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
691 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
692 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
693 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
694 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
695 !Info->hasWorkItemIDZ());
Tom Stellardaf775432013-10-23 00:44:32 +0000696 }
697
Matt Arsenault26f8f3d2015-11-30 21:16:03 +0000698 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
699 if (Info->hasPrivateSegmentBuffer()) {
700 unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
701 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
702 CCInfo.AllocateReg(PrivateSegmentBufferReg);
703 }
704
705 if (Info->hasDispatchPtr()) {
706 unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
707 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
708 CCInfo.AllocateReg(DispatchPtrReg);
709 }
710
Matt Arsenault48ab5262016-04-25 19:27:18 +0000711 if (Info->hasQueuePtr()) {
712 unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
713 MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass);
714 CCInfo.AllocateReg(QueuePtrReg);
715 }
716
Matt Arsenault26f8f3d2015-11-30 21:16:03 +0000717 if (Info->hasKernargSegmentPtr()) {
718 unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
719 MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
720 CCInfo.AllocateReg(InputPtrReg);
721 }
722
Matt Arsenault8d718dc2016-07-22 17:01:30 +0000723 if (Info->hasDispatchID()) {
724 unsigned DispatchIDReg = Info->addDispatchID(*TRI);
725 MF.addLiveIn(DispatchIDReg, &AMDGPU::SReg_64RegClass);
726 CCInfo.AllocateReg(DispatchIDReg);
727 }
728
Matt Arsenault296b8492016-02-12 06:31:30 +0000729 if (Info->hasFlatScratchInit()) {
730 unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
731 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass);
732 CCInfo.AllocateReg(FlatScratchInitReg);
733 }
734
Christian Konig2c8f6d52013-03-07 09:03:52 +0000735 AnalyzeFormalArguments(CCInfo, Splits);
736
Matt Arsenaultcf13d182015-07-10 22:51:36 +0000737 SmallVector<SDValue, 16> Chains;
738
Christian Konig2c8f6d52013-03-07 09:03:52 +0000739 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
740
Christian Konigb7be72d2013-05-17 09:46:48 +0000741 const ISD::InputArg &Arg = Ins[i];
Alexey Samsonova253bf92014-08-27 19:36:53 +0000742 if (Skipped[i]) {
Christian Konigb7be72d2013-05-17 09:46:48 +0000743 InVals.push_back(DAG.getUNDEF(Arg.VT));
Christian Konig99ee0f42013-03-07 09:04:14 +0000744 continue;
745 }
746
Christian Konig2c8f6d52013-03-07 09:03:52 +0000747 CCValAssign &VA = ArgLocs[ArgIdx++];
Craig Topper7f416c82014-11-16 21:17:18 +0000748 MVT VT = VA.getLocVT();
Tom Stellarded882c22013-06-03 17:40:11 +0000749
750 if (VA.isMemLoc()) {
Tom Stellardaf775432013-10-23 00:44:32 +0000751 VT = Ins[i].VT;
752 EVT MemVT = Splits[i].VT;
Tom Stellardb5798b02015-06-26 21:15:03 +0000753 const unsigned Offset = Subtarget->getExplicitKernelArgOffset() +
754 VA.getLocMemOffset();
Tom Stellard94593ee2013-06-03 17:40:18 +0000755 // The first 36 bytes of the input buffer contains information about
756 // thread group and global sizes.
Matt Arsenault0d519732015-07-10 22:28:41 +0000757 SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain,
Jan Veselye5121f32014-10-14 20:05:26 +0000758 Offset, Ins[i].Flags.isSExt());
Matt Arsenaultcf13d182015-07-10 22:51:36 +0000759 Chains.push_back(Arg.getValue(1));
Tom Stellardca7ecf32014-08-22 18:49:31 +0000760
Craig Toppere3dcce92015-08-01 22:20:21 +0000761 auto *ParamTy =
Andrew Trick05938a52015-02-16 18:10:47 +0000762 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000763 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
Tom Stellardca7ecf32014-08-22 18:49:31 +0000764 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
765 // On SI local pointers are just offsets into LDS, so they are always
766 // less than 16-bits. On CI and newer they could potentially be
767 // real pointers, so we can't guarantee their size.
768 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
769 DAG.getValueType(MVT::i16));
770 }
771
Tom Stellarded882c22013-06-03 17:40:11 +0000772 InVals.push_back(Arg);
Matt Arsenault52ef4012016-07-26 16:45:58 +0000773 Info->setABIArgOffset(Offset + MemVT.getStoreSize());
Tom Stellarded882c22013-06-03 17:40:11 +0000774 continue;
775 }
Christian Konig2c8f6d52013-03-07 09:03:52 +0000776 assert(VA.isRegLoc() && "Parameter must be in a register!");
777
778 unsigned Reg = VA.getLocReg();
Christian Konig2c8f6d52013-03-07 09:03:52 +0000779
780 if (VT == MVT::i64) {
781 // For now assume it is a pointer
782 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
783 &AMDGPU::SReg_64RegClass);
784 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
Matt Arsenaultcf13d182015-07-10 22:51:36 +0000785 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
786 InVals.push_back(Copy);
Christian Konig2c8f6d52013-03-07 09:03:52 +0000787 continue;
788 }
789
790 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
791
792 Reg = MF.addLiveIn(Reg, RC);
793 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
794
Christian Konig2c8f6d52013-03-07 09:03:52 +0000795 if (Arg.VT.isVector()) {
796
797 // Build a vector from the registers
Andrew Trick05938a52015-02-16 18:10:47 +0000798 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
Christian Konig2c8f6d52013-03-07 09:03:52 +0000799 unsigned NumElements = ParamType->getVectorNumElements();
800
801 SmallVector<SDValue, 4> Regs;
802 Regs.push_back(Val);
803 for (unsigned j = 1; j != NumElements; ++j) {
804 Reg = ArgLocs[ArgIdx++].getLocReg();
805 Reg = MF.addLiveIn(Reg, RC);
Matt Arsenaultcf13d182015-07-10 22:51:36 +0000806
807 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
808 Regs.push_back(Copy);
Christian Konig2c8f6d52013-03-07 09:03:52 +0000809 }
810
811 // Fill up the missing vector elements
812 NumElements = Arg.VT.getVectorNumElements() - NumElements;
Benjamin Kramer6cd780f2015-02-17 15:29:18 +0000813 Regs.append(NumElements, DAG.getUNDEF(VT));
Matt Arsenault758659232013-05-18 00:21:46 +0000814
Ahmed Bougacha128f8732016-04-26 21:15:30 +0000815 InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
Christian Konig2c8f6d52013-03-07 09:03:52 +0000816 continue;
817 }
818
819 InVals.push_back(Val);
820 }
Tom Stellarde99fb652015-01-20 19:33:04 +0000821
Matt Arsenault26f8f3d2015-11-30 21:16:03 +0000822 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
823 // these from the dispatch pointer.
824
825 // Start adding system SGPRs.
826 if (Info->hasWorkGroupIDX()) {
827 unsigned Reg = Info->addWorkGroupIDX();
828 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
829 CCInfo.AllocateReg(Reg);
Tom Stellardf110f8f2016-04-14 16:27:03 +0000830 }
Matt Arsenault26f8f3d2015-11-30 21:16:03 +0000831
832 if (Info->hasWorkGroupIDY()) {
833 unsigned Reg = Info->addWorkGroupIDY();
834 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
835 CCInfo.AllocateReg(Reg);
Tom Stellarde99fb652015-01-20 19:33:04 +0000836 }
Matt Arsenaultcf13d182015-07-10 22:51:36 +0000837
Matt Arsenault26f8f3d2015-11-30 21:16:03 +0000838 if (Info->hasWorkGroupIDZ()) {
839 unsigned Reg = Info->addWorkGroupIDZ();
840 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
841 CCInfo.AllocateReg(Reg);
842 }
843
844 if (Info->hasWorkGroupInfo()) {
845 unsigned Reg = Info->addWorkGroupInfo();
846 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
847 CCInfo.AllocateReg(Reg);
848 }
849
850 if (Info->hasPrivateSegmentWaveByteOffset()) {
851 // Scratch wave offset passed in system SGPR.
Tom Stellardf110f8f2016-04-14 16:27:03 +0000852 unsigned PrivateSegmentWaveByteOffsetReg;
853
854 if (AMDGPU::isShader(CallConv)) {
855 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
856 Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
857 } else
858 PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
Matt Arsenault26f8f3d2015-11-30 21:16:03 +0000859
860 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
861 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
862 }
863
864 // Now that we've figured out where the scratch register inputs are, see if
865 // should reserve the arguments and use them directly.
Matthias Braun941a7052016-07-28 18:40:00 +0000866 bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
Matt Arsenault296b8492016-02-12 06:31:30 +0000867 // Record that we know we have non-spill stack objects so we don't need to
868 // check all stack objects later.
869 if (HasStackObjects)
870 Info->setHasNonSpillStackObjects(true);
Matt Arsenault26f8f3d2015-11-30 21:16:03 +0000871
872 if (ST.isAmdHsaOS()) {
873 // TODO: Assume we will spill without optimizations.
874 if (HasStackObjects) {
875 // If we have stack objects, we unquestionably need the private buffer
876 // resource. For the HSA ABI, this will be the first 4 user SGPR
877 // inputs. We can reserve those and use them directly.
878
879 unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
880 MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
881 Info->setScratchRSrcReg(PrivateSegmentBufferReg);
882
883 unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
884 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
885 Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
886 } else {
887 unsigned ReservedBufferReg
888 = TRI->reservedPrivateSegmentBufferReg(MF);
889 unsigned ReservedOffsetReg
890 = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
891
892 // We tentatively reserve the last registers (skipping the last two
893 // which may contain VCC). After register allocation, we'll replace
894 // these with the ones immediately after those which were really
895 // allocated. In the prologue copies will be inserted from the argument
896 // to these reserved registers.
897 Info->setScratchRSrcReg(ReservedBufferReg);
898 Info->setScratchWaveOffsetReg(ReservedOffsetReg);
899 }
900 } else {
901 unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
902
903 // Without HSA, relocations are used for the scratch pointer and the
904 // buffer resource setup is always inserted in the prologue. Scratch wave
905 // offset is still in an input SGPR.
906 Info->setScratchRSrcReg(ReservedBufferReg);
907
908 if (HasStackObjects) {
909 unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
910 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
911 Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
912 } else {
913 unsigned ReservedOffsetReg
914 = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
915 Info->setScratchWaveOffsetReg(ReservedOffsetReg);
916 }
917 }
918
919 if (Info->hasWorkItemIDX()) {
920 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
921 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
922 CCInfo.AllocateReg(Reg);
Tom Stellardf110f8f2016-04-14 16:27:03 +0000923 }
Matt Arsenault26f8f3d2015-11-30 21:16:03 +0000924
925 if (Info->hasWorkItemIDY()) {
926 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
927 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
928 CCInfo.AllocateReg(Reg);
929 }
930
931 if (Info->hasWorkItemIDZ()) {
932 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
933 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
934 CCInfo.AllocateReg(Reg);
935 }
Matt Arsenault0e3d3892015-11-30 21:15:53 +0000936
Matt Arsenaultcf13d182015-07-10 22:51:36 +0000937 if (Chains.empty())
938 return Chain;
939
940 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
Christian Konig2c8f6d52013-03-07 09:03:52 +0000941}
942
Benjamin Kramerbdc49562016-06-12 15:39:02 +0000943SDValue
944SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
945 bool isVarArg,
946 const SmallVectorImpl<ISD::OutputArg> &Outs,
947 const SmallVectorImpl<SDValue> &OutVals,
948 const SDLoc &DL, SelectionDAG &DAG) const {
Marek Olsak8a0f3352016-01-13 17:23:04 +0000949 MachineFunction &MF = DAG.getMachineFunction();
950 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
951
Nicolai Haehnledf3a20c2016-04-06 19:40:20 +0000952 if (!AMDGPU::isShader(CallConv))
Marek Olsak8a0f3352016-01-13 17:23:04 +0000953 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
954 OutVals, DL, DAG);
955
Marek Olsak8e9cc632016-01-13 17:23:09 +0000956 Info->setIfReturnsVoid(Outs.size() == 0);
957
Marek Olsak8a0f3352016-01-13 17:23:04 +0000958 SmallVector<ISD::OutputArg, 48> Splits;
959 SmallVector<SDValue, 48> SplitVals;
960
961 // Split vectors into their elements.
962 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
963 const ISD::OutputArg &Out = Outs[i];
964
965 if (Out.VT.isVector()) {
966 MVT VT = Out.VT.getVectorElementType();
967 ISD::OutputArg NewOut = Out;
968 NewOut.Flags.setSplit();
969 NewOut.VT = VT;
970
971 // We want the original number of vector elements here, e.g.
972 // three or five, not four or eight.
973 unsigned NumElements = Out.ArgVT.getVectorNumElements();
974
975 for (unsigned j = 0; j != NumElements; ++j) {
976 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
977 DAG.getConstant(j, DL, MVT::i32));
978 SplitVals.push_back(Elem);
979 Splits.push_back(NewOut);
980 NewOut.PartOffset += NewOut.VT.getStoreSize();
981 }
982 } else {
983 SplitVals.push_back(OutVals[i]);
984 Splits.push_back(Out);
985 }
986 }
987
988 // CCValAssign - represent the assignment of the return value to a location.
989 SmallVector<CCValAssign, 48> RVLocs;
990
991 // CCState - Info about the registers and stack slots.
992 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
993 *DAG.getContext());
994
995 // Analyze outgoing return values.
996 AnalyzeReturn(CCInfo, Splits);
997
998 SDValue Flag;
999 SmallVector<SDValue, 48> RetOps;
1000 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1001
1002 // Copy the result values into the output registers.
1003 for (unsigned i = 0, realRVLocIdx = 0;
1004 i != RVLocs.size();
1005 ++i, ++realRVLocIdx) {
1006 CCValAssign &VA = RVLocs[i];
1007 assert(VA.isRegLoc() && "Can only return in registers!");
1008
1009 SDValue Arg = SplitVals[realRVLocIdx];
1010
1011 // Copied from other backends.
1012 switch (VA.getLocInfo()) {
1013 default: llvm_unreachable("Unknown loc info!");
1014 case CCValAssign::Full:
1015 break;
1016 case CCValAssign::BCvt:
1017 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
1018 break;
1019 }
1020
1021 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
1022 Flag = Chain.getValue(1);
1023 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1024 }
1025
1026 // Update chain and glue.
1027 RetOps[0] = Chain;
1028 if (Flag.getNode())
1029 RetOps.push_back(Flag);
1030
Matt Arsenault9babdf42016-06-22 20:15:28 +00001031 unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
1032 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
Marek Olsak8a0f3352016-01-13 17:23:04 +00001033}
1034
Matt Arsenault9a10cea2016-01-26 04:29:24 +00001035unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
1036 SelectionDAG &DAG) const {
1037 unsigned Reg = StringSwitch<unsigned>(RegName)
1038 .Case("m0", AMDGPU::M0)
1039 .Case("exec", AMDGPU::EXEC)
1040 .Case("exec_lo", AMDGPU::EXEC_LO)
1041 .Case("exec_hi", AMDGPU::EXEC_HI)
1042 .Case("flat_scratch", AMDGPU::FLAT_SCR)
1043 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
1044 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
1045 .Default(AMDGPU::NoRegister);
1046
1047 if (Reg == AMDGPU::NoRegister) {
1048 report_fatal_error(Twine("invalid register name \""
1049 + StringRef(RegName) + "\"."));
1050
1051 }
1052
Matt Arsenault43e92fe2016-06-24 06:30:11 +00001053 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
Matt Arsenault9a10cea2016-01-26 04:29:24 +00001054 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
1055 report_fatal_error(Twine("invalid register \""
1056 + StringRef(RegName) + "\" for subtarget."));
1057 }
1058
1059 switch (Reg) {
1060 case AMDGPU::M0:
1061 case AMDGPU::EXEC_LO:
1062 case AMDGPU::EXEC_HI:
1063 case AMDGPU::FLAT_SCR_LO:
1064 case AMDGPU::FLAT_SCR_HI:
1065 if (VT.getSizeInBits() == 32)
1066 return Reg;
1067 break;
1068 case AMDGPU::EXEC:
1069 case AMDGPU::FLAT_SCR:
1070 if (VT.getSizeInBits() == 64)
1071 return Reg;
1072 break;
1073 default:
1074 llvm_unreachable("missing register type checking");
1075 }
1076
1077 report_fatal_error(Twine("invalid type for register \""
1078 + StringRef(RegName) + "\"."));
1079}
1080
Matt Arsenault786724a2016-07-12 21:41:32 +00001081// If kill is not the last instruction, split the block so kill is always a
1082// proper terminator.
1083MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
1084 MachineBasicBlock *BB) const {
1085 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
1086
1087 MachineBasicBlock::iterator SplitPoint(&MI);
1088 ++SplitPoint;
1089
1090 if (SplitPoint == BB->end()) {
1091 // Don't bother with a new block.
1092 MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
1093 return BB;
1094 }
1095
1096 MachineFunction *MF = BB->getParent();
1097 MachineBasicBlock *SplitBB
1098 = MF->CreateMachineBasicBlock(BB->getBasicBlock());
1099
Matt Arsenault786724a2016-07-12 21:41:32 +00001100 MF->insert(++MachineFunction::iterator(BB), SplitBB);
1101 SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
1102
Matt Arsenaultd40ded62016-07-22 17:01:15 +00001103 SplitBB->transferSuccessorsAndUpdatePHIs(BB);
Matt Arsenault786724a2016-07-12 21:41:32 +00001104 BB->addSuccessor(SplitBB);
1105
1106 MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
1107 return SplitBB;
1108}
1109
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00001110// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
1111// wavefront. If the value is uniform and just happens to be in a VGPR, this
1112// will only do one iteration. In the worst case, this will loop 64 times.
1113//
1114// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
1115static void emitLoadM0FromVGPRLoop(const SIInstrInfo *TII,
1116 MachineRegisterInfo &MRI,
1117 MachineBasicBlock &OrigBB,
1118 MachineBasicBlock &LoopBB,
1119 const DebugLoc &DL,
1120 MachineInstr *MovRel,
1121 const MachineOperand &IdxReg,
1122 unsigned InitReg,
1123 unsigned ResultReg,
1124 unsigned PhiReg,
1125 unsigned InitSaveExecReg,
1126 int Offset) {
1127 MachineBasicBlock::iterator I = LoopBB.begin();
1128
1129 unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1130 unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1131 unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1132 unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1133
1134 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
1135 .addReg(InitReg)
1136 .addMBB(&OrigBB)
1137 .addReg(ResultReg)
1138 .addMBB(&LoopBB);
1139
1140 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
1141 .addReg(InitSaveExecReg)
1142 .addMBB(&OrigBB)
1143 .addReg(NewExec)
1144 .addMBB(&LoopBB);
1145
1146 // Read the next variant <- also loop target.
1147 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
1148 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
1149
1150 // Compare the just read M0 value to all possible Idx values.
1151 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
1152 .addReg(CurrentIdxReg)
Matt Arsenaultf0ba86a2016-07-21 09:40:57 +00001153 .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00001154
1155 // Move index from VCC into M0
1156 if (Offset == 0) {
1157 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1158 .addReg(CurrentIdxReg, RegState::Kill);
1159 } else {
1160 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
1161 .addReg(CurrentIdxReg, RegState::Kill)
1162 .addImm(Offset);
1163 }
1164
1165 // Update EXEC, save the original EXEC value to VCC.
1166 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
1167 .addReg(CondReg, RegState::Kill);
1168
1169 MRI.setSimpleHint(NewExec, CondReg);
1170
1171 // Do the actual move.
1172 LoopBB.insert(I, MovRel);
1173
1174 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1175 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
1176 .addReg(AMDGPU::EXEC)
1177 .addReg(NewExec);
1178
1179 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1180 // s_cbranch_scc0?
1181
1182 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1183 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
1184 .addMBB(&LoopBB);
1185}
1186
1187// This has slightly sub-optimal regalloc when the source vector is killed by
1188// the read. The register allocator does not understand that the kill is
1189// per-workitem, so is kept alive for the whole loop so we end up not re-using a
1190// subregister from it, using 1 more VGPR than necessary. This was saved when
1191// this was expanded after register allocation.
1192static MachineBasicBlock *loadM0FromVGPR(const SIInstrInfo *TII,
1193 MachineBasicBlock &MBB,
1194 MachineInstr &MI,
1195 MachineInstr *MovRel,
1196 unsigned InitResultReg,
1197 unsigned PhiReg,
1198 int Offset) {
1199 MachineFunction *MF = MBB.getParent();
1200 MachineRegisterInfo &MRI = MF->getRegInfo();
1201 const DebugLoc &DL = MI.getDebugLoc();
1202 MachineBasicBlock::iterator I(&MI);
1203
1204 unsigned DstReg = MI.getOperand(0).getReg();
1205 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1206 unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1207
1208 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
1209
1210 // Save the EXEC mask
1211 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
1212 .addReg(AMDGPU::EXEC);
1213
1214 // To insert the loop we need to split the block. Move everything after this
1215 // point to a new block, and insert a new empty block between the two.
1216 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
1217 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
1218 MachineFunction::iterator MBBI(MBB);
1219 ++MBBI;
1220
1221 MF->insert(MBBI, LoopBB);
1222 MF->insert(MBBI, RemainderBB);
1223
1224 LoopBB->addSuccessor(LoopBB);
1225 LoopBB->addSuccessor(RemainderBB);
1226
1227 // Move the rest of the block into a new block.
Matt Arsenaultd40ded62016-07-22 17:01:15 +00001228 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00001229 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
1230
1231 MBB.addSuccessor(LoopBB);
1232
1233 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
1234
1235 emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, MovRel, *Idx,
1236 InitResultReg, DstReg, PhiReg, TmpExec, Offset);
1237
1238 MachineBasicBlock::iterator First = RemainderBB->begin();
1239 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
1240 .addReg(SaveExec);
1241
1242 MI.eraseFromParent();
1243
1244 return RemainderBB;
1245}
1246
1247// Returns subreg index, offset
1248static std::pair<unsigned, int>
1249computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
1250 const TargetRegisterClass *SuperRC,
1251 unsigned VecReg,
1252 int Offset) {
1253 int NumElts = SuperRC->getSize() / 4;
1254
1255 // Skip out of bounds offsets, or else we would end up using an undefined
1256 // register.
1257 if (Offset >= NumElts || Offset < 0)
1258 return std::make_pair(AMDGPU::sub0, Offset);
1259
1260 return std::make_pair(AMDGPU::sub0 + Offset, 0);
1261}
1262
1263// Return true if the index is an SGPR and was set.
1264static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
1265 MachineRegisterInfo &MRI,
1266 MachineInstr &MI,
1267 int Offset) {
1268 MachineBasicBlock *MBB = MI.getParent();
1269 const DebugLoc &DL = MI.getDebugLoc();
1270 MachineBasicBlock::iterator I(&MI);
1271
1272 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
1273 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
1274
1275 assert(Idx->getReg() != AMDGPU::NoRegister);
1276
1277 if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
1278 return false;
1279
1280 if (Offset == 0) {
1281 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1282 .addOperand(*Idx);
1283 } else {
1284 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
1285 .addOperand(*Idx)
1286 .addImm(Offset);
1287 }
1288
1289 return true;
1290}
1291
1292// Control flow needs to be inserted if indexing with a VGPR.
1293static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
1294 MachineBasicBlock &MBB,
1295 const SIInstrInfo *TII) {
1296 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1297 MachineFunction *MF = MBB.getParent();
1298 MachineRegisterInfo &MRI = MF->getRegInfo();
1299
1300 unsigned Dst = MI.getOperand(0).getReg();
1301 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
1302 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
1303
1304 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
1305
1306 unsigned SubReg;
1307 std::tie(SubReg, Offset)
1308 = computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
1309
1310 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset)) {
1311 MachineBasicBlock::iterator I(&MI);
1312 const DebugLoc &DL = MI.getDebugLoc();
1313
1314 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
1315 .addReg(SrcVec->getReg(), RegState::Undef, SubReg)
1316 .addReg(SrcVec->getReg(), RegState::Implicit);
1317 MI.eraseFromParent();
1318
1319 return &MBB;
1320 }
1321
1322 const DebugLoc &DL = MI.getDebugLoc();
1323 MachineBasicBlock::iterator I(&MI);
1324
1325 unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1326 unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1327
1328 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
1329
1330 MachineInstr *MovRel =
1331 BuildMI(*MF, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
1332 .addReg(SrcVec->getReg(), RegState::Undef, SubReg)
1333 .addReg(SrcVec->getReg(), RegState::Implicit);
1334
1335 return loadM0FromVGPR(TII, MBB, MI, MovRel, InitReg, PhiReg, Offset);
1336}
1337
1338static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
1339 MachineBasicBlock &MBB,
1340 const SIInstrInfo *TII) {
1341 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1342 MachineFunction *MF = MBB.getParent();
1343 MachineRegisterInfo &MRI = MF->getRegInfo();
1344
1345 unsigned Dst = MI.getOperand(0).getReg();
1346 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
1347 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
1348 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
1349 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
1350 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
1351
1352 // This can be an immediate, but will be folded later.
1353 assert(Val->getReg());
1354
1355 unsigned SubReg;
1356 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
1357 SrcVec->getReg(),
1358 Offset);
1359 if (Idx->getReg() == AMDGPU::NoRegister) {
1360 MachineBasicBlock::iterator I(&MI);
1361 const DebugLoc &DL = MI.getDebugLoc();
1362
1363 assert(Offset == 0);
1364
1365 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
1366 .addOperand(*SrcVec)
1367 .addOperand(*Val)
1368 .addImm(SubReg);
1369
1370 MI.eraseFromParent();
1371 return &MBB;
1372 }
1373
1374 const MCInstrDesc &MovRelDesc = TII->get(AMDGPU::V_MOVRELD_B32_e32);
1375 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset)) {
1376 MachineBasicBlock::iterator I(&MI);
1377 const DebugLoc &DL = MI.getDebugLoc();
1378
1379 MachineInstr *MovRel =
1380 BuildMI(MBB, I, DL, MovRelDesc)
1381 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
1382 .addOperand(*Val)
1383 .addReg(Dst, RegState::ImplicitDefine)
1384 .addReg(SrcVec->getReg(), RegState::Implicit);
1385
1386 const int ImpDefIdx = MovRelDesc.getNumOperands() +
1387 MovRelDesc.getNumImplicitUses();
1388 const int ImpUseIdx = ImpDefIdx + 1;
1389
1390 MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
1391 MI.eraseFromParent();
1392 return &MBB;
1393 }
1394
1395 if (Val->isReg())
1396 MRI.clearKillFlags(Val->getReg());
1397
1398 const DebugLoc &DL = MI.getDebugLoc();
1399 unsigned PhiReg = MRI.createVirtualRegister(VecRC);
1400
1401 // vdst is not actually read and just provides the base register index.
1402 MachineInstr *MovRel =
1403 BuildMI(*MF, DL, MovRelDesc)
1404 .addReg(PhiReg, RegState::Undef, SubReg) // vdst
1405 .addOperand(*Val)
1406 .addReg(Dst, RegState::ImplicitDefine)
1407 .addReg(PhiReg, RegState::Implicit);
1408
1409 const int ImpDefIdx = MovRelDesc.getNumOperands() +
1410 MovRelDesc.getNumImplicitUses();
1411 const int ImpUseIdx = ImpDefIdx + 1;
1412
1413 MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
1414
1415 return loadM0FromVGPR(TII, MBB, MI, MovRel,
1416 SrcVec->getReg(), PhiReg, Offset);
1417}
1418
Matt Arsenault786724a2016-07-12 21:41:32 +00001419MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
1420 MachineInstr &MI, MachineBasicBlock *BB) const {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00001421 switch (MI.getOpcode()) {
Matt Arsenault4ac341c2016-04-14 21:58:15 +00001422 case AMDGPU::SI_INIT_M0: {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00001423 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00001424 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
Matt Arsenault4ac341c2016-04-14 21:58:15 +00001425 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00001426 .addOperand(MI.getOperand(0));
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00001427 MI.eraseFromParent();
Matt Arsenault20711b72015-02-20 22:10:45 +00001428 return BB;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00001429 }
Changpeng Fang01f60622016-03-15 17:28:44 +00001430 case AMDGPU::GET_GROUPSTATICSIZE: {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00001431 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
1432
Changpeng Fang01f60622016-03-15 17:28:44 +00001433 MachineFunction *MF = BB->getParent();
1434 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00001435 DebugLoc DL = MI.getDebugLoc();
Matt Arsenault3c07c812016-07-22 17:01:33 +00001436 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
1437 .addOperand(MI.getOperand(0))
Matt Arsenault52ef4012016-07-26 16:45:58 +00001438 .addImm(MFI->getLDSSize());
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00001439 MI.eraseFromParent();
Changpeng Fang01f60622016-03-15 17:28:44 +00001440 return BB;
1441 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00001442 case AMDGPU::SI_INDIRECT_SRC_V1:
1443 case AMDGPU::SI_INDIRECT_SRC_V2:
1444 case AMDGPU::SI_INDIRECT_SRC_V4:
1445 case AMDGPU::SI_INDIRECT_SRC_V8:
1446 case AMDGPU::SI_INDIRECT_SRC_V16:
1447 return emitIndirectSrc(MI, *BB, getSubtarget()->getInstrInfo());
1448 case AMDGPU::SI_INDIRECT_DST_V1:
1449 case AMDGPU::SI_INDIRECT_DST_V2:
1450 case AMDGPU::SI_INDIRECT_DST_V4:
1451 case AMDGPU::SI_INDIRECT_DST_V8:
1452 case AMDGPU::SI_INDIRECT_DST_V16:
1453 return emitIndirectDst(MI, *BB, getSubtarget()->getInstrInfo());
Matt Arsenault786724a2016-07-12 21:41:32 +00001454 case AMDGPU::SI_KILL:
1455 return splitKillBlock(MI, BB);
Matt Arsenault22e41792016-08-27 01:00:37 +00001456 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
1457 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
1458 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
1459
1460 unsigned Dst = MI.getOperand(0).getReg();
1461 unsigned Src0 = MI.getOperand(1).getReg();
1462 unsigned Src1 = MI.getOperand(2).getReg();
1463 const DebugLoc &DL = MI.getDebugLoc();
1464 unsigned SrcCond = MI.getOperand(3).getReg();
1465
1466 unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1467 unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1468
1469 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
1470 .addReg(Src0, 0, AMDGPU::sub0)
1471 .addReg(Src1, 0, AMDGPU::sub0)
1472 .addReg(SrcCond);
1473 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
1474 .addReg(Src0, 0, AMDGPU::sub1)
1475 .addReg(Src1, 0, AMDGPU::sub1)
1476 .addReg(SrcCond);
1477
1478 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
1479 .addReg(DstLo)
1480 .addImm(AMDGPU::sub0)
1481 .addReg(DstHi)
1482 .addImm(AMDGPU::sub1);
1483 MI.eraseFromParent();
1484 return BB;
1485 }
Changpeng Fang01f60622016-03-15 17:28:44 +00001486 default:
1487 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
Tom Stellard75aadc22012-12-11 21:25:42 +00001488 }
Tom Stellard75aadc22012-12-11 21:25:42 +00001489}
1490
Matt Arsenault423bf3f2015-01-29 19:34:32 +00001491bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
1492 // This currently forces unfolding various combinations of fsub into fma with
1493 // free fneg'd operands. As long as we have fast FMA (controlled by
1494 // isFMAFasterThanFMulAndFAdd), we should perform these.
1495
1496 // When fma is quarter rate, for f64 where add / sub are at best half rate,
1497 // most of these combines appear to be cycle neutral but save on instruction
1498 // count / code size.
1499 return true;
1500}
1501
Mehdi Amini44ede332015-07-09 02:09:04 +00001502EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
1503 EVT VT) const {
Tom Stellard83747202013-07-18 21:43:53 +00001504 if (!VT.isVector()) {
1505 return MVT::i1;
1506 }
Matt Arsenault8596f712014-11-28 22:51:38 +00001507 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
Tom Stellard75aadc22012-12-11 21:25:42 +00001508}
1509
Mehdi Aminieaabc512015-07-09 15:12:23 +00001510MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const {
Christian Konig082a14a2013-03-18 11:34:05 +00001511 return MVT::i32;
1512}
1513
Matt Arsenault423bf3f2015-01-29 19:34:32 +00001514// Answering this is somewhat tricky and depends on the specific device which
1515// have different rates for fma or all f64 operations.
1516//
1517// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
1518// regardless of which device (although the number of cycles differs between
1519// devices), so it is always profitable for f64.
1520//
1521// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
1522// only on full rate devices. Normally, we should prefer selecting v_mad_f32
1523// which we can always do even without fused FP ops since it returns the same
1524// result as the separate operations and since it is always full
1525// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
1526// however does not support denormals, so we do report fma as faster if we have
1527// a fast fma device and require denormals.
1528//
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00001529bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
1530 VT = VT.getScalarType();
1531
1532 if (!VT.isSimple())
1533 return false;
1534
1535 switch (VT.getSimpleVT().SimpleTy) {
1536 case MVT::f32:
Matt Arsenault423bf3f2015-01-29 19:34:32 +00001537 // This is as fast on some subtargets. However, we always have full rate f32
1538 // mad available which returns the same result as the separate operations
Matt Arsenault8d630032015-02-20 22:10:41 +00001539 // which we should prefer over fma. We can't use this if we want to support
1540 // denormals, so only report this in these cases.
1541 return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00001542 case MVT::f64:
1543 return true;
1544 default:
1545 break;
1546 }
1547
1548 return false;
1549}
1550
Tom Stellard75aadc22012-12-11 21:25:42 +00001551//===----------------------------------------------------------------------===//
1552// Custom DAG Lowering Operations
1553//===----------------------------------------------------------------------===//
1554
1555SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1556 switch (Op.getOpcode()) {
1557 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
Tom Stellardb02094e2014-07-21 15:45:01 +00001558 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
Tom Stellardf8794352012-12-19 22:10:31 +00001559 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
Tom Stellard35bb18c2013-08-26 15:06:04 +00001560 case ISD::LOAD: {
Tom Stellarde812f2f2014-07-21 15:45:06 +00001561 SDValue Result = LowerLOAD(Op, DAG);
1562 assert((!Result.getNode() ||
1563 Result.getNode()->getNumValues() == 2) &&
1564 "Load should return a value and a chain");
1565 return Result;
Tom Stellard35bb18c2013-08-26 15:06:04 +00001566 }
Tom Stellardaf775432013-10-23 00:44:32 +00001567
Matt Arsenaultad14ce82014-07-19 18:44:39 +00001568 case ISD::FSIN:
1569 case ISD::FCOS:
1570 return LowerTrig(Op, DAG);
Tom Stellard0ec134f2014-02-04 17:18:40 +00001571 case ISD::SELECT: return LowerSELECT(Op, DAG);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00001572 case ISD::FDIV: return LowerFDIV(Op, DAG);
Tom Stellard354a43c2016-04-01 18:27:37 +00001573 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
Tom Stellard81d871d2013-11-13 23:36:50 +00001574 case ISD::STORE: return LowerSTORE(Op, DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00001575 case ISD::GlobalAddress: {
1576 MachineFunction &MF = DAG.getMachineFunction();
1577 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1578 return LowerGlobalAddress(MFI, Op, DAG);
Tom Stellard94593ee2013-06-03 17:40:18 +00001579 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00001580 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00001581 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00001582 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
Matt Arsenault99c14522016-04-25 19:27:24 +00001583 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
Matt Arsenault0bb294b2016-06-17 22:27:03 +00001584 case ISD::TRAP: return lowerTRAP(Op, DAG);
Tom Stellard75aadc22012-12-11 21:25:42 +00001585 }
1586 return SDValue();
1587}
1588
Tom Stellardf8794352012-12-19 22:10:31 +00001589/// \brief Helper function for LowerBRCOND
1590static SDNode *findUser(SDValue Value, unsigned Opcode) {
Tom Stellard75aadc22012-12-11 21:25:42 +00001591
Tom Stellardf8794352012-12-19 22:10:31 +00001592 SDNode *Parent = Value.getNode();
1593 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
1594 I != E; ++I) {
1595
1596 if (I.getUse().get() != Value)
1597 continue;
1598
1599 if (I->getOpcode() == Opcode)
1600 return *I;
1601 }
Craig Topper062a2ba2014-04-25 05:30:21 +00001602 return nullptr;
Tom Stellardf8794352012-12-19 22:10:31 +00001603}
1604
Tom Stellardb02094e2014-07-21 15:45:01 +00001605SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
1606
Tom Stellardc98ee202015-07-16 19:40:07 +00001607 SDLoc SL(Op);
Tom Stellardb02094e2014-07-21 15:45:01 +00001608 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
1609 unsigned FrameIndex = FINode->getIndex();
1610
Matt Arsenault3a619852016-02-27 20:26:57 +00001611 // A FrameIndex node represents a 32-bit offset into scratch memory. If the
1612 // high bit of a frame index offset were to be set, this would mean that it
1613 // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
1614 // buffer, with 64 being the number of threads per wave.
Tom Stellardc98ee202015-07-16 19:40:07 +00001615 //
Matt Arsenault3a619852016-02-27 20:26:57 +00001616 // The maximum private allocation for the entire GPU is 4G, and we are
1617 // concerned with the largest the index could ever be for an individual
1618 // workitem. This will occur with the minmum dispatch size. If a program
1619 // requires more, the dispatch size will be reduced.
1620 //
1621 // With this limit, we can mark the high bit of the FrameIndex node as known
1622 // zero, which is important, because it means in most situations we can prove
1623 // that values derived from FrameIndex nodes are non-negative. This enables us
1624 // to take advantage of more addressing modes when accessing scratch buffers,
1625 // since for scratch reads/writes, the register offset must always be
1626 // positive.
1627
1628 uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
1629
1630 // XXX - It is unclear if partial dispatch works. Assume it works at half wave
1631 // granularity. It is probably a full wave.
1632 uint64_t MinGranularity = 32;
1633
1634 unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
1635 EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
Tom Stellardc98ee202015-07-16 19:40:07 +00001636
1637 SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
Tom Stellardc98ee202015-07-16 19:40:07 +00001638 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
Matt Arsenault3a619852016-02-27 20:26:57 +00001639 DAG.getValueType(ExtVT));
Tom Stellardb02094e2014-07-21 15:45:01 +00001640}
1641
Tom Stellardbc4497b2016-02-12 23:45:29 +00001642bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
Matt Arsenault16f48d72016-02-13 00:36:10 +00001643 if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
Tom Stellardbc4497b2016-02-12 23:45:29 +00001644 return false;
1645
1646 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
1647 default: return false;
1648 case AMDGPUIntrinsic::amdgcn_if:
1649 case AMDGPUIntrinsic::amdgcn_else:
Matt Arsenault48d70cb2016-07-09 17:18:39 +00001650 case AMDGPUIntrinsic::amdgcn_break:
Tom Stellardbc4497b2016-02-12 23:45:29 +00001651 case AMDGPUIntrinsic::amdgcn_if_break:
1652 case AMDGPUIntrinsic::amdgcn_else_break:
1653 case AMDGPUIntrinsic::amdgcn_loop:
1654 case AMDGPUIntrinsic::amdgcn_end_cf:
1655 return true;
1656 }
1657}
1658
Konstantin Zhuravlyovf2f3d142016-06-25 03:11:28 +00001659void SITargetLowering::createDebuggerPrologueStackObjects(
1660 MachineFunction &MF) const {
1661 // Create stack objects that are used for emitting debugger prologue.
1662 //
1663 // Debugger prologue writes work group IDs and work item IDs to scratch memory
1664 // at fixed location in the following format:
1665 // offset 0: work group ID x
1666 // offset 4: work group ID y
1667 // offset 8: work group ID z
1668 // offset 16: work item ID x
1669 // offset 20: work item ID y
1670 // offset 24: work item ID z
1671 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1672 int ObjectIdx = 0;
1673
1674 // For each dimension:
1675 for (unsigned i = 0; i < 3; ++i) {
1676 // Create fixed stack object for work group ID.
Matthias Braun941a7052016-07-28 18:40:00 +00001677 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
Konstantin Zhuravlyovf2f3d142016-06-25 03:11:28 +00001678 Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
1679 // Create fixed stack object for work item ID.
Matthias Braun941a7052016-07-28 18:40:00 +00001680 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
Konstantin Zhuravlyovf2f3d142016-06-25 03:11:28 +00001681 Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
1682 }
1683}
1684
Tom Stellardf8794352012-12-19 22:10:31 +00001685/// This transforms the control flow intrinsics to get the branch destination as
1686/// last parameter, also switches branch target with BR if the need arise
1687SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
1688 SelectionDAG &DAG) const {
1689
Andrew Trickef9de2a2013-05-25 02:42:55 +00001690 SDLoc DL(BRCOND);
Tom Stellardf8794352012-12-19 22:10:31 +00001691
1692 SDNode *Intr = BRCOND.getOperand(1).getNode();
1693 SDValue Target = BRCOND.getOperand(2);
Craig Topper062a2ba2014-04-25 05:30:21 +00001694 SDNode *BR = nullptr;
Tom Stellardbc4497b2016-02-12 23:45:29 +00001695 SDNode *SetCC = nullptr;
Tom Stellardf8794352012-12-19 22:10:31 +00001696
1697 if (Intr->getOpcode() == ISD::SETCC) {
1698 // As long as we negate the condition everything is fine
Tom Stellardbc4497b2016-02-12 23:45:29 +00001699 SetCC = Intr;
Tom Stellardf8794352012-12-19 22:10:31 +00001700 Intr = SetCC->getOperand(0).getNode();
1701
1702 } else {
1703 // Get the target from BR if we don't negate the condition
1704 BR = findUser(BRCOND, ISD::BR);
1705 Target = BR->getOperand(1);
1706 }
1707
Nicolai Haehnleffbd56a2016-05-05 17:36:36 +00001708 if (!isCFIntrinsic(Intr)) {
Tom Stellardbc4497b2016-02-12 23:45:29 +00001709 // This is a uniform branch so we don't need to legalize.
1710 return BRCOND;
1711 }
1712
1713 assert(!SetCC ||
1714 (SetCC->getConstantOperandVal(1) == 1 &&
Tom Stellardbc4497b2016-02-12 23:45:29 +00001715 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
1716 ISD::SETNE));
Tom Stellardf8794352012-12-19 22:10:31 +00001717
1718 // Build the result and
Benjamin Kramer6cd780f2015-02-17 15:29:18 +00001719 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
Tom Stellardf8794352012-12-19 22:10:31 +00001720
1721 // operands of the new intrinsic call
1722 SmallVector<SDValue, 4> Ops;
1723 Ops.push_back(BRCOND.getOperand(0));
Benjamin Kramer6cd780f2015-02-17 15:29:18 +00001724 Ops.append(Intr->op_begin() + 1, Intr->op_end());
Tom Stellardf8794352012-12-19 22:10:31 +00001725 Ops.push_back(Target);
1726
1727 // build the new intrinsic call
1728 SDNode *Result = DAG.getNode(
1729 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
Craig Topper48d114b2014-04-26 18:35:24 +00001730 DAG.getVTList(Res), Ops).getNode();
Tom Stellardf8794352012-12-19 22:10:31 +00001731
1732 if (BR) {
1733 // Give the branch instruction our target
1734 SDValue Ops[] = {
1735 BR->getOperand(0),
1736 BRCOND.getOperand(2)
1737 };
Chandler Carruth356665a2014-08-01 22:09:43 +00001738 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
1739 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
1740 BR = NewBR.getNode();
Tom Stellardf8794352012-12-19 22:10:31 +00001741 }
1742
1743 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
1744
1745 // Copy the intrinsic results to registers
1746 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
1747 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
1748 if (!CopyToReg)
1749 continue;
1750
1751 Chain = DAG.getCopyToReg(
1752 Chain, DL,
1753 CopyToReg->getOperand(1),
1754 SDValue(Result, i - 1),
1755 SDValue());
1756
1757 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
1758 }
1759
1760 // Remove the old intrinsic from the chain
1761 DAG.ReplaceAllUsesOfValueWith(
1762 SDValue(Intr, Intr->getNumValues() - 1),
1763 Intr->getOperand(0));
1764
1765 return Chain;
Tom Stellard75aadc22012-12-11 21:25:42 +00001766}
1767
Matt Arsenault99c14522016-04-25 19:27:24 +00001768SDValue SITargetLowering::getSegmentAperture(unsigned AS,
1769 SelectionDAG &DAG) const {
1770 SDLoc SL;
1771 MachineFunction &MF = DAG.getMachineFunction();
1772 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenault3b2e2a52016-06-06 20:03:31 +00001773 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
1774 assert(UserSGPR != AMDGPU::NoRegister);
1775
Matt Arsenault99c14522016-04-25 19:27:24 +00001776 SDValue QueuePtr = CreateLiveInRegister(
Matt Arsenault3b2e2a52016-06-06 20:03:31 +00001777 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
Matt Arsenault99c14522016-04-25 19:27:24 +00001778
1779 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1780 // private_segment_aperture_base_hi.
1781 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1782
1783 SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
1784 DAG.getConstant(StructOffset, SL, MVT::i64));
1785
1786 // TODO: Use custom target PseudoSourceValue.
1787 // TODO: We should use the value from the IR intrinsic call, but it might not
1788 // be available and how do we get it?
1789 Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
1790 AMDGPUAS::CONSTANT_ADDRESS));
1791
1792 MachinePointerInfo PtrInfo(V, StructOffset);
Justin Lebar9c375812016-07-15 18:27:10 +00001793 return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo,
1794 MinAlign(64, StructOffset),
1795 MachineMemOperand::MOInvariant);
Matt Arsenault99c14522016-04-25 19:27:24 +00001796}
1797
1798SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
1799 SelectionDAG &DAG) const {
1800 SDLoc SL(Op);
1801 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
1802
1803 SDValue Src = ASC->getOperand(0);
1804
1805 // FIXME: Really support non-0 null pointers.
1806 SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
1807 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
1808
1809 // flat -> local/private
1810 if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
1811 if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1812 ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1813 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
1814 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
1815
1816 return DAG.getNode(ISD::SELECT, SL, MVT::i32,
1817 NonNull, Ptr, SegmentNullPtr);
1818 }
1819 }
1820
1821 // local/private -> flat
1822 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
1823 if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1824 ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1825 SDValue NonNull
1826 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
1827
1828 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
1829 SDValue CvtPtr
1830 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
1831
1832 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
1833 DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
1834 FlatNullPtr);
1835 }
1836 }
1837
1838 // global <-> flat are no-ops and never emitted.
1839
1840 const MachineFunction &MF = DAG.getMachineFunction();
1841 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
1842 *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
1843 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
1844
1845 return DAG.getUNDEF(ASC->getValueType(0));
1846}
1847
Tom Stellard418beb72016-07-13 14:23:33 +00001848static bool shouldEmitGOTReloc(const GlobalValue *GV,
1849 const TargetMachine &TM) {
1850 return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
1851 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
Tom Stellardb164a982016-06-25 01:59:16 +00001852}
1853
Tom Stellard418beb72016-07-13 14:23:33 +00001854bool
1855SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
1856 // We can fold offsets for anything that doesn't require a GOT relocation.
1857 return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
1858 !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine());
1859}
Tom Stellardbf3e6e52016-06-14 20:29:59 +00001860
Tom Stellard418beb72016-07-13 14:23:33 +00001861static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
1862 SDLoc DL, unsigned Offset, EVT PtrVT,
1863 unsigned GAFlags = SIInstrInfo::MO_NONE) {
Tom Stellardbf3e6e52016-06-14 20:29:59 +00001864 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
1865 // lowered to the following code sequence:
1866 // s_getpc_b64 s[0:1]
1867 // s_add_u32 s0, s0, $symbol
1868 // s_addc_u32 s1, s1, 0
1869 //
1870 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1871 // a fixup or relocation is emitted to replace $symbol with a literal
1872 // constant, which is a pc-relative offset from the encoding of the $symbol
1873 // operand to the global variable.
1874 //
1875 // What we want here is an offset from the value returned by s_getpc
1876 // (which is the address of the s_add_u32 instruction) to the global
1877 // variable, but since the encoding of $symbol starts 4 bytes after the start
1878 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1879 // small. This requires us to add 4 to the global variable offset in order to
1880 // compute the correct address.
Tom Stellard418beb72016-07-13 14:23:33 +00001881 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
1882 GAFlags);
Tom Stellardbf3e6e52016-06-14 20:29:59 +00001883 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, GA);
1884}
1885
Tom Stellard418beb72016-07-13 14:23:33 +00001886SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
1887 SDValue Op,
1888 SelectionDAG &DAG) const {
1889 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
1890
1891 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
1892 GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS)
1893 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
1894
1895 SDLoc DL(GSD);
1896 const GlobalValue *GV = GSD->getGlobal();
1897 EVT PtrVT = Op.getValueType();
1898
1899 if (!shouldEmitGOTReloc(GV, getTargetMachine()))
1900 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
1901
1902 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
1903 SIInstrInfo::MO_GOTPCREL);
1904
1905 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
1906 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
1907 const DataLayout &DataLayout = DAG.getDataLayout();
1908 unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
1909 // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
1910 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1911
Justin Lebar9c375812016-07-15 18:27:10 +00001912 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
1913 MachineMemOperand::MOInvariant);
Tom Stellard418beb72016-07-13 14:23:33 +00001914}
1915
Matt Arsenault0bb294b2016-06-17 22:27:03 +00001916SDValue SITargetLowering::lowerTRAP(SDValue Op,
1917 SelectionDAG &DAG) const {
1918 const MachineFunction &MF = DAG.getMachineFunction();
1919 DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
1920 "trap handler not supported",
1921 Op.getDebugLoc(),
1922 DS_Warning);
1923 DAG.getContext()->diagnose(NoTrap);
1924
1925 // Emit s_endpgm.
1926
1927 // FIXME: This should really be selected to s_trap, but that requires
1928 // setting up the trap handler for it o do anything.
Matt Arsenault9babdf42016-06-22 20:15:28 +00001929 return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
1930 Op.getOperand(0));
Matt Arsenault0bb294b2016-06-17 22:27:03 +00001931}
1932
Benjamin Kramerbdc49562016-06-12 15:39:02 +00001933SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
1934 const SDLoc &DL, SDValue V) const {
Matt Arsenault4ac341c2016-04-14 21:58:15 +00001935 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
1936 // the destination register.
1937 //
Tom Stellardfc92e772015-05-12 14:18:14 +00001938 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
1939 // so we will end up with redundant moves to m0.
1940 //
Matt Arsenault4ac341c2016-04-14 21:58:15 +00001941 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
1942
1943 // A Null SDValue creates a glue result.
1944 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
1945 V, Chain);
1946 return SDValue(M0, 0);
Tom Stellardfc92e772015-05-12 14:18:14 +00001947}
1948
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00001949SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
1950 SDValue Op,
1951 MVT VT,
1952 unsigned Offset) const {
1953 SDLoc SL(Op);
1954 SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
1955 DAG.getEntryNode(), Offset, false);
1956 // The local size values will have the hi 16-bits as zero.
1957 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
1958 DAG.getValueType(VT));
1959}
1960
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00001961static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
Matt Arsenaulte0132462016-01-30 05:19:45 +00001962 DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00001963 "non-hsa intrinsic with hsa target",
1964 DL.getDebugLoc());
1965 DAG.getContext()->diagnose(BadIntrin);
1966 return DAG.getUNDEF(VT);
1967}
1968
1969static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
1970 DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
1971 "intrinsic not supported on subtarget",
1972 DL.getDebugLoc());
Matt Arsenaulte0132462016-01-30 05:19:45 +00001973 DAG.getContext()->diagnose(BadIntrin);
1974 return DAG.getUNDEF(VT);
1975}
1976
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00001977SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1978 SelectionDAG &DAG) const {
1979 MachineFunction &MF = DAG.getMachineFunction();
Tom Stellarddcb9f092015-07-09 21:20:37 +00001980 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenault43e92fe2016-06-24 06:30:11 +00001981 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00001982
1983 EVT VT = Op.getValueType();
1984 SDLoc DL(Op);
1985 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1986
Sanjay Patela2607012015-09-16 16:31:21 +00001987 // TODO: Should this propagate fast-math-flags?
1988
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00001989 switch (IntrinsicID) {
Tom Stellard48f29f22015-11-26 00:43:29 +00001990 case Intrinsic::amdgcn_dispatch_ptr:
Matt Arsenault48ab5262016-04-25 19:27:18 +00001991 case Intrinsic::amdgcn_queue_ptr: {
Matt Arsenault800fecf2016-01-11 21:18:33 +00001992 if (!Subtarget->isAmdHsaOS()) {
Oliver Stannard7e7d9832016-02-02 13:52:43 +00001993 DiagnosticInfoUnsupported BadIntrin(
1994 *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
1995 DL.getDebugLoc());
Matt Arsenault800fecf2016-01-11 21:18:33 +00001996 DAG.getContext()->diagnose(BadIntrin);
1997 return DAG.getUNDEF(VT);
1998 }
1999
Matt Arsenault48ab5262016-04-25 19:27:18 +00002000 auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
2001 SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
Tom Stellard48f29f22015-11-26 00:43:29 +00002002 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
Matt Arsenault48ab5262016-04-25 19:27:18 +00002003 TRI->getPreloadedValue(MF, Reg), VT);
2004 }
Jan Veselyfea814d2016-06-21 20:46:20 +00002005 case Intrinsic::amdgcn_implicitarg_ptr: {
2006 unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
2007 return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
2008 }
Matt Arsenaultdc4ebad2016-04-29 21:16:52 +00002009 case Intrinsic::amdgcn_kernarg_segment_ptr: {
2010 unsigned Reg
2011 = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
2012 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
2013 }
Matt Arsenault8d718dc2016-07-22 17:01:30 +00002014 case Intrinsic::amdgcn_dispatch_id: {
2015 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID);
2016 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
2017 }
Matt Arsenaultf75257a2016-01-23 05:32:20 +00002018 case Intrinsic::amdgcn_rcp:
2019 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
2020 case Intrinsic::amdgcn_rsq:
Matt Arsenault0c3e2332016-01-26 04:14:16 +00002021 case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
Matt Arsenaultf75257a2016-01-23 05:32:20 +00002022 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00002023 case Intrinsic::amdgcn_rsq_legacy: {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00002024 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00002025 return emitRemovedIntrinsicError(DAG, DL, VT);
2026
2027 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
2028 }
Matt Arsenault32fc5272016-07-26 16:45:45 +00002029 case Intrinsic::amdgcn_rcp_legacy: {
2030 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
2031 return emitRemovedIntrinsicError(DAG, DL, VT);
2032 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
2033 }
Matt Arsenault09b2c4a2016-07-15 21:26:52 +00002034 case Intrinsic::amdgcn_rsq_clamp: {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00002035 if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
Matt Arsenault79963e82016-02-13 01:03:00 +00002036 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
Tom Stellard48f29f22015-11-26 00:43:29 +00002037
Matt Arsenaultf75257a2016-01-23 05:32:20 +00002038 Type *Type = VT.getTypeForEVT(*DAG.getContext());
2039 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
2040 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
2041
2042 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
2043 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
2044 DAG.getConstantFP(Max, DL, VT));
2045 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
2046 DAG.getConstantFP(Min, DL, VT));
2047 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002048 case Intrinsic::r600_read_ngroups_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00002049 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00002050 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00002051
Tom Stellardec2e43c2014-09-22 15:35:29 +00002052 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
2053 SI::KernelInputOffsets::NGROUPS_X, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002054 case Intrinsic::r600_read_ngroups_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00002055 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00002056 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00002057
Tom Stellardec2e43c2014-09-22 15:35:29 +00002058 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
2059 SI::KernelInputOffsets::NGROUPS_Y, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002060 case Intrinsic::r600_read_ngroups_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00002061 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00002062 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00002063
Tom Stellardec2e43c2014-09-22 15:35:29 +00002064 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
2065 SI::KernelInputOffsets::NGROUPS_Z, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002066 case Intrinsic::r600_read_global_size_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00002067 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00002068 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00002069
Tom Stellardec2e43c2014-09-22 15:35:29 +00002070 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
2071 SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002072 case Intrinsic::r600_read_global_size_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00002073 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00002074 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00002075
Tom Stellardec2e43c2014-09-22 15:35:29 +00002076 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
2077 SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002078 case Intrinsic::r600_read_global_size_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00002079 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00002080 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00002081
Tom Stellardec2e43c2014-09-22 15:35:29 +00002082 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
2083 SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002084 case Intrinsic::r600_read_local_size_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00002085 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00002086 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00002087
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00002088 return lowerImplicitZextParam(DAG, Op, MVT::i16,
2089 SI::KernelInputOffsets::LOCAL_SIZE_X);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002090 case Intrinsic::r600_read_local_size_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00002091 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00002092 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00002093
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00002094 return lowerImplicitZextParam(DAG, Op, MVT::i16,
2095 SI::KernelInputOffsets::LOCAL_SIZE_Y);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002096 case Intrinsic::r600_read_local_size_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00002097 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00002098 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00002099
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00002100 return lowerImplicitZextParam(DAG, Op, MVT::i16,
2101 SI::KernelInputOffsets::LOCAL_SIZE_Z);
Matt Arsenault43976df2016-01-30 04:25:19 +00002102 case Intrinsic::amdgcn_workgroup_id_x:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002103 case Intrinsic::r600_read_tgid_x:
2104 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
Matt Arsenaultac234b62015-11-30 21:15:57 +00002105 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
Matt Arsenault43976df2016-01-30 04:25:19 +00002106 case Intrinsic::amdgcn_workgroup_id_y:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002107 case Intrinsic::r600_read_tgid_y:
2108 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
Matt Arsenaultac234b62015-11-30 21:15:57 +00002109 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
Matt Arsenault43976df2016-01-30 04:25:19 +00002110 case Intrinsic::amdgcn_workgroup_id_z:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002111 case Intrinsic::r600_read_tgid_z:
2112 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
Matt Arsenaultac234b62015-11-30 21:15:57 +00002113 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
Matt Arsenault43976df2016-01-30 04:25:19 +00002114 case Intrinsic::amdgcn_workitem_id_x:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002115 case Intrinsic::r600_read_tidig_x:
Tom Stellard45c0b3a2015-01-07 20:59:25 +00002116 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
Matt Arsenaultac234b62015-11-30 21:15:57 +00002117 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
Matt Arsenault43976df2016-01-30 04:25:19 +00002118 case Intrinsic::amdgcn_workitem_id_y:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002119 case Intrinsic::r600_read_tidig_y:
Tom Stellard45c0b3a2015-01-07 20:59:25 +00002120 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
Matt Arsenaultac234b62015-11-30 21:15:57 +00002121 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
Matt Arsenault43976df2016-01-30 04:25:19 +00002122 case Intrinsic::amdgcn_workitem_id_z:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002123 case Intrinsic::r600_read_tidig_z:
Tom Stellard45c0b3a2015-01-07 20:59:25 +00002124 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
Matt Arsenaultac234b62015-11-30 21:15:57 +00002125 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002126 case AMDGPUIntrinsic::SI_load_const: {
2127 SDValue Ops[] = {
2128 Op.getOperand(1),
2129 Op.getOperand(2)
2130 };
2131
2132 MachineMemOperand *MMO = MF.getMachineMemOperand(
2133 MachinePointerInfo(),
2134 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
2135 VT.getStoreSize(), 4);
2136 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
2137 Op->getVTList(), Ops, VT, MMO);
2138 }
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00002139 case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
2140 return lowerFDIV_FAST(Op, DAG);
2141 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002142 case AMDGPUIntrinsic::SI_vs_load_input:
2143 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
2144 Op.getOperand(1),
2145 Op.getOperand(2),
2146 Op.getOperand(3));
Marek Olsak43650e42015-03-24 13:40:08 +00002147
Tom Stellard2a9d9472015-05-12 15:00:46 +00002148 case AMDGPUIntrinsic::SI_fs_constant: {
2149 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
2150 SDValue Glue = M0.getValue(1);
2151 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
2152 DAG.getConstant(2, DL, MVT::i32), // P0
2153 Op.getOperand(1), Op.getOperand(2), Glue);
2154 }
Marek Olsak6f6d3182015-10-29 15:29:09 +00002155 case AMDGPUIntrinsic::SI_packf16:
2156 if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
2157 return DAG.getUNDEF(MVT::i32);
2158 return Op;
Tom Stellard2a9d9472015-05-12 15:00:46 +00002159 case AMDGPUIntrinsic::SI_fs_interp: {
2160 SDValue IJ = Op.getOperand(4);
2161 SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
2162 DAG.getConstant(0, DL, MVT::i32));
2163 SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
2164 DAG.getConstant(1, DL, MVT::i32));
2165 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
2166 SDValue Glue = M0.getValue(1);
2167 SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL,
2168 DAG.getVTList(MVT::f32, MVT::Glue),
2169 I, Op.getOperand(1), Op.getOperand(2), Glue);
2170 Glue = SDValue(P1.getNode(), 1);
2171 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
2172 Op.getOperand(1), Op.getOperand(2), Glue);
2173 }
Tom Stellardad7d03d2015-12-15 17:02:49 +00002174 case Intrinsic::amdgcn_interp_p1: {
2175 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
2176 SDValue Glue = M0.getValue(1);
2177 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
2178 Op.getOperand(2), Op.getOperand(3), Glue);
2179 }
2180 case Intrinsic::amdgcn_interp_p2: {
2181 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
2182 SDValue Glue = SDValue(M0.getNode(), 1);
2183 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
2184 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
2185 Glue);
2186 }
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00002187 case Intrinsic::amdgcn_sin:
2188 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
2189
2190 case Intrinsic::amdgcn_cos:
2191 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
2192
2193 case Intrinsic::amdgcn_log_clamp: {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00002194 if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00002195 return SDValue();
2196
2197 DiagnosticInfoUnsupported BadIntrin(
2198 *MF.getFunction(), "intrinsic not supported on subtarget",
2199 DL.getDebugLoc());
2200 DAG.getContext()->diagnose(BadIntrin);
2201 return DAG.getUNDEF(VT);
2202 }
Matt Arsenaultf75257a2016-01-23 05:32:20 +00002203 case Intrinsic::amdgcn_ldexp:
2204 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
2205 Op.getOperand(1), Op.getOperand(2));
Matt Arsenault74015162016-05-28 00:19:52 +00002206
2207 case Intrinsic::amdgcn_fract:
2208 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
2209
Matt Arsenaultf75257a2016-01-23 05:32:20 +00002210 case Intrinsic::amdgcn_class:
2211 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
2212 Op.getOperand(1), Op.getOperand(2));
2213 case Intrinsic::amdgcn_div_fmas:
2214 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
2215 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
2216 Op.getOperand(4));
2217
2218 case Intrinsic::amdgcn_div_fixup:
2219 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
2220 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
2221
2222 case Intrinsic::amdgcn_trig_preop:
2223 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
2224 Op.getOperand(1), Op.getOperand(2));
2225 case Intrinsic::amdgcn_div_scale: {
2226 // 3rd parameter required to be a constant.
2227 const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
2228 if (!Param)
2229 return DAG.getUNDEF(VT);
2230
2231 // Translate to the operands expected by the machine instruction. The
2232 // first parameter must be the same as the first instruction.
2233 SDValue Numerator = Op.getOperand(1);
2234 SDValue Denominator = Op.getOperand(2);
2235
2236 // Note this order is opposite of the machine instruction's operations,
2237 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
2238 // intrinsic has the numerator as the first operand to match a normal
2239 // division operation.
2240
2241 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
2242
2243 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
2244 Denominator, Numerator);
2245 }
Wei Ding07e03712016-07-28 16:42:13 +00002246 case Intrinsic::amdgcn_icmp: {
2247 const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
2248 int CondCode = CD->getSExtValue();
2249
2250 if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
NAKAMURA Takumi59a20642016-08-22 00:58:04 +00002251 CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE)
Wei Ding07e03712016-07-28 16:42:13 +00002252 return DAG.getUNDEF(VT);
2253
NAKAMURA Takumi59a20642016-08-22 00:58:04 +00002254 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
Wei Ding07e03712016-07-28 16:42:13 +00002255 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
2256 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
2257 Op.getOperand(2), DAG.getCondCode(CCOpcode));
2258 }
2259 case Intrinsic::amdgcn_fcmp: {
2260 const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
2261 int CondCode = CD->getSExtValue();
2262
2263 if (CondCode <= FCmpInst::Predicate::FCMP_FALSE ||
NAKAMURA Takumi59a20642016-08-22 00:58:04 +00002264 CondCode >= FCmpInst::Predicate::FCMP_TRUE)
Wei Ding07e03712016-07-28 16:42:13 +00002265 return DAG.getUNDEF(VT);
2266
NAKAMURA Takumi59a20642016-08-22 00:58:04 +00002267 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
Wei Ding07e03712016-07-28 16:42:13 +00002268 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
2269 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
2270 Op.getOperand(2), DAG.getCondCode(CCOpcode));
2271 }
Matt Arsenault32fc5272016-07-26 16:45:45 +00002272 case Intrinsic::amdgcn_fmul_legacy:
2273 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
2274 Op.getOperand(1), Op.getOperand(2));
Matt Arsenaultc96e1de2016-07-18 18:35:05 +00002275 case Intrinsic::amdgcn_sffbh:
2276 case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name.
2277 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002278 default:
2279 return AMDGPUTargetLowering::LowerOperation(Op, DAG);
2280 }
2281}
2282
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00002283SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
2284 SelectionDAG &DAG) const {
2285 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
2286 switch (IntrID) {
2287 case Intrinsic::amdgcn_atomic_inc:
2288 case Intrinsic::amdgcn_atomic_dec: {
2289 MemSDNode *M = cast<MemSDNode>(Op);
2290 unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
2291 AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
2292 SDValue Ops[] = {
2293 M->getOperand(0), // Chain
2294 M->getOperand(2), // Ptr
2295 M->getOperand(3) // Value
2296 };
2297
2298 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
2299 M->getMemoryVT(), M->getMemOperand());
2300 }
2301 default:
2302 return SDValue();
2303 }
2304}
2305
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002306SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
2307 SelectionDAG &DAG) const {
2308 MachineFunction &MF = DAG.getMachineFunction();
Tom Stellardfc92e772015-05-12 14:18:14 +00002309 SDLoc DL(Op);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002310 SDValue Chain = Op.getOperand(0);
2311 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
2312
2313 switch (IntrinsicID) {
Tom Stellardfc92e772015-05-12 14:18:14 +00002314 case AMDGPUIntrinsic::SI_sendmsg: {
2315 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
2316 SDValue Glue = Chain.getValue(1);
2317 return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
2318 Op.getOperand(2), Glue);
2319 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002320 case AMDGPUIntrinsic::SI_tbuffer_store: {
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002321 SDValue Ops[] = {
2322 Chain,
2323 Op.getOperand(2),
2324 Op.getOperand(3),
2325 Op.getOperand(4),
2326 Op.getOperand(5),
2327 Op.getOperand(6),
2328 Op.getOperand(7),
2329 Op.getOperand(8),
2330 Op.getOperand(9),
2331 Op.getOperand(10),
2332 Op.getOperand(11),
2333 Op.getOperand(12),
2334 Op.getOperand(13),
2335 Op.getOperand(14)
2336 };
2337
2338 EVT VT = Op.getOperand(3).getValueType();
2339
2340 MachineMemOperand *MMO = MF.getMachineMemOperand(
2341 MachinePointerInfo(),
2342 MachineMemOperand::MOStore,
2343 VT.getStoreSize(), 4);
2344 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
2345 Op->getVTList(), Ops, VT, MMO);
2346 }
Matt Arsenault00568682016-07-13 06:04:22 +00002347 case AMDGPUIntrinsic::AMDGPU_kill: {
Matt Arsenault03006fd2016-07-19 16:27:56 +00002348 SDValue Src = Op.getOperand(2);
2349 if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
Matt Arsenault00568682016-07-13 06:04:22 +00002350 if (!K->isNegative())
2351 return Chain;
Matt Arsenault03006fd2016-07-19 16:27:56 +00002352
2353 SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
2354 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
Matt Arsenault00568682016-07-13 06:04:22 +00002355 }
2356
Matt Arsenault03006fd2016-07-19 16:27:56 +00002357 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
2358 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
Matt Arsenault00568682016-07-13 06:04:22 +00002359 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00002360 default:
2361 return SDValue();
2362 }
2363}
2364
Tom Stellard81d871d2013-11-13 23:36:50 +00002365SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2366 SDLoc DL(Op);
2367 LoadSDNode *Load = cast<LoadSDNode>(Op);
Matt Arsenault6dfda962016-02-10 18:21:39 +00002368 ISD::LoadExtType ExtType = Load->getExtensionType();
Matt Arsenaulta1436412016-02-10 18:21:45 +00002369 EVT MemVT = Load->getMemoryVT();
Matt Arsenault6dfda962016-02-10 18:21:39 +00002370
Matt Arsenaulta1436412016-02-10 18:21:45 +00002371 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
2372 assert(MemVT == MVT::i1 && "Only i1 non-extloads expected");
Matt Arsenault6dfda962016-02-10 18:21:39 +00002373 // FIXME: Copied from PPC
2374 // First, load into 32 bits, then truncate to 1 bit.
2375
2376 SDValue Chain = Load->getChain();
2377 SDValue BasePtr = Load->getBasePtr();
2378 MachineMemOperand *MMO = Load->getMemOperand();
2379
2380 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
2381 BasePtr, MVT::i8, MMO);
2382
2383 SDValue Ops[] = {
Matt Arsenaulta1436412016-02-10 18:21:45 +00002384 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
Matt Arsenault6dfda962016-02-10 18:21:39 +00002385 NewLD.getValue(1)
2386 };
2387
2388 return DAG.getMergeValues(Ops, DL);
2389 }
Tom Stellard81d871d2013-11-13 23:36:50 +00002390
Matt Arsenaulta1436412016-02-10 18:21:45 +00002391 if (!MemVT.isVector())
2392 return SDValue();
Matt Arsenault4d801cd2015-11-24 12:05:03 +00002393
Matt Arsenaulta1436412016-02-10 18:21:45 +00002394 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
2395 "Custom lowering for non-i32 vectors hasn't been implemented.");
Matt Arsenault4d801cd2015-11-24 12:05:03 +00002396
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00002397 unsigned AS = Load->getAddressSpace();
2398 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
2399 AS, Load->getAlignment())) {
2400 SDValue Ops[2];
2401 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2402 return DAG.getMergeValues(Ops, DL);
2403 }
2404
2405 unsigned NumElements = MemVT.getVectorNumElements();
2406 switch (AS) {
Matt Arsenaulta1436412016-02-10 18:21:45 +00002407 case AMDGPUAS::CONSTANT_ADDRESS:
2408 if (isMemOpUniform(Load))
2409 return SDValue();
2410 // Non-uniform loads will be selected to MUBUF instructions, so they
2411 // have the same legalization requires ments as global and private
2412 // loads.
2413 //
Justin Bognerb03fd122016-08-17 05:10:15 +00002414 LLVM_FALLTHROUGH;
Matt Arsenaulta1436412016-02-10 18:21:45 +00002415 case AMDGPUAS::GLOBAL_ADDRESS:
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00002416 case AMDGPUAS::FLAT_ADDRESS:
2417 if (NumElements > 4)
Matt Arsenaulta1436412016-02-10 18:21:45 +00002418 return SplitVectorLoad(Op, DAG);
2419 // v4 loads are supported for private and global memory.
2420 return SDValue();
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00002421 case AMDGPUAS::PRIVATE_ADDRESS: {
2422 // Depending on the setting of the private_element_size field in the
2423 // resource descriptor, we can only make private accesses up to a certain
2424 // size.
2425 switch (Subtarget->getMaxPrivateElementSize()) {
2426 case 4:
Matt Arsenault9c499c32016-04-14 23:31:26 +00002427 return scalarizeVectorLoad(Load, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00002428 case 8:
2429 if (NumElements > 2)
2430 return SplitVectorLoad(Op, DAG);
2431 return SDValue();
2432 case 16:
2433 // Same as global/flat
2434 if (NumElements > 4)
2435 return SplitVectorLoad(Op, DAG);
2436 return SDValue();
2437 default:
2438 llvm_unreachable("unsupported private_element_size");
2439 }
2440 }
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00002441 case AMDGPUAS::LOCAL_ADDRESS: {
2442 if (NumElements > 2)
2443 return SplitVectorLoad(Op, DAG);
2444
2445 if (NumElements == 2)
2446 return SDValue();
2447
Matt Arsenaulta1436412016-02-10 18:21:45 +00002448 // If properly aligned, if we split we might be able to use ds_read_b64.
2449 return SplitVectorLoad(Op, DAG);
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00002450 }
Matt Arsenaulta1436412016-02-10 18:21:45 +00002451 default:
2452 return SDValue();
Tom Stellarde9373602014-01-22 19:24:14 +00002453 }
Tom Stellard81d871d2013-11-13 23:36:50 +00002454}
2455
Tom Stellard0ec134f2014-02-04 17:18:40 +00002456SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
2457 if (Op.getValueType() != MVT::i64)
2458 return SDValue();
2459
2460 SDLoc DL(Op);
2461 SDValue Cond = Op.getOperand(0);
Tom Stellard0ec134f2014-02-04 17:18:40 +00002462
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00002463 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2464 SDValue One = DAG.getConstant(1, DL, MVT::i32);
Tom Stellard0ec134f2014-02-04 17:18:40 +00002465
Tom Stellard7ea3d6d2014-03-31 14:01:55 +00002466 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
2467 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
2468
2469 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
2470 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
Tom Stellard0ec134f2014-02-04 17:18:40 +00002471
2472 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
2473
Tom Stellard7ea3d6d2014-03-31 14:01:55 +00002474 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
2475 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
Tom Stellard0ec134f2014-02-04 17:18:40 +00002476
2477 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
2478
Ahmed Bougacha128f8732016-04-26 21:15:30 +00002479 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
Tom Stellard7ea3d6d2014-03-31 14:01:55 +00002480 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
Tom Stellard0ec134f2014-02-04 17:18:40 +00002481}
2482
Matt Arsenault22ca3f82014-07-15 23:50:10 +00002483// Catch division cases where we can use shortcuts with rcp and rsq
2484// instructions.
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00002485SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
2486 SelectionDAG &DAG) const {
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00002487 SDLoc SL(Op);
2488 SDValue LHS = Op.getOperand(0);
2489 SDValue RHS = Op.getOperand(1);
2490 EVT VT = Op.getValueType();
Matt Arsenault22ca3f82014-07-15 23:50:10 +00002491 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00002492
2493 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
Matt Arsenault979902b2016-08-02 22:25:04 +00002494 if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()))) {
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00002495
Matt Arsenault979902b2016-08-02 22:25:04 +00002496 if (CLHS->isExactlyValue(1.0)) {
2497 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
2498 // the CI documentation has a worst case error of 1 ulp.
2499 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
2500 // use it as long as we aren't trying to use denormals.
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00002501
Matt Arsenault979902b2016-08-02 22:25:04 +00002502 // 1.0 / sqrt(x) -> rsq(x)
2503 //
2504 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
2505 // error seems really high at 2^29 ULP.
2506 if (RHS.getOpcode() == ISD::FSQRT)
2507 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
2508
2509 // 1.0 / x -> rcp(x)
2510 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
2511 }
2512
2513 // Same as for 1.0, but expand the sign out of the constant.
2514 if (CLHS->isExactlyValue(-1.0)) {
2515 // -1.0 / x -> rcp (fneg x)
2516 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
2517 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
2518 }
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00002519 }
2520 }
2521
Wei Dinged0f97f2016-06-09 19:17:15 +00002522 const SDNodeFlags *Flags = Op->getFlags();
2523
2524 if (Unsafe || Flags->hasAllowReciprocal()) {
Matt Arsenault22ca3f82014-07-15 23:50:10 +00002525 // Turn into multiply by the reciprocal.
2526 // x / y -> x * (1.0 / y)
Sanjay Patela2607012015-09-16 16:31:21 +00002527 SDNodeFlags Flags;
2528 Flags.setUnsafeAlgebra(true);
Matt Arsenault22ca3f82014-07-15 23:50:10 +00002529 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
Sanjay Patela2607012015-09-16 16:31:21 +00002530 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
Matt Arsenault22ca3f82014-07-15 23:50:10 +00002531 }
2532
2533 return SDValue();
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00002534}
2535
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00002536// Faster 2.5 ULP division that does not support denormals.
2537SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
2538 SDLoc SL(Op);
2539 SDValue LHS = Op.getOperand(1);
2540 SDValue RHS = Op.getOperand(2);
2541
2542 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
2543
2544 const APFloat K0Val(BitsToFloat(0x6f800000));
2545 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
2546
2547 const APFloat K1Val(BitsToFloat(0x2f800000));
2548 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
2549
2550 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
2551
2552 EVT SetCCVT =
2553 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
2554
2555 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
2556
2557 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
2558
2559 // TODO: Should this propagate fast-math-flags?
2560 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
2561
2562 // rcp does not support denormals.
2563 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
2564
2565 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
2566
2567 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
2568}
2569
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00002570SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00002571 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
Eric Christopher538d09d02016-06-07 20:27:12 +00002572 return FastLowered;
Matt Arsenault22ca3f82014-07-15 23:50:10 +00002573
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00002574 SDLoc SL(Op);
2575 SDValue LHS = Op.getOperand(0);
2576 SDValue RHS = Op.getOperand(1);
2577
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00002578 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
Matt Arsenault37fefd62016-06-10 02:18:02 +00002579
Wei Dinged0f97f2016-06-09 19:17:15 +00002580 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
Matt Arsenault37fefd62016-06-10 02:18:02 +00002581
Wei Dinged0f97f2016-06-09 19:17:15 +00002582 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS);
2583 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS);
Matt Arsenault37fefd62016-06-10 02:18:02 +00002584
Matt Arsenaultdfec5ce2016-07-09 07:48:11 +00002585 // Denominator is scaled to not be denormal, so using rcp is ok.
Wei Dinged0f97f2016-06-09 19:17:15 +00002586 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled);
Matt Arsenault37fefd62016-06-10 02:18:02 +00002587
Wei Dinged0f97f2016-06-09 19:17:15 +00002588 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled);
Matt Arsenault37fefd62016-06-10 02:18:02 +00002589
Wei Dinged0f97f2016-06-09 19:17:15 +00002590 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One);
2591 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp);
Matt Arsenault37fefd62016-06-10 02:18:02 +00002592
Wei Dinged0f97f2016-06-09 19:17:15 +00002593 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1);
Matt Arsenault37fefd62016-06-10 02:18:02 +00002594
Wei Dinged0f97f2016-06-09 19:17:15 +00002595 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled);
2596 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul);
2597 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled);
Matt Arsenault37fefd62016-06-10 02:18:02 +00002598
Wei Dinged0f97f2016-06-09 19:17:15 +00002599 SDValue Scale = NumeratorScaled.getValue(1);
2600 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale);
Matt Arsenault37fefd62016-06-10 02:18:02 +00002601
Wei Dinged0f97f2016-06-09 19:17:15 +00002602 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00002603}
2604
2605SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00002606 if (DAG.getTarget().Options.UnsafeFPMath)
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00002607 return lowerFastUnsafeFDIV(Op, DAG);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00002608
2609 SDLoc SL(Op);
2610 SDValue X = Op.getOperand(0);
2611 SDValue Y = Op.getOperand(1);
2612
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00002613 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00002614
2615 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
2616
2617 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
2618
2619 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
2620
2621 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
2622
2623 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
2624
2625 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
2626
2627 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
2628
2629 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
2630
2631 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
2632 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
2633
2634 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
2635 NegDivScale0, Mul, DivScale1);
2636
2637 SDValue Scale;
2638
Matt Arsenault43e92fe2016-06-24 06:30:11 +00002639 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00002640 // Workaround a hardware bug on SI where the condition output from div_scale
2641 // is not usable.
2642
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00002643 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00002644
2645 // Figure out if the scale to use for div_fmas.
2646 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
2647 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
2648 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
2649 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
2650
2651 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
2652 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
2653
2654 SDValue Scale0Hi
2655 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
2656 SDValue Scale1Hi
2657 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
2658
2659 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
2660 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
2661 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
2662 } else {
2663 Scale = DivScale1.getValue(1);
2664 }
2665
2666 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
2667 Fma4, Fma3, Mul, Scale);
2668
2669 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00002670}
2671
2672SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
2673 EVT VT = Op.getValueType();
2674
2675 if (VT == MVT::f32)
2676 return LowerFDIV32(Op, DAG);
2677
2678 if (VT == MVT::f64)
2679 return LowerFDIV64(Op, DAG);
2680
2681 llvm_unreachable("Unexpected type for fdiv");
2682}
2683
Tom Stellard81d871d2013-11-13 23:36:50 +00002684SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2685 SDLoc DL(Op);
2686 StoreSDNode *Store = cast<StoreSDNode>(Op);
2687 EVT VT = Store->getMemoryVT();
2688
Matt Arsenault95245662016-02-11 05:32:46 +00002689 if (VT == MVT::i1) {
2690 return DAG.getTruncStore(Store->getChain(), DL,
2691 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
2692 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
Tom Stellardb02094e2014-07-21 15:45:01 +00002693 }
2694
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00002695 assert(VT.isVector() &&
2696 Store->getValue().getValueType().getScalarType() == MVT::i32);
2697
2698 unsigned AS = Store->getAddressSpace();
2699 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
2700 AS, Store->getAlignment())) {
2701 return expandUnalignedStore(Store, DAG);
2702 }
Tom Stellard81d871d2013-11-13 23:36:50 +00002703
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00002704 unsigned NumElements = VT.getVectorNumElements();
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00002705 switch (AS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00002706 case AMDGPUAS::GLOBAL_ADDRESS:
2707 case AMDGPUAS::FLAT_ADDRESS:
2708 if (NumElements > 4)
2709 return SplitVectorStore(Op, DAG);
2710 return SDValue();
2711 case AMDGPUAS::PRIVATE_ADDRESS: {
2712 switch (Subtarget->getMaxPrivateElementSize()) {
2713 case 4:
Matt Arsenault9c499c32016-04-14 23:31:26 +00002714 return scalarizeVectorStore(Store, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00002715 case 8:
2716 if (NumElements > 2)
2717 return SplitVectorStore(Op, DAG);
2718 return SDValue();
2719 case 16:
2720 if (NumElements > 4)
2721 return SplitVectorStore(Op, DAG);
2722 return SDValue();
2723 default:
2724 llvm_unreachable("unsupported private_element_size");
2725 }
2726 }
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00002727 case AMDGPUAS::LOCAL_ADDRESS: {
2728 if (NumElements > 2)
2729 return SplitVectorStore(Op, DAG);
2730
2731 if (NumElements == 2)
2732 return Op;
2733
Matt Arsenault95245662016-02-11 05:32:46 +00002734 // If properly aligned, if we split we might be able to use ds_write_b64.
2735 return SplitVectorStore(Op, DAG);
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00002736 }
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00002737 default:
2738 llvm_unreachable("unhandled address space");
Matt Arsenault95245662016-02-11 05:32:46 +00002739 }
Tom Stellard81d871d2013-11-13 23:36:50 +00002740}
2741
Matt Arsenaultad14ce82014-07-19 18:44:39 +00002742SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00002743 SDLoc DL(Op);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00002744 EVT VT = Op.getValueType();
2745 SDValue Arg = Op.getOperand(0);
Sanjay Patela2607012015-09-16 16:31:21 +00002746 // TODO: Should this propagate fast-math-flags?
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00002747 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
2748 DAG.getNode(ISD::FMUL, DL, VT, Arg,
2749 DAG.getConstantFP(0.5/M_PI, DL,
2750 VT)));
Matt Arsenaultad14ce82014-07-19 18:44:39 +00002751
2752 switch (Op.getOpcode()) {
2753 case ISD::FCOS:
2754 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
2755 case ISD::FSIN:
2756 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
2757 default:
2758 llvm_unreachable("Wrong trig opcode");
2759 }
2760}
2761
Tom Stellard354a43c2016-04-01 18:27:37 +00002762SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
2763 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
2764 assert(AtomicNode->isCompareAndSwap());
2765 unsigned AS = AtomicNode->getAddressSpace();
2766
2767 // No custom lowering required for local address space
2768 if (!isFlatGlobalAddrSpace(AS))
2769 return Op;
2770
2771 // Non-local address space requires custom lowering for atomic compare
2772 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
2773 SDLoc DL(Op);
2774 SDValue ChainIn = Op.getOperand(0);
2775 SDValue Addr = Op.getOperand(1);
2776 SDValue Old = Op.getOperand(2);
2777 SDValue New = Op.getOperand(3);
2778 EVT VT = Op.getValueType();
2779 MVT SimpleVT = VT.getSimpleVT();
2780 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
2781
Ahmed Bougacha128f8732016-04-26 21:15:30 +00002782 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
Tom Stellard354a43c2016-04-01 18:27:37 +00002783 SDValue Ops[] = { ChainIn, Addr, NewOld };
Matt Arsenault88701812016-06-09 23:42:48 +00002784
2785 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
2786 Ops, VT, AtomicNode->getMemOperand());
Tom Stellard354a43c2016-04-01 18:27:37 +00002787}
2788
Tom Stellard75aadc22012-12-11 21:25:42 +00002789//===----------------------------------------------------------------------===//
2790// Custom DAG optimizations
2791//===----------------------------------------------------------------------===//
2792
Matt Arsenault364a6742014-06-11 17:50:44 +00002793SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
Matt Arsenaulte6986632015-01-14 01:35:22 +00002794 DAGCombinerInfo &DCI) const {
Matt Arsenault364a6742014-06-11 17:50:44 +00002795 EVT VT = N->getValueType(0);
2796 EVT ScalarVT = VT.getScalarType();
2797 if (ScalarVT != MVT::f32)
2798 return SDValue();
2799
2800 SelectionDAG &DAG = DCI.DAG;
2801 SDLoc DL(N);
2802
2803 SDValue Src = N->getOperand(0);
2804 EVT SrcVT = Src.getValueType();
2805
2806 // TODO: We could try to match extracting the higher bytes, which would be
2807 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
2808 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
2809 // about in practice.
2810 if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
2811 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
2812 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
2813 DCI.AddToWorklist(Cvt.getNode());
2814 return Cvt;
2815 }
2816 }
2817
Matt Arsenault364a6742014-06-11 17:50:44 +00002818 return SDValue();
2819}
2820
Eric Christopher6c5b5112015-03-11 18:43:21 +00002821/// \brief Return true if the given offset Size in bytes can be folded into
2822/// the immediate offsets of a memory instruction for the given address space.
2823static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
Matt Arsenault43e92fe2016-06-24 06:30:11 +00002824 const SISubtarget &STI) {
Eric Christopher6c5b5112015-03-11 18:43:21 +00002825 switch (AS) {
2826 case AMDGPUAS::GLOBAL_ADDRESS: {
2827 // MUBUF instructions a 12-bit offset in bytes.
2828 return isUInt<12>(OffsetSize);
2829 }
2830 case AMDGPUAS::CONSTANT_ADDRESS: {
2831 // SMRD instructions have an 8-bit offset in dwords on SI and
2832 // a 20-bit offset in bytes on VI.
Matt Arsenault43e92fe2016-06-24 06:30:11 +00002833 if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
Eric Christopher6c5b5112015-03-11 18:43:21 +00002834 return isUInt<20>(OffsetSize);
2835 else
2836 return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
2837 }
2838 case AMDGPUAS::LOCAL_ADDRESS:
2839 case AMDGPUAS::REGION_ADDRESS: {
2840 // The single offset versions have a 16-bit offset in bytes.
2841 return isUInt<16>(OffsetSize);
2842 }
2843 case AMDGPUAS::PRIVATE_ADDRESS:
2844 // Indirect register addressing does not use any offsets.
2845 default:
2846 return 0;
2847 }
2848}
2849
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00002850// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
2851
2852// This is a variant of
2853// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
2854//
2855// The normal DAG combiner will do this, but only if the add has one use since
2856// that would increase the number of instructions.
2857//
2858// This prevents us from seeing a constant offset that can be folded into a
2859// memory instruction's addressing mode. If we know the resulting add offset of
2860// a pointer can be folded into an addressing offset, we can replace the pointer
2861// operand with the add of new constant offset. This eliminates one of the uses,
2862// and may allow the remaining use to also be simplified.
2863//
2864SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
2865 unsigned AddrSpace,
2866 DAGCombinerInfo &DCI) const {
2867 SDValue N0 = N->getOperand(0);
2868 SDValue N1 = N->getOperand(1);
2869
2870 if (N0.getOpcode() != ISD::ADD)
2871 return SDValue();
2872
2873 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
2874 if (!CN1)
2875 return SDValue();
2876
2877 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
2878 if (!CAdd)
2879 return SDValue();
2880
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00002881 // If the resulting offset is too large, we can't fold it into the addressing
2882 // mode offset.
2883 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
Matt Arsenault43e92fe2016-06-24 06:30:11 +00002884 if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00002885 return SDValue();
2886
2887 SelectionDAG &DAG = DCI.DAG;
2888 SDLoc SL(N);
2889 EVT VT = N->getValueType(0);
2890
2891 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00002892 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00002893
2894 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
2895}
2896
Matt Arsenaultd0101a22015-01-06 23:00:46 +00002897SDValue SITargetLowering::performAndCombine(SDNode *N,
2898 DAGCombinerInfo &DCI) const {
2899 if (DCI.isBeforeLegalize())
2900 return SDValue();
2901
Matt Arsenault6e3a4512016-01-18 22:01:13 +00002902 if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI))
2903 return Base;
2904
Matt Arsenaultd0101a22015-01-06 23:00:46 +00002905 SelectionDAG &DAG = DCI.DAG;
2906
2907 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
2908 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
2909 SDValue LHS = N->getOperand(0);
2910 SDValue RHS = N->getOperand(1);
2911
2912 if (LHS.getOpcode() == ISD::SETCC &&
2913 RHS.getOpcode() == ISD::SETCC) {
2914 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
2915 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
2916
2917 SDValue X = LHS.getOperand(0);
2918 SDValue Y = RHS.getOperand(0);
2919 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
2920 return SDValue();
2921
2922 if (LCC == ISD::SETO) {
2923 if (X != LHS.getOperand(1))
2924 return SDValue();
2925
2926 if (RCC == ISD::SETUNE) {
2927 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
2928 if (!C1 || !C1->isInfinity() || C1->isNegative())
2929 return SDValue();
2930
2931 const uint32_t Mask = SIInstrFlags::N_NORMAL |
2932 SIInstrFlags::N_SUBNORMAL |
2933 SIInstrFlags::N_ZERO |
2934 SIInstrFlags::P_ZERO |
2935 SIInstrFlags::P_SUBNORMAL |
2936 SIInstrFlags::P_NORMAL;
2937
2938 static_assert(((~(SIInstrFlags::S_NAN |
2939 SIInstrFlags::Q_NAN |
2940 SIInstrFlags::N_INFINITY |
2941 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
2942 "mask not equal");
2943
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00002944 SDLoc DL(N);
2945 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
2946 X, DAG.getConstant(Mask, DL, MVT::i32));
Matt Arsenaultd0101a22015-01-06 23:00:46 +00002947 }
2948 }
2949 }
2950
2951 return SDValue();
2952}
2953
Matt Arsenaultf2290332015-01-06 23:00:39 +00002954SDValue SITargetLowering::performOrCombine(SDNode *N,
2955 DAGCombinerInfo &DCI) const {
2956 SelectionDAG &DAG = DCI.DAG;
2957 SDValue LHS = N->getOperand(0);
2958 SDValue RHS = N->getOperand(1);
2959
Matt Arsenault3b082382016-04-12 18:24:38 +00002960 EVT VT = N->getValueType(0);
2961 if (VT == MVT::i64) {
2962 // TODO: This could be a generic combine with a predicate for extracting the
2963 // high half of an integer being free.
2964
2965 // (or i64:x, (zero_extend i32:y)) ->
2966 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
2967 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
2968 RHS.getOpcode() != ISD::ZERO_EXTEND)
2969 std::swap(LHS, RHS);
2970
2971 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
2972 SDValue ExtSrc = RHS.getOperand(0);
2973 EVT SrcVT = ExtSrc.getValueType();
2974 if (SrcVT == MVT::i32) {
2975 SDLoc SL(N);
2976 SDValue LowLHS, HiBits;
2977 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
2978 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
2979
2980 DCI.AddToWorklist(LowOr.getNode());
2981 DCI.AddToWorklist(HiBits.getNode());
2982
2983 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
2984 LowOr, HiBits);
2985 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2986 }
2987 }
2988 }
2989
Matt Arsenaultf2290332015-01-06 23:00:39 +00002990 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
2991 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
2992 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
2993 SDValue Src = LHS.getOperand(0);
2994 if (Src != RHS.getOperand(0))
2995 return SDValue();
2996
2997 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
2998 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
2999 if (!CLHS || !CRHS)
3000 return SDValue();
3001
3002 // Only 10 bits are used.
3003 static const uint32_t MaxMask = 0x3ff;
3004
3005 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003006 SDLoc DL(N);
3007 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
3008 Src, DAG.getConstant(NewMask, DL, MVT::i32));
Matt Arsenaultf2290332015-01-06 23:00:39 +00003009 }
3010
3011 return SDValue();
3012}
3013
3014SDValue SITargetLowering::performClassCombine(SDNode *N,
3015 DAGCombinerInfo &DCI) const {
3016 SelectionDAG &DAG = DCI.DAG;
3017 SDValue Mask = N->getOperand(1);
3018
3019 // fp_class x, 0 -> false
3020 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
3021 if (CMask->isNullValue())
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003022 return DAG.getConstant(0, SDLoc(N), MVT::i1);
Matt Arsenaultf2290332015-01-06 23:00:39 +00003023 }
3024
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00003025 if (N->getOperand(0).isUndef())
3026 return DAG.getUNDEF(MVT::i1);
3027
Matt Arsenaultf2290332015-01-06 23:00:39 +00003028 return SDValue();
3029}
3030
Matt Arsenault9cd90712016-04-14 01:42:16 +00003031// Constant fold canonicalize.
3032SDValue SITargetLowering::performFCanonicalizeCombine(
3033 SDNode *N,
3034 DAGCombinerInfo &DCI) const {
3035 ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
3036 if (!CFP)
3037 return SDValue();
3038
3039 SelectionDAG &DAG = DCI.DAG;
3040 const APFloat &C = CFP->getValueAPF();
3041
3042 // Flush denormals to 0 if not enabled.
3043 if (C.isDenormal()) {
3044 EVT VT = N->getValueType(0);
3045 if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
3046 return DAG.getConstantFP(0.0, SDLoc(N), VT);
3047
3048 if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
3049 return DAG.getConstantFP(0.0, SDLoc(N), VT);
3050 }
3051
3052 if (C.isNaN()) {
3053 EVT VT = N->getValueType(0);
3054 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
3055 if (C.isSignaling()) {
3056 // Quiet a signaling NaN.
3057 return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
3058 }
3059
3060 // Make sure it is the canonical NaN bitpattern.
3061 //
3062 // TODO: Can we use -1 as the canonical NaN value since it's an inline
3063 // immediate?
3064 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
3065 return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
3066 }
3067
3068 return SDValue(CFP, 0);
3069}
3070
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00003071static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
3072 switch (Opc) {
3073 case ISD::FMAXNUM:
3074 return AMDGPUISD::FMAX3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00003075 case ISD::SMAX:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00003076 return AMDGPUISD::SMAX3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00003077 case ISD::UMAX:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00003078 return AMDGPUISD::UMAX3;
3079 case ISD::FMINNUM:
3080 return AMDGPUISD::FMIN3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00003081 case ISD::SMIN:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00003082 return AMDGPUISD::SMIN3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00003083 case ISD::UMIN:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00003084 return AMDGPUISD::UMIN3;
3085 default:
3086 llvm_unreachable("Not a min/max opcode");
3087 }
3088}
3089
Benjamin Kramerbdc49562016-06-12 15:39:02 +00003090static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
3091 SDValue Op0, SDValue Op1, bool Signed) {
Matt Arsenaultf639c322016-01-28 20:53:42 +00003092 ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
3093 if (!K1)
3094 return SDValue();
3095
3096 ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
3097 if (!K0)
3098 return SDValue();
3099
Matt Arsenaultf639c322016-01-28 20:53:42 +00003100 if (Signed) {
3101 if (K0->getAPIntValue().sge(K1->getAPIntValue()))
3102 return SDValue();
3103 } else {
3104 if (K0->getAPIntValue().uge(K1->getAPIntValue()))
3105 return SDValue();
3106 }
3107
3108 EVT VT = K0->getValueType(0);
3109 return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
3110 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
3111}
3112
3113static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
3114 if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
3115 return true;
3116
3117 return DAG.isKnownNeverNaN(Op);
3118}
3119
Benjamin Kramerbdc49562016-06-12 15:39:02 +00003120static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
3121 SDValue Op0, SDValue Op1) {
Matt Arsenaultf639c322016-01-28 20:53:42 +00003122 ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
3123 if (!K1)
3124 return SDValue();
3125
3126 ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1));
3127 if (!K0)
3128 return SDValue();
3129
3130 // Ordered >= (although NaN inputs should have folded away by now).
3131 APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
3132 if (Cmp == APFloat::cmpGreaterThan)
3133 return SDValue();
3134
3135 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
3136 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
3137 // give the other result, which is different from med3 with a NaN input.
3138 SDValue Var = Op0.getOperand(0);
3139 if (!isKnownNeverSNan(DAG, Var))
3140 return SDValue();
3141
3142 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
3143 Var, SDValue(K0, 0), SDValue(K1, 0));
3144}
3145
3146SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
3147 DAGCombinerInfo &DCI) const {
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00003148 SelectionDAG &DAG = DCI.DAG;
3149
3150 unsigned Opc = N->getOpcode();
3151 SDValue Op0 = N->getOperand(0);
3152 SDValue Op1 = N->getOperand(1);
3153
3154 // Only do this if the inner op has one use since this will just increases
3155 // register pressure for no benefit.
3156
Matt Arsenault5b39b342016-01-28 20:53:48 +00003157 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
3158 // max(max(a, b), c) -> max3(a, b, c)
3159 // min(min(a, b), c) -> min3(a, b, c)
3160 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
3161 SDLoc DL(N);
3162 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
3163 DL,
3164 N->getValueType(0),
3165 Op0.getOperand(0),
3166 Op0.getOperand(1),
3167 Op1);
3168 }
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00003169
Matt Arsenault5b39b342016-01-28 20:53:48 +00003170 // Try commuted.
3171 // max(a, max(b, c)) -> max3(a, b, c)
3172 // min(a, min(b, c)) -> min3(a, b, c)
3173 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
3174 SDLoc DL(N);
3175 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
3176 DL,
3177 N->getValueType(0),
3178 Op0,
3179 Op1.getOperand(0),
3180 Op1.getOperand(1));
3181 }
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00003182 }
3183
Matt Arsenaultf639c322016-01-28 20:53:42 +00003184 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
3185 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
3186 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
3187 return Med3;
3188 }
3189
3190 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
3191 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
3192 return Med3;
3193 }
3194
3195 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
Matt Arsenault5b39b342016-01-28 20:53:48 +00003196 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
3197 (Opc == AMDGPUISD::FMIN_LEGACY &&
3198 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
Matt Arsenaultf639c322016-01-28 20:53:42 +00003199 N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
3200 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
3201 return Res;
3202 }
3203
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00003204 return SDValue();
3205}
3206
Matt Arsenault6f6233d2015-01-06 23:00:41 +00003207SDValue SITargetLowering::performSetCCCombine(SDNode *N,
3208 DAGCombinerInfo &DCI) const {
3209 SelectionDAG &DAG = DCI.DAG;
3210 SDLoc SL(N);
3211
3212 SDValue LHS = N->getOperand(0);
3213 SDValue RHS = N->getOperand(1);
3214 EVT VT = LHS.getValueType();
3215
3216 if (VT != MVT::f32 && VT != MVT::f64)
3217 return SDValue();
3218
3219 // Match isinf pattern
3220 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
3221 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
3222 if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
3223 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3224 if (!CRHS)
3225 return SDValue();
3226
3227 const APFloat &APF = CRHS->getValueAPF();
3228 if (APF.isInfinity() && !APF.isNegative()) {
3229 unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003230 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
3231 DAG.getConstant(Mask, SL, MVT::i32));
Matt Arsenault6f6233d2015-01-06 23:00:41 +00003232 }
3233 }
3234
3235 return SDValue();
3236}
3237
Tom Stellard75aadc22012-12-11 21:25:42 +00003238SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
3239 DAGCombinerInfo &DCI) const {
3240 SelectionDAG &DAG = DCI.DAG;
Andrew Trickef9de2a2013-05-25 02:42:55 +00003241 SDLoc DL(N);
Tom Stellard75aadc22012-12-11 21:25:42 +00003242
3243 switch (N->getOpcode()) {
Matt Arsenault22b4c252014-12-21 16:48:42 +00003244 default:
3245 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
Matt Arsenault6f6233d2015-01-06 23:00:41 +00003246 case ISD::SETCC:
3247 return performSetCCCombine(N, DCI);
Matt Arsenault5b39b342016-01-28 20:53:48 +00003248 case ISD::FMAXNUM:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00003249 case ISD::FMINNUM:
Matt Arsenault5881f4e2015-06-09 00:52:37 +00003250 case ISD::SMAX:
3251 case ISD::SMIN:
3252 case ISD::UMAX:
Matt Arsenault5b39b342016-01-28 20:53:48 +00003253 case ISD::UMIN:
3254 case AMDGPUISD::FMIN_LEGACY:
3255 case AMDGPUISD::FMAX_LEGACY: {
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00003256 if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
Tom Stellard7c840bc2015-03-16 15:53:55 +00003257 N->getValueType(0) != MVT::f64 &&
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00003258 getTargetMachine().getOptLevel() > CodeGenOpt::None)
Matt Arsenaultf639c322016-01-28 20:53:42 +00003259 return performMinMaxCombine(N, DCI);
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00003260 break;
3261 }
Matt Arsenault364a6742014-06-11 17:50:44 +00003262
3263 case AMDGPUISD::CVT_F32_UBYTE0:
3264 case AMDGPUISD::CVT_F32_UBYTE1:
3265 case AMDGPUISD::CVT_F32_UBYTE2:
3266 case AMDGPUISD::CVT_F32_UBYTE3: {
3267 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
Matt Arsenault364a6742014-06-11 17:50:44 +00003268 SDValue Src = N->getOperand(0);
Matt Arsenaulta949dc62016-05-09 16:29:50 +00003269
Matt Arsenault327bb5a2016-07-01 22:47:50 +00003270 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
Matt Arsenaulta949dc62016-05-09 16:29:50 +00003271 if (Src.getOpcode() == ISD::SRL) {
3272 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
3273 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
3274 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
3275
3276 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
3277 unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
3278 if (SrcOffset < 32 && SrcOffset % 8 == 0) {
3279 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL,
3280 MVT::f32, Src.getOperand(0));
3281 }
3282 }
3283 }
3284
Matt Arsenault364a6742014-06-11 17:50:44 +00003285 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
3286
3287 APInt KnownZero, KnownOne;
3288 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
3289 !DCI.isBeforeLegalizeOps());
3290 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3291 if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
3292 TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
3293 DCI.CommitTargetLoweringOpt(TLO);
3294 }
3295
3296 break;
3297 }
3298
3299 case ISD::UINT_TO_FP: {
3300 return performUCharToFloatCombine(N, DCI);
Matt Arsenaultde5fbe92016-01-11 17:02:00 +00003301 }
Matt Arsenault02cb0ff2014-09-29 14:59:34 +00003302 case ISD::FADD: {
3303 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3304 break;
3305
3306 EVT VT = N->getValueType(0);
3307 if (VT != MVT::f32)
3308 break;
3309
Matt Arsenault8d630032015-02-20 22:10:41 +00003310 // Only do this if we are not trying to support denormals. v_mad_f32 does
3311 // not support denormals ever.
3312 if (Subtarget->hasFP32Denormals())
3313 break;
3314
Matt Arsenault02cb0ff2014-09-29 14:59:34 +00003315 SDValue LHS = N->getOperand(0);
3316 SDValue RHS = N->getOperand(1);
3317
3318 // These should really be instruction patterns, but writing patterns with
3319 // source modiifiers is a pain.
3320
3321 // fadd (fadd (a, a), b) -> mad 2.0, a, b
3322 if (LHS.getOpcode() == ISD::FADD) {
3323 SDValue A = LHS.getOperand(0);
3324 if (A == LHS.getOperand(1)) {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003325 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
Matt Arsenault8d630032015-02-20 22:10:41 +00003326 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS);
Matt Arsenault02cb0ff2014-09-29 14:59:34 +00003327 }
3328 }
3329
3330 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
3331 if (RHS.getOpcode() == ISD::FADD) {
3332 SDValue A = RHS.getOperand(0);
3333 if (A == RHS.getOperand(1)) {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003334 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
Matt Arsenault8d630032015-02-20 22:10:41 +00003335 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS);
Matt Arsenault02cb0ff2014-09-29 14:59:34 +00003336 }
3337 }
3338
Matt Arsenault8d630032015-02-20 22:10:41 +00003339 return SDValue();
Matt Arsenault02cb0ff2014-09-29 14:59:34 +00003340 }
Matt Arsenault8675db12014-08-29 16:01:14 +00003341 case ISD::FSUB: {
3342 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3343 break;
3344
3345 EVT VT = N->getValueType(0);
3346
3347 // Try to get the fneg to fold into the source modifier. This undoes generic
3348 // DAG combines and folds them into the mad.
Matt Arsenault8d630032015-02-20 22:10:41 +00003349 //
3350 // Only do this if we are not trying to support denormals. v_mad_f32 does
3351 // not support denormals ever.
3352 if (VT == MVT::f32 &&
3353 !Subtarget->hasFP32Denormals()) {
Matt Arsenault8675db12014-08-29 16:01:14 +00003354 SDValue LHS = N->getOperand(0);
3355 SDValue RHS = N->getOperand(1);
Matt Arsenault3d4233f2014-09-29 14:59:38 +00003356 if (LHS.getOpcode() == ISD::FADD) {
3357 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
3358
3359 SDValue A = LHS.getOperand(0);
3360 if (A == LHS.getOperand(1)) {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003361 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
Matt Arsenault3d4233f2014-09-29 14:59:38 +00003362 SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS);
3363
Matt Arsenault8d630032015-02-20 22:10:41 +00003364 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS);
Matt Arsenault3d4233f2014-09-29 14:59:38 +00003365 }
3366 }
3367
3368 if (RHS.getOpcode() == ISD::FADD) {
3369 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
3370
3371 SDValue A = RHS.getOperand(0);
3372 if (A == RHS.getOperand(1)) {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003373 const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32);
Matt Arsenault8d630032015-02-20 22:10:41 +00003374 return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS);
Matt Arsenault3d4233f2014-09-29 14:59:38 +00003375 }
3376 }
Matt Arsenault8d630032015-02-20 22:10:41 +00003377
3378 return SDValue();
Matt Arsenault8675db12014-08-29 16:01:14 +00003379 }
3380
3381 break;
3382 }
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00003383 case ISD::LOAD:
3384 case ISD::STORE:
3385 case ISD::ATOMIC_LOAD:
3386 case ISD::ATOMIC_STORE:
3387 case ISD::ATOMIC_CMP_SWAP:
3388 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
3389 case ISD::ATOMIC_SWAP:
3390 case ISD::ATOMIC_LOAD_ADD:
3391 case ISD::ATOMIC_LOAD_SUB:
3392 case ISD::ATOMIC_LOAD_AND:
3393 case ISD::ATOMIC_LOAD_OR:
3394 case ISD::ATOMIC_LOAD_XOR:
3395 case ISD::ATOMIC_LOAD_NAND:
3396 case ISD::ATOMIC_LOAD_MIN:
3397 case ISD::ATOMIC_LOAD_MAX:
3398 case ISD::ATOMIC_LOAD_UMIN:
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00003399 case ISD::ATOMIC_LOAD_UMAX:
3400 case AMDGPUISD::ATOMIC_INC:
3401 case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00003402 if (DCI.isBeforeLegalize())
3403 break;
Matt Arsenault5565f65e2014-05-22 18:09:07 +00003404
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00003405 MemSDNode *MemNode = cast<MemSDNode>(N);
3406 SDValue Ptr = MemNode->getBasePtr();
3407
3408 // TODO: We could also do this for multiplies.
3409 unsigned AS = MemNode->getAddressSpace();
3410 if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
3411 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
3412 if (NewPtr) {
Benjamin Kramer6cd780f2015-02-17 15:29:18 +00003413 SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end());
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00003414
3415 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
3416 return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
3417 }
3418 }
3419 break;
3420 }
Matt Arsenaultd0101a22015-01-06 23:00:46 +00003421 case ISD::AND:
3422 return performAndCombine(N, DCI);
Matt Arsenaultf2290332015-01-06 23:00:39 +00003423 case ISD::OR:
3424 return performOrCombine(N, DCI);
3425 case AMDGPUISD::FP_CLASS:
3426 return performClassCombine(N, DCI);
Matt Arsenault9cd90712016-04-14 01:42:16 +00003427 case ISD::FCANONICALIZE:
3428 return performFCanonicalizeCombine(N, DCI);
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00003429 case AMDGPUISD::FRACT:
3430 case AMDGPUISD::RCP:
3431 case AMDGPUISD::RSQ:
Matt Arsenault32fc5272016-07-26 16:45:45 +00003432 case AMDGPUISD::RCP_LEGACY:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00003433 case AMDGPUISD::RSQ_LEGACY:
3434 case AMDGPUISD::RSQ_CLAMP:
3435 case AMDGPUISD::LDEXP: {
3436 SDValue Src = N->getOperand(0);
3437 if (Src.isUndef())
3438 return Src;
3439 break;
3440 }
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00003441 }
Matt Arsenault5565f65e2014-05-22 18:09:07 +00003442 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
Tom Stellard75aadc22012-12-11 21:25:42 +00003443}
Christian Konigd910b7d2013-02-26 17:52:16 +00003444
Christian Konig8e06e2a2013-04-10 08:39:08 +00003445/// \brief Helper function for adjustWritemask
Benjamin Kramer635e3682013-05-23 15:43:05 +00003446static unsigned SubIdx2Lane(unsigned Idx) {
Christian Konig8e06e2a2013-04-10 08:39:08 +00003447 switch (Idx) {
3448 default: return 0;
3449 case AMDGPU::sub0: return 0;
3450 case AMDGPU::sub1: return 1;
3451 case AMDGPU::sub2: return 2;
3452 case AMDGPU::sub3: return 3;
3453 }
3454}
3455
3456/// \brief Adjust the writemask of MIMG instructions
3457void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
3458 SelectionDAG &DAG) const {
3459 SDNode *Users[4] = { };
Tom Stellard54774e52013-10-23 02:53:47 +00003460 unsigned Lane = 0;
Nikolay Haustov2f684f12016-02-26 09:51:05 +00003461 unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
3462 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
Tom Stellard54774e52013-10-23 02:53:47 +00003463 unsigned NewDmask = 0;
Christian Konig8e06e2a2013-04-10 08:39:08 +00003464
3465 // Try to figure out the used register components
3466 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
3467 I != E; ++I) {
3468
3469 // Abort if we can't understand the usage
3470 if (!I->isMachineOpcode() ||
3471 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
3472 return;
3473
Tom Stellard54774e52013-10-23 02:53:47 +00003474 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
3475 // Note that subregs are packed, i.e. Lane==0 is the first bit set
3476 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
3477 // set, etc.
Christian Konig8b1ed282013-04-10 08:39:16 +00003478 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
Christian Konig8e06e2a2013-04-10 08:39:08 +00003479
Tom Stellard54774e52013-10-23 02:53:47 +00003480 // Set which texture component corresponds to the lane.
3481 unsigned Comp;
3482 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
3483 assert(Dmask);
Tom Stellard03a5c082013-10-23 03:50:25 +00003484 Comp = countTrailingZeros(Dmask);
Tom Stellard54774e52013-10-23 02:53:47 +00003485 Dmask &= ~(1 << Comp);
3486 }
3487
Christian Konig8e06e2a2013-04-10 08:39:08 +00003488 // Abort if we have more than one user per component
3489 if (Users[Lane])
3490 return;
3491
3492 Users[Lane] = *I;
Tom Stellard54774e52013-10-23 02:53:47 +00003493 NewDmask |= 1 << Comp;
Christian Konig8e06e2a2013-04-10 08:39:08 +00003494 }
3495
Tom Stellard54774e52013-10-23 02:53:47 +00003496 // Abort if there's no change
3497 if (NewDmask == OldDmask)
Christian Konig8e06e2a2013-04-10 08:39:08 +00003498 return;
3499
3500 // Adjust the writemask in the node
3501 std::vector<SDValue> Ops;
Nikolay Haustov2f684f12016-02-26 09:51:05 +00003502 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003503 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
Nikolay Haustov2f684f12016-02-26 09:51:05 +00003504 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
Craig Topper8c0b4d02014-04-28 05:57:50 +00003505 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
Christian Konig8e06e2a2013-04-10 08:39:08 +00003506
Christian Konig8b1ed282013-04-10 08:39:16 +00003507 // If we only got one lane, replace it with a copy
Tom Stellard54774e52013-10-23 02:53:47 +00003508 // (if NewDmask has only one bit set...)
3509 if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003510 SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
3511 MVT::i32);
Christian Konig8b1ed282013-04-10 08:39:16 +00003512 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
Andrew Trickef9de2a2013-05-25 02:42:55 +00003513 SDLoc(), Users[Lane]->getValueType(0),
Christian Konig8b1ed282013-04-10 08:39:16 +00003514 SDValue(Node, 0), RC);
3515 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
3516 return;
3517 }
3518
Christian Konig8e06e2a2013-04-10 08:39:08 +00003519 // Update the users of the node with the new indices
3520 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
3521
3522 SDNode *User = Users[i];
3523 if (!User)
3524 continue;
3525
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003526 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
Christian Konig8e06e2a2013-04-10 08:39:08 +00003527 DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
3528
3529 switch (Idx) {
3530 default: break;
3531 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
3532 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
3533 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
3534 }
3535 }
3536}
3537
Tom Stellardc98ee202015-07-16 19:40:07 +00003538static bool isFrameIndexOp(SDValue Op) {
3539 if (Op.getOpcode() == ISD::AssertZext)
3540 Op = Op.getOperand(0);
3541
3542 return isa<FrameIndexSDNode>(Op);
3543}
3544
Tom Stellard3457a842014-10-09 19:06:00 +00003545/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
3546/// with frame index operands.
3547/// LLVM assumes that inputs are to these instructions are registers.
3548void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
3549 SelectionDAG &DAG) const {
Tom Stellard8dd392e2014-10-09 18:09:15 +00003550
3551 SmallVector<SDValue, 8> Ops;
Tom Stellard3457a842014-10-09 19:06:00 +00003552 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
Tom Stellardc98ee202015-07-16 19:40:07 +00003553 if (!isFrameIndexOp(Node->getOperand(i))) {
Tom Stellard3457a842014-10-09 19:06:00 +00003554 Ops.push_back(Node->getOperand(i));
Tom Stellard8dd392e2014-10-09 18:09:15 +00003555 continue;
3556 }
3557
Tom Stellard3457a842014-10-09 19:06:00 +00003558 SDLoc DL(Node);
Tom Stellard8dd392e2014-10-09 18:09:15 +00003559 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
Tom Stellard3457a842014-10-09 19:06:00 +00003560 Node->getOperand(i).getValueType(),
3561 Node->getOperand(i)), 0));
Tom Stellard8dd392e2014-10-09 18:09:15 +00003562 }
3563
Tom Stellard3457a842014-10-09 19:06:00 +00003564 DAG.UpdateNodeOperands(Node, Ops);
Tom Stellard8dd392e2014-10-09 18:09:15 +00003565}
3566
Matt Arsenault08d84942014-06-03 23:06:13 +00003567/// \brief Fold the instructions after selecting them.
Christian Konig8e06e2a2013-04-10 08:39:08 +00003568SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
3569 SelectionDAG &DAG) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00003570 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Nicolai Haehnlef2c64db2016-02-18 16:44:18 +00003571 unsigned Opcode = Node->getMachineOpcode();
Christian Konig8e06e2a2013-04-10 08:39:08 +00003572
Nicolai Haehnlec06bfa12016-07-11 21:59:43 +00003573 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
3574 !TII->isGather4(Opcode))
Christian Konig8e06e2a2013-04-10 08:39:08 +00003575 adjustWritemask(Node, DAG);
3576
Nicolai Haehnlef2c64db2016-02-18 16:44:18 +00003577 if (Opcode == AMDGPU::INSERT_SUBREG ||
3578 Opcode == AMDGPU::REG_SEQUENCE) {
Tom Stellard8dd392e2014-10-09 18:09:15 +00003579 legalizeTargetIndependentNode(Node, DAG);
3580 return Node;
3581 }
Tom Stellard654d6692015-01-08 15:08:17 +00003582 return Node;
Christian Konig8e06e2a2013-04-10 08:39:08 +00003583}
Christian Konig8b1ed282013-04-10 08:39:16 +00003584
3585/// \brief Assign the register class depending on the number of
3586/// bits set in the writemask
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003587void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
Christian Konig8b1ed282013-04-10 08:39:16 +00003588 SDNode *Node) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00003589 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00003590
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003591 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
Matt Arsenault6005fcb2015-10-21 21:51:02 +00003592
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003593 if (TII->isVOP3(MI.getOpcode())) {
Matt Arsenault6005fcb2015-10-21 21:51:02 +00003594 // Make sure constant bus requirements are respected.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003595 TII->legalizeOperandsVOP3(MRI, MI);
Matt Arsenault6005fcb2015-10-21 21:51:02 +00003596 return;
3597 }
Matt Arsenaultcb0ac3d2014-09-26 17:54:59 +00003598
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003599 if (TII->isMIMG(MI)) {
3600 unsigned VReg = MI.getOperand(0).getReg();
3601 unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
3602 unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00003603 unsigned BitsSet = 0;
3604 for (unsigned i = 0; i < 4; ++i)
3605 BitsSet += Writemask & (1 << i) ? 1 : 0;
3606
3607 const TargetRegisterClass *RC;
3608 switch (BitsSet) {
3609 default: return;
Tom Stellard45c0b3a2015-01-07 20:59:25 +00003610 case 1: RC = &AMDGPU::VGPR_32RegClass; break;
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00003611 case 2: RC = &AMDGPU::VReg_64RegClass; break;
3612 case 3: RC = &AMDGPU::VReg_96RegClass; break;
3613 }
3614
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003615 unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
3616 MI.setDesc(TII->get(NewOpcode));
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00003617 MRI.setRegClass(VReg, RC);
Christian Konig8b1ed282013-04-10 08:39:16 +00003618 return;
Christian Konig8b1ed282013-04-10 08:39:16 +00003619 }
3620
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00003621 // Replace unused atomics with the no return version.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003622 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00003623 if (NoRetAtomicOp != -1) {
3624 if (!Node->hasAnyUseOfValue(0)) {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003625 MI.setDesc(TII->get(NoRetAtomicOp));
3626 MI.RemoveOperand(0);
Tom Stellard354a43c2016-04-01 18:27:37 +00003627 return;
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00003628 }
3629
Tom Stellard354a43c2016-04-01 18:27:37 +00003630 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
3631 // instruction, because the return type of these instructions is a vec2 of
3632 // the memory type, so it can be tied to the input operand.
3633 // This means these instructions always have a use, so we need to add a
3634 // special case to check if the atomic has only one extract_subreg use,
3635 // which itself has no uses.
3636 if ((Node->hasNUsesOfValue(1, 0) &&
Nicolai Haehnle750082d2016-04-15 14:42:36 +00003637 Node->use_begin()->isMachineOpcode() &&
Tom Stellard354a43c2016-04-01 18:27:37 +00003638 Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
3639 !Node->use_begin()->hasAnyUseOfValue(0))) {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003640 unsigned Def = MI.getOperand(0).getReg();
Tom Stellard354a43c2016-04-01 18:27:37 +00003641
3642 // Change this into a noret atomic.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003643 MI.setDesc(TII->get(NoRetAtomicOp));
3644 MI.RemoveOperand(0);
Tom Stellard354a43c2016-04-01 18:27:37 +00003645
3646 // If we only remove the def operand from the atomic instruction, the
3647 // extract_subreg will be left with a use of a vreg without a def.
3648 // So we need to insert an implicit_def to avoid machine verifier
3649 // errors.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003650 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
Tom Stellard354a43c2016-04-01 18:27:37 +00003651 TII->get(AMDGPU::IMPLICIT_DEF), Def);
3652 }
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00003653 return;
3654 }
Christian Konig8b1ed282013-04-10 08:39:16 +00003655}
Tom Stellard0518ff82013-06-03 17:39:58 +00003656
Benjamin Kramerbdc49562016-06-12 15:39:02 +00003657static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
3658 uint64_t Val) {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003659 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
Matt Arsenault485defe2014-11-05 19:01:17 +00003660 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
3661}
3662
3663MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
Benjamin Kramerbdc49562016-06-12 15:39:02 +00003664 const SDLoc &DL,
Matt Arsenault485defe2014-11-05 19:01:17 +00003665 SDValue Ptr) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00003666 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Matt Arsenault485defe2014-11-05 19:01:17 +00003667
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00003668 // Build the half of the subregister with the constants before building the
3669 // full 128-bit register. If we are building multiple resource descriptors,
3670 // this will allow CSEing of the 2-component register.
3671 const SDValue Ops0[] = {
3672 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
3673 buildSMovImm32(DAG, DL, 0),
3674 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
3675 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
3676 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
3677 };
Matt Arsenault485defe2014-11-05 19:01:17 +00003678
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00003679 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
3680 MVT::v2i32, Ops0), 0);
Matt Arsenault485defe2014-11-05 19:01:17 +00003681
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00003682 // Combine the constants and the pointer.
3683 const SDValue Ops1[] = {
3684 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
3685 Ptr,
3686 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
3687 SubRegHi,
3688 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
3689 };
Matt Arsenault485defe2014-11-05 19:01:17 +00003690
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00003691 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
Matt Arsenault485defe2014-11-05 19:01:17 +00003692}
3693
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00003694/// \brief Return a resource descriptor with the 'Add TID' bit enabled
Benjamin Kramerdf005cb2015-08-08 18:27:36 +00003695/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
3696/// of the resource descriptor) to create an offset, which is added to
3697/// the resource pointer.
Benjamin Kramerbdc49562016-06-12 15:39:02 +00003698MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
3699 SDValue Ptr, uint32_t RsrcDword1,
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00003700 uint64_t RsrcDword2And3) const {
3701 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
3702 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
3703 if (RsrcDword1) {
3704 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003705 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
3706 0);
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00003707 }
3708
3709 SDValue DataLo = buildSMovImm32(DAG, DL,
3710 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
3711 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
3712
3713 const SDValue Ops[] = {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003714 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00003715 PtrLo,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003716 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00003717 PtrHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003718 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00003719 DataLo,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003720 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00003721 DataHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00003722 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00003723 };
3724
3725 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
3726}
3727
Tom Stellard94593ee2013-06-03 17:40:18 +00003728SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
3729 const TargetRegisterClass *RC,
3730 unsigned Reg, EVT VT) const {
3731 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
3732
3733 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
3734 cast<RegisterSDNode>(VReg)->getReg(), VT);
3735}
Tom Stellardd7e6f132015-04-08 01:09:26 +00003736
3737//===----------------------------------------------------------------------===//
3738// SI Inline Assembly Support
3739//===----------------------------------------------------------------------===//
3740
3741std::pair<unsigned, const TargetRegisterClass *>
3742SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
Benjamin Kramer9bfb6272015-07-05 19:29:18 +00003743 StringRef Constraint,
Tom Stellardd7e6f132015-04-08 01:09:26 +00003744 MVT VT) const {
Tom Stellardb3c3bda2015-12-10 02:12:53 +00003745
3746 if (Constraint.size() == 1) {
3747 switch (Constraint[0]) {
3748 case 's':
3749 case 'r':
3750 switch (VT.getSizeInBits()) {
3751 default:
3752 return std::make_pair(0U, nullptr);
3753 case 32:
Matt Arsenaulta609e2d2016-08-30 20:50:08 +00003754 return std::make_pair(0U, &AMDGPU::SReg_32RegClass);
Tom Stellardb3c3bda2015-12-10 02:12:53 +00003755 case 64:
3756 return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
3757 case 128:
3758 return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
3759 case 256:
3760 return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
3761 }
3762
3763 case 'v':
3764 switch (VT.getSizeInBits()) {
3765 default:
3766 return std::make_pair(0U, nullptr);
3767 case 32:
3768 return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
3769 case 64:
3770 return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
3771 case 96:
3772 return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
3773 case 128:
3774 return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
3775 case 256:
3776 return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
3777 case 512:
3778 return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
3779 }
Tom Stellardd7e6f132015-04-08 01:09:26 +00003780 }
3781 }
3782
3783 if (Constraint.size() > 1) {
3784 const TargetRegisterClass *RC = nullptr;
3785 if (Constraint[1] == 'v') {
3786 RC = &AMDGPU::VGPR_32RegClass;
3787 } else if (Constraint[1] == 's') {
3788 RC = &AMDGPU::SGPR_32RegClass;
3789 }
3790
3791 if (RC) {
Matt Arsenault0b554ed2015-06-23 02:05:55 +00003792 uint32_t Idx;
3793 bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
3794 if (!Failed && Idx < RC->getNumRegs())
Tom Stellardd7e6f132015-04-08 01:09:26 +00003795 return std::make_pair(RC->getRegister(Idx), RC);
3796 }
3797 }
3798 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
3799}
Tom Stellardb3c3bda2015-12-10 02:12:53 +00003800
3801SITargetLowering::ConstraintType
3802SITargetLowering::getConstraintType(StringRef Constraint) const {
3803 if (Constraint.size() == 1) {
3804 switch (Constraint[0]) {
3805 default: break;
3806 case 's':
3807 case 'v':
3808 return C_RegisterClass;
3809 }
3810 }
3811 return TargetLowering::getConstraintType(Constraint);
3812}