blob: f62533170730ef4d01e3fefbd2a832c1d156fa78 [file] [log] [blame]
Tom Stellard75aadc22012-12-11 21:25:42 +00001//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Tom Stellard75aadc22012-12-11 21:25:42 +00006//
7//===----------------------------------------------------------------------===//
8//
9/// \file
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000010/// Custom DAG lowering for SI
Tom Stellard75aadc22012-12-11 21:25:42 +000011//
12//===----------------------------------------------------------------------===//
13
Sylvestre Ledrudf92dab2018-11-02 17:25:40 +000014#if defined(_MSC_VER) || defined(__MINGW32__)
NAKAMURA Takumi45e0a832014-07-20 11:15:07 +000015// Provide M_PI.
16#define _USE_MATH_DEFINES
NAKAMURA Takumi45e0a832014-07-20 11:15:07 +000017#endif
18
Chandler Carruth6bda14b2017-06-06 11:49:48 +000019#include "SIISelLowering.h"
Christian Konig99ee0f42013-03-07 09:04:14 +000020#include "AMDGPU.h"
Matt Arsenault41e2f2b2014-02-24 21:01:28 +000021#include "AMDGPUSubtarget.h"
Chandler Carruth6bda14b2017-06-06 11:49:48 +000022#include "AMDGPUTargetMachine.h"
Tom Stellard8485fa02016-12-07 02:42:15 +000023#include "SIDefines.h"
Tom Stellard75aadc22012-12-11 21:25:42 +000024#include "SIInstrInfo.h"
25#include "SIMachineFunctionInfo.h"
26#include "SIRegisterInfo.h"
Tom Stellard44b30b42018-05-22 02:03:23 +000027#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000028#include "Utils/AMDGPUBaseInfo.h"
29#include "llvm/ADT/APFloat.h"
30#include "llvm/ADT/APInt.h"
31#include "llvm/ADT/ArrayRef.h"
Alexey Samsonova253bf92014-08-27 19:36:53 +000032#include "llvm/ADT/BitVector.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000033#include "llvm/ADT/SmallVector.h"
Matt Arsenault71bcbd42017-08-11 20:42:08 +000034#include "llvm/ADT/Statistic.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000035#include "llvm/ADT/StringRef.h"
Matt Arsenault9a10cea2016-01-26 04:29:24 +000036#include "llvm/ADT/StringSwitch.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000037#include "llvm/ADT/Twine.h"
Wei Ding07e03712016-07-28 16:42:13 +000038#include "llvm/CodeGen/Analysis.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000039#include "llvm/CodeGen/CallingConvLower.h"
40#include "llvm/CodeGen/DAGCombine.h"
41#include "llvm/CodeGen/ISDOpcodes.h"
42#include "llvm/CodeGen/MachineBasicBlock.h"
43#include "llvm/CodeGen/MachineFrameInfo.h"
44#include "llvm/CodeGen/MachineFunction.h"
45#include "llvm/CodeGen/MachineInstr.h"
46#include "llvm/CodeGen/MachineInstrBuilder.h"
47#include "llvm/CodeGen/MachineMemOperand.h"
Matt Arsenault8623e8d2017-08-03 23:00:29 +000048#include "llvm/CodeGen/MachineModuleInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000049#include "llvm/CodeGen/MachineOperand.h"
50#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000051#include "llvm/CodeGen/SelectionDAG.h"
52#include "llvm/CodeGen/SelectionDAGNodes.h"
David Blaikieb3bde2e2017-11-17 01:07:10 +000053#include "llvm/CodeGen/TargetCallingConv.h"
54#include "llvm/CodeGen/TargetRegisterInfo.h"
Craig Topper2fa14362018-03-29 17:21:10 +000055#include "llvm/CodeGen/ValueTypes.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000056#include "llvm/IR/Constants.h"
57#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/DebugLoc.h"
59#include "llvm/IR/DerivedTypes.h"
Oliver Stannard7e7d9832016-02-02 13:52:43 +000060#include "llvm/IR/DiagnosticInfo.h"
Benjamin Kramerd78bb462013-05-23 17:10:37 +000061#include "llvm/IR/Function.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000062#include "llvm/IR/GlobalValue.h"
63#include "llvm/IR/InstrTypes.h"
64#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Instructions.h"
Matt Arsenault7dc01c92017-03-15 23:15:12 +000066#include "llvm/IR/IntrinsicInst.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000067#include "llvm/IR/Type.h"
68#include "llvm/Support/Casting.h"
69#include "llvm/Support/CodeGen.h"
70#include "llvm/Support/CommandLine.h"
71#include "llvm/Support/Compiler.h"
72#include "llvm/Support/ErrorHandling.h"
Craig Topperd0af7e82017-04-28 05:31:46 +000073#include "llvm/Support/KnownBits.h"
David Blaikie13e77db2018-03-23 23:58:25 +000074#include "llvm/Support/MachineValueType.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000075#include "llvm/Support/MathExtras.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000076#include "llvm/Target/TargetOptions.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000077#include <cassert>
78#include <cmath>
79#include <cstdint>
80#include <iterator>
81#include <tuple>
82#include <utility>
83#include <vector>
Tom Stellard75aadc22012-12-11 21:25:42 +000084
85using namespace llvm;
86
Matt Arsenault71bcbd42017-08-11 20:42:08 +000087#define DEBUG_TYPE "si-lower"
88
89STATISTIC(NumTailCalls, "Number of tail calls");
90
Matt Arsenaultd486d3f2016-10-12 18:49:05 +000091static cl::opt<bool> EnableVGPRIndexMode(
92 "amdgpu-vgpr-index-mode",
93 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
94 cl::init(false));
95
Matt Arsenault45b98182017-11-15 00:45:43 +000096static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
97 "amdgpu-frame-index-zero-bits",
98 cl::desc("High bits of frame index assumed to be zero"),
99 cl::init(5),
100 cl::ReallyHidden);
101
Tom Stellardf110f8f2016-04-14 16:27:03 +0000102static unsigned findFirstFreeSGPR(CCState &CCInfo) {
103 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
104 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
105 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
106 return AMDGPU::SGPR0 + Reg;
107 }
108 }
109 llvm_unreachable("Cannot allocate sgpr");
110}
111
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000112SITargetLowering::SITargetLowering(const TargetMachine &TM,
Tom Stellard5bfbae52018-07-11 20:59:01 +0000113 const GCNSubtarget &STI)
Tom Stellardc5a154d2018-06-28 23:47:12 +0000114 : AMDGPUTargetLowering(TM, STI),
115 Subtarget(&STI) {
Tom Stellard1bd80722014-04-30 15:31:33 +0000116 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
Tom Stellard436780b2014-05-15 14:41:57 +0000117 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000118
Marek Olsak79c05872016-11-25 17:37:09 +0000119 addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
Tom Stellard45c0b3a2015-01-07 20:59:25 +0000120 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
Tom Stellard75aadc22012-12-11 21:25:42 +0000121
Tom Stellard436780b2014-05-15 14:41:57 +0000122 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
123 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
124 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000125
Matt Arsenault61001bb2015-11-25 19:58:34 +0000126 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
127 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
128
Tom Stellard436780b2014-05-15 14:41:57 +0000129 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
130 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000131
Tom Stellardf0a21072014-11-18 20:39:39 +0000132 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000133 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
134
Tom Stellardf0a21072014-11-18 20:39:39 +0000135 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000136 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
Tom Stellard75aadc22012-12-11 21:25:42 +0000137
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000138 if (Subtarget->has16BitInsts()) {
Marek Olsak79c05872016-11-25 17:37:09 +0000139 addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
140 addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
Tom Stellard115a6152016-11-10 16:02:37 +0000141
Matt Arsenault1349a042018-05-22 06:32:10 +0000142 // Unless there are also VOP3P operations, not operations are really legal.
Matt Arsenault7596f132017-02-27 20:52:10 +0000143 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
144 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000145 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
146 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
Matt Arsenault7596f132017-02-27 20:52:10 +0000147 }
148
Tom Stellardc5a154d2018-06-28 23:47:12 +0000149 computeRegisterProperties(Subtarget->getRegisterInfo());
Tom Stellard75aadc22012-12-11 21:25:42 +0000150
Tom Stellard35bb18c2013-08-26 15:06:04 +0000151 // We need to custom lower vector stores from local memory
Matt Arsenault71e66762016-05-21 02:27:49 +0000152 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
Tom Stellard35bb18c2013-08-26 15:06:04 +0000153 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
Tom Stellardaf775432013-10-23 00:44:32 +0000154 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
155 setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000156 setOperationAction(ISD::LOAD, MVT::i1, Custom);
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000157 setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
Matt Arsenault2b957b52016-05-02 20:07:26 +0000158
Matt Arsenaultbcdfee72016-05-02 20:13:51 +0000159 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000160 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
161 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
162 setOperationAction(ISD::STORE, MVT::v16i32, Custom);
163 setOperationAction(ISD::STORE, MVT::i1, Custom);
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000164 setOperationAction(ISD::STORE, MVT::v32i32, Custom);
Matt Arsenaultbcdfee72016-05-02 20:13:51 +0000165
Jan Vesely06200bd2017-01-06 21:00:46 +0000166 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
167 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
168 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
169 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
170 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
171 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
172 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
173 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
174 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
175 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
176
Matt Arsenault71e66762016-05-21 02:27:49 +0000177 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
178 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000179
180 setOperationAction(ISD::SELECT, MVT::i1, Promote);
Tom Stellard0ec134f2014-02-04 17:18:40 +0000181 setOperationAction(ISD::SELECT, MVT::i64, Custom);
Tom Stellardda99c6e2014-03-24 16:07:30 +0000182 setOperationAction(ISD::SELECT, MVT::f64, Promote);
183 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
Tom Stellard81d871d2013-11-13 23:36:50 +0000184
Tom Stellard3ca1bfc2014-06-10 16:01:22 +0000185 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
186 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
187 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
188 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
Matt Arsenault71e66762016-05-21 02:27:49 +0000189 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
Tom Stellard754f80f2013-04-05 23:31:51 +0000190
Tom Stellardd1efda82016-01-20 21:48:24 +0000191 setOperationAction(ISD::SETCC, MVT::i1, Promote);
Tom Stellard83747202013-07-18 21:43:53 +0000192 setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
193 setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
Matt Arsenault18f56be2016-12-22 16:27:11 +0000194 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
Tom Stellard83747202013-07-18 21:43:53 +0000195
Matt Arsenault71e66762016-05-21 02:27:49 +0000196 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
197 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
Matt Arsenaulte306a322014-10-21 16:25:08 +0000198
Matt Arsenault4e466652014-04-16 01:41:30 +0000199 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
200 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000201 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
202 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000203 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
204 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000205 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
206
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000207 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
Tom Stellard9fa17912013-08-14 23:24:45 +0000208 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
Tom Stellard9fa17912013-08-14 23:24:45 +0000209 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
Matt Arsenaultb3a80e52018-08-15 21:25:20 +0000210 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
211 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
Marek Olsak13e47412018-01-31 20:18:04 +0000212 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000213 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
214
Changpeng Fang44dfa1d2018-01-12 21:12:19 +0000215 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
216 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
David Stuttardf77079f2019-01-14 11:55:24 +0000217 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000218 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000219
220 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
Matt Arsenault4165efd2017-01-17 07:26:53 +0000221 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
222 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +0000223 setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000224
Matt Arsenaulte54e1c32014-06-23 18:00:44 +0000225 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000226 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
Tom Stellardbc4497b2016-02-12 23:45:29 +0000227 setOperationAction(ISD::BR_CC, MVT::i32, Expand);
228 setOperationAction(ISD::BR_CC, MVT::i64, Expand);
229 setOperationAction(ISD::BR_CC, MVT::f32, Expand);
230 setOperationAction(ISD::BR_CC, MVT::f64, Expand);
Tom Stellardafcf12f2013-09-12 02:55:14 +0000231
Matt Arsenaultee3f0ac2017-01-30 18:11:38 +0000232 setOperationAction(ISD::UADDO, MVT::i32, Legal);
233 setOperationAction(ISD::USUBO, MVT::i32, Legal);
234
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +0000235 setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
236 setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
237
Matt Arsenaulte7191392018-08-08 16:58:33 +0000238 setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
239 setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
240 setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
241
Matt Arsenault84445dd2017-11-30 22:51:26 +0000242#if 0
243 setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
244 setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
245#endif
246
Benjamin Kramer867bfc52015-03-07 17:41:00 +0000247 // We only support LOAD/STORE and vector manipulation ops for vectors
248 // with > 4 elements.
Matt Arsenault7596f132017-02-27 20:52:10 +0000249 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000250 MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
Tom Stellard967bf582014-02-13 23:34:15 +0000251 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
Matt Arsenault71e66762016-05-21 02:27:49 +0000252 switch (Op) {
Tom Stellard967bf582014-02-13 23:34:15 +0000253 case ISD::LOAD:
254 case ISD::STORE:
255 case ISD::BUILD_VECTOR:
256 case ISD::BITCAST:
257 case ISD::EXTRACT_VECTOR_ELT:
258 case ISD::INSERT_VECTOR_ELT:
Tom Stellard967bf582014-02-13 23:34:15 +0000259 case ISD::INSERT_SUBVECTOR:
260 case ISD::EXTRACT_SUBVECTOR:
Matt Arsenault61001bb2015-11-25 19:58:34 +0000261 case ISD::SCALAR_TO_VECTOR:
Tom Stellard967bf582014-02-13 23:34:15 +0000262 break;
Tom Stellardc0503db2014-08-09 01:06:56 +0000263 case ISD::CONCAT_VECTORS:
264 setOperationAction(Op, VT, Custom);
265 break;
Tom Stellard967bf582014-02-13 23:34:15 +0000266 default:
Matt Arsenaultd504a742014-05-15 21:44:05 +0000267 setOperationAction(Op, VT, Expand);
Tom Stellard967bf582014-02-13 23:34:15 +0000268 break;
269 }
270 }
271 }
272
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000273 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
274
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000275 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
276 // is expanded to avoid having two separate loops in case the index is a VGPR.
277
Matt Arsenault61001bb2015-11-25 19:58:34 +0000278 // Most operations are naturally 32-bit vector operations. We only support
279 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
280 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
281 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
282 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
283
284 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
285 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
286
287 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
288 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
289
290 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
291 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
292 }
293
Matt Arsenault71e66762016-05-21 02:27:49 +0000294 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
295 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
296 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
297 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +0000298
Matt Arsenault67a98152018-05-16 11:47:30 +0000299 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
300 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
301
Matt Arsenault3aef8092017-01-23 23:09:58 +0000302 // Avoid stack access for these.
303 // TODO: Generalize to more vector types.
304 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
305 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault67a98152018-05-16 11:47:30 +0000306 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
307 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
308
Matt Arsenault3aef8092017-01-23 23:09:58 +0000309 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
310 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault9224c002018-06-05 19:52:46 +0000311 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
312 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
313 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
314
315 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
316 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
317 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
Matt Arsenault3aef8092017-01-23 23:09:58 +0000318
Matt Arsenault67a98152018-05-16 11:47:30 +0000319 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
320 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
321 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
322 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
323
Tom Stellard354a43c2016-04-01 18:27:37 +0000324 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
325 // and output demarshalling
326 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
327 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
328
329 // We can't return success/failure, only the old value,
330 // let LLVM add the comparison
331 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
332 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
333
Tom Stellardc5a154d2018-06-28 23:47:12 +0000334 if (Subtarget->hasFlatAddressSpace()) {
Matt Arsenault99c14522016-04-25 19:27:24 +0000335 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
336 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
337 }
338
Matt Arsenault71e66762016-05-21 02:27:49 +0000339 setOperationAction(ISD::BSWAP, MVT::i32, Legal);
340 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
341
342 // On SI this is s_memtime and s_memrealtime on VI.
343 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
Matt Arsenault3e025382017-04-24 17:49:13 +0000344 setOperationAction(ISD::TRAP, MVT::Other, Custom);
345 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000346
Tom Stellardc5a154d2018-06-28 23:47:12 +0000347 if (Subtarget->has16BitInsts()) {
348 setOperationAction(ISD::FLOG, MVT::f16, Custom);
Matt Arsenault7121bed2018-08-16 17:07:52 +0000349 setOperationAction(ISD::FEXP, MVT::f16, Custom);
Tom Stellardc5a154d2018-06-28 23:47:12 +0000350 setOperationAction(ISD::FLOG10, MVT::f16, Custom);
351 }
352
353 // v_mad_f32 does not support denormals according to some sources.
354 if (!Subtarget->hasFP32Denormals())
355 setOperationAction(ISD::FMAD, MVT::f32, Legal);
356
357 if (!Subtarget->hasBFI()) {
358 // fcopysign can be done in a single instruction with BFI.
359 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
360 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
361 }
362
363 if (!Subtarget->hasBCNT(32))
364 setOperationAction(ISD::CTPOP, MVT::i32, Expand);
365
366 if (!Subtarget->hasBCNT(64))
367 setOperationAction(ISD::CTPOP, MVT::i64, Expand);
368
369 if (Subtarget->hasFFBH())
370 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
371
372 if (Subtarget->hasFFBL())
373 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
374
375 // We only really have 32-bit BFE instructions (and 16-bit on VI).
376 //
377 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
378 // effort to match them now. We want this to be false for i64 cases when the
379 // extraction isn't restricted to the upper or lower half. Ideally we would
380 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
381 // span the midpoint are probably relatively rare, so don't worry about them
382 // for now.
383 if (Subtarget->hasBFE())
384 setHasExtractBitsInsn(true);
385
Matt Arsenault687ec752018-10-22 16:27:27 +0000386 setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
387 setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
388 setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
389 setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
390
391
392 // These are really only legal for ieee_mode functions. We should be avoiding
393 // them for functions that don't have ieee_mode enabled, so just say they are
394 // legal.
395 setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
396 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
397 setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
398 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
399
Matt Arsenault71e66762016-05-21 02:27:49 +0000400
Tom Stellard5bfbae52018-07-11 20:59:01 +0000401 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
Matt Arsenault71e66762016-05-21 02:27:49 +0000402 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
403 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
404 setOperationAction(ISD::FRINT, MVT::f64, Legal);
Tom Stellardc5a154d2018-06-28 23:47:12 +0000405 } else {
406 setOperationAction(ISD::FCEIL, MVT::f64, Custom);
407 setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
408 setOperationAction(ISD::FRINT, MVT::f64, Custom);
409 setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000410 }
411
412 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
413
414 setOperationAction(ISD::FSIN, MVT::f32, Custom);
415 setOperationAction(ISD::FCOS, MVT::f32, Custom);
416 setOperationAction(ISD::FDIV, MVT::f32, Custom);
417 setOperationAction(ISD::FDIV, MVT::f64, Custom);
418
Tom Stellard115a6152016-11-10 16:02:37 +0000419 if (Subtarget->has16BitInsts()) {
420 setOperationAction(ISD::Constant, MVT::i16, Legal);
421
422 setOperationAction(ISD::SMIN, MVT::i16, Legal);
423 setOperationAction(ISD::SMAX, MVT::i16, Legal);
424
425 setOperationAction(ISD::UMIN, MVT::i16, Legal);
426 setOperationAction(ISD::UMAX, MVT::i16, Legal);
427
Tom Stellard115a6152016-11-10 16:02:37 +0000428 setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
429 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
430
431 setOperationAction(ISD::ROTR, MVT::i16, Promote);
432 setOperationAction(ISD::ROTL, MVT::i16, Promote);
433
434 setOperationAction(ISD::SDIV, MVT::i16, Promote);
435 setOperationAction(ISD::UDIV, MVT::i16, Promote);
436 setOperationAction(ISD::SREM, MVT::i16, Promote);
437 setOperationAction(ISD::UREM, MVT::i16, Promote);
438
439 setOperationAction(ISD::BSWAP, MVT::i16, Promote);
440 setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
441
442 setOperationAction(ISD::CTTZ, MVT::i16, Promote);
443 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
444 setOperationAction(ISD::CTLZ, MVT::i16, Promote);
445 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
Jan Veselyb283ea02018-03-02 02:50:22 +0000446 setOperationAction(ISD::CTPOP, MVT::i16, Promote);
Tom Stellard115a6152016-11-10 16:02:37 +0000447
448 setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
449
450 setOperationAction(ISD::BR_CC, MVT::i16, Expand);
451
452 setOperationAction(ISD::LOAD, MVT::i16, Custom);
453
454 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
455
Tom Stellard115a6152016-11-10 16:02:37 +0000456 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
457 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
458 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
459 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
Tom Stellardb4c8e8e2016-11-12 00:19:11 +0000460
Konstantin Zhuravlyov3f0cdc72016-11-17 04:00:46 +0000461 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
462 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
463 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
464 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
Tom Stellardb4c8e8e2016-11-12 00:19:11 +0000465
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000466 // F16 - Constant Actions.
Matt Arsenaulte96d0372016-12-08 20:14:46 +0000467 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000468
469 // F16 - Load/Store Actions.
470 setOperationAction(ISD::LOAD, MVT::f16, Promote);
471 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
472 setOperationAction(ISD::STORE, MVT::f16, Promote);
473 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
474
475 // F16 - VOP1 Actions.
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +0000476 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000477 setOperationAction(ISD::FCOS, MVT::f16, Promote);
478 setOperationAction(ISD::FSIN, MVT::f16, Promote);
Konstantin Zhuravlyov3f0cdc72016-11-17 04:00:46 +0000479 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
480 setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
481 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
482 setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
Matt Arsenaultb5d23272017-03-24 20:04:18 +0000483 setOperationAction(ISD::FROUND, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000484
485 // F16 - VOP2 Actions.
Konstantin Zhuravlyov662e01d2016-11-17 03:49:01 +0000486 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
Konstantin Zhuravlyov2a87a422016-11-16 03:16:26 +0000487 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
Matt Arsenault687ec752018-10-22 16:27:27 +0000488
Matt Arsenault4052a572016-12-22 03:05:41 +0000489 setOperationAction(ISD::FDIV, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000490
491 // F16 - VOP3 Actions.
492 setOperationAction(ISD::FMA, MVT::f16, Legal);
493 if (!Subtarget->hasFP16Denormals())
494 setOperationAction(ISD::FMAD, MVT::f16, Legal);
Tom Stellard115a6152016-11-10 16:02:37 +0000495
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000496 for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
Matt Arsenault7596f132017-02-27 20:52:10 +0000497 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
498 switch (Op) {
499 case ISD::LOAD:
500 case ISD::STORE:
501 case ISD::BUILD_VECTOR:
502 case ISD::BITCAST:
503 case ISD::EXTRACT_VECTOR_ELT:
504 case ISD::INSERT_VECTOR_ELT:
505 case ISD::INSERT_SUBVECTOR:
506 case ISD::EXTRACT_SUBVECTOR:
507 case ISD::SCALAR_TO_VECTOR:
508 break;
509 case ISD::CONCAT_VECTORS:
510 setOperationAction(Op, VT, Custom);
511 break;
512 default:
513 setOperationAction(Op, VT, Expand);
514 break;
515 }
516 }
517 }
518
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000519 // XXX - Do these do anything? Vector constants turn into build_vector.
520 setOperationAction(ISD::Constant, MVT::v2i16, Legal);
521 setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
522
Matt Arsenaultdfb88df2018-05-13 10:04:38 +0000523 setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
524 setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
525
Matt Arsenault7596f132017-02-27 20:52:10 +0000526 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
527 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
528 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
529 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
530
531 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
532 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
533 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
534 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000535
536 setOperationAction(ISD::AND, MVT::v2i16, Promote);
537 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
538 setOperationAction(ISD::OR, MVT::v2i16, Promote);
539 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
540 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
541 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000542
Matt Arsenault1349a042018-05-22 06:32:10 +0000543 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
544 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
545 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
546 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
547
548 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
549 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
550 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
551 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
552
553 setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
554 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
555 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
556 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
557
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000558 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
559 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
560 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
561
Matt Arsenault1349a042018-05-22 06:32:10 +0000562 if (!Subtarget->hasVOP3PInsts()) {
563 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
564 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
565 }
566
567 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
568 // This isn't really legal, but this avoids the legalizer unrolling it (and
569 // allows matching fneg (fabs x) patterns)
570 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
Matt Arsenault687ec752018-10-22 16:27:27 +0000571
572 setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
573 setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
574 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
575 setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
576
577 setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
578 setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
579
580 setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
581 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
Matt Arsenault1349a042018-05-22 06:32:10 +0000582 }
583
584 if (Subtarget->hasVOP3PInsts()) {
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000585 setOperationAction(ISD::ADD, MVT::v2i16, Legal);
586 setOperationAction(ISD::SUB, MVT::v2i16, Legal);
587 setOperationAction(ISD::MUL, MVT::v2i16, Legal);
588 setOperationAction(ISD::SHL, MVT::v2i16, Legal);
589 setOperationAction(ISD::SRL, MVT::v2i16, Legal);
590 setOperationAction(ISD::SRA, MVT::v2i16, Legal);
591 setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
592 setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
593 setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
594 setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
595
596 setOperationAction(ISD::FADD, MVT::v2f16, Legal);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000597 setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
598 setOperationAction(ISD::FMA, MVT::v2f16, Legal);
Matt Arsenault687ec752018-10-22 16:27:27 +0000599
600 setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
601 setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
602
Matt Arsenault540512c2018-04-26 19:21:37 +0000603 setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000604
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000605 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
606 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000607
608 setOperationAction(ISD::SHL, MVT::v4i16, Custom);
609 setOperationAction(ISD::SRA, MVT::v4i16, Custom);
610 setOperationAction(ISD::SRL, MVT::v4i16, Custom);
611 setOperationAction(ISD::ADD, MVT::v4i16, Custom);
612 setOperationAction(ISD::SUB, MVT::v4i16, Custom);
613 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
614
615 setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
616 setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
617 setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
618 setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
619
620 setOperationAction(ISD::FADD, MVT::v4f16, Custom);
621 setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
Matt Arsenault687ec752018-10-22 16:27:27 +0000622
623 setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
624 setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
625
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000626 setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
627 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
Matt Arsenault36cdcfa2018-08-02 13:43:42 +0000628 setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000629
Matt Arsenault7121bed2018-08-16 17:07:52 +0000630 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000631 setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
632 setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
Matt Arsenault1349a042018-05-22 06:32:10 +0000633 }
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000634
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000635 setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
636 setOperationAction(ISD::FABS, MVT::v4f16, Custom);
637
Matt Arsenault1349a042018-05-22 06:32:10 +0000638 if (Subtarget->has16BitInsts()) {
639 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
640 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
641 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
642 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
Matt Arsenault4a486232017-04-19 20:53:07 +0000643 } else {
Matt Arsenault1349a042018-05-22 06:32:10 +0000644 // Legalization hack.
Matt Arsenault4a486232017-04-19 20:53:07 +0000645 setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
646 setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
Matt Arsenaulte9524f12018-06-06 21:28:11 +0000647
648 setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
649 setOperationAction(ISD::FABS, MVT::v2f16, Custom);
Matt Arsenault4a486232017-04-19 20:53:07 +0000650 }
651
652 for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
653 setOperationAction(ISD::SELECT, VT, Custom);
Matt Arsenault7596f132017-02-27 20:52:10 +0000654 }
655
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +0000656 setTargetDAGCombine(ISD::ADD);
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +0000657 setTargetDAGCombine(ISD::ADDCARRY);
658 setTargetDAGCombine(ISD::SUB);
659 setTargetDAGCombine(ISD::SUBCARRY);
Matt Arsenault02cb0ff2014-09-29 14:59:34 +0000660 setTargetDAGCombine(ISD::FADD);
Matt Arsenault8675db12014-08-29 16:01:14 +0000661 setTargetDAGCombine(ISD::FSUB);
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +0000662 setTargetDAGCombine(ISD::FMINNUM);
663 setTargetDAGCombine(ISD::FMAXNUM);
Matt Arsenault687ec752018-10-22 16:27:27 +0000664 setTargetDAGCombine(ISD::FMINNUM_IEEE);
665 setTargetDAGCombine(ISD::FMAXNUM_IEEE);
Farhana Aleenc370d7b2018-07-16 18:19:59 +0000666 setTargetDAGCombine(ISD::FMA);
Matt Arsenault5881f4e2015-06-09 00:52:37 +0000667 setTargetDAGCombine(ISD::SMIN);
668 setTargetDAGCombine(ISD::SMAX);
669 setTargetDAGCombine(ISD::UMIN);
670 setTargetDAGCombine(ISD::UMAX);
Tom Stellard75aadc22012-12-11 21:25:42 +0000671 setTargetDAGCombine(ISD::SETCC);
Matt Arsenaultd0101a22015-01-06 23:00:46 +0000672 setTargetDAGCombine(ISD::AND);
Matt Arsenaultf2290332015-01-06 23:00:39 +0000673 setTargetDAGCombine(ISD::OR);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +0000674 setTargetDAGCombine(ISD::XOR);
Konstantin Zhuravlyovfda33ea2016-10-21 22:10:03 +0000675 setTargetDAGCombine(ISD::SINT_TO_FP);
Matt Arsenault364a6742014-06-11 17:50:44 +0000676 setTargetDAGCombine(ISD::UINT_TO_FP);
Matt Arsenault9cd90712016-04-14 01:42:16 +0000677 setTargetDAGCombine(ISD::FCANONICALIZE);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000678 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
Matt Arsenault8edfaee2017-03-31 19:53:03 +0000679 setTargetDAGCombine(ISD::ZERO_EXTEND);
Matt Arsenaultbf5482e2017-05-11 17:26:25 +0000680 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +0000681 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
Matt Arsenault364a6742014-06-11 17:50:44 +0000682
Matt Arsenaultb2baffa2014-08-15 17:49:05 +0000683 // All memory operations. Some folding on the pointer operand is done to help
684 // matching the constant offsets in the addressing modes.
685 setTargetDAGCombine(ISD::LOAD);
686 setTargetDAGCombine(ISD::STORE);
687 setTargetDAGCombine(ISD::ATOMIC_LOAD);
688 setTargetDAGCombine(ISD::ATOMIC_STORE);
689 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
690 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
691 setTargetDAGCombine(ISD::ATOMIC_SWAP);
692 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
693 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
694 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
695 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
696 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
697 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
698 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
699 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
700 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
701 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
Matt Arsenaulta5840c32019-01-22 18:36:06 +0000702 setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +0000703
Christian Konigeecebd02013-03-26 14:04:02 +0000704 setSchedulingPreference(Sched::RegPressure);
Tom Stellardc5a154d2018-06-28 23:47:12 +0000705
706 // SI at least has hardware support for floating point exceptions, but no way
707 // of using or handling them is implemented. They are also optional in OpenCL
708 // (Section 7.3)
709 setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
Tom Stellard75aadc22012-12-11 21:25:42 +0000710}
711
Tom Stellard5bfbae52018-07-11 20:59:01 +0000712const GCNSubtarget *SITargetLowering::getSubtarget() const {
Tom Stellardc5a154d2018-06-28 23:47:12 +0000713 return Subtarget;
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000714}
715
Tom Stellard0125f2a2013-06-25 02:39:35 +0000716//===----------------------------------------------------------------------===//
717// TargetLowering queries
718//===----------------------------------------------------------------------===//
719
Tom Stellardb12f4de2018-05-22 19:37:55 +0000720// v_mad_mix* support a conversion from f16 to f32.
721//
722// There is only one special case when denormals are enabled we don't currently,
723// where this is OK to use.
724bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
725 EVT DestVT, EVT SrcVT) const {
726 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
727 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
728 DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
729 SrcVT.getScalarType() == MVT::f16;
730}
731
Zvi Rackover1b736822017-07-26 08:06:58 +0000732bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
Matt Arsenault7dc01c92017-03-15 23:15:12 +0000733 // SI has some legal vector types, but no legal vector operations. Say no
734 // shuffles are legal in order to prefer scalarizing some vector operations.
735 return false;
736}
737
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000738MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
739 CallingConv::ID CC,
740 EVT VT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000741 // TODO: Consider splitting all arguments into 32-bit pieces.
742 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000743 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000744 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000745 if (Size == 32)
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000746 return ScalarVT.getSimpleVT();
Matt Arsenault0395da72018-07-31 19:17:47 +0000747
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000748 if (Size == 64)
749 return MVT::i32;
750
Matt Arsenault57b59662018-09-10 11:49:23 +0000751 if (Size == 16 && Subtarget->has16BitInsts())
Matt Arsenault0395da72018-07-31 19:17:47 +0000752 return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000753 }
754
755 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
756}
757
758unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
759 CallingConv::ID CC,
760 EVT VT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000761 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000762 unsigned NumElts = VT.getVectorNumElements();
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000763 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000764 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenault0395da72018-07-31 19:17:47 +0000765
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000766 if (Size == 32)
Matt Arsenault0395da72018-07-31 19:17:47 +0000767 return NumElts;
768
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000769 if (Size == 64)
770 return 2 * NumElts;
771
Matt Arsenault57b59662018-09-10 11:49:23 +0000772 if (Size == 16 && Subtarget->has16BitInsts())
773 return (VT.getVectorNumElements() + 1) / 2;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000774 }
775
776 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
777}
778
779unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
780 LLVMContext &Context, CallingConv::ID CC,
781 EVT VT, EVT &IntermediateVT,
782 unsigned &NumIntermediates, MVT &RegisterVT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000783 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000784 unsigned NumElts = VT.getVectorNumElements();
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000785 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000786 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000787 if (Size == 32) {
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000788 RegisterVT = ScalarVT.getSimpleVT();
789 IntermediateVT = RegisterVT;
Matt Arsenault0395da72018-07-31 19:17:47 +0000790 NumIntermediates = NumElts;
791 return NumIntermediates;
792 }
793
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000794 if (Size == 64) {
795 RegisterVT = MVT::i32;
796 IntermediateVT = RegisterVT;
797 NumIntermediates = 2 * NumElts;
798 return NumIntermediates;
799 }
800
Matt Arsenault0395da72018-07-31 19:17:47 +0000801 // FIXME: We should fix the ABI to be the same on targets without 16-bit
802 // support, but unless we can properly handle 3-vectors, it will be still be
803 // inconsistent.
Matt Arsenault57b59662018-09-10 11:49:23 +0000804 if (Size == 16 && Subtarget->has16BitInsts()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000805 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
806 IntermediateVT = RegisterVT;
Matt Arsenault57b59662018-09-10 11:49:23 +0000807 NumIntermediates = (NumElts + 1) / 2;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000808 return NumIntermediates;
809 }
810 }
811
812 return TargetLowering::getVectorTypeBreakdownForCallingConv(
813 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
814}
815
David Stuttardf77079f2019-01-14 11:55:24 +0000816static MVT memVTFromAggregate(Type *Ty) {
817 // Only limited forms of aggregate type currently expected.
818 assert(Ty->isStructTy() && "Expected struct type");
819
820
821 Type *ElementType = nullptr;
822 unsigned NumElts;
823 if (Ty->getContainedType(0)->isVectorTy()) {
824 VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
825 ElementType = VecComponent->getElementType();
826 NumElts = VecComponent->getNumElements();
827 } else {
828 ElementType = Ty->getContainedType(0);
829 NumElts = 1;
830 }
831
832 assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
833
834 // Calculate the size of the memVT type from the aggregate
835 unsigned Pow2Elts = 0;
836 unsigned ElementSize;
837 switch (ElementType->getTypeID()) {
838 default:
839 llvm_unreachable("Unknown type!");
840 case Type::IntegerTyID:
841 ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
842 break;
843 case Type::HalfTyID:
844 ElementSize = 16;
845 break;
846 case Type::FloatTyID:
847 ElementSize = 32;
848 break;
849 }
850 unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
851 Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
852
853 return MVT::getVectorVT(MVT::getVT(ElementType, false),
854 Pow2Elts);
855}
856
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000857bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
858 const CallInst &CI,
Matt Arsenault7d7adf42017-12-14 22:34:10 +0000859 MachineFunction &MF,
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000860 unsigned IntrID) const {
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000861 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
Nicolai Haehnlee741d7e2018-06-21 13:36:33 +0000862 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000863 AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
864 (Intrinsic::ID)IntrID);
865 if (Attr.hasFnAttribute(Attribute::ReadNone))
866 return false;
867
868 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
869
870 if (RsrcIntr->IsImage) {
871 Info.ptrVal = MFI->getImagePSV(
Tom Stellard5bfbae52018-07-11 20:59:01 +0000872 *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000873 CI.getArgOperand(RsrcIntr->RsrcArg));
874 Info.align = 0;
875 } else {
876 Info.ptrVal = MFI->getBufferPSV(
Tom Stellard5bfbae52018-07-11 20:59:01 +0000877 *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000878 CI.getArgOperand(RsrcIntr->RsrcArg));
879 }
880
881 Info.flags = MachineMemOperand::MODereferenceable;
882 if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
883 Info.opc = ISD::INTRINSIC_W_CHAIN;
David Stuttardf77079f2019-01-14 11:55:24 +0000884 Info.memVT = MVT::getVT(CI.getType(), true);
885 if (Info.memVT == MVT::Other) {
886 // Some intrinsics return an aggregate type - special case to work out
887 // the correct memVT
888 Info.memVT = memVTFromAggregate(CI.getType());
889 }
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000890 Info.flags |= MachineMemOperand::MOLoad;
891 } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
892 Info.opc = ISD::INTRINSIC_VOID;
893 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
894 Info.flags |= MachineMemOperand::MOStore;
895 } else {
896 // Atomic
897 Info.opc = ISD::INTRINSIC_W_CHAIN;
898 Info.memVT = MVT::getVT(CI.getType());
899 Info.flags = MachineMemOperand::MOLoad |
900 MachineMemOperand::MOStore |
901 MachineMemOperand::MODereferenceable;
902
903 // XXX - Should this be volatile without known ordering?
904 Info.flags |= MachineMemOperand::MOVolatile;
905 }
906 return true;
907 }
908
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000909 switch (IntrID) {
910 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +0000911 case Intrinsic::amdgcn_atomic_dec:
Marek Olsakc5cec5e2019-01-16 15:43:53 +0000912 case Intrinsic::amdgcn_ds_ordered_add:
913 case Intrinsic::amdgcn_ds_ordered_swap:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +0000914 case Intrinsic::amdgcn_ds_fadd:
915 case Intrinsic::amdgcn_ds_fmin:
916 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000917 Info.opc = ISD::INTRINSIC_W_CHAIN;
918 Info.memVT = MVT::getVT(CI.getType());
919 Info.ptrVal = CI.getOperand(0);
920 Info.align = 0;
Matt Arsenault11171332017-12-14 21:39:51 +0000921 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
Matt Arsenault79f837c2017-03-30 22:21:40 +0000922
Matt Arsenaultcaf13162019-03-12 21:02:54 +0000923 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
924 if (!Vol->isZero())
Matt Arsenault11171332017-12-14 21:39:51 +0000925 Info.flags |= MachineMemOperand::MOVolatile;
926
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000927 return true;
Matt Arsenault79f837c2017-03-30 22:21:40 +0000928 }
Matt Arsenaultcdd191d2019-01-28 20:14:49 +0000929 case Intrinsic::amdgcn_ds_append:
930 case Intrinsic::amdgcn_ds_consume: {
931 Info.opc = ISD::INTRINSIC_W_CHAIN;
932 Info.memVT = MVT::getVT(CI.getType());
933 Info.ptrVal = CI.getOperand(0);
934 Info.align = 0;
935 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
Matt Arsenault905f3512017-12-29 17:18:14 +0000936
Matt Arsenaultcaf13162019-03-12 21:02:54 +0000937 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
938 if (!Vol->isZero())
Matt Arsenaultcdd191d2019-01-28 20:14:49 +0000939 Info.flags |= MachineMemOperand::MOVolatile;
940
941 return true;
942 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000943 default:
944 return false;
945 }
946}
947
Matt Arsenault7dc01c92017-03-15 23:15:12 +0000948bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
949 SmallVectorImpl<Value*> &Ops,
950 Type *&AccessTy) const {
951 switch (II->getIntrinsicID()) {
952 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +0000953 case Intrinsic::amdgcn_atomic_dec:
Marek Olsakc5cec5e2019-01-16 15:43:53 +0000954 case Intrinsic::amdgcn_ds_ordered_add:
955 case Intrinsic::amdgcn_ds_ordered_swap:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +0000956 case Intrinsic::amdgcn_ds_fadd:
957 case Intrinsic::amdgcn_ds_fmin:
958 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenault7dc01c92017-03-15 23:15:12 +0000959 Value *Ptr = II->getArgOperand(0);
960 AccessTy = II->getType();
961 Ops.push_back(Ptr);
962 return true;
963 }
964 default:
965 return false;
966 }
Matt Arsenaulte306a322014-10-21 16:25:08 +0000967}
968
Tom Stellard70580f82015-07-20 14:28:41 +0000969bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
Matt Arsenaultd9b77842017-06-12 17:06:35 +0000970 if (!Subtarget->hasFlatInstOffsets()) {
971 // Flat instructions do not have offsets, and only have the register
972 // address.
973 return AM.BaseOffs == 0 && AM.Scale == 0;
974 }
975
976 // GFX9 added a 13-bit signed offset. When using regular flat instructions,
977 // the sign bit is ignored and is treated as a 12-bit unsigned offset.
978
979 // Just r + i
980 return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
Tom Stellard70580f82015-07-20 14:28:41 +0000981}
982
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +0000983bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
984 if (Subtarget->hasFlatGlobalInsts())
985 return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
986
987 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
988 // Assume the we will use FLAT for all global memory accesses
989 // on VI.
990 // FIXME: This assumption is currently wrong. On VI we still use
991 // MUBUF instructions for the r + i addressing mode. As currently
992 // implemented, the MUBUF instructions only work on buffer < 4GB.
993 // It may be possible to support > 4GB buffers with MUBUF instructions,
994 // by setting the stride value in the resource descriptor which would
995 // increase the size limit to (stride * 4GB). However, this is risky,
996 // because it has never been validated.
997 return isLegalFlatAddressingMode(AM);
998 }
999
1000 return isLegalMUBUFAddressingMode(AM);
1001}
1002
Matt Arsenault711b3902015-08-07 20:18:34 +00001003bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1004 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1005 // additionally can do r + r + i with addr64. 32-bit has more addressing
1006 // mode options. Depending on the resource constant, it can also do
1007 // (i64 r0) + (i32 r1) * (i14 i).
1008 //
1009 // Private arrays end up using a scratch buffer most of the time, so also
1010 // assume those use MUBUF instructions. Scratch loads / stores are currently
1011 // implemented as mubuf instructions with offen bit set, so slightly
1012 // different than the normal addr64.
1013 if (!isUInt<12>(AM.BaseOffs))
1014 return false;
1015
1016 // FIXME: Since we can split immediate into soffset and immediate offset,
1017 // would it make sense to allow any immediate?
1018
1019 switch (AM.Scale) {
1020 case 0: // r + i or just i, depending on HasBaseReg.
1021 return true;
1022 case 1:
1023 return true; // We have r + r or r + i.
1024 case 2:
1025 if (AM.HasBaseReg) {
1026 // Reject 2 * r + r.
1027 return false;
1028 }
1029
1030 // Allow 2 * r as r + r
1031 // Or 2 * r + i is allowed as r + r + i.
1032 return true;
1033 default: // Don't allow n * r
1034 return false;
1035 }
1036}
1037
Mehdi Amini0cdec1e2015-07-09 02:09:40 +00001038bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1039 const AddrMode &AM, Type *Ty,
Jonas Paulsson024e3192017-07-21 11:59:37 +00001040 unsigned AS, Instruction *I) const {
Matt Arsenault5015a892014-08-15 17:17:07 +00001041 // No global is ever allowed as a base.
1042 if (AM.BaseGV)
1043 return false;
1044
Matt Arsenault0da63502018-08-31 05:49:54 +00001045 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +00001046 return isLegalGlobalAddressingMode(AM);
Matt Arsenault5015a892014-08-15 17:17:07 +00001047
Matt Arsenault0da63502018-08-31 05:49:54 +00001048 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1049 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001050 // If the offset isn't a multiple of 4, it probably isn't going to be
1051 // correctly aligned.
Matt Arsenault3cc1e002016-08-13 01:43:51 +00001052 // FIXME: Can we get the real alignment here?
Matt Arsenault711b3902015-08-07 20:18:34 +00001053 if (AM.BaseOffs % 4 != 0)
1054 return isLegalMUBUFAddressingMode(AM);
1055
1056 // There are no SMRD extloads, so if we have to do a small type access we
1057 // will use a MUBUF load.
1058 // FIXME?: We also need to do this if unaligned, but we don't know the
1059 // alignment here.
Stanislav Mekhanoshin57d341c2018-05-15 22:07:51 +00001060 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +00001061 return isLegalGlobalAddressingMode(AM);
Matt Arsenault711b3902015-08-07 20:18:34 +00001062
Tom Stellard5bfbae52018-07-11 20:59:01 +00001063 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001064 // SMRD instructions have an 8-bit, dword offset on SI.
1065 if (!isUInt<8>(AM.BaseOffs / 4))
1066 return false;
Tom Stellard5bfbae52018-07-11 20:59:01 +00001067 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001068 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1069 // in 8-bits, it can use a smaller encoding.
1070 if (!isUInt<32>(AM.BaseOffs / 4))
1071 return false;
Tom Stellard5bfbae52018-07-11 20:59:01 +00001072 } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001073 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1074 if (!isUInt<20>(AM.BaseOffs))
1075 return false;
1076 } else
1077 llvm_unreachable("unhandled generation");
1078
1079 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1080 return true;
1081
1082 if (AM.Scale == 1 && AM.HasBaseReg)
1083 return true;
1084
1085 return false;
Matt Arsenault711b3902015-08-07 20:18:34 +00001086
Matt Arsenault0da63502018-08-31 05:49:54 +00001087 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001088 return isLegalMUBUFAddressingMode(AM);
Matt Arsenault0da63502018-08-31 05:49:54 +00001089 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1090 AS == AMDGPUAS::REGION_ADDRESS) {
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001091 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1092 // field.
1093 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1094 // an 8-bit dword offset but we don't know the alignment here.
1095 if (!isUInt<16>(AM.BaseOffs))
Matt Arsenault5015a892014-08-15 17:17:07 +00001096 return false;
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001097
1098 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1099 return true;
1100
1101 if (AM.Scale == 1 && AM.HasBaseReg)
1102 return true;
1103
Matt Arsenault5015a892014-08-15 17:17:07 +00001104 return false;
Matt Arsenault0da63502018-08-31 05:49:54 +00001105 } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1106 AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
Matt Arsenault7d1b6c82016-04-29 06:25:10 +00001107 // For an unknown address space, this usually means that this is for some
1108 // reason being used for pure arithmetic, and not based on some addressing
1109 // computation. We don't have instructions that compute pointers with any
1110 // addressing modes, so treat them as having no offset like flat
1111 // instructions.
Tom Stellard70580f82015-07-20 14:28:41 +00001112 return isLegalFlatAddressingMode(AM);
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00001113 } else {
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001114 llvm_unreachable("unhandled address space");
1115 }
Matt Arsenault5015a892014-08-15 17:17:07 +00001116}
1117
Nirav Dave4dcad5d2017-07-10 20:25:54 +00001118bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1119 const SelectionDAG &DAG) const {
Matt Arsenault0da63502018-08-31 05:49:54 +00001120 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001121 return (MemVT.getSizeInBits() <= 4 * 32);
Matt Arsenault0da63502018-08-31 05:49:54 +00001122 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001123 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1124 return (MemVT.getSizeInBits() <= MaxPrivateBits);
Matt Arsenault0da63502018-08-31 05:49:54 +00001125 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001126 return (MemVT.getSizeInBits() <= 2 * 32);
1127 }
1128 return true;
1129}
1130
Matt Arsenaulte6986632015-01-14 01:35:22 +00001131bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
Matt Arsenault6f2a5262014-07-27 17:46:40 +00001132 unsigned AddrSpace,
1133 unsigned Align,
1134 bool *IsFast) const {
Matt Arsenault1018c892014-04-24 17:08:26 +00001135 if (IsFast)
1136 *IsFast = false;
1137
Matt Arsenault1018c892014-04-24 17:08:26 +00001138 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1139 // which isn't a simple VT.
Alina Sbirlea6f937b12016-08-04 16:38:44 +00001140 // Until MVT is extended to handle this, simply check for the size and
1141 // rely on the condition below: allow accesses if the size is a multiple of 4.
1142 if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1143 VT.getStoreSize() > 16)) {
Tom Stellard81d871d2013-11-13 23:36:50 +00001144 return false;
Alina Sbirlea6f937b12016-08-04 16:38:44 +00001145 }
Matt Arsenault1018c892014-04-24 17:08:26 +00001146
Matt Arsenault0da63502018-08-31 05:49:54 +00001147 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1148 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
Matt Arsenault6f2a5262014-07-27 17:46:40 +00001149 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1150 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1151 // with adjacent offsets.
Sanjay Patelce74db92015-09-03 15:03:19 +00001152 bool AlignedBy4 = (Align % 4 == 0);
1153 if (IsFast)
1154 *IsFast = AlignedBy4;
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001155
Sanjay Patelce74db92015-09-03 15:03:19 +00001156 return AlignedBy4;
Matt Arsenault6f2a5262014-07-27 17:46:40 +00001157 }
Matt Arsenault1018c892014-04-24 17:08:26 +00001158
Tom Stellard64a9d082016-10-14 18:10:39 +00001159 // FIXME: We have to be conservative here and assume that flat operations
1160 // will access scratch. If we had access to the IR function, then we
1161 // could determine if any private memory was used in the function.
1162 if (!Subtarget->hasUnalignedScratchAccess() &&
Matt Arsenault0da63502018-08-31 05:49:54 +00001163 (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1164 AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
Matt Arsenaultf4320112018-09-24 13:18:15 +00001165 bool AlignedBy4 = Align >= 4;
1166 if (IsFast)
1167 *IsFast = AlignedBy4;
1168
1169 return AlignedBy4;
Tom Stellard64a9d082016-10-14 18:10:39 +00001170 }
1171
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001172 if (Subtarget->hasUnalignedBufferAccess()) {
1173 // If we have an uniform constant load, it still requires using a slow
1174 // buffer instruction if unaligned.
1175 if (IsFast) {
Matt Arsenault0da63502018-08-31 05:49:54 +00001176 *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1177 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001178 (Align % 4 == 0) : true;
1179 }
1180
1181 return true;
1182 }
1183
Tom Stellard33e64c62015-02-04 20:49:52 +00001184 // Smaller than dword value must be aligned.
Tom Stellard33e64c62015-02-04 20:49:52 +00001185 if (VT.bitsLT(MVT::i32))
1186 return false;
1187
Matt Arsenault1018c892014-04-24 17:08:26 +00001188 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1189 // byte-address are ignored, thus forcing Dword alignment.
Tom Stellarde812f2f2014-07-21 15:45:06 +00001190 // This applies to private, global, and constant memory.
Matt Arsenault1018c892014-04-24 17:08:26 +00001191 if (IsFast)
1192 *IsFast = true;
Tom Stellardc6b299c2015-02-02 18:02:28 +00001193
1194 return VT.bitsGT(MVT::i32) && Align % 4 == 0;
Tom Stellard0125f2a2013-06-25 02:39:35 +00001195}
1196
Matt Arsenault46645fa2014-07-28 17:49:26 +00001197EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
1198 unsigned SrcAlign, bool IsMemset,
1199 bool ZeroMemset,
1200 bool MemcpyStrSrc,
1201 MachineFunction &MF) const {
1202 // FIXME: Should account for address space here.
1203
1204 // The default fallback uses the private pointer size as a guess for a type to
1205 // use. Make sure we switch these to 64-bit accesses.
1206
1207 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
1208 return MVT::v4i32;
1209
1210 if (Size >= 8 && DstAlign >= 4)
1211 return MVT::v2i32;
1212
1213 // Use the default.
1214 return MVT::Other;
1215}
1216
Matt Arsenault0da63502018-08-31 05:49:54 +00001217static bool isFlatGlobalAddrSpace(unsigned AS) {
1218 return AS == AMDGPUAS::GLOBAL_ADDRESS ||
1219 AS == AMDGPUAS::FLAT_ADDRESS ||
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001220 AS == AMDGPUAS::CONSTANT_ADDRESS ||
1221 AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
Matt Arsenaultf9bfeaf2015-12-01 23:04:00 +00001222}
1223
1224bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1225 unsigned DestAS) const {
Matt Arsenault0da63502018-08-31 05:49:54 +00001226 return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
Matt Arsenaultf9bfeaf2015-12-01 23:04:00 +00001227}
1228
Alexander Timofeev18009562016-12-08 17:28:47 +00001229bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1230 const MemSDNode *MemNode = cast<MemSDNode>(N);
1231 const Value *Ptr = MemNode->getMemOperand()->getValue();
Matt Arsenault0a0c8712018-03-27 18:39:45 +00001232 const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
Alexander Timofeev18009562016-12-08 17:28:47 +00001233 return I && I->getMetadata("amdgpu.noclobber");
1234}
1235
Matt Arsenaultd4da0ed2016-12-02 18:12:53 +00001236bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
1237 unsigned DestAS) const {
1238 // Flat -> private/local is a simple truncate.
1239 // Flat -> global is no-op
Matt Arsenault0da63502018-08-31 05:49:54 +00001240 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
Matt Arsenaultd4da0ed2016-12-02 18:12:53 +00001241 return true;
1242
1243 return isNoopAddrSpaceCast(SrcAS, DestAS);
1244}
1245
Tom Stellarda6f24c62015-12-15 20:55:55 +00001246bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1247 const MemSDNode *MemNode = cast<MemSDNode>(N);
Tom Stellarda6f24c62015-12-15 20:55:55 +00001248
Matt Arsenaultbcf7bec2018-02-09 16:57:48 +00001249 return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
Tom Stellarda6f24c62015-12-15 20:55:55 +00001250}
1251
Chandler Carruth9d010ff2014-07-03 00:23:43 +00001252TargetLoweringBase::LegalizeTypeAction
Craig Topper0b5f8162018-11-05 23:26:13 +00001253SITargetLowering::getPreferredVectorAction(MVT VT) const {
Chandler Carruth9d010ff2014-07-03 00:23:43 +00001254 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
1255 return TypeSplitVector;
1256
1257 return TargetLoweringBase::getPreferredVectorAction(VT);
Tom Stellardd86003e2013-08-14 23:25:00 +00001258}
Tom Stellard0125f2a2013-06-25 02:39:35 +00001259
Matt Arsenaultd7bdcc42014-03-31 19:54:27 +00001260bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1261 Type *Ty) const {
Matt Arsenault749035b2016-07-30 01:40:36 +00001262 // FIXME: Could be smarter if called for vector constants.
1263 return true;
Matt Arsenaultd7bdcc42014-03-31 19:54:27 +00001264}
1265
Tom Stellard2e045bb2016-01-20 00:13:22 +00001266bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
Matt Arsenault7b00cf42016-12-09 17:57:43 +00001267 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1268 switch (Op) {
1269 case ISD::LOAD:
1270 case ISD::STORE:
Tom Stellard2e045bb2016-01-20 00:13:22 +00001271
Matt Arsenault7b00cf42016-12-09 17:57:43 +00001272 // These operations are done with 32-bit instructions anyway.
1273 case ISD::AND:
1274 case ISD::OR:
1275 case ISD::XOR:
1276 case ISD::SELECT:
1277 // TODO: Extensions?
1278 return true;
1279 default:
1280 return false;
1281 }
1282 }
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +00001283
Tom Stellard2e045bb2016-01-20 00:13:22 +00001284 // SimplifySetCC uses this function to determine whether or not it should
1285 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1286 if (VT == MVT::i1 && Op == ISD::SETCC)
1287 return false;
1288
1289 return TargetLowering::isTypeDesirableForOp(Op, VT);
1290}
1291
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001292SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1293 const SDLoc &SL,
1294 SDValue Chain,
1295 uint64_t Offset) const {
Mehdi Aminia749f2a2015-07-09 02:09:52 +00001296 const DataLayout &DL = DAG.getDataLayout();
Tom Stellardec2e43c2014-09-22 15:35:29 +00001297 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001298 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1299
1300 const ArgDescriptor *InputPtrReg;
1301 const TargetRegisterClass *RC;
1302
1303 std::tie(InputPtrReg, RC)
1304 = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
Tom Stellard94593ee2013-06-03 17:40:18 +00001305
Matt Arsenault86033ca2014-07-28 17:31:39 +00001306 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
Matt Arsenault0da63502018-08-31 05:49:54 +00001307 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
Matt Arsenaulta0269b62015-06-01 21:58:24 +00001308 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001309 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1310
Matt Arsenault2fb9ccf2018-05-29 17:42:38 +00001311 return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
Jan Veselyfea814d2016-06-21 20:46:20 +00001312}
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00001313
Matt Arsenault9166ce82017-07-28 15:52:08 +00001314SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1315 const SDLoc &SL) const {
Matt Arsenault75e71922018-06-28 10:18:55 +00001316 uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1317 FIRST_IMPLICIT);
Matt Arsenault9166ce82017-07-28 15:52:08 +00001318 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1319}
1320
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001321SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1322 const SDLoc &SL, SDValue Val,
1323 bool Signed,
Matt Arsenault6dca5422017-01-09 18:52:39 +00001324 const ISD::InputArg *Arg) const {
Matt Arsenault6dca5422017-01-09 18:52:39 +00001325 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1326 VT.bitsLT(MemVT)) {
1327 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1328 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1329 }
1330
Tom Stellardbc6c5232016-10-17 16:21:45 +00001331 if (MemVT.isFloatingPoint())
Matt Arsenault6dca5422017-01-09 18:52:39 +00001332 Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001333 else if (Signed)
Matt Arsenault6dca5422017-01-09 18:52:39 +00001334 Val = DAG.getSExtOrTrunc(Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001335 else
Matt Arsenault6dca5422017-01-09 18:52:39 +00001336 Val = DAG.getZExtOrTrunc(Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001337
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001338 return Val;
1339}
1340
1341SDValue SITargetLowering::lowerKernargMemParameter(
1342 SelectionDAG &DAG, EVT VT, EVT MemVT,
1343 const SDLoc &SL, SDValue Chain,
Matt Arsenault7b4826e2018-05-30 16:17:51 +00001344 uint64_t Offset, unsigned Align, bool Signed,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001345 const ISD::InputArg *Arg) const {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001346 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
Matt Arsenault0da63502018-08-31 05:49:54 +00001347 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001348 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1349
Matt Arsenault90083d32018-06-07 09:54:49 +00001350 // Try to avoid using an extload by loading earlier than the argument address,
1351 // and extracting the relevant bits. The load should hopefully be merged with
1352 // the previous argument.
Matt Arsenault4bec7d42018-07-20 09:05:08 +00001353 if (MemVT.getStoreSize() < 4 && Align < 4) {
1354 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
Matt Arsenault90083d32018-06-07 09:54:49 +00001355 int64_t AlignDownOffset = alignDown(Offset, 4);
1356 int64_t OffsetDiff = Offset - AlignDownOffset;
1357
1358 EVT IntVT = MemVT.changeTypeToInteger();
1359
1360 // TODO: If we passed in the base kernel offset we could have a better
1361 // alignment than 4, but we don't really need it.
1362 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1363 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1364 MachineMemOperand::MODereferenceable |
1365 MachineMemOperand::MOInvariant);
1366
1367 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1368 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1369
1370 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1371 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1372 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1373
1374
1375 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1376 }
1377
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001378 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1379 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001380 MachineMemOperand::MODereferenceable |
1381 MachineMemOperand::MOInvariant);
1382
1383 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
Matt Arsenault6dca5422017-01-09 18:52:39 +00001384 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
Tom Stellard94593ee2013-06-03 17:40:18 +00001385}
1386
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001387SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1388 const SDLoc &SL, SDValue Chain,
1389 const ISD::InputArg &Arg) const {
1390 MachineFunction &MF = DAG.getMachineFunction();
1391 MachineFrameInfo &MFI = MF.getFrameInfo();
1392
1393 if (Arg.Flags.isByVal()) {
1394 unsigned Size = Arg.Flags.getByValSize();
1395 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1396 return DAG.getFrameIndex(FrameIdx, MVT::i32);
1397 }
1398
1399 unsigned ArgOffset = VA.getLocMemOffset();
1400 unsigned ArgSize = VA.getValVT().getStoreSize();
1401
1402 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1403
1404 // Create load nodes to retrieve arguments from the stack.
1405 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1406 SDValue ArgValue;
1407
1408 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1409 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
1410 MVT MemVT = VA.getValVT();
1411
1412 switch (VA.getLocInfo()) {
1413 default:
1414 break;
1415 case CCValAssign::BCvt:
1416 MemVT = VA.getLocVT();
1417 break;
1418 case CCValAssign::SExt:
1419 ExtType = ISD::SEXTLOAD;
1420 break;
1421 case CCValAssign::ZExt:
1422 ExtType = ISD::ZEXTLOAD;
1423 break;
1424 case CCValAssign::AExt:
1425 ExtType = ISD::EXTLOAD;
1426 break;
1427 }
1428
1429 ArgValue = DAG.getExtLoad(
1430 ExtType, SL, VA.getLocVT(), Chain, FIN,
1431 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1432 MemVT);
1433 return ArgValue;
1434}
1435
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001436SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1437 const SIMachineFunctionInfo &MFI,
1438 EVT VT,
1439 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1440 const ArgDescriptor *Reg;
1441 const TargetRegisterClass *RC;
1442
1443 std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1444 return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1445}
1446
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001447static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1448 CallingConv::ID CallConv,
1449 ArrayRef<ISD::InputArg> Ins,
1450 BitVector &Skipped,
1451 FunctionType *FType,
1452 SIMachineFunctionInfo *Info) {
1453 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001454 const ISD::InputArg *Arg = &Ins[I];
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001455
Matt Arsenault55ab9212018-08-01 19:57:34 +00001456 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1457 "vector type argument should have been split");
Matt Arsenault9ced1e02018-07-31 19:05:14 +00001458
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001459 // First check if it's a PS input addr.
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001460 if (CallConv == CallingConv::AMDGPU_PS &&
1461 !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001462
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001463 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1464
1465 // Inconveniently only the first part of the split is marked as isSplit,
1466 // so skip to the end. We only want to increment PSInputNum once for the
1467 // entire split argument.
1468 if (Arg->Flags.isSplit()) {
1469 while (!Arg->Flags.isSplitEnd()) {
1470 assert(!Arg->VT.isVector() &&
1471 "unexpected vector split in ps argument type");
1472 if (!SkipArg)
1473 Splits.push_back(*Arg);
1474 Arg = &Ins[++I];
1475 }
1476 }
1477
1478 if (SkipArg) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001479 // We can safely skip PS inputs.
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001480 Skipped.set(Arg->getOrigArgIndex());
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001481 ++PSInputNum;
1482 continue;
1483 }
1484
1485 Info->markPSInputAllocated(PSInputNum);
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001486 if (Arg->Used)
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001487 Info->markPSInputEnabled(PSInputNum);
1488
1489 ++PSInputNum;
1490 }
1491
Matt Arsenault9ced1e02018-07-31 19:05:14 +00001492 Splits.push_back(*Arg);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001493 }
1494}
1495
1496// Allocate special inputs passed in VGPRs.
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001497static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1498 MachineFunction &MF,
1499 const SIRegisterInfo &TRI,
1500 SIMachineFunctionInfo &Info) {
1501 if (Info.hasWorkItemIDX()) {
1502 unsigned Reg = AMDGPU::VGPR0;
1503 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001504
1505 CCInfo.AllocateReg(Reg);
1506 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
1507 }
1508
1509 if (Info.hasWorkItemIDY()) {
1510 unsigned Reg = AMDGPU::VGPR1;
1511 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1512
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001513 CCInfo.AllocateReg(Reg);
1514 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1515 }
1516
1517 if (Info.hasWorkItemIDZ()) {
1518 unsigned Reg = AMDGPU::VGPR2;
1519 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1520
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001521 CCInfo.AllocateReg(Reg);
1522 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1523 }
1524}
1525
1526// Try to allocate a VGPR at the end of the argument list, or if no argument
1527// VGPRs are left allocating a stack slot.
1528static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
1529 ArrayRef<MCPhysReg> ArgVGPRs
1530 = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1531 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1532 if (RegIdx == ArgVGPRs.size()) {
1533 // Spill to stack required.
1534 int64_t Offset = CCInfo.AllocateStack(4, 4);
1535
1536 return ArgDescriptor::createStack(Offset);
1537 }
1538
1539 unsigned Reg = ArgVGPRs[RegIdx];
1540 Reg = CCInfo.AllocateReg(Reg);
1541 assert(Reg != AMDGPU::NoRegister);
1542
1543 MachineFunction &MF = CCInfo.getMachineFunction();
1544 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1545 return ArgDescriptor::createRegister(Reg);
1546}
1547
1548static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1549 const TargetRegisterClass *RC,
1550 unsigned NumArgRegs) {
1551 ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1552 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1553 if (RegIdx == ArgSGPRs.size())
1554 report_fatal_error("ran out of SGPRs for arguments");
1555
1556 unsigned Reg = ArgSGPRs[RegIdx];
1557 Reg = CCInfo.AllocateReg(Reg);
1558 assert(Reg != AMDGPU::NoRegister);
1559
1560 MachineFunction &MF = CCInfo.getMachineFunction();
1561 MF.addLiveIn(Reg, RC);
1562 return ArgDescriptor::createRegister(Reg);
1563}
1564
1565static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
1566 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1567}
1568
1569static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
1570 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1571}
1572
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001573static void allocateSpecialInputVGPRs(CCState &CCInfo,
1574 MachineFunction &MF,
1575 const SIRegisterInfo &TRI,
1576 SIMachineFunctionInfo &Info) {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001577 if (Info.hasWorkItemIDX())
1578 Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001579
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001580 if (Info.hasWorkItemIDY())
1581 Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001582
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001583 if (Info.hasWorkItemIDZ())
1584 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1585}
1586
1587static void allocateSpecialInputSGPRs(CCState &CCInfo,
1588 MachineFunction &MF,
1589 const SIRegisterInfo &TRI,
1590 SIMachineFunctionInfo &Info) {
1591 auto &ArgInfo = Info.getArgInfo();
1592
1593 // TODO: Unify handling with private memory pointers.
1594
1595 if (Info.hasDispatchPtr())
1596 ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1597
1598 if (Info.hasQueuePtr())
1599 ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1600
1601 if (Info.hasKernargSegmentPtr())
1602 ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1603
1604 if (Info.hasDispatchID())
1605 ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1606
1607 // flat_scratch_init is not applicable for non-kernel functions.
1608
1609 if (Info.hasWorkGroupIDX())
1610 ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1611
1612 if (Info.hasWorkGroupIDY())
1613 ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1614
1615 if (Info.hasWorkGroupIDZ())
1616 ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
Matt Arsenault817c2532017-08-03 23:12:44 +00001617
1618 if (Info.hasImplicitArgPtr())
1619 ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001620}
1621
1622// Allocate special inputs passed in user SGPRs.
1623static void allocateHSAUserSGPRs(CCState &CCInfo,
1624 MachineFunction &MF,
1625 const SIRegisterInfo &TRI,
1626 SIMachineFunctionInfo &Info) {
Matt Arsenault10fc0622017-06-26 03:01:31 +00001627 if (Info.hasImplicitBufferPtr()) {
1628 unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1629 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1630 CCInfo.AllocateReg(ImplicitBufferPtrReg);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001631 }
1632
1633 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1634 if (Info.hasPrivateSegmentBuffer()) {
1635 unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1636 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1637 CCInfo.AllocateReg(PrivateSegmentBufferReg);
1638 }
1639
1640 if (Info.hasDispatchPtr()) {
1641 unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1642 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1643 CCInfo.AllocateReg(DispatchPtrReg);
1644 }
1645
1646 if (Info.hasQueuePtr()) {
1647 unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1648 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1649 CCInfo.AllocateReg(QueuePtrReg);
1650 }
1651
1652 if (Info.hasKernargSegmentPtr()) {
1653 unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1654 MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1655 CCInfo.AllocateReg(InputPtrReg);
1656 }
1657
1658 if (Info.hasDispatchID()) {
1659 unsigned DispatchIDReg = Info.addDispatchID(TRI);
1660 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1661 CCInfo.AllocateReg(DispatchIDReg);
1662 }
1663
1664 if (Info.hasFlatScratchInit()) {
1665 unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1666 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1667 CCInfo.AllocateReg(FlatScratchInitReg);
1668 }
1669
1670 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1671 // these from the dispatch pointer.
1672}
1673
1674// Allocate special input registers that are initialized per-wave.
1675static void allocateSystemSGPRs(CCState &CCInfo,
1676 MachineFunction &MF,
1677 SIMachineFunctionInfo &Info,
Marek Olsak584d2c02017-05-04 22:25:20 +00001678 CallingConv::ID CallConv,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001679 bool IsShader) {
1680 if (Info.hasWorkGroupIDX()) {
1681 unsigned Reg = Info.addWorkGroupIDX();
1682 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1683 CCInfo.AllocateReg(Reg);
1684 }
1685
1686 if (Info.hasWorkGroupIDY()) {
1687 unsigned Reg = Info.addWorkGroupIDY();
1688 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1689 CCInfo.AllocateReg(Reg);
1690 }
1691
1692 if (Info.hasWorkGroupIDZ()) {
1693 unsigned Reg = Info.addWorkGroupIDZ();
1694 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1695 CCInfo.AllocateReg(Reg);
1696 }
1697
1698 if (Info.hasWorkGroupInfo()) {
1699 unsigned Reg = Info.addWorkGroupInfo();
1700 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1701 CCInfo.AllocateReg(Reg);
1702 }
1703
1704 if (Info.hasPrivateSegmentWaveByteOffset()) {
1705 // Scratch wave offset passed in system SGPR.
1706 unsigned PrivateSegmentWaveByteOffsetReg;
1707
1708 if (IsShader) {
Marek Olsak584d2c02017-05-04 22:25:20 +00001709 PrivateSegmentWaveByteOffsetReg =
1710 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
1711
1712 // This is true if the scratch wave byte offset doesn't have a fixed
1713 // location.
1714 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1715 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1716 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1717 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001718 } else
1719 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1720
1721 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1722 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1723 }
1724}
1725
1726static void reservePrivateMemoryRegs(const TargetMachine &TM,
1727 MachineFunction &MF,
1728 const SIRegisterInfo &TRI,
Matt Arsenault1cc47f82017-07-18 16:44:56 +00001729 SIMachineFunctionInfo &Info) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001730 // Now that we've figured out where the scratch register inputs are, see if
1731 // should reserve the arguments and use them directly.
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001732 MachineFrameInfo &MFI = MF.getFrameInfo();
1733 bool HasStackObjects = MFI.hasStackObjects();
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001734
1735 // Record that we know we have non-spill stack objects so we don't need to
1736 // check all stack objects later.
1737 if (HasStackObjects)
1738 Info.setHasNonSpillStackObjects(true);
1739
1740 // Everything live out of a block is spilled with fast regalloc, so it's
1741 // almost certain that spilling will be required.
1742 if (TM.getOptLevel() == CodeGenOpt::None)
1743 HasStackObjects = true;
1744
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001745 // For now assume stack access is needed in any callee functions, so we need
1746 // the scratch registers to pass in.
1747 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1748
Tom Stellard5bfbae52018-07-11 20:59:01 +00001749 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Konstantin Zhuravlyovaa067cb2018-10-04 21:02:16 +00001750 if (ST.isAmdHsaOrMesa(MF.getFunction())) {
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001751 if (RequiresStackAccess) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001752 // If we have stack objects, we unquestionably need the private buffer
1753 // resource. For the Code Object V2 ABI, this will be the first 4 user
1754 // SGPR inputs. We can reserve those and use them directly.
1755
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001756 unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1757 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001758 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1759
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001760 if (MFI.hasCalls()) {
1761 // If we have calls, we need to keep the frame register in a register
1762 // that won't be clobbered by a call, so ensure it is copied somewhere.
1763
1764 // This is not a problem for the scratch wave offset, because the same
1765 // registers are reserved in all functions.
1766
1767 // FIXME: Nothing is really ensuring this is a call preserved register,
1768 // it's just selected from the end so it happens to be.
1769 unsigned ReservedOffsetReg
1770 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1771 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1772 } else {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001773 unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1774 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001775 Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1776 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001777 } else {
1778 unsigned ReservedBufferReg
1779 = TRI.reservedPrivateSegmentBufferReg(MF);
1780 unsigned ReservedOffsetReg
1781 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1782
1783 // We tentatively reserve the last registers (skipping the last two
1784 // which may contain VCC). After register allocation, we'll replace
1785 // these with the ones immediately after those which were really
1786 // allocated. In the prologue copies will be inserted from the argument
1787 // to these reserved registers.
1788 Info.setScratchRSrcReg(ReservedBufferReg);
1789 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1790 }
1791 } else {
1792 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1793
1794 // Without HSA, relocations are used for the scratch pointer and the
1795 // buffer resource setup is always inserted in the prologue. Scratch wave
1796 // offset is still in an input SGPR.
1797 Info.setScratchRSrcReg(ReservedBufferReg);
1798
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001799 if (HasStackObjects && !MFI.hasCalls()) {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001800 unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1801 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001802 Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1803 } else {
1804 unsigned ReservedOffsetReg
1805 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1806 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1807 }
1808 }
1809}
1810
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001811bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
1812 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1813 return !Info->isEntryFunction();
1814}
1815
1816void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
1817
1818}
1819
1820void SITargetLowering::insertCopiesSplitCSR(
1821 MachineBasicBlock *Entry,
1822 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1823 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1824
1825 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1826 if (!IStart)
1827 return;
1828
1829 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1830 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1831 MachineBasicBlock::iterator MBBI = Entry->begin();
1832 for (const MCPhysReg *I = IStart; *I; ++I) {
1833 const TargetRegisterClass *RC = nullptr;
1834 if (AMDGPU::SReg_64RegClass.contains(*I))
1835 RC = &AMDGPU::SGPR_64RegClass;
1836 else if (AMDGPU::SReg_32RegClass.contains(*I))
1837 RC = &AMDGPU::SGPR_32RegClass;
1838 else
1839 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1840
1841 unsigned NewVR = MRI->createVirtualRegister(RC);
1842 // Create copy from CSR to a virtual register.
1843 Entry->addLiveIn(*I);
1844 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1845 .addReg(*I);
1846
1847 // Insert the copy-back instructions right before the terminator.
1848 for (auto *Exit : Exits)
1849 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1850 TII->get(TargetOpcode::COPY), *I)
1851 .addReg(NewVR);
1852 }
1853}
1854
Christian Konig2c8f6d52013-03-07 09:03:52 +00001855SDValue SITargetLowering::LowerFormalArguments(
Eric Christopher7792e322015-01-30 23:24:40 +00001856 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
Benjamin Kramerbdc49562016-06-12 15:39:02 +00001857 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1858 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00001859 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
Christian Konig2c8f6d52013-03-07 09:03:52 +00001860
1861 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenaultceafc552018-05-29 17:42:50 +00001862 const Function &Fn = MF.getFunction();
Matthias Braunf1caa282017-12-15 22:22:58 +00001863 FunctionType *FType = MF.getFunction().getFunctionType();
Christian Konig99ee0f42013-03-07 09:04:14 +00001864 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Christian Konig2c8f6d52013-03-07 09:03:52 +00001865
Nicolai Haehnledf3a20c2016-04-06 19:40:20 +00001866 if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
Oliver Stannard7e7d9832016-02-02 13:52:43 +00001867 DiagnosticInfoUnsupported NoGraphicsHSA(
Matthias Braunf1caa282017-12-15 22:22:58 +00001868 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
Matt Arsenaultd48da142015-11-02 23:23:02 +00001869 DAG.getContext()->diagnose(NoGraphicsHSA);
Diana Picus81bc3172016-05-26 15:24:55 +00001870 return DAG.getEntryNode();
Matt Arsenaultd48da142015-11-02 23:23:02 +00001871 }
1872
Christian Konig2c8f6d52013-03-07 09:03:52 +00001873 SmallVector<ISD::InputArg, 16> Splits;
Christian Konig2c8f6d52013-03-07 09:03:52 +00001874 SmallVector<CCValAssign, 16> ArgLocs;
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001875 BitVector Skipped(Ins.size());
Eric Christopherb5217502014-08-06 18:45:26 +00001876 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1877 *DAG.getContext());
Christian Konig2c8f6d52013-03-07 09:03:52 +00001878
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001879 bool IsShader = AMDGPU::isShader(CallConv);
Matt Arsenaultefa9f4b2017-04-11 22:29:28 +00001880 bool IsKernel = AMDGPU::isKernel(CallConv);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001881 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
Christian Konig99ee0f42013-03-07 09:04:14 +00001882
Matt Arsenaultd1867c02017-08-02 00:59:51 +00001883 if (!IsEntryFunc) {
1884 // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1885 // this when allocating argument fixed offsets.
1886 CCInfo.AllocateStack(4, 4);
1887 }
1888
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001889 if (IsShader) {
1890 processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1891
1892 // At least one interpolation mode must be enabled or else the GPU will
1893 // hang.
1894 //
1895 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1896 // set PSInputAddr, the user wants to enable some bits after the compilation
1897 // based on run-time states. Since we can't know what the final PSInputEna
1898 // will look like, so we shouldn't do anything here and the user should take
1899 // responsibility for the correct programming.
1900 //
1901 // Otherwise, the following restrictions apply:
1902 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1903 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1904 // enabled too.
Tim Renoufc8ffffe2017-10-12 16:16:41 +00001905 if (CallConv == CallingConv::AMDGPU_PS) {
1906 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1907 ((Info->getPSInputAddr() & 0xF) == 0 &&
1908 Info->isPSInputAllocated(11))) {
1909 CCInfo.AllocateReg(AMDGPU::VGPR0);
1910 CCInfo.AllocateReg(AMDGPU::VGPR1);
1911 Info->markPSInputAllocated(0);
1912 Info->markPSInputEnabled(0);
1913 }
1914 if (Subtarget->isAmdPalOS()) {
1915 // For isAmdPalOS, the user does not enable some bits after compilation
1916 // based on run-time states; the register values being generated here are
1917 // the final ones set in hardware. Therefore we need to apply the
1918 // workaround to PSInputAddr and PSInputEnable together. (The case where
1919 // a bit is set in PSInputAddr but not PSInputEnable is where the
1920 // frontend set up an input arg for a particular interpolation mode, but
1921 // nothing uses that input arg. Really we should have an earlier pass
1922 // that removes such an arg.)
1923 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1924 if ((PsInputBits & 0x7F) == 0 ||
1925 ((PsInputBits & 0xF) == 0 &&
1926 (PsInputBits >> 11 & 1)))
1927 Info->markPSInputEnabled(
1928 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
1929 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001930 }
1931
Tom Stellard2f3f9852017-01-25 01:25:13 +00001932 assert(!Info->hasDispatchPtr() &&
Tom Stellardf110f8f2016-04-14 16:27:03 +00001933 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
1934 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
1935 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
1936 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
1937 !Info->hasWorkItemIDZ());
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001938 } else if (IsKernel) {
1939 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001940 } else {
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001941 Splits.append(Ins.begin(), Ins.end());
Tom Stellardaf775432013-10-23 00:44:32 +00001942 }
1943
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001944 if (IsEntryFunc) {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001945 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001946 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
Tom Stellard2f3f9852017-01-25 01:25:13 +00001947 }
1948
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001949 if (IsKernel) {
Tom Stellardbbeb45a2016-09-16 21:53:00 +00001950 analyzeFormalArgumentsCompute(CCInfo, Ins);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001951 } else {
1952 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1953 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1954 }
Christian Konig2c8f6d52013-03-07 09:03:52 +00001955
Matt Arsenaultcf13d182015-07-10 22:51:36 +00001956 SmallVector<SDValue, 16> Chains;
1957
Matt Arsenault7b4826e2018-05-30 16:17:51 +00001958 // FIXME: This is the minimum kernel argument alignment. We should improve
1959 // this to the maximum alignment of the arguments.
1960 //
1961 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
1962 // kern arg offset.
1963 const unsigned KernelArgBaseAlign = 16;
Matt Arsenault7b4826e2018-05-30 16:17:51 +00001964
1965 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
Christian Konigb7be72d2013-05-17 09:46:48 +00001966 const ISD::InputArg &Arg = Ins[i];
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001967 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
Christian Konigb7be72d2013-05-17 09:46:48 +00001968 InVals.push_back(DAG.getUNDEF(Arg.VT));
Christian Konig99ee0f42013-03-07 09:04:14 +00001969 continue;
1970 }
1971
Christian Konig2c8f6d52013-03-07 09:03:52 +00001972 CCValAssign &VA = ArgLocs[ArgIdx++];
Craig Topper7f416c82014-11-16 21:17:18 +00001973 MVT VT = VA.getLocVT();
Tom Stellarded882c22013-06-03 17:40:11 +00001974
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001975 if (IsEntryFunc && VA.isMemLoc()) {
Tom Stellardaf775432013-10-23 00:44:32 +00001976 VT = Ins[i].VT;
Tom Stellardbbeb45a2016-09-16 21:53:00 +00001977 EVT MemVT = VA.getLocVT();
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001978
Matt Arsenault4bec7d42018-07-20 09:05:08 +00001979 const uint64_t Offset = VA.getLocMemOffset();
Matt Arsenault7b4826e2018-05-30 16:17:51 +00001980 unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001981
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001982 SDValue Arg = lowerKernargMemParameter(
Matt Arsenault7b4826e2018-05-30 16:17:51 +00001983 DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
Matt Arsenaultcf13d182015-07-10 22:51:36 +00001984 Chains.push_back(Arg.getValue(1));
Tom Stellardca7ecf32014-08-22 18:49:31 +00001985
Craig Toppere3dcce92015-08-01 22:20:21 +00001986 auto *ParamTy =
Andrew Trick05938a52015-02-16 18:10:47 +00001987 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
Tom Stellard5bfbae52018-07-11 20:59:01 +00001988 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
Matt Arsenaultcdd191d2019-01-28 20:14:49 +00001989 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1990 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
Tom Stellardca7ecf32014-08-22 18:49:31 +00001991 // On SI local pointers are just offsets into LDS, so they are always
1992 // less than 16-bits. On CI and newer they could potentially be
1993 // real pointers, so we can't guarantee their size.
1994 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
1995 DAG.getValueType(MVT::i16));
1996 }
1997
Tom Stellarded882c22013-06-03 17:40:11 +00001998 InVals.push_back(Arg);
1999 continue;
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002000 } else if (!IsEntryFunc && VA.isMemLoc()) {
2001 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2002 InVals.push_back(Val);
2003 if (!Arg.Flags.isByVal())
2004 Chains.push_back(Val.getValue(1));
2005 continue;
Tom Stellarded882c22013-06-03 17:40:11 +00002006 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002007
Christian Konig2c8f6d52013-03-07 09:03:52 +00002008 assert(VA.isRegLoc() && "Parameter must be in a register!");
2009
2010 unsigned Reg = VA.getLocReg();
Christian Konig2c8f6d52013-03-07 09:03:52 +00002011 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
Matt Arsenaultb3463552017-07-15 05:52:59 +00002012 EVT ValVT = VA.getValVT();
Christian Konig2c8f6d52013-03-07 09:03:52 +00002013
2014 Reg = MF.addLiveIn(Reg, RC);
2015 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2016
Matt Arsenault45b98182017-11-15 00:45:43 +00002017 if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
2018 // The return object should be reasonably addressable.
2019
2020 // FIXME: This helps when the return is a real sret. If it is a
2021 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2022 // extra copy is inserted in SelectionDAGBuilder which obscures this.
2023 unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
2024 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2025 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2026 }
2027
Matt Arsenaultb3463552017-07-15 05:52:59 +00002028 // If this is an 8 or 16-bit value, it is really passed promoted
2029 // to 32 bits. Insert an assert[sz]ext to capture this, then
2030 // truncate to the right size.
2031 switch (VA.getLocInfo()) {
2032 case CCValAssign::Full:
2033 break;
2034 case CCValAssign::BCvt:
2035 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2036 break;
2037 case CCValAssign::SExt:
2038 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2039 DAG.getValueType(ValVT));
2040 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2041 break;
2042 case CCValAssign::ZExt:
2043 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2044 DAG.getValueType(ValVT));
2045 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2046 break;
2047 case CCValAssign::AExt:
2048 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2049 break;
2050 default:
2051 llvm_unreachable("Unknown loc info!");
2052 }
2053
Christian Konig2c8f6d52013-03-07 09:03:52 +00002054 InVals.push_back(Val);
2055 }
Tom Stellarde99fb652015-01-20 19:33:04 +00002056
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002057 if (!IsEntryFunc) {
2058 // Special inputs come after user arguments.
2059 allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2060 }
2061
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002062 // Start adding system SGPRs.
2063 if (IsEntryFunc) {
2064 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002065 } else {
2066 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2067 CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
2068 CCInfo.AllocateReg(Info->getFrameOffsetReg());
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002069 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002070 }
Matt Arsenaultcf13d182015-07-10 22:51:36 +00002071
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002072 auto &ArgUsageInfo =
2073 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
Matt Arsenaultceafc552018-05-29 17:42:50 +00002074 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002075
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002076 unsigned StackArgSize = CCInfo.getNextStackOffset();
2077 Info->setBytesInStackArgArea(StackArgSize);
2078
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002079 return Chains.empty() ? Chain :
2080 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
Christian Konig2c8f6d52013-03-07 09:03:52 +00002081}
2082
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002083// TODO: If return values can't fit in registers, we should return as many as
2084// possible in registers before passing on stack.
2085bool SITargetLowering::CanLowerReturn(
2086 CallingConv::ID CallConv,
2087 MachineFunction &MF, bool IsVarArg,
2088 const SmallVectorImpl<ISD::OutputArg> &Outs,
2089 LLVMContext &Context) const {
2090 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2091 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2092 // for shaders. Vector types should be explicitly handled by CC.
2093 if (AMDGPU::isEntryFunctionCC(CallConv))
2094 return true;
2095
2096 SmallVector<CCValAssign, 16> RVLocs;
2097 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2098 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2099}
2100
Benjamin Kramerbdc49562016-06-12 15:39:02 +00002101SDValue
2102SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2103 bool isVarArg,
2104 const SmallVectorImpl<ISD::OutputArg> &Outs,
2105 const SmallVectorImpl<SDValue> &OutVals,
2106 const SDLoc &DL, SelectionDAG &DAG) const {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002107 MachineFunction &MF = DAG.getMachineFunction();
2108 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2109
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002110 if (AMDGPU::isKernel(CallConv)) {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002111 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2112 OutVals, DL, DAG);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002113 }
2114
2115 bool IsShader = AMDGPU::isShader(CallConv);
Marek Olsak8a0f3352016-01-13 17:23:04 +00002116
Matt Arsenault55ab9212018-08-01 19:57:34 +00002117 Info->setIfReturnsVoid(Outs.empty());
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002118 bool IsWaveEnd = Info->returnsVoid() && IsShader;
Marek Olsak8e9cc632016-01-13 17:23:09 +00002119
Marek Olsak8a0f3352016-01-13 17:23:04 +00002120 // CCValAssign - represent the assignment of the return value to a location.
2121 SmallVector<CCValAssign, 48> RVLocs;
Matt Arsenault55ab9212018-08-01 19:57:34 +00002122 SmallVector<ISD::OutputArg, 48> Splits;
Marek Olsak8a0f3352016-01-13 17:23:04 +00002123
2124 // CCState - Info about the registers and stack slots.
2125 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2126 *DAG.getContext());
2127
2128 // Analyze outgoing return values.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002129 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
Marek Olsak8a0f3352016-01-13 17:23:04 +00002130
2131 SDValue Flag;
2132 SmallVector<SDValue, 48> RetOps;
2133 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2134
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002135 // Add return address for callable functions.
2136 if (!Info->isEntryFunction()) {
2137 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2138 SDValue ReturnAddrReg = CreateLiveInRegister(
2139 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2140
2141 // FIXME: Should be able to use a vreg here, but need a way to prevent it
2142 // from being allcoated to a CSR.
2143
2144 SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2145 MVT::i64);
2146
2147 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
2148 Flag = Chain.getValue(1);
2149
2150 RetOps.push_back(PhysReturnAddrReg);
2151 }
2152
Marek Olsak8a0f3352016-01-13 17:23:04 +00002153 // Copy the result values into the output registers.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002154 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2155 ++I, ++RealRVLocIdx) {
2156 CCValAssign &VA = RVLocs[I];
Marek Olsak8a0f3352016-01-13 17:23:04 +00002157 assert(VA.isRegLoc() && "Can only return in registers!");
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002158 // TODO: Partially return in registers if return values don't fit.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002159 SDValue Arg = OutVals[RealRVLocIdx];
Marek Olsak8a0f3352016-01-13 17:23:04 +00002160
2161 // Copied from other backends.
2162 switch (VA.getLocInfo()) {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002163 case CCValAssign::Full:
2164 break;
2165 case CCValAssign::BCvt:
2166 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2167 break;
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002168 case CCValAssign::SExt:
2169 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2170 break;
2171 case CCValAssign::ZExt:
2172 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2173 break;
2174 case CCValAssign::AExt:
2175 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2176 break;
2177 default:
2178 llvm_unreachable("Unknown loc info!");
Marek Olsak8a0f3352016-01-13 17:23:04 +00002179 }
2180
2181 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2182 Flag = Chain.getValue(1);
2183 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2184 }
2185
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002186 // FIXME: Does sret work properly?
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002187 if (!Info->isEntryFunction()) {
Tom Stellardc5a154d2018-06-28 23:47:12 +00002188 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002189 const MCPhysReg *I =
2190 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2191 if (I) {
2192 for (; *I; ++I) {
2193 if (AMDGPU::SReg_64RegClass.contains(*I))
2194 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2195 else if (AMDGPU::SReg_32RegClass.contains(*I))
2196 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2197 else
2198 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2199 }
2200 }
2201 }
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002202
Marek Olsak8a0f3352016-01-13 17:23:04 +00002203 // Update chain and glue.
2204 RetOps[0] = Chain;
2205 if (Flag.getNode())
2206 RetOps.push_back(Flag);
2207
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002208 unsigned Opc = AMDGPUISD::ENDPGM;
2209 if (!IsWaveEnd)
2210 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
Matt Arsenault9babdf42016-06-22 20:15:28 +00002211 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
Marek Olsak8a0f3352016-01-13 17:23:04 +00002212}
2213
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002214SDValue SITargetLowering::LowerCallResult(
2215 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2216 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2217 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2218 SDValue ThisVal) const {
2219 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2220
2221 // Assign locations to each value returned by this call.
2222 SmallVector<CCValAssign, 16> RVLocs;
2223 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2224 *DAG.getContext());
2225 CCInfo.AnalyzeCallResult(Ins, RetCC);
2226
2227 // Copy all of the result registers out of their specified physreg.
2228 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2229 CCValAssign VA = RVLocs[i];
2230 SDValue Val;
2231
2232 if (VA.isRegLoc()) {
2233 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2234 Chain = Val.getValue(1);
2235 InFlag = Val.getValue(2);
2236 } else if (VA.isMemLoc()) {
2237 report_fatal_error("TODO: return values in memory");
2238 } else
2239 llvm_unreachable("unknown argument location type");
2240
2241 switch (VA.getLocInfo()) {
2242 case CCValAssign::Full:
2243 break;
2244 case CCValAssign::BCvt:
2245 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2246 break;
2247 case CCValAssign::ZExt:
2248 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2249 DAG.getValueType(VA.getValVT()));
2250 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2251 break;
2252 case CCValAssign::SExt:
2253 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2254 DAG.getValueType(VA.getValVT()));
2255 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2256 break;
2257 case CCValAssign::AExt:
2258 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2259 break;
2260 default:
2261 llvm_unreachable("Unknown loc info!");
2262 }
2263
2264 InVals.push_back(Val);
2265 }
2266
2267 return Chain;
2268}
2269
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002270// Add code to pass special inputs required depending on used features separate
2271// from the explicit user arguments present in the IR.
2272void SITargetLowering::passSpecialInputs(
2273 CallLoweringInfo &CLI,
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002274 CCState &CCInfo,
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002275 const SIMachineFunctionInfo &Info,
2276 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2277 SmallVectorImpl<SDValue> &MemOpChains,
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002278 SDValue Chain) const {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002279 // If we don't have a call site, this was a call inserted by
2280 // legalization. These can never use special inputs.
2281 if (!CLI.CS)
2282 return;
2283
2284 const Function *CalleeFunc = CLI.CS.getCalledFunction();
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002285 assert(CalleeFunc);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002286
2287 SelectionDAG &DAG = CLI.DAG;
2288 const SDLoc &DL = CLI.DL;
2289
Tom Stellardc5a154d2018-06-28 23:47:12 +00002290 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002291
2292 auto &ArgUsageInfo =
2293 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2294 const AMDGPUFunctionArgInfo &CalleeArgInfo
2295 = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2296
2297 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2298
2299 // TODO: Unify with private memory register handling. This is complicated by
2300 // the fact that at least in kernels, the input argument is not necessarily
2301 // in the same location as the input.
2302 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
2303 AMDGPUFunctionArgInfo::DISPATCH_PTR,
2304 AMDGPUFunctionArgInfo::QUEUE_PTR,
2305 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
2306 AMDGPUFunctionArgInfo::DISPATCH_ID,
2307 AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
2308 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
2309 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
2310 AMDGPUFunctionArgInfo::WORKITEM_ID_X,
2311 AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
Matt Arsenault817c2532017-08-03 23:12:44 +00002312 AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
2313 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002314 };
2315
2316 for (auto InputID : InputRegs) {
2317 const ArgDescriptor *OutgoingArg;
2318 const TargetRegisterClass *ArgRC;
2319
2320 std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2321 if (!OutgoingArg)
2322 continue;
2323
2324 const ArgDescriptor *IncomingArg;
2325 const TargetRegisterClass *IncomingArgRC;
2326 std::tie(IncomingArg, IncomingArgRC)
2327 = CallerArgInfo.getPreloadedValue(InputID);
2328 assert(IncomingArgRC == ArgRC);
2329
2330 // All special arguments are ints for now.
2331 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
Matt Arsenault817c2532017-08-03 23:12:44 +00002332 SDValue InputReg;
2333
2334 if (IncomingArg) {
2335 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2336 } else {
2337 // The implicit arg ptr is special because it doesn't have a corresponding
2338 // input for kernels, and is computed from the kernarg segment pointer.
2339 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2340 InputReg = getImplicitArgPtr(DAG, DL);
2341 }
2342
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002343 if (OutgoingArg->isRegister()) {
2344 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2345 } else {
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002346 unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
2347 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2348 SpecialArgOffset);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002349 MemOpChains.push_back(ArgStore);
2350 }
2351 }
2352}
2353
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002354static bool canGuaranteeTCO(CallingConv::ID CC) {
2355 return CC == CallingConv::Fast;
2356}
2357
2358/// Return true if we might ever do TCO for calls with this calling convention.
2359static bool mayTailCallThisCC(CallingConv::ID CC) {
2360 switch (CC) {
2361 case CallingConv::C:
2362 return true;
2363 default:
2364 return canGuaranteeTCO(CC);
2365 }
2366}
2367
2368bool SITargetLowering::isEligibleForTailCallOptimization(
2369 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2370 const SmallVectorImpl<ISD::OutputArg> &Outs,
2371 const SmallVectorImpl<SDValue> &OutVals,
2372 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2373 if (!mayTailCallThisCC(CalleeCC))
2374 return false;
2375
2376 MachineFunction &MF = DAG.getMachineFunction();
Matthias Braunf1caa282017-12-15 22:22:58 +00002377 const Function &CallerF = MF.getFunction();
2378 CallingConv::ID CallerCC = CallerF.getCallingConv();
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002379 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2380 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2381
2382 // Kernels aren't callable, and don't have a live in return address so it
2383 // doesn't make sense to do a tail call with entry functions.
2384 if (!CallerPreserved)
2385 return false;
2386
2387 bool CCMatch = CallerCC == CalleeCC;
2388
2389 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
2390 if (canGuaranteeTCO(CalleeCC) && CCMatch)
2391 return true;
2392 return false;
2393 }
2394
2395 // TODO: Can we handle var args?
2396 if (IsVarArg)
2397 return false;
2398
Matthias Braunf1caa282017-12-15 22:22:58 +00002399 for (const Argument &Arg : CallerF.args()) {
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002400 if (Arg.hasByValAttr())
2401 return false;
2402 }
2403
2404 LLVMContext &Ctx = *DAG.getContext();
2405
2406 // Check that the call results are passed in the same way.
2407 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2408 CCAssignFnForCall(CalleeCC, IsVarArg),
2409 CCAssignFnForCall(CallerCC, IsVarArg)))
2410 return false;
2411
2412 // The callee has to preserve all registers the caller needs to preserve.
2413 if (!CCMatch) {
2414 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2415 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2416 return false;
2417 }
2418
2419 // Nothing more to check if the callee is taking no arguments.
2420 if (Outs.empty())
2421 return true;
2422
2423 SmallVector<CCValAssign, 16> ArgLocs;
2424 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2425
2426 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2427
2428 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2429 // If the stack arguments for this call do not fit into our own save area then
2430 // the call cannot be made tail.
2431 // TODO: Is this really necessary?
2432 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2433 return false;
2434
2435 const MachineRegisterInfo &MRI = MF.getRegInfo();
2436 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2437}
2438
2439bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2440 if (!CI->isTailCall())
2441 return false;
2442
2443 const Function *ParentFn = CI->getParent()->getParent();
2444 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2445 return false;
2446
2447 auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2448 return (Attr.getValueAsString() != "true");
2449}
2450
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002451// The wave scratch offset register is used as the global base pointer.
2452SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
2453 SmallVectorImpl<SDValue> &InVals) const {
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002454 SelectionDAG &DAG = CLI.DAG;
2455 const SDLoc &DL = CLI.DL;
2456 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2457 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2458 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2459 SDValue Chain = CLI.Chain;
2460 SDValue Callee = CLI.Callee;
2461 bool &IsTailCall = CLI.IsTailCall;
2462 CallingConv::ID CallConv = CLI.CallConv;
2463 bool IsVarArg = CLI.IsVarArg;
2464 bool IsSibCall = false;
2465 bool IsThisReturn = false;
2466 MachineFunction &MF = DAG.getMachineFunction();
2467
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002468 if (IsVarArg) {
2469 return lowerUnhandledCall(CLI, InVals,
2470 "unsupported call to variadic function ");
2471 }
2472
Matt Arsenault935f3b72018-08-08 16:58:39 +00002473 if (!CLI.CS.getInstruction())
2474 report_fatal_error("unsupported libcall legalization");
2475
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002476 if (!CLI.CS.getCalledFunction()) {
2477 return lowerUnhandledCall(CLI, InVals,
2478 "unsupported indirect call to function ");
2479 }
2480
2481 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2482 return lowerUnhandledCall(CLI, InVals,
2483 "unsupported required tail call to function ");
2484 }
2485
Matt Arsenault1fb90132018-06-28 10:18:36 +00002486 if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
2487 // Note the issue is with the CC of the calling function, not of the call
2488 // itself.
2489 return lowerUnhandledCall(CLI, InVals,
2490 "unsupported call from graphics shader of function ");
2491 }
2492
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002493 // The first 4 bytes are reserved for the callee's emergency stack slot.
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002494 if (IsTailCall) {
2495 IsTailCall = isEligibleForTailCallOptimization(
2496 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2497 if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2498 report_fatal_error("failed to perform tail call elimination on a call "
2499 "site marked musttail");
2500 }
2501
2502 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2503
2504 // A sibling call is one where we're under the usual C ABI and not planning
2505 // to change that but can still do a tail call:
2506 if (!TailCallOpt && IsTailCall)
2507 IsSibCall = true;
2508
2509 if (IsTailCall)
2510 ++NumTailCalls;
2511 }
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002512
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002513 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2514
2515 // Analyze operands of the call, assigning locations to each operand.
2516 SmallVector<CCValAssign, 16> ArgLocs;
2517 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2518 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002519
2520 // The first 4 bytes are reserved for the callee's emergency stack slot.
2521 CCInfo.AllocateStack(4, 4);
2522
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002523 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2524
2525 // Get a count of how many bytes are to be pushed on the stack.
2526 unsigned NumBytes = CCInfo.getNextStackOffset();
2527
2528 if (IsSibCall) {
2529 // Since we're not changing the ABI to make this a tail call, the memory
2530 // operands are already available in the caller's incoming argument space.
2531 NumBytes = 0;
2532 }
2533
2534 // FPDiff is the byte offset of the call's argument area from the callee's.
2535 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2536 // by this amount for a tail call. In a sibling call it must be 0 because the
2537 // caller will deallocate the entire stack and the callee still expects its
2538 // arguments to begin at SP+0. Completely unused for non-tail calls.
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002539 int32_t FPDiff = 0;
2540 MachineFrameInfo &MFI = MF.getFrameInfo();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002541 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2542
Matt Arsenault6efd0822017-09-14 17:14:57 +00002543 SDValue CallerSavedFP;
2544
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002545 // Adjust the stack pointer for the new arguments...
2546 // These operations are automatically eliminated by the prolog/epilog pass
2547 if (!IsSibCall) {
Matt Arsenaultdefe3712017-09-14 17:37:40 +00002548 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002549
2550 unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2551
2552 // In the HSA case, this should be an identity copy.
2553 SDValue ScratchRSrcReg
2554 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2555 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2556
2557 // TODO: Don't hardcode these registers and get from the callee function.
2558 SDValue ScratchWaveOffsetReg
2559 = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2560 RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
Matt Arsenault6efd0822017-09-14 17:14:57 +00002561
2562 if (!Info->isEntryFunction()) {
2563 // Avoid clobbering this function's FP value. In the current convention
2564 // callee will overwrite this, so do save/restore around the call site.
2565 CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2566 Info->getFrameOffsetReg(), MVT::i32);
2567 }
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002568 }
2569
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002570 SmallVector<SDValue, 8> MemOpChains;
2571 MVT PtrVT = MVT::i32;
2572
2573 // Walk the register/memloc assignments, inserting copies/loads.
2574 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2575 ++i, ++realArgIdx) {
2576 CCValAssign &VA = ArgLocs[i];
2577 SDValue Arg = OutVals[realArgIdx];
2578
2579 // Promote the value if needed.
2580 switch (VA.getLocInfo()) {
2581 case CCValAssign::Full:
2582 break;
2583 case CCValAssign::BCvt:
2584 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2585 break;
2586 case CCValAssign::ZExt:
2587 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2588 break;
2589 case CCValAssign::SExt:
2590 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2591 break;
2592 case CCValAssign::AExt:
2593 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2594 break;
2595 case CCValAssign::FPExt:
2596 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2597 break;
2598 default:
2599 llvm_unreachable("Unknown loc info!");
2600 }
2601
2602 if (VA.isRegLoc()) {
2603 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2604 } else {
2605 assert(VA.isMemLoc());
2606
2607 SDValue DstAddr;
2608 MachinePointerInfo DstInfo;
2609
2610 unsigned LocMemOffset = VA.getLocMemOffset();
2611 int32_t Offset = LocMemOffset;
Matt Arsenaultb655fa92017-11-29 01:25:12 +00002612
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002613 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002614 unsigned Align = 0;
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002615
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002616 if (IsTailCall) {
2617 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2618 unsigned OpSize = Flags.isByVal() ?
2619 Flags.getByValSize() : VA.getValVT().getStoreSize();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002620
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002621 // FIXME: We can have better than the minimum byval required alignment.
2622 Align = Flags.isByVal() ? Flags.getByValAlign() :
2623 MinAlign(Subtarget->getStackAlignment(), Offset);
2624
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002625 Offset = Offset + FPDiff;
2626 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2627
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002628 DstAddr = DAG.getFrameIndex(FI, PtrVT);
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002629 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2630
2631 // Make sure any stack arguments overlapping with where we're storing
2632 // are loaded before this eventual operation. Otherwise they'll be
2633 // clobbered.
2634
2635 // FIXME: Why is this really necessary? This seems to just result in a
2636 // lot of code to copy the stack and write them back to the same
2637 // locations, which are supposed to be immutable?
2638 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2639 } else {
2640 DstAddr = PtrOff;
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002641 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002642 Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002643 }
2644
2645 if (Outs[i].Flags.isByVal()) {
2646 SDValue SizeNode =
2647 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2648 SDValue Cpy = DAG.getMemcpy(
2649 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2650 /*isVol = */ false, /*AlwaysInline = */ true,
Yaxun Liuc5962262017-11-22 16:13:35 +00002651 /*isTailCall = */ false, DstInfo,
2652 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
Matt Arsenault0da63502018-08-31 05:49:54 +00002653 *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))));
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002654
2655 MemOpChains.push_back(Cpy);
2656 } else {
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002657 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002658 MemOpChains.push_back(Store);
2659 }
2660 }
2661 }
2662
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002663 // Copy special input registers after user input arguments.
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002664 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002665
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002666 if (!MemOpChains.empty())
2667 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2668
2669 // Build a sequence of copy-to-reg nodes chained together with token chain
2670 // and flag operands which copy the outgoing args into the appropriate regs.
2671 SDValue InFlag;
2672 for (auto &RegToPass : RegsToPass) {
2673 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2674 RegToPass.second, InFlag);
2675 InFlag = Chain.getValue(1);
2676 }
2677
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002678
2679 SDValue PhysReturnAddrReg;
2680 if (IsTailCall) {
2681 // Since the return is being combined with the call, we need to pass on the
2682 // return address.
2683
2684 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2685 SDValue ReturnAddrReg = CreateLiveInRegister(
2686 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2687
2688 PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2689 MVT::i64);
2690 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2691 InFlag = Chain.getValue(1);
2692 }
2693
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002694 // We don't usually want to end the call-sequence here because we would tidy
2695 // the frame up *after* the call, however in the ABI-changing tail-call case
2696 // we've carefully laid out the parameters so that when sp is reset they'll be
2697 // in the correct location.
2698 if (IsTailCall && !IsSibCall) {
2699 Chain = DAG.getCALLSEQ_END(Chain,
2700 DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2701 DAG.getTargetConstant(0, DL, MVT::i32),
2702 InFlag, DL);
2703 InFlag = Chain.getValue(1);
2704 }
2705
2706 std::vector<SDValue> Ops;
2707 Ops.push_back(Chain);
2708 Ops.push_back(Callee);
Scott Linderd19d1972019-02-04 20:00:07 +00002709 // Add a redundant copy of the callee global which will not be legalized, as
2710 // we need direct access to the callee later.
2711 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee);
2712 const GlobalValue *GV = GSD->getGlobal();
2713 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002714
2715 if (IsTailCall) {
2716 // Each tail call may have to adjust the stack by a different amount, so
2717 // this information must travel along with the operation for eventual
2718 // consumption by emitEpilogue.
2719 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002720
2721 Ops.push_back(PhysReturnAddrReg);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002722 }
2723
2724 // Add argument registers to the end of the list so that they are known live
2725 // into the call.
2726 for (auto &RegToPass : RegsToPass) {
2727 Ops.push_back(DAG.getRegister(RegToPass.first,
2728 RegToPass.second.getValueType()));
2729 }
2730
2731 // Add a register mask operand representing the call-preserved registers.
2732
Tom Stellardc5a154d2018-06-28 23:47:12 +00002733 auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002734 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2735 assert(Mask && "Missing call preserved mask for calling convention");
2736 Ops.push_back(DAG.getRegisterMask(Mask));
2737
2738 if (InFlag.getNode())
2739 Ops.push_back(InFlag);
2740
2741 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2742
2743 // If we're doing a tall call, use a TC_RETURN here rather than an
2744 // actual call instruction.
2745 if (IsTailCall) {
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002746 MFI.setHasTailCall();
2747 return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002748 }
2749
2750 // Returns a chain and a flag for retval copy to use.
2751 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2752 Chain = Call.getValue(0);
2753 InFlag = Call.getValue(1);
2754
Matt Arsenault6efd0822017-09-14 17:14:57 +00002755 if (CallerSavedFP) {
2756 SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2757 Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2758 InFlag = Chain.getValue(1);
2759 }
2760
Matt Arsenaultdefe3712017-09-14 17:37:40 +00002761 uint64_t CalleePopBytes = NumBytes;
2762 Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002763 DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2764 InFlag, DL);
2765 if (!Ins.empty())
2766 InFlag = Chain.getValue(1);
2767
2768 // Handle result values, copying them out of physregs into vregs that we
2769 // return.
2770 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2771 InVals, IsThisReturn,
2772 IsThisReturn ? OutVals[0] : SDValue());
2773}
2774
Matt Arsenault9a10cea2016-01-26 04:29:24 +00002775unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2776 SelectionDAG &DAG) const {
2777 unsigned Reg = StringSwitch<unsigned>(RegName)
2778 .Case("m0", AMDGPU::M0)
2779 .Case("exec", AMDGPU::EXEC)
2780 .Case("exec_lo", AMDGPU::EXEC_LO)
2781 .Case("exec_hi", AMDGPU::EXEC_HI)
2782 .Case("flat_scratch", AMDGPU::FLAT_SCR)
2783 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2784 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2785 .Default(AMDGPU::NoRegister);
2786
2787 if (Reg == AMDGPU::NoRegister) {
2788 report_fatal_error(Twine("invalid register name \""
2789 + StringRef(RegName) + "\"."));
2790
2791 }
2792
Tom Stellard5bfbae52018-07-11 20:59:01 +00002793 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
Matt Arsenault9a10cea2016-01-26 04:29:24 +00002794 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
2795 report_fatal_error(Twine("invalid register \""
2796 + StringRef(RegName) + "\" for subtarget."));
2797 }
2798
2799 switch (Reg) {
2800 case AMDGPU::M0:
2801 case AMDGPU::EXEC_LO:
2802 case AMDGPU::EXEC_HI:
2803 case AMDGPU::FLAT_SCR_LO:
2804 case AMDGPU::FLAT_SCR_HI:
2805 if (VT.getSizeInBits() == 32)
2806 return Reg;
2807 break;
2808 case AMDGPU::EXEC:
2809 case AMDGPU::FLAT_SCR:
2810 if (VT.getSizeInBits() == 64)
2811 return Reg;
2812 break;
2813 default:
2814 llvm_unreachable("missing register type checking");
2815 }
2816
2817 report_fatal_error(Twine("invalid type for register \""
2818 + StringRef(RegName) + "\"."));
2819}
2820
Matt Arsenault786724a2016-07-12 21:41:32 +00002821// If kill is not the last instruction, split the block so kill is always a
2822// proper terminator.
2823MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
2824 MachineBasicBlock *BB) const {
2825 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2826
2827 MachineBasicBlock::iterator SplitPoint(&MI);
2828 ++SplitPoint;
2829
2830 if (SplitPoint == BB->end()) {
2831 // Don't bother with a new block.
Marek Olsakce76ea02017-10-24 10:27:13 +00002832 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
Matt Arsenault786724a2016-07-12 21:41:32 +00002833 return BB;
2834 }
2835
2836 MachineFunction *MF = BB->getParent();
2837 MachineBasicBlock *SplitBB
2838 = MF->CreateMachineBasicBlock(BB->getBasicBlock());
2839
Matt Arsenault786724a2016-07-12 21:41:32 +00002840 MF->insert(++MachineFunction::iterator(BB), SplitBB);
2841 SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2842
Matt Arsenaultd40ded62016-07-22 17:01:15 +00002843 SplitBB->transferSuccessorsAndUpdatePHIs(BB);
Matt Arsenault786724a2016-07-12 21:41:32 +00002844 BB->addSuccessor(SplitBB);
2845
Marek Olsakce76ea02017-10-24 10:27:13 +00002846 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
Matt Arsenault786724a2016-07-12 21:41:32 +00002847 return SplitBB;
2848}
2849
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002850// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2851// wavefront. If the value is uniform and just happens to be in a VGPR, this
2852// will only do one iteration. In the worst case, this will loop 64 times.
2853//
2854// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002855static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
2856 const SIInstrInfo *TII,
2857 MachineRegisterInfo &MRI,
2858 MachineBasicBlock &OrigBB,
2859 MachineBasicBlock &LoopBB,
2860 const DebugLoc &DL,
2861 const MachineOperand &IdxReg,
2862 unsigned InitReg,
2863 unsigned ResultReg,
2864 unsigned PhiReg,
2865 unsigned InitSaveExecReg,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00002866 int Offset,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002867 bool UseGPRIdxMode,
2868 bool IsIndirectSrc) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002869 MachineBasicBlock::iterator I = LoopBB.begin();
2870
2871 unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2872 unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2873 unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2874 unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2875
2876 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2877 .addReg(InitReg)
2878 .addMBB(&OrigBB)
2879 .addReg(ResultReg)
2880 .addMBB(&LoopBB);
2881
2882 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2883 .addReg(InitSaveExecReg)
2884 .addMBB(&OrigBB)
2885 .addReg(NewExec)
2886 .addMBB(&LoopBB);
2887
2888 // Read the next variant <- also loop target.
2889 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2890 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2891
2892 // Compare the just read M0 value to all possible Idx values.
2893 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2894 .addReg(CurrentIdxReg)
Matt Arsenaultf0ba86a2016-07-21 09:40:57 +00002895 .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002896
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002897 // Update EXEC, save the original EXEC value to VCC.
2898 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2899 .addReg(CondReg, RegState::Kill);
2900
2901 MRI.setSimpleHint(NewExec, CondReg);
2902
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00002903 if (UseGPRIdxMode) {
2904 unsigned IdxReg;
2905 if (Offset == 0) {
2906 IdxReg = CurrentIdxReg;
2907 } else {
2908 IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2909 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2910 .addReg(CurrentIdxReg, RegState::Kill)
2911 .addImm(Offset);
2912 }
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002913 unsigned IdxMode = IsIndirectSrc ?
Dmitry Preobrazhenskyef920352019-02-27 13:12:12 +00002914 AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002915 MachineInstr *SetOn =
2916 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2917 .addReg(IdxReg, RegState::Kill)
2918 .addImm(IdxMode);
2919 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002920 } else {
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00002921 // Move index from VCC into M0
2922 if (Offset == 0) {
2923 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2924 .addReg(CurrentIdxReg, RegState::Kill);
2925 } else {
2926 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2927 .addReg(CurrentIdxReg, RegState::Kill)
2928 .addImm(Offset);
2929 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002930 }
2931
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002932 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002933 MachineInstr *InsertPt =
Scott Lindere2c58472019-02-05 19:50:32 +00002934 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002935 .addReg(AMDGPU::EXEC)
2936 .addReg(NewExec);
2937
2938 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2939 // s_cbranch_scc0?
2940
2941 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2942 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2943 .addMBB(&LoopBB);
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002944
2945 return InsertPt->getIterator();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002946}
2947
2948// This has slightly sub-optimal regalloc when the source vector is killed by
2949// the read. The register allocator does not understand that the kill is
2950// per-workitem, so is kept alive for the whole loop so we end up not re-using a
2951// subregister from it, using 1 more VGPR than necessary. This was saved when
2952// this was expanded after register allocation.
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002953static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
2954 MachineBasicBlock &MBB,
2955 MachineInstr &MI,
2956 unsigned InitResultReg,
2957 unsigned PhiReg,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00002958 int Offset,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002959 bool UseGPRIdxMode,
2960 bool IsIndirectSrc) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002961 MachineFunction *MF = MBB.getParent();
2962 MachineRegisterInfo &MRI = MF->getRegInfo();
2963 const DebugLoc &DL = MI.getDebugLoc();
2964 MachineBasicBlock::iterator I(&MI);
2965
2966 unsigned DstReg = MI.getOperand(0).getReg();
Matt Arsenault301162c2017-11-15 21:51:43 +00002967 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2968 unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002969
2970 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
2971
2972 // Save the EXEC mask
2973 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
2974 .addReg(AMDGPU::EXEC);
2975
2976 // To insert the loop we need to split the block. Move everything after this
2977 // point to a new block, and insert a new empty block between the two.
2978 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
2979 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
2980 MachineFunction::iterator MBBI(MBB);
2981 ++MBBI;
2982
2983 MF->insert(MBBI, LoopBB);
2984 MF->insert(MBBI, RemainderBB);
2985
2986 LoopBB->addSuccessor(LoopBB);
2987 LoopBB->addSuccessor(RemainderBB);
2988
2989 // Move the rest of the block into a new block.
Matt Arsenaultd40ded62016-07-22 17:01:15 +00002990 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002991 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
2992
2993 MBB.addSuccessor(LoopBB);
2994
2995 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2996
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002997 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
2998 InitResultReg, DstReg, PhiReg, TmpExec,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002999 Offset, UseGPRIdxMode, IsIndirectSrc);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003000
3001 MachineBasicBlock::iterator First = RemainderBB->begin();
3002 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
3003 .addReg(SaveExec);
3004
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003005 return InsPt;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003006}
3007
3008// Returns subreg index, offset
3009static std::pair<unsigned, int>
3010computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
3011 const TargetRegisterClass *SuperRC,
3012 unsigned VecReg,
3013 int Offset) {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003014 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003015
3016 // Skip out of bounds offsets, or else we would end up using an undefined
3017 // register.
3018 if (Offset >= NumElts || Offset < 0)
3019 return std::make_pair(AMDGPU::sub0, Offset);
3020
3021 return std::make_pair(AMDGPU::sub0 + Offset, 0);
3022}
3023
3024// Return true if the index is an SGPR and was set.
3025static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
3026 MachineRegisterInfo &MRI,
3027 MachineInstr &MI,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003028 int Offset,
3029 bool UseGPRIdxMode,
3030 bool IsIndirectSrc) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003031 MachineBasicBlock *MBB = MI.getParent();
3032 const DebugLoc &DL = MI.getDebugLoc();
3033 MachineBasicBlock::iterator I(&MI);
3034
3035 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3036 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3037
3038 assert(Idx->getReg() != AMDGPU::NoRegister);
3039
3040 if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
3041 return false;
3042
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003043 if (UseGPRIdxMode) {
3044 unsigned IdxMode = IsIndirectSrc ?
Dmitry Preobrazhenskyef920352019-02-27 13:12:12 +00003045 AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003046 if (Offset == 0) {
3047 MachineInstr *SetOn =
Diana Picus116bbab2017-01-13 09:58:52 +00003048 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3049 .add(*Idx)
3050 .addImm(IdxMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003051
Matt Arsenaultdac31db2016-10-13 12:45:16 +00003052 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003053 } else {
3054 unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3055 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
Diana Picus116bbab2017-01-13 09:58:52 +00003056 .add(*Idx)
3057 .addImm(Offset);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003058 MachineInstr *SetOn =
3059 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3060 .addReg(Tmp, RegState::Kill)
3061 .addImm(IdxMode);
3062
Matt Arsenaultdac31db2016-10-13 12:45:16 +00003063 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003064 }
3065
3066 return true;
3067 }
3068
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003069 if (Offset == 0) {
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00003070 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3071 .add(*Idx);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003072 } else {
3073 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00003074 .add(*Idx)
3075 .addImm(Offset);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003076 }
3077
3078 return true;
3079}
3080
3081// Control flow needs to be inserted if indexing with a VGPR.
3082static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
3083 MachineBasicBlock &MBB,
Tom Stellard5bfbae52018-07-11 20:59:01 +00003084 const GCNSubtarget &ST) {
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003085 const SIInstrInfo *TII = ST.getInstrInfo();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003086 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3087 MachineFunction *MF = MBB.getParent();
3088 MachineRegisterInfo &MRI = MF->getRegInfo();
3089
3090 unsigned Dst = MI.getOperand(0).getReg();
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003091 unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003092 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3093
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003094 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003095
3096 unsigned SubReg;
3097 std::tie(SubReg, Offset)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003098 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003099
Marek Olsake22fdb92017-03-21 17:00:32 +00003100 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003101
3102 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003103 MachineBasicBlock::iterator I(&MI);
3104 const DebugLoc &DL = MI.getDebugLoc();
3105
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003106 if (UseGPRIdxMode) {
3107 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3108 // to avoid interfering with other uses, so probably requires a new
3109 // optimization pass.
3110 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003111 .addReg(SrcReg, RegState::Undef, SubReg)
3112 .addReg(SrcReg, RegState::Implicit)
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003113 .addReg(AMDGPU::M0, RegState::Implicit);
3114 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3115 } else {
3116 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003117 .addReg(SrcReg, RegState::Undef, SubReg)
3118 .addReg(SrcReg, RegState::Implicit);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003119 }
3120
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003121 MI.eraseFromParent();
3122
3123 return &MBB;
3124 }
3125
3126 const DebugLoc &DL = MI.getDebugLoc();
3127 MachineBasicBlock::iterator I(&MI);
3128
3129 unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3130 unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3131
3132 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3133
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003134 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
3135 Offset, UseGPRIdxMode, true);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003136 MachineBasicBlock *LoopBB = InsPt->getParent();
3137
3138 if (UseGPRIdxMode) {
3139 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003140 .addReg(SrcReg, RegState::Undef, SubReg)
3141 .addReg(SrcReg, RegState::Implicit)
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003142 .addReg(AMDGPU::M0, RegState::Implicit);
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003143 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003144 } else {
3145 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003146 .addReg(SrcReg, RegState::Undef, SubReg)
3147 .addReg(SrcReg, RegState::Implicit);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003148 }
3149
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003150 MI.eraseFromParent();
3151
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003152 return LoopBB;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003153}
3154
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003155static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
3156 const TargetRegisterClass *VecRC) {
3157 switch (TRI.getRegSizeInBits(*VecRC)) {
3158 case 32: // 4 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003159 return AMDGPU::V_MOVRELD_B32_V1;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003160 case 64: // 8 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003161 return AMDGPU::V_MOVRELD_B32_V2;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003162 case 128: // 16 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003163 return AMDGPU::V_MOVRELD_B32_V4;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003164 case 256: // 32 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003165 return AMDGPU::V_MOVRELD_B32_V8;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003166 case 512: // 64 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003167 return AMDGPU::V_MOVRELD_B32_V16;
3168 default:
3169 llvm_unreachable("unsupported size for MOVRELD pseudos");
3170 }
3171}
3172
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003173static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
3174 MachineBasicBlock &MBB,
Tom Stellard5bfbae52018-07-11 20:59:01 +00003175 const GCNSubtarget &ST) {
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003176 const SIInstrInfo *TII = ST.getInstrInfo();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003177 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3178 MachineFunction *MF = MBB.getParent();
3179 MachineRegisterInfo &MRI = MF->getRegInfo();
3180
3181 unsigned Dst = MI.getOperand(0).getReg();
3182 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3183 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3184 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3185 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3186 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3187
3188 // This can be an immediate, but will be folded later.
3189 assert(Val->getReg());
3190
3191 unsigned SubReg;
3192 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3193 SrcVec->getReg(),
3194 Offset);
Marek Olsake22fdb92017-03-21 17:00:32 +00003195 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003196
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003197 if (Idx->getReg() == AMDGPU::NoRegister) {
3198 MachineBasicBlock::iterator I(&MI);
3199 const DebugLoc &DL = MI.getDebugLoc();
3200
3201 assert(Offset == 0);
3202
3203 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
Diana Picus116bbab2017-01-13 09:58:52 +00003204 .add(*SrcVec)
3205 .add(*Val)
3206 .addImm(SubReg);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003207
3208 MI.eraseFromParent();
3209 return &MBB;
3210 }
3211
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003212 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003213 MachineBasicBlock::iterator I(&MI);
3214 const DebugLoc &DL = MI.getDebugLoc();
3215
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003216 if (UseGPRIdxMode) {
3217 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
Diana Picus116bbab2017-01-13 09:58:52 +00003218 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
3219 .add(*Val)
3220 .addReg(Dst, RegState::ImplicitDefine)
3221 .addReg(SrcVec->getReg(), RegState::Implicit)
3222 .addReg(AMDGPU::M0, RegState::Implicit);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003223
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003224 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3225 } else {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003226 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003227
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003228 BuildMI(MBB, I, DL, MovRelDesc)
3229 .addReg(Dst, RegState::Define)
3230 .addReg(SrcVec->getReg())
Diana Picus116bbab2017-01-13 09:58:52 +00003231 .add(*Val)
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003232 .addImm(SubReg - AMDGPU::sub0);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003233 }
3234
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003235 MI.eraseFromParent();
3236 return &MBB;
3237 }
3238
3239 if (Val->isReg())
3240 MRI.clearKillFlags(Val->getReg());
3241
3242 const DebugLoc &DL = MI.getDebugLoc();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003243
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003244 unsigned PhiReg = MRI.createVirtualRegister(VecRC);
3245
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003246 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003247 Offset, UseGPRIdxMode, false);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003248 MachineBasicBlock *LoopBB = InsPt->getParent();
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003249
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003250 if (UseGPRIdxMode) {
3251 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
Diana Picus116bbab2017-01-13 09:58:52 +00003252 .addReg(PhiReg, RegState::Undef, SubReg) // vdst
3253 .add(*Val) // src0
3254 .addReg(Dst, RegState::ImplicitDefine)
3255 .addReg(PhiReg, RegState::Implicit)
3256 .addReg(AMDGPU::M0, RegState::Implicit);
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003257 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003258 } else {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003259 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003260
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003261 BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
3262 .addReg(Dst, RegState::Define)
3263 .addReg(PhiReg)
Diana Picus116bbab2017-01-13 09:58:52 +00003264 .add(*Val)
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003265 .addImm(SubReg - AMDGPU::sub0);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003266 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003267
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003268 MI.eraseFromParent();
3269
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003270 return LoopBB;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003271}
3272
Matt Arsenault786724a2016-07-12 21:41:32 +00003273MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
3274 MachineInstr &MI, MachineBasicBlock *BB) const {
Tom Stellard244891d2016-12-20 15:52:17 +00003275
3276 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3277 MachineFunction *MF = BB->getParent();
3278 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
3279
3280 if (TII->isMIMG(MI)) {
Matt Arsenault905f3512017-12-29 17:18:14 +00003281 if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
3282 report_fatal_error("missing mem operand from MIMG instruction");
3283 }
Tom Stellard244891d2016-12-20 15:52:17 +00003284 // Add a memoperand for mimg instructions so that they aren't assumed to
3285 // be ordered memory instuctions.
3286
Tom Stellard244891d2016-12-20 15:52:17 +00003287 return BB;
3288 }
3289
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003290 switch (MI.getOpcode()) {
Matt Arsenault301162c2017-11-15 21:51:43 +00003291 case AMDGPU::S_ADD_U64_PSEUDO:
3292 case AMDGPU::S_SUB_U64_PSEUDO: {
3293 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3294 const DebugLoc &DL = MI.getDebugLoc();
3295
3296 MachineOperand &Dest = MI.getOperand(0);
3297 MachineOperand &Src0 = MI.getOperand(1);
3298 MachineOperand &Src1 = MI.getOperand(2);
3299
3300 unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3301 unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3302
3303 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3304 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3305 &AMDGPU::SReg_32_XM0RegClass);
3306 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3307 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3308 &AMDGPU::SReg_32_XM0RegClass);
3309
3310 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3311 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3312 &AMDGPU::SReg_32_XM0RegClass);
3313 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3314 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3315 &AMDGPU::SReg_32_XM0RegClass);
3316
3317 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3318
3319 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3320 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3321 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3322 .add(Src0Sub0)
3323 .add(Src1Sub0);
3324 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3325 .add(Src0Sub1)
3326 .add(Src1Sub1);
3327 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3328 .addReg(DestSub0)
3329 .addImm(AMDGPU::sub0)
3330 .addReg(DestSub1)
3331 .addImm(AMDGPU::sub1);
3332 MI.eraseFromParent();
3333 return BB;
3334 }
3335 case AMDGPU::SI_INIT_M0: {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003336 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
Matt Arsenault4ac341c2016-04-14 21:58:15 +00003337 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
Diana Picus116bbab2017-01-13 09:58:52 +00003338 .add(MI.getOperand(0));
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003339 MI.eraseFromParent();
Matt Arsenault20711b72015-02-20 22:10:45 +00003340 return BB;
Matt Arsenault301162c2017-11-15 21:51:43 +00003341 }
Marek Olsak2d825902017-04-28 20:21:58 +00003342 case AMDGPU::SI_INIT_EXEC:
3343 // This should be before all vector instructions.
3344 BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3345 AMDGPU::EXEC)
3346 .addImm(MI.getOperand(0).getImm());
3347 MI.eraseFromParent();
3348 return BB;
3349
3350 case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3351 // Extract the thread count from an SGPR input and set EXEC accordingly.
3352 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3353 //
3354 // S_BFE_U32 count, input, {shift, 7}
3355 // S_BFM_B64 exec, count, 0
3356 // S_CMP_EQ_U32 count, 64
3357 // S_CMOV_B64 exec, -1
3358 MachineInstr *FirstMI = &*BB->begin();
3359 MachineRegisterInfo &MRI = MF->getRegInfo();
3360 unsigned InputReg = MI.getOperand(0).getReg();
3361 unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3362 bool Found = false;
3363
3364 // Move the COPY of the input reg to the beginning, so that we can use it.
3365 for (auto I = BB->begin(); I != &MI; I++) {
3366 if (I->getOpcode() != TargetOpcode::COPY ||
3367 I->getOperand(0).getReg() != InputReg)
3368 continue;
3369
3370 if (I == FirstMI) {
3371 FirstMI = &*++BB->begin();
3372 } else {
3373 I->removeFromParent();
3374 BB->insert(FirstMI, &*I);
3375 }
3376 Found = true;
3377 break;
3378 }
3379 assert(Found);
Davide Italiano0dcc0152017-05-11 19:58:52 +00003380 (void)Found;
Marek Olsak2d825902017-04-28 20:21:58 +00003381
3382 // This should be before all vector instructions.
3383 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3384 .addReg(InputReg)
3385 .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3386 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3387 AMDGPU::EXEC)
3388 .addReg(CountReg)
3389 .addImm(0);
3390 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3391 .addReg(CountReg, RegState::Kill)
3392 .addImm(64);
3393 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3394 AMDGPU::EXEC)
3395 .addImm(-1);
3396 MI.eraseFromParent();
3397 return BB;
3398 }
3399
Changpeng Fang01f60622016-03-15 17:28:44 +00003400 case AMDGPU::GET_GROUPSTATICSIZE: {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003401 DebugLoc DL = MI.getDebugLoc();
Matt Arsenault3c07c812016-07-22 17:01:33 +00003402 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
Diana Picus116bbab2017-01-13 09:58:52 +00003403 .add(MI.getOperand(0))
3404 .addImm(MFI->getLDSSize());
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003405 MI.eraseFromParent();
Changpeng Fang01f60622016-03-15 17:28:44 +00003406 return BB;
3407 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003408 case AMDGPU::SI_INDIRECT_SRC_V1:
3409 case AMDGPU::SI_INDIRECT_SRC_V2:
3410 case AMDGPU::SI_INDIRECT_SRC_V4:
3411 case AMDGPU::SI_INDIRECT_SRC_V8:
3412 case AMDGPU::SI_INDIRECT_SRC_V16:
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003413 return emitIndirectSrc(MI, *BB, *getSubtarget());
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003414 case AMDGPU::SI_INDIRECT_DST_V1:
3415 case AMDGPU::SI_INDIRECT_DST_V2:
3416 case AMDGPU::SI_INDIRECT_DST_V4:
3417 case AMDGPU::SI_INDIRECT_DST_V8:
3418 case AMDGPU::SI_INDIRECT_DST_V16:
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003419 return emitIndirectDst(MI, *BB, *getSubtarget());
Marek Olsakce76ea02017-10-24 10:27:13 +00003420 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3421 case AMDGPU::SI_KILL_I1_PSEUDO:
Matt Arsenault786724a2016-07-12 21:41:32 +00003422 return splitKillBlock(MI, BB);
Matt Arsenault22e41792016-08-27 01:00:37 +00003423 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3424 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
Matt Arsenault22e41792016-08-27 01:00:37 +00003425
3426 unsigned Dst = MI.getOperand(0).getReg();
3427 unsigned Src0 = MI.getOperand(1).getReg();
3428 unsigned Src1 = MI.getOperand(2).getReg();
3429 const DebugLoc &DL = MI.getDebugLoc();
3430 unsigned SrcCond = MI.getOperand(3).getReg();
3431
3432 unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3433 unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003434 unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
Matt Arsenault22e41792016-08-27 01:00:37 +00003435
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003436 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3437 .addReg(SrcCond);
Matt Arsenault22e41792016-08-27 01:00:37 +00003438 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3439 .addReg(Src0, 0, AMDGPU::sub0)
3440 .addReg(Src1, 0, AMDGPU::sub0)
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003441 .addReg(SrcCondCopy);
Matt Arsenault22e41792016-08-27 01:00:37 +00003442 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3443 .addReg(Src0, 0, AMDGPU::sub1)
3444 .addReg(Src1, 0, AMDGPU::sub1)
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003445 .addReg(SrcCondCopy);
Matt Arsenault22e41792016-08-27 01:00:37 +00003446
3447 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3448 .addReg(DstLo)
3449 .addImm(AMDGPU::sub0)
3450 .addReg(DstHi)
3451 .addImm(AMDGPU::sub1);
3452 MI.eraseFromParent();
3453 return BB;
3454 }
Matt Arsenault327188a2016-12-15 21:57:11 +00003455 case AMDGPU::SI_BR_UNDEF: {
3456 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3457 const DebugLoc &DL = MI.getDebugLoc();
3458 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
Diana Picus116bbab2017-01-13 09:58:52 +00003459 .add(MI.getOperand(0));
Matt Arsenault327188a2016-12-15 21:57:11 +00003460 Br->getOperand(1).setIsUndef(true); // read undef SCC
3461 MI.eraseFromParent();
3462 return BB;
3463 }
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003464 case AMDGPU::ADJCALLSTACKUP:
3465 case AMDGPU::ADJCALLSTACKDOWN: {
3466 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3467 MachineInstrBuilder MIB(*MF, &MI);
Matt Arsenaulte9f36792018-03-27 18:38:51 +00003468
3469 // Add an implicit use of the frame offset reg to prevent the restore copy
3470 // inserted after the call from being reorderd after stack operations in the
3471 // the caller's frame.
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003472 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
Matt Arsenaulte9f36792018-03-27 18:38:51 +00003473 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3474 .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003475 return BB;
3476 }
Scott Linderd19d1972019-02-04 20:00:07 +00003477 case AMDGPU::SI_CALL_ISEL: {
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003478 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3479 const DebugLoc &DL = MI.getDebugLoc();
Scott Linderd19d1972019-02-04 20:00:07 +00003480
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003481 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
Matt Arsenault6ed7b9b2017-08-02 01:31:28 +00003482
Matt Arsenault71bcbd42017-08-11 20:42:08 +00003483 MachineInstrBuilder MIB;
Scott Linderd19d1972019-02-04 20:00:07 +00003484 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
Matt Arsenault71bcbd42017-08-11 20:42:08 +00003485
Scott Linderd19d1972019-02-04 20:00:07 +00003486 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003487 MIB.add(MI.getOperand(I));
Matt Arsenault6ed7b9b2017-08-02 01:31:28 +00003488
Chandler Carruthc73c0302018-08-16 21:30:05 +00003489 MIB.cloneMemRefs(MI);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003490 MI.eraseFromParent();
3491 return BB;
3492 }
Changpeng Fang01f60622016-03-15 17:28:44 +00003493 default:
3494 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
Tom Stellard75aadc22012-12-11 21:25:42 +00003495 }
Tom Stellard75aadc22012-12-11 21:25:42 +00003496}
3497
Matt Arsenaulte11d8ac2017-10-13 21:10:22 +00003498bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
3499 return isTypeLegal(VT.getScalarType());
3500}
3501
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003502bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
3503 // This currently forces unfolding various combinations of fsub into fma with
3504 // free fneg'd operands. As long as we have fast FMA (controlled by
3505 // isFMAFasterThanFMulAndFAdd), we should perform these.
3506
3507 // When fma is quarter rate, for f64 where add / sub are at best half rate,
3508 // most of these combines appear to be cycle neutral but save on instruction
3509 // count / code size.
3510 return true;
3511}
3512
Mehdi Amini44ede332015-07-09 02:09:04 +00003513EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
3514 EVT VT) const {
Tom Stellard83747202013-07-18 21:43:53 +00003515 if (!VT.isVector()) {
3516 return MVT::i1;
3517 }
Matt Arsenault8596f712014-11-28 22:51:38 +00003518 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
Tom Stellard75aadc22012-12-11 21:25:42 +00003519}
3520
Matt Arsenault94163282016-12-22 16:36:25 +00003521MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
3522 // TODO: Should i16 be used always if legal? For now it would force VALU
3523 // shifts.
3524 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
Christian Konig082a14a2013-03-18 11:34:05 +00003525}
3526
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003527// Answering this is somewhat tricky and depends on the specific device which
3528// have different rates for fma or all f64 operations.
3529//
3530// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3531// regardless of which device (although the number of cycles differs between
3532// devices), so it is always profitable for f64.
3533//
3534// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3535// only on full rate devices. Normally, we should prefer selecting v_mad_f32
3536// which we can always do even without fused FP ops since it returns the same
3537// result as the separate operations and since it is always full
3538// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3539// however does not support denormals, so we do report fma as faster if we have
3540// a fast fma device and require denormals.
3541//
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003542bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3543 VT = VT.getScalarType();
3544
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003545 switch (VT.getSimpleVT().SimpleTy) {
Matt Arsenault0084adc2018-04-30 19:08:16 +00003546 case MVT::f32: {
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003547 // This is as fast on some subtargets. However, we always have full rate f32
3548 // mad available which returns the same result as the separate operations
Matt Arsenault8d630032015-02-20 22:10:41 +00003549 // which we should prefer over fma. We can't use this if we want to support
3550 // denormals, so only report this in these cases.
Matt Arsenault0084adc2018-04-30 19:08:16 +00003551 if (Subtarget->hasFP32Denormals())
3552 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
3553
3554 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3555 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
3556 }
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003557 case MVT::f64:
3558 return true;
Matt Arsenault9e22bc22016-12-22 03:21:48 +00003559 case MVT::f16:
3560 return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003561 default:
3562 break;
3563 }
3564
3565 return false;
3566}
3567
Tom Stellard75aadc22012-12-11 21:25:42 +00003568//===----------------------------------------------------------------------===//
3569// Custom DAG Lowering Operations
3570//===----------------------------------------------------------------------===//
3571
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003572// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3573// wider vector type is legal.
3574SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
3575 SelectionDAG &DAG) const {
3576 unsigned Opc = Op.getOpcode();
3577 EVT VT = Op.getValueType();
3578 assert(VT == MVT::v4f16);
3579
3580 SDValue Lo, Hi;
3581 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
3582
3583 SDLoc SL(Op);
3584 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
3585 Op->getFlags());
3586 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
3587 Op->getFlags());
3588
3589 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3590}
3591
3592// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3593// wider vector type is legal.
3594SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
3595 SelectionDAG &DAG) const {
3596 unsigned Opc = Op.getOpcode();
3597 EVT VT = Op.getValueType();
3598 assert(VT == MVT::v4i16 || VT == MVT::v4f16);
3599
3600 SDValue Lo0, Hi0;
3601 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
3602 SDValue Lo1, Hi1;
3603 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
3604
3605 SDLoc SL(Op);
3606
3607 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
3608 Op->getFlags());
3609 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
3610 Op->getFlags());
3611
3612 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3613}
3614
Tom Stellard75aadc22012-12-11 21:25:42 +00003615SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3616 switch (Op.getOpcode()) {
3617 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
Tom Stellardf8794352012-12-19 22:10:31 +00003618 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
Tom Stellard35bb18c2013-08-26 15:06:04 +00003619 case ISD::LOAD: {
Tom Stellarde812f2f2014-07-21 15:45:06 +00003620 SDValue Result = LowerLOAD(Op, DAG);
3621 assert((!Result.getNode() ||
3622 Result.getNode()->getNumValues() == 2) &&
3623 "Load should return a value and a chain");
3624 return Result;
Tom Stellard35bb18c2013-08-26 15:06:04 +00003625 }
Tom Stellardaf775432013-10-23 00:44:32 +00003626
Matt Arsenaultad14ce82014-07-19 18:44:39 +00003627 case ISD::FSIN:
3628 case ISD::FCOS:
3629 return LowerTrig(Op, DAG);
Tom Stellard0ec134f2014-02-04 17:18:40 +00003630 case ISD::SELECT: return LowerSELECT(Op, DAG);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00003631 case ISD::FDIV: return LowerFDIV(Op, DAG);
Tom Stellard354a43c2016-04-01 18:27:37 +00003632 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
Tom Stellard81d871d2013-11-13 23:36:50 +00003633 case ISD::STORE: return LowerSTORE(Op, DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003634 case ISD::GlobalAddress: {
3635 MachineFunction &MF = DAG.getMachineFunction();
3636 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3637 return LowerGlobalAddress(MFI, Op, DAG);
Tom Stellard94593ee2013-06-03 17:40:18 +00003638 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003639 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00003640 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003641 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
Matt Arsenault99c14522016-04-25 19:27:24 +00003642 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
Matt Arsenault3aef8092017-01-23 23:09:58 +00003643 case ISD::INSERT_VECTOR_ELT:
3644 return lowerINSERT_VECTOR_ELT(Op, DAG);
3645 case ISD::EXTRACT_VECTOR_ELT:
3646 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
Matt Arsenault67a98152018-05-16 11:47:30 +00003647 case ISD::BUILD_VECTOR:
3648 return lowerBUILD_VECTOR(Op, DAG);
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00003649 case ISD::FP_ROUND:
3650 return lowerFP_ROUND(Op, DAG);
Matt Arsenault3e025382017-04-24 17:49:13 +00003651 case ISD::TRAP:
Matt Arsenault3e025382017-04-24 17:49:13 +00003652 return lowerTRAP(Op, DAG);
Tony Tye43259df2018-05-16 16:19:34 +00003653 case ISD::DEBUGTRAP:
3654 return lowerDEBUGTRAP(Op, DAG);
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003655 case ISD::FABS:
3656 case ISD::FNEG:
Matt Arsenault36cdcfa2018-08-02 13:43:42 +00003657 case ISD::FCANONICALIZE:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003658 return splitUnaryVectorOp(Op, DAG);
Matt Arsenault687ec752018-10-22 16:27:27 +00003659 case ISD::FMINNUM:
3660 case ISD::FMAXNUM:
3661 return lowerFMINNUM_FMAXNUM(Op, DAG);
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003662 case ISD::SHL:
3663 case ISD::SRA:
3664 case ISD::SRL:
3665 case ISD::ADD:
3666 case ISD::SUB:
3667 case ISD::MUL:
3668 case ISD::SMIN:
3669 case ISD::SMAX:
3670 case ISD::UMIN:
3671 case ISD::UMAX:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003672 case ISD::FADD:
3673 case ISD::FMUL:
Matt Arsenault687ec752018-10-22 16:27:27 +00003674 case ISD::FMINNUM_IEEE:
3675 case ISD::FMAXNUM_IEEE:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003676 return splitBinaryVectorOp(Op, DAG);
Tom Stellard75aadc22012-12-11 21:25:42 +00003677 }
3678 return SDValue();
3679}
3680
Matt Arsenault1349a042018-05-22 06:32:10 +00003681static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
3682 const SDLoc &DL,
3683 SelectionDAG &DAG, bool Unpacked) {
3684 if (!LoadVT.isVector())
3685 return Result;
3686
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003687 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3688 // Truncate to v2i16/v4i16.
3689 EVT IntLoadVT = LoadVT.changeTypeToInteger();
Matt Arsenault1349a042018-05-22 06:32:10 +00003690
3691 // Workaround legalizer not scalarizing truncate after vector op
3692 // legalization byt not creating intermediate vector trunc.
3693 SmallVector<SDValue, 4> Elts;
3694 DAG.ExtractVectorElements(Result, Elts);
3695 for (SDValue &Elt : Elts)
3696 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3697
3698 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3699
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003700 // Bitcast to original type (v2f16/v4f16).
Matt Arsenault1349a042018-05-22 06:32:10 +00003701 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003702 }
Matt Arsenault1349a042018-05-22 06:32:10 +00003703
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003704 // Cast back to the original packed type.
3705 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3706}
3707
Matt Arsenault1349a042018-05-22 06:32:10 +00003708SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3709 MemSDNode *M,
3710 SelectionDAG &DAG,
Tim Renouf366a49d2018-08-02 23:33:01 +00003711 ArrayRef<SDValue> Ops,
Matt Arsenault1349a042018-05-22 06:32:10 +00003712 bool IsIntrinsic) const {
3713 SDLoc DL(M);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003714
3715 bool Unpacked = Subtarget->hasUnpackedD16VMem();
Matt Arsenault1349a042018-05-22 06:32:10 +00003716 EVT LoadVT = M->getValueType(0);
3717
Matt Arsenault1349a042018-05-22 06:32:10 +00003718 EVT EquivLoadVT = LoadVT;
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003719 if (Unpacked && LoadVT.isVector()) {
3720 EquivLoadVT = LoadVT.isVector() ?
3721 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3722 LoadVT.getVectorNumElements()) : LoadVT;
Matt Arsenault1349a042018-05-22 06:32:10 +00003723 }
3724
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003725 // Change from v4f16/v2f16 to EquivLoadVT.
3726 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3727
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003728 SDValue Load
3729 = DAG.getMemIntrinsicNode(
3730 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3731 VTList, Ops, M->getMemoryVT(),
3732 M->getMemOperand());
3733 if (!Unpacked) // Just adjusted the opcode.
3734 return Load;
Changpeng Fang4737e892018-01-18 22:08:53 +00003735
Matt Arsenault1349a042018-05-22 06:32:10 +00003736 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
Changpeng Fang4737e892018-01-18 22:08:53 +00003737
Matt Arsenault1349a042018-05-22 06:32:10 +00003738 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003739}
3740
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003741static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
3742 SDNode *N, SelectionDAG &DAG) {
3743 EVT VT = N->getValueType(0);
Matt Arsenaultcaf13162019-03-12 21:02:54 +00003744 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003745 int CondCode = CD->getSExtValue();
3746 if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
3747 CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
3748 return DAG.getUNDEF(VT);
3749
3750 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
3751
3752
3753 SDValue LHS = N->getOperand(1);
3754 SDValue RHS = N->getOperand(2);
3755
3756 SDLoc DL(N);
3757
3758 EVT CmpVT = LHS.getValueType();
3759 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
3760 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
3761 ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3762 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
3763 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
3764 }
3765
3766 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
3767
3768 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
3769 DAG.getCondCode(CCOpcode));
3770}
3771
3772static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
3773 SDNode *N, SelectionDAG &DAG) {
3774 EVT VT = N->getValueType(0);
Matt Arsenaultcaf13162019-03-12 21:02:54 +00003775 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003776
3777 int CondCode = CD->getSExtValue();
3778 if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
3779 CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
3780 return DAG.getUNDEF(VT);
3781 }
3782
3783 SDValue Src0 = N->getOperand(1);
3784 SDValue Src1 = N->getOperand(2);
3785 EVT CmpVT = Src0.getValueType();
3786 SDLoc SL(N);
3787
3788 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
3789 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
3790 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
3791 }
3792
3793 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
3794 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
3795 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
3796 Src1, DAG.getCondCode(CCOpcode));
3797}
3798
Matt Arsenault3aef8092017-01-23 23:09:58 +00003799void SITargetLowering::ReplaceNodeResults(SDNode *N,
3800 SmallVectorImpl<SDValue> &Results,
3801 SelectionDAG &DAG) const {
3802 switch (N->getOpcode()) {
3803 case ISD::INSERT_VECTOR_ELT: {
3804 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3805 Results.push_back(Res);
3806 return;
3807 }
3808 case ISD::EXTRACT_VECTOR_ELT: {
3809 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3810 Results.push_back(Res);
3811 return;
3812 }
Matt Arsenault1f17c662017-02-22 00:27:34 +00003813 case ISD::INTRINSIC_WO_CHAIN: {
3814 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
Marek Olsak13e47412018-01-31 20:18:04 +00003815 switch (IID) {
3816 case Intrinsic::amdgcn_cvt_pkrtz: {
Matt Arsenault1f17c662017-02-22 00:27:34 +00003817 SDValue Src0 = N->getOperand(1);
3818 SDValue Src1 = N->getOperand(2);
3819 SDLoc SL(N);
3820 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
3821 Src0, Src1);
Matt Arsenault1f17c662017-02-22 00:27:34 +00003822 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3823 return;
3824 }
Marek Olsak13e47412018-01-31 20:18:04 +00003825 case Intrinsic::amdgcn_cvt_pknorm_i16:
3826 case Intrinsic::amdgcn_cvt_pknorm_u16:
3827 case Intrinsic::amdgcn_cvt_pk_i16:
3828 case Intrinsic::amdgcn_cvt_pk_u16: {
3829 SDValue Src0 = N->getOperand(1);
3830 SDValue Src1 = N->getOperand(2);
3831 SDLoc SL(N);
3832 unsigned Opcode;
3833
3834 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
3835 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
3836 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3837 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
3838 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3839 Opcode = AMDGPUISD::CVT_PK_I16_I32;
3840 else
3841 Opcode = AMDGPUISD::CVT_PK_U16_U32;
3842
Matt Arsenault709374d2018-08-01 20:13:58 +00003843 EVT VT = N->getValueType(0);
3844 if (isTypeLegal(VT))
3845 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
3846 else {
3847 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3848 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3849 }
Marek Olsak13e47412018-01-31 20:18:04 +00003850 return;
3851 }
3852 }
Simon Pilgrimd362d272017-07-08 19:50:03 +00003853 break;
Matt Arsenault1f17c662017-02-22 00:27:34 +00003854 }
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003855 case ISD::INTRINSIC_W_CHAIN: {
Matt Arsenault1349a042018-05-22 06:32:10 +00003856 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003857 Results.push_back(Res);
Matt Arsenault1349a042018-05-22 06:32:10 +00003858 Results.push_back(Res.getValue(1));
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003859 return;
3860 }
Matt Arsenault1349a042018-05-22 06:32:10 +00003861
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003862 break;
3863 }
Matt Arsenault4a486232017-04-19 20:53:07 +00003864 case ISD::SELECT: {
3865 SDLoc SL(N);
3866 EVT VT = N->getValueType(0);
3867 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3868 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3869 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3870
3871 EVT SelectVT = NewVT;
3872 if (NewVT.bitsLT(MVT::i32)) {
3873 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3874 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3875 SelectVT = MVT::i32;
3876 }
3877
3878 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3879 N->getOperand(0), LHS, RHS);
3880
3881 if (NewVT != SelectVT)
3882 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3883 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3884 return;
3885 }
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003886 case ISD::FNEG: {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003887 if (N->getValueType(0) != MVT::v2f16)
3888 break;
3889
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003890 SDLoc SL(N);
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003891 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3892
3893 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
3894 BC,
3895 DAG.getConstant(0x80008000, SL, MVT::i32));
3896 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3897 return;
3898 }
3899 case ISD::FABS: {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003900 if (N->getValueType(0) != MVT::v2f16)
3901 break;
3902
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003903 SDLoc SL(N);
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003904 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3905
3906 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
3907 BC,
3908 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
3909 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3910 return;
3911 }
Matt Arsenault3aef8092017-01-23 23:09:58 +00003912 default:
3913 break;
3914 }
3915}
3916
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00003917/// Helper function for LowerBRCOND
Tom Stellardf8794352012-12-19 22:10:31 +00003918static SDNode *findUser(SDValue Value, unsigned Opcode) {
Tom Stellard75aadc22012-12-11 21:25:42 +00003919
Tom Stellardf8794352012-12-19 22:10:31 +00003920 SDNode *Parent = Value.getNode();
3921 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3922 I != E; ++I) {
3923
3924 if (I.getUse().get() != Value)
3925 continue;
3926
3927 if (I->getOpcode() == Opcode)
3928 return *I;
3929 }
Craig Topper062a2ba2014-04-25 05:30:21 +00003930 return nullptr;
Tom Stellardf8794352012-12-19 22:10:31 +00003931}
3932
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00003933unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
Matt Arsenault6408c912016-09-16 22:11:18 +00003934 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
3935 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00003936 case Intrinsic::amdgcn_if:
3937 return AMDGPUISD::IF;
3938 case Intrinsic::amdgcn_else:
3939 return AMDGPUISD::ELSE;
3940 case Intrinsic::amdgcn_loop:
3941 return AMDGPUISD::LOOP;
3942 case Intrinsic::amdgcn_end_cf:
3943 llvm_unreachable("should not occur");
Matt Arsenault6408c912016-09-16 22:11:18 +00003944 default:
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00003945 return 0;
Matt Arsenault6408c912016-09-16 22:11:18 +00003946 }
Tom Stellardbc4497b2016-02-12 23:45:29 +00003947 }
Matt Arsenault6408c912016-09-16 22:11:18 +00003948
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00003949 // break, if_break, else_break are all only used as inputs to loop, not
3950 // directly as branch conditions.
3951 return 0;
Tom Stellardbc4497b2016-02-12 23:45:29 +00003952}
3953
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00003954bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
3955 const Triple &TT = getTargetMachine().getTargetTriple();
Matt Arsenault0da63502018-08-31 05:49:54 +00003956 return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3957 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00003958 AMDGPU::shouldEmitConstantsToTextSection(TT);
3959}
3960
3961bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
Scott Linderd19d1972019-02-04 20:00:07 +00003962 // FIXME: Either avoid relying on address space here or change the default
3963 // address space for functions to avoid the explicit check.
3964 return (GV->getValueType()->isFunctionTy() ||
3965 GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
Matt Arsenault0da63502018-08-31 05:49:54 +00003966 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3967 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00003968 !shouldEmitFixup(GV) &&
3969 !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
3970}
3971
3972bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
3973 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
3974}
3975
Tom Stellardf8794352012-12-19 22:10:31 +00003976/// This transforms the control flow intrinsics to get the branch destination as
3977/// last parameter, also switches branch target with BR if the need arise
3978SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
3979 SelectionDAG &DAG) const {
Andrew Trickef9de2a2013-05-25 02:42:55 +00003980 SDLoc DL(BRCOND);
Tom Stellardf8794352012-12-19 22:10:31 +00003981
3982 SDNode *Intr = BRCOND.getOperand(1).getNode();
3983 SDValue Target = BRCOND.getOperand(2);
Craig Topper062a2ba2014-04-25 05:30:21 +00003984 SDNode *BR = nullptr;
Tom Stellardbc4497b2016-02-12 23:45:29 +00003985 SDNode *SetCC = nullptr;
Tom Stellardf8794352012-12-19 22:10:31 +00003986
3987 if (Intr->getOpcode() == ISD::SETCC) {
3988 // As long as we negate the condition everything is fine
Tom Stellardbc4497b2016-02-12 23:45:29 +00003989 SetCC = Intr;
Tom Stellardf8794352012-12-19 22:10:31 +00003990 Intr = SetCC->getOperand(0).getNode();
3991
3992 } else {
3993 // Get the target from BR if we don't negate the condition
3994 BR = findUser(BRCOND, ISD::BR);
3995 Target = BR->getOperand(1);
3996 }
3997
Matt Arsenault6408c912016-09-16 22:11:18 +00003998 // FIXME: This changes the types of the intrinsics instead of introducing new
3999 // nodes with the correct types.
4000 // e.g. llvm.amdgcn.loop
4001
4002 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
4003 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
4004
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004005 unsigned CFNode = isCFIntrinsic(Intr);
4006 if (CFNode == 0) {
Tom Stellardbc4497b2016-02-12 23:45:29 +00004007 // This is a uniform branch so we don't need to legalize.
4008 return BRCOND;
4009 }
4010
Matt Arsenault6408c912016-09-16 22:11:18 +00004011 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
4012 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
4013
Tom Stellardbc4497b2016-02-12 23:45:29 +00004014 assert(!SetCC ||
4015 (SetCC->getConstantOperandVal(1) == 1 &&
Tom Stellardbc4497b2016-02-12 23:45:29 +00004016 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
4017 ISD::SETNE));
Tom Stellardf8794352012-12-19 22:10:31 +00004018
Tom Stellardf8794352012-12-19 22:10:31 +00004019 // operands of the new intrinsic call
4020 SmallVector<SDValue, 4> Ops;
Matt Arsenault6408c912016-09-16 22:11:18 +00004021 if (HaveChain)
4022 Ops.push_back(BRCOND.getOperand(0));
4023
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004024 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
Tom Stellardf8794352012-12-19 22:10:31 +00004025 Ops.push_back(Target);
4026
Matt Arsenault6408c912016-09-16 22:11:18 +00004027 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
4028
Tom Stellardf8794352012-12-19 22:10:31 +00004029 // build the new intrinsic call
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004030 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
Tom Stellardf8794352012-12-19 22:10:31 +00004031
Matt Arsenault6408c912016-09-16 22:11:18 +00004032 if (!HaveChain) {
4033 SDValue Ops[] = {
4034 SDValue(Result, 0),
4035 BRCOND.getOperand(0)
4036 };
4037
4038 Result = DAG.getMergeValues(Ops, DL).getNode();
4039 }
4040
Tom Stellardf8794352012-12-19 22:10:31 +00004041 if (BR) {
4042 // Give the branch instruction our target
4043 SDValue Ops[] = {
4044 BR->getOperand(0),
4045 BRCOND.getOperand(2)
4046 };
Chandler Carruth356665a2014-08-01 22:09:43 +00004047 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
4048 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
4049 BR = NewBR.getNode();
Tom Stellardf8794352012-12-19 22:10:31 +00004050 }
4051
4052 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
4053
4054 // Copy the intrinsic results to registers
4055 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
4056 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
4057 if (!CopyToReg)
4058 continue;
4059
4060 Chain = DAG.getCopyToReg(
4061 Chain, DL,
4062 CopyToReg->getOperand(1),
4063 SDValue(Result, i - 1),
4064 SDValue());
4065
4066 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
4067 }
4068
4069 // Remove the old intrinsic from the chain
4070 DAG.ReplaceAllUsesOfValueWith(
4071 SDValue(Intr, Intr->getNumValues() - 1),
4072 Intr->getOperand(0));
4073
4074 return Chain;
Tom Stellard75aadc22012-12-11 21:25:42 +00004075}
4076
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00004077SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
4078 SDValue Op,
4079 const SDLoc &DL,
4080 EVT VT) const {
4081 return Op.getValueType().bitsLE(VT) ?
4082 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
4083 DAG.getNode(ISD::FTRUNC, DL, VT, Op);
4084}
4085
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004086SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004087 assert(Op.getValueType() == MVT::f16 &&
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004088 "Do not know how to custom lower FP_ROUND for non-f16 type");
4089
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004090 SDValue Src = Op.getOperand(0);
4091 EVT SrcVT = Src.getValueType();
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004092 if (SrcVT != MVT::f64)
4093 return Op;
4094
4095 SDLoc DL(Op);
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004096
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004097 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
4098 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
Mandeep Singh Grang5e1697e2017-06-06 05:08:36 +00004099 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004100}
4101
Matt Arsenault687ec752018-10-22 16:27:27 +00004102SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
4103 SelectionDAG &DAG) const {
4104 EVT VT = Op.getValueType();
4105 bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
4106
4107 // FIXME: Assert during eslection that this is only selected for
4108 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4109 // mode functions, but this happens to be OK since it's only done in cases
4110 // where there is known no sNaN.
4111 if (IsIEEEMode)
4112 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
4113
4114 if (VT == MVT::v4f16)
4115 return splitBinaryVectorOp(Op, DAG);
4116 return Op;
4117}
4118
Matt Arsenault3e025382017-04-24 17:49:13 +00004119SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
4120 SDLoc SL(Op);
Matt Arsenault3e025382017-04-24 17:49:13 +00004121 SDValue Chain = Op.getOperand(0);
4122
Tom Stellard5bfbae52018-07-11 20:59:01 +00004123 if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
Tony Tye43259df2018-05-16 16:19:34 +00004124 !Subtarget->isTrapHandlerEnabled())
Matt Arsenault3e025382017-04-24 17:49:13 +00004125 return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
Tony Tye43259df2018-05-16 16:19:34 +00004126
4127 MachineFunction &MF = DAG.getMachineFunction();
4128 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4129 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4130 assert(UserSGPR != AMDGPU::NoRegister);
4131 SDValue QueuePtr = CreateLiveInRegister(
4132 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4133 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
4134 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
4135 QueuePtr, SDValue());
4136 SDValue Ops[] = {
4137 ToReg,
Tom Stellard5bfbae52018-07-11 20:59:01 +00004138 DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
Tony Tye43259df2018-05-16 16:19:34 +00004139 SGPR01,
4140 ToReg.getValue(1)
4141 };
4142 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4143}
4144
4145SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
4146 SDLoc SL(Op);
4147 SDValue Chain = Op.getOperand(0);
4148 MachineFunction &MF = DAG.getMachineFunction();
4149
Tom Stellard5bfbae52018-07-11 20:59:01 +00004150 if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
Tony Tye43259df2018-05-16 16:19:34 +00004151 !Subtarget->isTrapHandlerEnabled()) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004152 DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
Matt Arsenault3e025382017-04-24 17:49:13 +00004153 "debugtrap handler not supported",
4154 Op.getDebugLoc(),
4155 DS_Warning);
Matthias Braunf1caa282017-12-15 22:22:58 +00004156 LLVMContext &Ctx = MF.getFunction().getContext();
Matt Arsenault3e025382017-04-24 17:49:13 +00004157 Ctx.diagnose(NoTrap);
4158 return Chain;
4159 }
Matt Arsenault3e025382017-04-24 17:49:13 +00004160
Tony Tye43259df2018-05-16 16:19:34 +00004161 SDValue Ops[] = {
4162 Chain,
Tom Stellard5bfbae52018-07-11 20:59:01 +00004163 DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
Tony Tye43259df2018-05-16 16:19:34 +00004164 };
4165 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
Matt Arsenault3e025382017-04-24 17:49:13 +00004166}
4167
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004168SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
Matt Arsenault99c14522016-04-25 19:27:24 +00004169 SelectionDAG &DAG) const {
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004170 // FIXME: Use inline constants (src_{shared, private}_base) instead.
4171 if (Subtarget->hasApertureRegs()) {
Matt Arsenault0da63502018-08-31 05:49:54 +00004172 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004173 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
4174 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
Matt Arsenault0da63502018-08-31 05:49:54 +00004175 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004176 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
4177 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
4178 unsigned Encoding =
4179 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
4180 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
4181 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
Matt Arsenaulte823d922017-02-18 18:29:53 +00004182
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004183 SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
4184 SDValue ApertureReg = SDValue(
4185 DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
4186 SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
4187 return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
Matt Arsenaulte823d922017-02-18 18:29:53 +00004188 }
4189
Matt Arsenault99c14522016-04-25 19:27:24 +00004190 MachineFunction &MF = DAG.getMachineFunction();
4191 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenault3b2e2a52016-06-06 20:03:31 +00004192 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4193 assert(UserSGPR != AMDGPU::NoRegister);
4194
Matt Arsenault99c14522016-04-25 19:27:24 +00004195 SDValue QueuePtr = CreateLiveInRegister(
Matt Arsenault3b2e2a52016-06-06 20:03:31 +00004196 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
Matt Arsenault99c14522016-04-25 19:27:24 +00004197
4198 // Offset into amd_queue_t for group_segment_aperture_base_hi /
4199 // private_segment_aperture_base_hi.
Matt Arsenault0da63502018-08-31 05:49:54 +00004200 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
Matt Arsenault99c14522016-04-25 19:27:24 +00004201
Matt Arsenaultb655fa92017-11-29 01:25:12 +00004202 SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
Matt Arsenault99c14522016-04-25 19:27:24 +00004203
4204 // TODO: Use custom target PseudoSourceValue.
4205 // TODO: We should use the value from the IR intrinsic call, but it might not
4206 // be available and how do we get it?
4207 Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
Matt Arsenault0da63502018-08-31 05:49:54 +00004208 AMDGPUAS::CONSTANT_ADDRESS));
Matt Arsenault99c14522016-04-25 19:27:24 +00004209
4210 MachinePointerInfo PtrInfo(V, StructOffset);
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004211 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
Justin Lebar9c375812016-07-15 18:27:10 +00004212 MinAlign(64, StructOffset),
Justin Lebaradbf09e2016-09-11 01:38:58 +00004213 MachineMemOperand::MODereferenceable |
4214 MachineMemOperand::MOInvariant);
Matt Arsenault99c14522016-04-25 19:27:24 +00004215}
4216
4217SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4218 SelectionDAG &DAG) const {
4219 SDLoc SL(Op);
4220 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4221
4222 SDValue Src = ASC->getOperand(0);
Matt Arsenault99c14522016-04-25 19:27:24 +00004223 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4224
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004225 const AMDGPUTargetMachine &TM =
4226 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4227
Matt Arsenault99c14522016-04-25 19:27:24 +00004228 // flat -> local/private
Matt Arsenault0da63502018-08-31 05:49:54 +00004229 if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenault971c85e2017-03-13 19:47:31 +00004230 unsigned DestAS = ASC->getDestAddressSpace();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00004231
Matt Arsenault0da63502018-08-31 05:49:54 +00004232 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
4233 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004234 unsigned NullVal = TM.getNullPointerValue(DestAS);
4235 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
Matt Arsenault99c14522016-04-25 19:27:24 +00004236 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4237 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4238
4239 return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4240 NonNull, Ptr, SegmentNullPtr);
4241 }
4242 }
4243
4244 // local/private -> flat
Matt Arsenault0da63502018-08-31 05:49:54 +00004245 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenault971c85e2017-03-13 19:47:31 +00004246 unsigned SrcAS = ASC->getSrcAddressSpace();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00004247
Matt Arsenault0da63502018-08-31 05:49:54 +00004248 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
4249 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004250 unsigned NullVal = TM.getNullPointerValue(SrcAS);
4251 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
Matt Arsenault971c85e2017-03-13 19:47:31 +00004252
Matt Arsenault99c14522016-04-25 19:27:24 +00004253 SDValue NonNull
4254 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4255
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004256 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
Matt Arsenault99c14522016-04-25 19:27:24 +00004257 SDValue CvtPtr
4258 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4259
4260 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4261 DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4262 FlatNullPtr);
4263 }
4264 }
4265
4266 // global <-> flat are no-ops and never emitted.
4267
4268 const MachineFunction &MF = DAG.getMachineFunction();
4269 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
Matthias Braunf1caa282017-12-15 22:22:58 +00004270 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
Matt Arsenault99c14522016-04-25 19:27:24 +00004271 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4272
4273 return DAG.getUNDEF(ASC->getValueType(0));
4274}
4275
Matt Arsenault3aef8092017-01-23 23:09:58 +00004276SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4277 SelectionDAG &DAG) const {
Matt Arsenault67a98152018-05-16 11:47:30 +00004278 SDValue Vec = Op.getOperand(0);
4279 SDValue InsVal = Op.getOperand(1);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004280 SDValue Idx = Op.getOperand(2);
Matt Arsenault67a98152018-05-16 11:47:30 +00004281 EVT VecVT = Vec.getValueType();
Matt Arsenault9224c002018-06-05 19:52:46 +00004282 EVT EltVT = VecVT.getVectorElementType();
4283 unsigned VecSize = VecVT.getSizeInBits();
4284 unsigned EltSize = EltVT.getSizeInBits();
Matt Arsenault67a98152018-05-16 11:47:30 +00004285
Matt Arsenault9224c002018-06-05 19:52:46 +00004286
4287 assert(VecSize <= 64);
Matt Arsenault67a98152018-05-16 11:47:30 +00004288
4289 unsigned NumElts = VecVT.getVectorNumElements();
4290 SDLoc SL(Op);
4291 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
4292
Matt Arsenault9224c002018-06-05 19:52:46 +00004293 if (NumElts == 4 && EltSize == 16 && KIdx) {
Matt Arsenault67a98152018-05-16 11:47:30 +00004294 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
4295
4296 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4297 DAG.getConstant(0, SL, MVT::i32));
4298 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4299 DAG.getConstant(1, SL, MVT::i32));
4300
4301 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
4302 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
4303
4304 unsigned Idx = KIdx->getZExtValue();
4305 bool InsertLo = Idx < 2;
4306 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
4307 InsertLo ? LoVec : HiVec,
4308 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
4309 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
4310
4311 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
4312
4313 SDValue Concat = InsertLo ?
4314 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
4315 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
4316
4317 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
4318 }
4319
Matt Arsenault3aef8092017-01-23 23:09:58 +00004320 if (isa<ConstantSDNode>(Idx))
4321 return SDValue();
4322
Matt Arsenault9224c002018-06-05 19:52:46 +00004323 MVT IntVT = MVT::getIntegerVT(VecSize);
Matt Arsenault67a98152018-05-16 11:47:30 +00004324
Matt Arsenault3aef8092017-01-23 23:09:58 +00004325 // Avoid stack access for dynamic indexing.
Matt Arsenault3aef8092017-01-23 23:09:58 +00004326 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
Tim Corringhamfa3e4e52019-02-01 16:51:09 +00004327
4328 // Create a congruent vector with the target value in each element so that
4329 // the required element can be masked and ORed into the target vector.
4330 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
4331 DAG.getSplatBuildVector(VecVT, SL, InsVal));
Matt Arsenault3aef8092017-01-23 23:09:58 +00004332
Matt Arsenault9224c002018-06-05 19:52:46 +00004333 assert(isPowerOf2_32(EltSize));
4334 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4335
Matt Arsenault3aef8092017-01-23 23:09:58 +00004336 // Convert vector index to bit-index.
Matt Arsenault9224c002018-06-05 19:52:46 +00004337 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004338
Matt Arsenault67a98152018-05-16 11:47:30 +00004339 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4340 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
4341 DAG.getConstant(0xffff, SL, IntVT),
Matt Arsenault3aef8092017-01-23 23:09:58 +00004342 ScaledIdx);
4343
Matt Arsenault67a98152018-05-16 11:47:30 +00004344 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
4345 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
4346 DAG.getNOT(SL, BFM, IntVT), BCVec);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004347
Matt Arsenault67a98152018-05-16 11:47:30 +00004348 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
4349 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004350}
4351
4352SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4353 SelectionDAG &DAG) const {
4354 SDLoc SL(Op);
4355
4356 EVT ResultVT = Op.getValueType();
4357 SDValue Vec = Op.getOperand(0);
4358 SDValue Idx = Op.getOperand(1);
Matt Arsenault67a98152018-05-16 11:47:30 +00004359 EVT VecVT = Vec.getValueType();
Matt Arsenault9224c002018-06-05 19:52:46 +00004360 unsigned VecSize = VecVT.getSizeInBits();
4361 EVT EltVT = VecVT.getVectorElementType();
4362 assert(VecSize <= 64);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004363
Matt Arsenault98f29462017-05-17 20:30:58 +00004364 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4365
Hiroshi Inoue372ffa12018-04-13 11:37:06 +00004366 // Make sure we do any optimizations that will make it easier to fold
Matt Arsenault98f29462017-05-17 20:30:58 +00004367 // source modifiers before obscuring it with bit operations.
4368
4369 // XXX - Why doesn't this get called when vector_shuffle is expanded?
4370 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4371 return Combined;
4372
Matt Arsenault9224c002018-06-05 19:52:46 +00004373 unsigned EltSize = EltVT.getSizeInBits();
4374 assert(isPowerOf2_32(EltSize));
Matt Arsenault3aef8092017-01-23 23:09:58 +00004375
Matt Arsenault9224c002018-06-05 19:52:46 +00004376 MVT IntVT = MVT::getIntegerVT(VecSize);
4377 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4378
4379 // Convert vector index to bit-index (* EltSize)
4380 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004381
Matt Arsenault67a98152018-05-16 11:47:30 +00004382 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4383 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004384
Matt Arsenault67a98152018-05-16 11:47:30 +00004385 if (ResultVT == MVT::f16) {
4386 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
4387 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4388 }
Matt Arsenault3aef8092017-01-23 23:09:58 +00004389
Matt Arsenault67a98152018-05-16 11:47:30 +00004390 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
4391}
4392
4393SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
4394 SelectionDAG &DAG) const {
4395 SDLoc SL(Op);
4396 EVT VT = Op.getValueType();
Matt Arsenault67a98152018-05-16 11:47:30 +00004397
Matt Arsenault02dc7e12018-06-15 15:15:46 +00004398 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4399 EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
4400
4401 // Turn into pair of packed build_vectors.
4402 // TODO: Special case for constants that can be materialized with s_mov_b64.
4403 SDValue Lo = DAG.getBuildVector(HalfVT, SL,
4404 { Op.getOperand(0), Op.getOperand(1) });
4405 SDValue Hi = DAG.getBuildVector(HalfVT, SL,
4406 { Op.getOperand(2), Op.getOperand(3) });
4407
4408 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
4409 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
4410
4411 SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
4412 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
4413 }
4414
Matt Arsenault1349a042018-05-22 06:32:10 +00004415 assert(VT == MVT::v2f16 || VT == MVT::v2i16);
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004416 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
Matt Arsenault67a98152018-05-16 11:47:30 +00004417
Matt Arsenault1349a042018-05-22 06:32:10 +00004418 SDValue Lo = Op.getOperand(0);
4419 SDValue Hi = Op.getOperand(1);
Matt Arsenault67a98152018-05-16 11:47:30 +00004420
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004421 // Avoid adding defined bits with the zero_extend.
4422 if (Hi.isUndef()) {
4423 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4424 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
4425 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
4426 }
Matt Arsenault67a98152018-05-16 11:47:30 +00004427
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004428 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
Matt Arsenault1349a042018-05-22 06:32:10 +00004429 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
4430
4431 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
4432 DAG.getConstant(16, SL, MVT::i32));
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004433 if (Lo.isUndef())
4434 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
4435
4436 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4437 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
Matt Arsenault1349a042018-05-22 06:32:10 +00004438
4439 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
Matt Arsenault1349a042018-05-22 06:32:10 +00004440 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004441}
4442
Tom Stellard418beb72016-07-13 14:23:33 +00004443bool
4444SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
4445 // We can fold offsets for anything that doesn't require a GOT relocation.
Matt Arsenault0da63502018-08-31 05:49:54 +00004446 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
4447 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4448 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004449 !shouldEmitGOTReloc(GA->getGlobal());
Tom Stellard418beb72016-07-13 14:23:33 +00004450}
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004451
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004452static SDValue
4453buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
4454 const SDLoc &DL, unsigned Offset, EVT PtrVT,
4455 unsigned GAFlags = SIInstrInfo::MO_NONE) {
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004456 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4457 // lowered to the following code sequence:
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004458 //
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004459 // For constant address space:
4460 // s_getpc_b64 s[0:1]
4461 // s_add_u32 s0, s0, $symbol
4462 // s_addc_u32 s1, s1, 0
4463 //
4464 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4465 // a fixup or relocation is emitted to replace $symbol with a literal
4466 // constant, which is a pc-relative offset from the encoding of the $symbol
4467 // operand to the global variable.
4468 //
4469 // For global address space:
4470 // s_getpc_b64 s[0:1]
4471 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4472 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4473 //
4474 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4475 // fixups or relocations are emitted to replace $symbol@*@lo and
4476 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4477 // which is a 64-bit pc-relative offset from the encoding of the $symbol
4478 // operand to the global variable.
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004479 //
4480 // What we want here is an offset from the value returned by s_getpc
4481 // (which is the address of the s_add_u32 instruction) to the global
4482 // variable, but since the encoding of $symbol starts 4 bytes after the start
4483 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4484 // small. This requires us to add 4 to the global variable offset in order to
4485 // compute the correct address.
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004486 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4487 GAFlags);
4488 SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4489 GAFlags == SIInstrInfo::MO_NONE ?
4490 GAFlags : GAFlags + 1);
4491 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004492}
4493
Tom Stellard418beb72016-07-13 14:23:33 +00004494SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
4495 SDValue Op,
4496 SelectionDAG &DAG) const {
4497 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00004498 const GlobalValue *GV = GSD->getGlobal();
Matt Arsenaultd1f45712018-09-10 12:16:11 +00004499 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
4500 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
4501 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
Tom Stellard418beb72016-07-13 14:23:33 +00004502 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
4503
4504 SDLoc DL(GSD);
Tom Stellard418beb72016-07-13 14:23:33 +00004505 EVT PtrVT = Op.getValueType();
4506
Matt Arsenaultd1f45712018-09-10 12:16:11 +00004507 // FIXME: Should not make address space based decisions here.
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004508 if (shouldEmitFixup(GV))
Tom Stellard418beb72016-07-13 14:23:33 +00004509 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004510 else if (shouldEmitPCReloc(GV))
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004511 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
4512 SIInstrInfo::MO_REL32);
Tom Stellard418beb72016-07-13 14:23:33 +00004513
4514 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004515 SIInstrInfo::MO_GOTPCREL32);
Tom Stellard418beb72016-07-13 14:23:33 +00004516
4517 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
Matt Arsenault0da63502018-08-31 05:49:54 +00004518 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
Tom Stellard418beb72016-07-13 14:23:33 +00004519 const DataLayout &DataLayout = DAG.getDataLayout();
4520 unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
Matt Arsenaultd77fcc22018-09-10 02:23:39 +00004521 MachinePointerInfo PtrInfo
4522 = MachinePointerInfo::getGOT(DAG.getMachineFunction());
Tom Stellard418beb72016-07-13 14:23:33 +00004523
Justin Lebar9c375812016-07-15 18:27:10 +00004524 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
Justin Lebaradbf09e2016-09-11 01:38:58 +00004525 MachineMemOperand::MODereferenceable |
4526 MachineMemOperand::MOInvariant);
Tom Stellard418beb72016-07-13 14:23:33 +00004527}
4528
Benjamin Kramerbdc49562016-06-12 15:39:02 +00004529SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
4530 const SDLoc &DL, SDValue V) const {
Matt Arsenault4ac341c2016-04-14 21:58:15 +00004531 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
4532 // the destination register.
4533 //
Tom Stellardfc92e772015-05-12 14:18:14 +00004534 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
4535 // so we will end up with redundant moves to m0.
4536 //
Matt Arsenault4ac341c2016-04-14 21:58:15 +00004537 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
4538
4539 // A Null SDValue creates a glue result.
4540 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
4541 V, Chain);
4542 return SDValue(M0, 0);
Tom Stellardfc92e772015-05-12 14:18:14 +00004543}
4544
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00004545SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
4546 SDValue Op,
4547 MVT VT,
4548 unsigned Offset) const {
4549 SDLoc SL(Op);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00004550 SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
Matt Arsenault7b4826e2018-05-30 16:17:51 +00004551 DAG.getEntryNode(), Offset, 4, false);
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00004552 // The local size values will have the hi 16-bits as zero.
4553 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
4554 DAG.getValueType(VT));
4555}
4556
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004557static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4558 EVT VT) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004559 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00004560 "non-hsa intrinsic with hsa target",
4561 DL.getDebugLoc());
4562 DAG.getContext()->diagnose(BadIntrin);
4563 return DAG.getUNDEF(VT);
4564}
4565
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004566static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4567 EVT VT) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004568 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00004569 "intrinsic not supported on subtarget",
4570 DL.getDebugLoc());
Matt Arsenaulte0132462016-01-30 05:19:45 +00004571 DAG.getContext()->diagnose(BadIntrin);
4572 return DAG.getUNDEF(VT);
4573}
4574
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004575static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
4576 ArrayRef<SDValue> Elts) {
4577 assert(!Elts.empty());
4578 MVT Type;
4579 unsigned NumElts;
4580
4581 if (Elts.size() == 1) {
4582 Type = MVT::f32;
4583 NumElts = 1;
4584 } else if (Elts.size() == 2) {
4585 Type = MVT::v2f32;
4586 NumElts = 2;
4587 } else if (Elts.size() <= 4) {
4588 Type = MVT::v4f32;
4589 NumElts = 4;
4590 } else if (Elts.size() <= 8) {
4591 Type = MVT::v8f32;
4592 NumElts = 8;
4593 } else {
4594 assert(Elts.size() <= 16);
4595 Type = MVT::v16f32;
4596 NumElts = 16;
4597 }
4598
4599 SmallVector<SDValue, 16> VecElts(NumElts);
4600 for (unsigned i = 0; i < Elts.size(); ++i) {
4601 SDValue Elt = Elts[i];
4602 if (Elt.getValueType() != MVT::f32)
4603 Elt = DAG.getBitcast(MVT::f32, Elt);
4604 VecElts[i] = Elt;
4605 }
4606 for (unsigned i = Elts.size(); i < NumElts; ++i)
4607 VecElts[i] = DAG.getUNDEF(MVT::f32);
4608
4609 if (NumElts == 1)
4610 return VecElts[0];
4611 return DAG.getBuildVector(Type, DL, VecElts);
4612}
4613
4614static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
4615 SDValue *GLC, SDValue *SLC) {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004616 auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004617
4618 uint64_t Value = CachePolicyConst->getZExtValue();
4619 SDLoc DL(CachePolicy);
4620 if (GLC) {
4621 *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4622 Value &= ~(uint64_t)0x1;
4623 }
4624 if (SLC) {
4625 *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4626 Value &= ~(uint64_t)0x2;
4627 }
4628
4629 return Value == 0;
4630}
4631
David Stuttardf77079f2019-01-14 11:55:24 +00004632// Re-construct the required return value for a image load intrinsic.
4633// This is more complicated due to the optional use TexFailCtrl which means the required
4634// return type is an aggregate
4635static SDValue constructRetValue(SelectionDAG &DAG,
4636 MachineSDNode *Result,
4637 ArrayRef<EVT> ResultTypes,
4638 bool IsTexFail, bool Unpacked, bool IsD16,
4639 int DMaskPop, int NumVDataDwords,
4640 const SDLoc &DL, LLVMContext &Context) {
4641 // Determine the required return type. This is the same regardless of IsTexFail flag
4642 EVT ReqRetVT = ResultTypes[0];
4643 EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
4644 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
4645 EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
4646 EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
4647 : AdjEltVT
4648 : ReqRetVT;
4649
4650 // Extract data part of the result
4651 // Bitcast the result to the same type as the required return type
4652 int NumElts;
4653 if (IsD16 && !Unpacked)
4654 NumElts = NumVDataDwords << 1;
4655 else
4656 NumElts = NumVDataDwords;
4657
4658 EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
4659 : AdjEltVT;
4660
4661 // Special case for v8f16. Rather than add support for this, use v4i32 to
4662 // extract the data elements
4663 bool V8F16Special = false;
4664 if (CastVT == MVT::v8f16) {
4665 CastVT = MVT::v4i32;
4666 DMaskPop >>= 1;
4667 ReqRetNumElts >>= 1;
4668 V8F16Special = true;
4669 AdjVT = MVT::v2i32;
4670 }
4671
4672 SDValue N = SDValue(Result, 0);
4673 SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
4674
4675 // Iterate over the result
4676 SmallVector<SDValue, 4> BVElts;
4677
4678 if (CastVT.isVector()) {
4679 DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
4680 } else {
4681 BVElts.push_back(CastRes);
4682 }
4683 int ExtraElts = ReqRetNumElts - DMaskPop;
4684 while(ExtraElts--)
4685 BVElts.push_back(DAG.getUNDEF(AdjEltVT));
4686
4687 SDValue PreTFCRes;
4688 if (ReqRetNumElts > 1) {
4689 SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
4690 if (IsD16 && Unpacked)
4691 PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
4692 else
4693 PreTFCRes = NewVec;
4694 } else {
4695 PreTFCRes = BVElts[0];
4696 }
4697
4698 if (V8F16Special)
4699 PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
4700
4701 if (!IsTexFail) {
4702 if (Result->getNumValues() > 1)
4703 return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
4704 else
4705 return PreTFCRes;
4706 }
4707
4708 // Extract the TexFail result and insert into aggregate return
4709 SmallVector<SDValue, 1> TFCElt;
4710 DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
4711 SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
4712 return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
4713}
4714
4715static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
4716 SDValue *LWE, bool &IsTexFail) {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004717 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
David Stuttardf77079f2019-01-14 11:55:24 +00004718
4719 uint64_t Value = TexFailCtrlConst->getZExtValue();
4720 if (Value) {
4721 IsTexFail = true;
4722 }
4723
4724 SDLoc DL(TexFailCtrlConst);
4725 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4726 Value &= ~(uint64_t)0x1;
4727 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4728 Value &= ~(uint64_t)0x2;
4729
4730 return Value == 0;
4731}
4732
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004733SDValue SITargetLowering::lowerImage(SDValue Op,
4734 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4735 SelectionDAG &DAG) const {
4736 SDLoc DL(Op);
Ryan Taylor1f334d02018-08-28 15:07:30 +00004737 MachineFunction &MF = DAG.getMachineFunction();
4738 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004739 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4740 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4741 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004742 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4743 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
4744 unsigned IntrOpcode = Intr->BaseOpcode;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004745
David Stuttardf77079f2019-01-14 11:55:24 +00004746 SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
4747 SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004748 bool IsD16 = false;
Ryan Taylor1f334d02018-08-28 15:07:30 +00004749 bool IsA16 = false;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004750 SDValue VData;
4751 int NumVDataDwords;
David Stuttardf77079f2019-01-14 11:55:24 +00004752 bool AdjustRetType = false;
4753
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004754 unsigned AddrIdx; // Index of first address argument
4755 unsigned DMask;
David Stuttardf77079f2019-01-14 11:55:24 +00004756 unsigned DMaskLanes = 0;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004757
4758 if (BaseOpcode->Atomic) {
4759 VData = Op.getOperand(2);
4760
4761 bool Is64Bit = VData.getValueType() == MVT::i64;
4762 if (BaseOpcode->AtomicX2) {
4763 SDValue VData2 = Op.getOperand(3);
4764 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
4765 {VData, VData2});
4766 if (Is64Bit)
4767 VData = DAG.getBitcast(MVT::v4i32, VData);
4768
4769 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
4770 DMask = Is64Bit ? 0xf : 0x3;
4771 NumVDataDwords = Is64Bit ? 4 : 2;
4772 AddrIdx = 4;
4773 } else {
4774 DMask = Is64Bit ? 0x3 : 0x1;
4775 NumVDataDwords = Is64Bit ? 2 : 1;
4776 AddrIdx = 3;
4777 }
4778 } else {
David Stuttardf77079f2019-01-14 11:55:24 +00004779 unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004780 auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
David Stuttardf77079f2019-01-14 11:55:24 +00004781 DMask = DMaskConst->getZExtValue();
4782 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004783
4784 if (BaseOpcode->Store) {
4785 VData = Op.getOperand(2);
4786
4787 MVT StoreVT = VData.getSimpleValueType();
4788 if (StoreVT.getScalarType() == MVT::f16) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00004789 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004790 !BaseOpcode->HasD16)
4791 return Op; // D16 is unsupported for this instruction
4792
4793 IsD16 = true;
4794 VData = handleD16VData(VData, DAG);
4795 }
4796
4797 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004798 } else {
David Stuttardf77079f2019-01-14 11:55:24 +00004799 // Work out the num dwords based on the dmask popcount and underlying type
4800 // and whether packing is supported.
4801 MVT LoadVT = ResultTypes[0].getSimpleVT();
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004802 if (LoadVT.getScalarType() == MVT::f16) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00004803 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004804 !BaseOpcode->HasD16)
4805 return Op; // D16 is unsupported for this instruction
4806
4807 IsD16 = true;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004808 }
4809
David Stuttardf77079f2019-01-14 11:55:24 +00004810 // Confirm that the return type is large enough for the dmask specified
4811 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
4812 (!LoadVT.isVector() && DMaskLanes > 1))
4813 return Op;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004814
David Stuttardf77079f2019-01-14 11:55:24 +00004815 if (IsD16 && !Subtarget->hasUnpackedD16VMem())
4816 NumVDataDwords = (DMaskLanes + 1) / 2;
4817 else
4818 NumVDataDwords = DMaskLanes;
4819
4820 AdjustRetType = true;
4821 }
David Stuttardc6603862018-11-29 20:14:17 +00004822
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004823 AddrIdx = DMaskIdx + 1;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004824 }
4825
Ryan Taylor1f334d02018-08-28 15:07:30 +00004826 unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
4827 unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
4828 unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
4829 unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
4830 NumCoords + NumLCM;
4831 unsigned NumMIVAddrs = NumVAddrs;
4832
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004833 SmallVector<SDValue, 4> VAddrs;
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004834
4835 // Optimize _L to _LZ when _L is zero
4836 if (LZMappingInfo) {
4837 if (auto ConstantLod =
Ryan Taylor1f334d02018-08-28 15:07:30 +00004838 dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004839 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4840 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
Ryan Taylor1f334d02018-08-28 15:07:30 +00004841 NumMIVAddrs--; // remove 'lod'
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004842 }
4843 }
4844 }
4845
Ryan Taylor1f334d02018-08-28 15:07:30 +00004846 // Check for 16 bit addresses and pack if true.
4847 unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4848 MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
Neil Henning63718b22018-10-31 10:34:48 +00004849 const MVT VAddrScalarVT = VAddrVT.getScalarType();
4850 if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
Ryan Taylor1f334d02018-08-28 15:07:30 +00004851 ST->hasFeature(AMDGPU::FeatureR128A16)) {
4852 IsA16 = true;
Neil Henning63718b22018-10-31 10:34:48 +00004853 const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
Ryan Taylor1f334d02018-08-28 15:07:30 +00004854 for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
4855 SDValue AddrLo, AddrHi;
4856 // Push back extra arguments.
4857 if (i < DimIdx) {
4858 AddrLo = Op.getOperand(i);
4859 } else {
4860 AddrLo = Op.getOperand(i);
4861 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
4862 // in 1D, derivatives dx/dh and dx/dv are packed with undef.
4863 if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
Matt Arsenault0da63502018-08-31 05:49:54 +00004864 ((NumGradients / 2) % 2 == 1 &&
4865 (i == DimIdx + (NumGradients / 2) - 1 ||
Ryan Taylor1f334d02018-08-28 15:07:30 +00004866 i == DimIdx + NumGradients - 1))) {
4867 AddrHi = DAG.getUNDEF(MVT::f16);
4868 } else {
4869 AddrHi = Op.getOperand(i + 1);
4870 i++;
4871 }
Neil Henning63718b22018-10-31 10:34:48 +00004872 AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
Ryan Taylor1f334d02018-08-28 15:07:30 +00004873 {AddrLo, AddrHi});
4874 AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
4875 }
4876 VAddrs.push_back(AddrLo);
4877 }
4878 } else {
4879 for (unsigned i = 0; i < NumMIVAddrs; ++i)
4880 VAddrs.push_back(Op.getOperand(AddrIdx + i));
4881 }
4882
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004883 SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
4884
4885 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
4886 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
4887 unsigned CtrlIdx; // Index of texfailctrl argument
4888 SDValue Unorm;
4889 if (!BaseOpcode->Sampler) {
4890 Unorm = True;
4891 CtrlIdx = AddrIdx + NumVAddrs + 1;
4892 } else {
4893 auto UnormConst =
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004894 cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004895
4896 Unorm = UnormConst->getZExtValue() ? True : False;
4897 CtrlIdx = AddrIdx + NumVAddrs + 3;
4898 }
4899
David Stuttardf77079f2019-01-14 11:55:24 +00004900 SDValue TFE;
4901 SDValue LWE;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004902 SDValue TexFail = Op.getOperand(CtrlIdx);
David Stuttardf77079f2019-01-14 11:55:24 +00004903 bool IsTexFail = false;
4904 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004905 return Op;
4906
David Stuttardf77079f2019-01-14 11:55:24 +00004907 if (IsTexFail) {
4908 if (!DMaskLanes) {
4909 // Expecting to get an error flag since TFC is on - and dmask is 0
4910 // Force dmask to be at least 1 otherwise the instruction will fail
4911 DMask = 0x1;
4912 DMaskLanes = 1;
4913 NumVDataDwords = 1;
4914 }
4915 NumVDataDwords += 1;
4916 AdjustRetType = true;
4917 }
4918
4919 // Has something earlier tagged that the return type needs adjusting
4920 // This happens if the instruction is a load or has set TexFailCtrl flags
4921 if (AdjustRetType) {
4922 // NumVDataDwords reflects the true number of dwords required in the return type
4923 if (DMaskLanes == 0 && !BaseOpcode->Store) {
4924 // This is a no-op load. This can be eliminated
4925 SDValue Undef = DAG.getUNDEF(Op.getValueType());
4926 if (isa<MemSDNode>(Op))
4927 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
4928 return Undef;
4929 }
4930
4931 // Have to use a power of 2 number of dwords
4932 NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords);
4933
4934 EVT NewVT = NumVDataDwords > 1 ?
4935 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
4936 : MVT::f32;
4937
4938 ResultTypes[0] = NewVT;
4939 if (ResultTypes.size() == 3) {
4940 // Original result was aggregate type used for TexFailCtrl results
4941 // The actual instruction returns as a vector type which has now been
4942 // created. Remove the aggregate result.
4943 ResultTypes.erase(&ResultTypes[1]);
4944 }
4945 }
4946
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004947 SDValue GLC;
4948 SDValue SLC;
4949 if (BaseOpcode->Atomic) {
4950 GLC = True; // TODO no-return optimization
4951 if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
4952 return Op;
4953 } else {
4954 if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
4955 return Op;
4956 }
4957
4958 SmallVector<SDValue, 14> Ops;
4959 if (BaseOpcode->Store || BaseOpcode->Atomic)
4960 Ops.push_back(VData); // vdata
4961 Ops.push_back(VAddr);
4962 Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
4963 if (BaseOpcode->Sampler)
4964 Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
4965 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
4966 Ops.push_back(Unorm);
4967 Ops.push_back(GLC);
4968 Ops.push_back(SLC);
Ryan Taylor1f334d02018-08-28 15:07:30 +00004969 Ops.push_back(IsA16 && // a16 or r128
4970 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
David Stuttardf77079f2019-01-14 11:55:24 +00004971 Ops.push_back(TFE); // tfe
4972 Ops.push_back(LWE); // lwe
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004973 Ops.push_back(DimInfo->DA ? True : False);
4974 if (BaseOpcode->HasD16)
4975 Ops.push_back(IsD16 ? True : False);
4976 if (isa<MemSDNode>(Op))
4977 Ops.push_back(Op.getOperand(0)); // chain
4978
4979 int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
4980 int Opcode = -1;
4981
Tom Stellard5bfbae52018-07-11 20:59:01 +00004982 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004983 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004984 NumVDataDwords, NumVAddrDwords);
4985 if (Opcode == -1)
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004986 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004987 NumVDataDwords, NumVAddrDwords);
4988 assert(Opcode != -1);
4989
4990 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
4991 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
Chandler Carruth66654b72018-08-14 23:30:32 +00004992 MachineMemOperand *MemRef = MemOp->getMemOperand();
4993 DAG.setNodeMemRefs(NewNode, {MemRef});
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004994 }
4995
4996 if (BaseOpcode->AtomicX2) {
4997 SmallVector<SDValue, 1> Elt;
4998 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
4999 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
David Stuttardf77079f2019-01-14 11:55:24 +00005000 } else if (!BaseOpcode->Store) {
5001 return constructRetValue(DAG, NewNode,
5002 OrigResultTypes, IsTexFail,
5003 Subtarget->hasUnpackedD16VMem(), IsD16,
5004 DMaskLanes, NumVDataDwords, DL,
5005 *DAG.getContext());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005006 }
5007
5008 return SDValue(NewNode, 0);
5009}
5010
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005011SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
5012 SDValue Offset, SDValue GLC,
5013 SelectionDAG &DAG) const {
5014 MachineFunction &MF = DAG.getMachineFunction();
5015 MachineMemOperand *MMO = MF.getMachineMemOperand(
5016 MachinePointerInfo(),
5017 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5018 MachineMemOperand::MOInvariant,
5019 VT.getStoreSize(), VT.getStoreSize());
5020
5021 if (!Offset->isDivergent()) {
5022 SDValue Ops[] = {
5023 Rsrc,
5024 Offset, // Offset
5025 GLC // glc
5026 };
5027 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
5028 DAG.getVTList(VT), Ops, VT, MMO);
5029 }
5030
5031 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
5032 // assume that the buffer is unswizzled.
5033 SmallVector<SDValue, 4> Loads;
5034 unsigned NumLoads = 1;
5035 MVT LoadVT = VT.getSimpleVT();
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005036 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
Simon Pilgrim44dfd812018-12-07 21:44:25 +00005037 assert((LoadVT.getScalarType() == MVT::i32 ||
5038 LoadVT.getScalarType() == MVT::f32) &&
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005039 isPowerOf2_32(NumElts));
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005040
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005041 if (NumElts == 8 || NumElts == 16) {
5042 NumLoads = NumElts == 16 ? 4 : 2;
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005043 LoadVT = MVT::v4i32;
5044 }
5045
5046 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
5047 unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
5048 SDValue Ops[] = {
5049 DAG.getEntryNode(), // Chain
5050 Rsrc, // rsrc
5051 DAG.getConstant(0, DL, MVT::i32), // vindex
5052 {}, // voffset
5053 {}, // soffset
5054 {}, // offset
5055 DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
5056 DAG.getConstant(0, DL, MVT::i1), // idxen
5057 };
5058
5059 // Use the alignment to ensure that the required offsets will fit into the
5060 // immediate offsets.
5061 setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
5062
5063 uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
5064 for (unsigned i = 0; i < NumLoads; ++i) {
5065 Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
5066 Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
5067 Ops, LoadVT, MMO));
5068 }
5069
5070 if (VT == MVT::v8i32 || VT == MVT::v16i32)
5071 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
5072
5073 return Loads[0];
5074}
5075
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005076SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5077 SelectionDAG &DAG) const {
5078 MachineFunction &MF = DAG.getMachineFunction();
Tom Stellarddcb9f092015-07-09 21:20:37 +00005079 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005080
5081 EVT VT = Op.getValueType();
5082 SDLoc DL(Op);
5083 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5084
Sanjay Patela2607012015-09-16 16:31:21 +00005085 // TODO: Should this propagate fast-math-flags?
5086
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005087 switch (IntrinsicID) {
Tom Stellard2f3f9852017-01-25 01:25:13 +00005088 case Intrinsic::amdgcn_implicit_buffer_ptr: {
Konstantin Zhuravlyovaa067cb2018-10-04 21:02:16 +00005089 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
Matt Arsenault10fc0622017-06-26 03:01:31 +00005090 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005091 return getPreloadedValue(DAG, *MFI, VT,
5092 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
Tom Stellard2f3f9852017-01-25 01:25:13 +00005093 }
Tom Stellard48f29f22015-11-26 00:43:29 +00005094 case Intrinsic::amdgcn_dispatch_ptr:
Matt Arsenault48ab5262016-04-25 19:27:18 +00005095 case Intrinsic::amdgcn_queue_ptr: {
Konstantin Zhuravlyovaa067cb2018-10-04 21:02:16 +00005096 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
Oliver Stannard7e7d9832016-02-02 13:52:43 +00005097 DiagnosticInfoUnsupported BadIntrin(
Matthias Braunf1caa282017-12-15 22:22:58 +00005098 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
Oliver Stannard7e7d9832016-02-02 13:52:43 +00005099 DL.getDebugLoc());
Matt Arsenault800fecf2016-01-11 21:18:33 +00005100 DAG.getContext()->diagnose(BadIntrin);
5101 return DAG.getUNDEF(VT);
5102 }
5103
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005104 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
5105 AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
5106 return getPreloadedValue(DAG, *MFI, VT, RegID);
Matt Arsenault48ab5262016-04-25 19:27:18 +00005107 }
Jan Veselyfea814d2016-06-21 20:46:20 +00005108 case Intrinsic::amdgcn_implicitarg_ptr: {
Matt Arsenault9166ce82017-07-28 15:52:08 +00005109 if (MFI->isEntryFunction())
5110 return getImplicitArgPtr(DAG, DL);
Matt Arsenault817c2532017-08-03 23:12:44 +00005111 return getPreloadedValue(DAG, *MFI, VT,
5112 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
Jan Veselyfea814d2016-06-21 20:46:20 +00005113 }
Matt Arsenaultdc4ebad2016-04-29 21:16:52 +00005114 case Intrinsic::amdgcn_kernarg_segment_ptr: {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005115 return getPreloadedValue(DAG, *MFI, VT,
5116 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
Matt Arsenaultdc4ebad2016-04-29 21:16:52 +00005117 }
Matt Arsenault8d718dc2016-07-22 17:01:30 +00005118 case Intrinsic::amdgcn_dispatch_id: {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005119 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
Matt Arsenault8d718dc2016-07-22 17:01:30 +00005120 }
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005121 case Intrinsic::amdgcn_rcp:
5122 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
5123 case Intrinsic::amdgcn_rsq:
5124 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
Eugene Zelenko66203762017-01-21 00:53:49 +00005125 case Intrinsic::amdgcn_rsq_legacy:
Tom Stellard5bfbae52018-07-11 20:59:01 +00005126 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005127 return emitRemovedIntrinsicError(DAG, DL, VT);
5128
5129 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
Eugene Zelenko66203762017-01-21 00:53:49 +00005130 case Intrinsic::amdgcn_rcp_legacy:
Tom Stellard5bfbae52018-07-11 20:59:01 +00005131 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenault32fc5272016-07-26 16:45:45 +00005132 return emitRemovedIntrinsicError(DAG, DL, VT);
5133 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
Matt Arsenault09b2c4a2016-07-15 21:26:52 +00005134 case Intrinsic::amdgcn_rsq_clamp: {
Tom Stellard5bfbae52018-07-11 20:59:01 +00005135 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenault79963e82016-02-13 01:03:00 +00005136 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
Tom Stellard48f29f22015-11-26 00:43:29 +00005137
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005138 Type *Type = VT.getTypeForEVT(*DAG.getContext());
5139 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
5140 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
5141
5142 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
5143 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
5144 DAG.getConstantFP(Max, DL, VT));
5145 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
5146 DAG.getConstantFP(Min, DL, VT));
5147 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005148 case Intrinsic::r600_read_ngroups_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005149 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005150 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005151
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005152 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005153 SI::KernelInputOffsets::NGROUPS_X, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005154 case Intrinsic::r600_read_ngroups_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005155 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005156 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005157
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005158 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005159 SI::KernelInputOffsets::NGROUPS_Y, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005160 case Intrinsic::r600_read_ngroups_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005161 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005162 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005163
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005164 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005165 SI::KernelInputOffsets::NGROUPS_Z, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005166 case Intrinsic::r600_read_global_size_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005167 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005168 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005169
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005170 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005171 SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005172 case Intrinsic::r600_read_global_size_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005173 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005174 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005175
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005176 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005177 SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005178 case Intrinsic::r600_read_global_size_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005179 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005180 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005181
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005182 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005183 SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005184 case Intrinsic::r600_read_local_size_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005185 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005186 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005187
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005188 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5189 SI::KernelInputOffsets::LOCAL_SIZE_X);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005190 case Intrinsic::r600_read_local_size_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005191 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005192 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005193
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005194 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5195 SI::KernelInputOffsets::LOCAL_SIZE_Y);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005196 case Intrinsic::r600_read_local_size_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005197 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005198 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005199
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005200 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5201 SI::KernelInputOffsets::LOCAL_SIZE_Z);
Matt Arsenault43976df2016-01-30 04:25:19 +00005202 case Intrinsic::amdgcn_workgroup_id_x:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005203 case Intrinsic::r600_read_tgid_x:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005204 return getPreloadedValue(DAG, *MFI, VT,
5205 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
Matt Arsenault43976df2016-01-30 04:25:19 +00005206 case Intrinsic::amdgcn_workgroup_id_y:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005207 case Intrinsic::r600_read_tgid_y:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005208 return getPreloadedValue(DAG, *MFI, VT,
5209 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
Matt Arsenault43976df2016-01-30 04:25:19 +00005210 case Intrinsic::amdgcn_workgroup_id_z:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005211 case Intrinsic::r600_read_tgid_z:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005212 return getPreloadedValue(DAG, *MFI, VT,
5213 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
Reid Kleckner4dc0b1a2018-11-01 19:54:45 +00005214 case Intrinsic::amdgcn_workitem_id_x:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005215 case Intrinsic::r600_read_tidig_x:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005216 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5217 SDLoc(DAG.getEntryNode()),
5218 MFI->getArgInfo().WorkItemIDX);
Matt Arsenault43976df2016-01-30 04:25:19 +00005219 case Intrinsic::amdgcn_workitem_id_y:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005220 case Intrinsic::r600_read_tidig_y:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005221 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5222 SDLoc(DAG.getEntryNode()),
5223 MFI->getArgInfo().WorkItemIDY);
Matt Arsenault43976df2016-01-30 04:25:19 +00005224 case Intrinsic::amdgcn_workitem_id_z:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005225 case Intrinsic::r600_read_tidig_z:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005226 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5227 SDLoc(DAG.getEntryNode()),
5228 MFI->getArgInfo().WorkItemIDZ);
Tim Renouf904343f2018-08-25 14:53:17 +00005229 case Intrinsic::amdgcn_s_buffer_load: {
5230 unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005231 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
5232 DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005233 }
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00005234 case Intrinsic::amdgcn_fdiv_fast:
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00005235 return lowerFDIV_FAST(Op, DAG);
Tom Stellard2187bb82016-12-06 23:52:13 +00005236 case Intrinsic::amdgcn_interp_mov: {
5237 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5238 SDValue Glue = M0.getValue(1);
5239 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
5240 Op.getOperand(2), Op.getOperand(3), Glue);
5241 }
Tom Stellardad7d03d2015-12-15 17:02:49 +00005242 case Intrinsic::amdgcn_interp_p1: {
5243 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5244 SDValue Glue = M0.getValue(1);
5245 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
5246 Op.getOperand(2), Op.getOperand(3), Glue);
5247 }
5248 case Intrinsic::amdgcn_interp_p2: {
5249 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5250 SDValue Glue = SDValue(M0.getNode(), 1);
5251 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
5252 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
5253 Glue);
5254 }
Tim Corringham824ca3f2019-01-28 13:48:59 +00005255 case Intrinsic::amdgcn_interp_p1_f16: {
5256 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5257 SDValue Glue = M0.getValue(1);
5258 if (getSubtarget()->getLDSBankCount() == 16) {
5259 // 16 bank LDS
5260 SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
5261 DAG.getConstant(2, DL, MVT::i32), // P0
5262 Op.getOperand(2), // Attrchan
5263 Op.getOperand(3), // Attr
5264 Glue);
5265 SDValue Ops[] = {
5266 Op.getOperand(1), // Src0
5267 Op.getOperand(2), // Attrchan
5268 Op.getOperand(3), // Attr
5269 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5270 S, // Src2 - holds two f16 values selected by high
5271 DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
5272 Op.getOperand(4), // high
5273 DAG.getConstant(0, DL, MVT::i1), // $clamp
5274 DAG.getConstant(0, DL, MVT::i32) // $omod
5275 };
5276 return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
5277 } else {
5278 // 32 bank LDS
5279 SDValue Ops[] = {
5280 Op.getOperand(1), // Src0
5281 Op.getOperand(2), // Attrchan
5282 Op.getOperand(3), // Attr
5283 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5284 Op.getOperand(4), // high
5285 DAG.getConstant(0, DL, MVT::i1), // $clamp
5286 DAG.getConstant(0, DL, MVT::i32), // $omod
5287 Glue
5288 };
5289 return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
5290 }
5291 }
5292 case Intrinsic::amdgcn_interp_p2_f16: {
5293 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6));
5294 SDValue Glue = SDValue(M0.getNode(), 1);
5295 SDValue Ops[] = {
5296 Op.getOperand(2), // Src0
5297 Op.getOperand(3), // Attrchan
5298 Op.getOperand(4), // Attr
5299 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5300 Op.getOperand(1), // Src2
5301 DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
5302 Op.getOperand(5), // high
5303 DAG.getConstant(0, DL, MVT::i1), // $clamp
5304 Glue
5305 };
5306 return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
5307 }
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005308 case Intrinsic::amdgcn_sin:
5309 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
5310
5311 case Intrinsic::amdgcn_cos:
5312 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
5313
5314 case Intrinsic::amdgcn_log_clamp: {
Tom Stellard5bfbae52018-07-11 20:59:01 +00005315 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005316 return SDValue();
5317
5318 DiagnosticInfoUnsupported BadIntrin(
Matthias Braunf1caa282017-12-15 22:22:58 +00005319 MF.getFunction(), "intrinsic not supported on subtarget",
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005320 DL.getDebugLoc());
5321 DAG.getContext()->diagnose(BadIntrin);
5322 return DAG.getUNDEF(VT);
5323 }
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005324 case Intrinsic::amdgcn_ldexp:
5325 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
5326 Op.getOperand(1), Op.getOperand(2));
Matt Arsenault74015162016-05-28 00:19:52 +00005327
5328 case Intrinsic::amdgcn_fract:
5329 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
5330
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005331 case Intrinsic::amdgcn_class:
5332 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
5333 Op.getOperand(1), Op.getOperand(2));
5334 case Intrinsic::amdgcn_div_fmas:
5335 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
5336 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5337 Op.getOperand(4));
5338
5339 case Intrinsic::amdgcn_div_fixup:
5340 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
5341 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5342
5343 case Intrinsic::amdgcn_trig_preop:
5344 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
5345 Op.getOperand(1), Op.getOperand(2));
5346 case Intrinsic::amdgcn_div_scale: {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00005347 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005348
5349 // Translate to the operands expected by the machine instruction. The
5350 // first parameter must be the same as the first instruction.
5351 SDValue Numerator = Op.getOperand(1);
5352 SDValue Denominator = Op.getOperand(2);
5353
5354 // Note this order is opposite of the machine instruction's operations,
5355 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5356 // intrinsic has the numerator as the first operand to match a normal
5357 // division operation.
5358
5359 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
5360
5361 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
5362 Denominator, Numerator);
5363 }
Wei Ding07e03712016-07-28 16:42:13 +00005364 case Intrinsic::amdgcn_icmp: {
Marek Olsak33eb4d92019-01-15 02:13:18 +00005365 // There is a Pat that handles this variant, so return it as-is.
5366 if (Op.getOperand(1).getValueType() == MVT::i1 &&
5367 Op.getConstantOperandVal(2) == 0 &&
5368 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
5369 return Op;
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00005370 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
Wei Ding07e03712016-07-28 16:42:13 +00005371 }
5372 case Intrinsic::amdgcn_fcmp: {
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00005373 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
Wei Ding07e03712016-07-28 16:42:13 +00005374 }
Matt Arsenaultf84e5d92017-01-31 03:07:46 +00005375 case Intrinsic::amdgcn_fmed3:
5376 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
5377 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
Farhana Aleenc370d7b2018-07-16 18:19:59 +00005378 case Intrinsic::amdgcn_fdot2:
5379 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
Konstantin Zhuravlyovbb30ef72018-08-01 01:31:30 +00005380 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5381 Op.getOperand(4));
Matt Arsenault32fc5272016-07-26 16:45:45 +00005382 case Intrinsic::amdgcn_fmul_legacy:
5383 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
5384 Op.getOperand(1), Op.getOperand(2));
Matt Arsenaultc96e1de2016-07-18 18:35:05 +00005385 case Intrinsic::amdgcn_sffbh:
Matt Arsenaultc96e1de2016-07-18 18:35:05 +00005386 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
Matt Arsenaultf5262252017-02-22 23:04:58 +00005387 case Intrinsic::amdgcn_sbfe:
5388 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
5389 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5390 case Intrinsic::amdgcn_ubfe:
5391 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
5392 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
Marek Olsak13e47412018-01-31 20:18:04 +00005393 case Intrinsic::amdgcn_cvt_pkrtz:
5394 case Intrinsic::amdgcn_cvt_pknorm_i16:
5395 case Intrinsic::amdgcn_cvt_pknorm_u16:
5396 case Intrinsic::amdgcn_cvt_pk_i16:
5397 case Intrinsic::amdgcn_cvt_pk_u16: {
5398 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
Matt Arsenault1f17c662017-02-22 00:27:34 +00005399 EVT VT = Op.getValueType();
Marek Olsak13e47412018-01-31 20:18:04 +00005400 unsigned Opcode;
5401
5402 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
5403 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
5404 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
5405 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
5406 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
5407 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
5408 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
5409 Opcode = AMDGPUISD::CVT_PK_I16_I32;
5410 else
5411 Opcode = AMDGPUISD::CVT_PK_U16_U32;
5412
Matt Arsenault709374d2018-08-01 20:13:58 +00005413 if (isTypeLegal(VT))
5414 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
5415
Marek Olsak13e47412018-01-31 20:18:04 +00005416 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
Matt Arsenault1f17c662017-02-22 00:27:34 +00005417 Op.getOperand(1), Op.getOperand(2));
5418 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
5419 }
Connor Abbott8c217d02017-08-04 18:36:49 +00005420 case Intrinsic::amdgcn_wqm: {
5421 SDValue Src = Op.getOperand(1);
5422 return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
5423 0);
5424 }
Connor Abbott92638ab2017-08-04 18:36:52 +00005425 case Intrinsic::amdgcn_wwm: {
5426 SDValue Src = Op.getOperand(1);
5427 return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
5428 0);
5429 }
Stanislav Mekhanoshindacda792018-06-26 20:04:19 +00005430 case Intrinsic::amdgcn_fmad_ftz:
5431 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
5432 Op.getOperand(2), Op.getOperand(3));
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005433 default:
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005434 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5435 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
5436 return lowerImage(Op, ImageDimIntr, DAG);
5437
Matt Arsenault754dd3e2017-04-03 18:08:08 +00005438 return Op;
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005439 }
5440}
5441
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005442SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5443 SelectionDAG &DAG) const {
5444 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
Tom Stellard6f9ef142016-12-20 17:19:44 +00005445 SDLoc DL(Op);
David Stuttard70e8bc12017-06-22 16:29:22 +00005446
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005447 switch (IntrID) {
Marek Olsakc5cec5e2019-01-16 15:43:53 +00005448 case Intrinsic::amdgcn_ds_ordered_add:
5449 case Intrinsic::amdgcn_ds_ordered_swap: {
5450 MemSDNode *M = cast<MemSDNode>(Op);
5451 SDValue Chain = M->getOperand(0);
5452 SDValue M0 = M->getOperand(2);
5453 SDValue Value = M->getOperand(3);
5454 unsigned OrderedCountIndex = M->getConstantOperandVal(7);
5455 unsigned WaveRelease = M->getConstantOperandVal(8);
5456 unsigned WaveDone = M->getConstantOperandVal(9);
5457 unsigned ShaderType;
5458 unsigned Instruction;
5459
5460 switch (IntrID) {
5461 case Intrinsic::amdgcn_ds_ordered_add:
5462 Instruction = 0;
5463 break;
5464 case Intrinsic::amdgcn_ds_ordered_swap:
5465 Instruction = 1;
5466 break;
5467 }
5468
5469 if (WaveDone && !WaveRelease)
5470 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
5471
5472 switch (DAG.getMachineFunction().getFunction().getCallingConv()) {
5473 case CallingConv::AMDGPU_CS:
5474 case CallingConv::AMDGPU_KERNEL:
5475 ShaderType = 0;
5476 break;
5477 case CallingConv::AMDGPU_PS:
5478 ShaderType = 1;
5479 break;
5480 case CallingConv::AMDGPU_VS:
5481 ShaderType = 2;
5482 break;
5483 case CallingConv::AMDGPU_GS:
5484 ShaderType = 3;
5485 break;
5486 default:
5487 report_fatal_error("ds_ordered_count unsupported for this calling conv");
5488 }
5489
5490 unsigned Offset0 = OrderedCountIndex << 2;
5491 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
5492 (Instruction << 4);
5493 unsigned Offset = Offset0 | (Offset1 << 8);
5494
5495 SDValue Ops[] = {
5496 Chain,
5497 Value,
5498 DAG.getTargetConstant(Offset, DL, MVT::i16),
5499 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
5500 };
5501 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
5502 M->getVTList(), Ops, M->getMemoryVT(),
5503 M->getMemOperand());
5504 }
Matt Arsenaulta5840c32019-01-22 18:36:06 +00005505 case Intrinsic::amdgcn_ds_fadd: {
5506 MemSDNode *M = cast<MemSDNode>(Op);
5507 unsigned Opc;
5508 switch (IntrID) {
5509 case Intrinsic::amdgcn_ds_fadd:
5510 Opc = ISD::ATOMIC_LOAD_FADD;
5511 break;
5512 }
5513
5514 return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
5515 M->getOperand(0), M->getOperand(2), M->getOperand(3),
5516 M->getMemOperand());
5517 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005518 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005519 case Intrinsic::amdgcn_atomic_dec:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005520 case Intrinsic::amdgcn_ds_fmin:
5521 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005522 MemSDNode *M = cast<MemSDNode>(Op);
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005523 unsigned Opc;
5524 switch (IntrID) {
5525 case Intrinsic::amdgcn_atomic_inc:
5526 Opc = AMDGPUISD::ATOMIC_INC;
5527 break;
5528 case Intrinsic::amdgcn_atomic_dec:
5529 Opc = AMDGPUISD::ATOMIC_DEC;
5530 break;
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005531 case Intrinsic::amdgcn_ds_fmin:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005532 Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
5533 break;
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005534 case Intrinsic::amdgcn_ds_fmax:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005535 Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
5536 break;
5537 default:
5538 llvm_unreachable("Unknown intrinsic!");
5539 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005540 SDValue Ops[] = {
5541 M->getOperand(0), // Chain
5542 M->getOperand(2), // Ptr
5543 M->getOperand(3) // Value
5544 };
5545
5546 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
5547 M->getMemoryVT(), M->getMemOperand());
5548 }
Tom Stellard6f9ef142016-12-20 17:19:44 +00005549 case Intrinsic::amdgcn_buffer_load:
5550 case Intrinsic::amdgcn_buffer_load_format: {
Tim Renouf4f703f52018-08-21 11:07:10 +00005551 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
5552 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5553 unsigned IdxEn = 1;
5554 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5555 IdxEn = Idx->getZExtValue() != 0;
Tom Stellard6f9ef142016-12-20 17:19:44 +00005556 SDValue Ops[] = {
5557 Op.getOperand(0), // Chain
5558 Op.getOperand(2), // rsrc
5559 Op.getOperand(3), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00005560 SDValue(), // voffset -- will be set by setBufferOffsets
5561 SDValue(), // soffset -- will be set by setBufferOffsets
5562 SDValue(), // offset -- will be set by setBufferOffsets
5563 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5564 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Tom Stellard6f9ef142016-12-20 17:19:44 +00005565 };
Tom Stellard6f9ef142016-12-20 17:19:44 +00005566
Tim Renouf4f703f52018-08-21 11:07:10 +00005567 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
Tom Stellard6f9ef142016-12-20 17:19:44 +00005568 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
5569 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
Tim Renouf4f703f52018-08-21 11:07:10 +00005570
5571 EVT VT = Op.getValueType();
5572 EVT IntVT = VT.changeTypeToInteger();
5573 auto *M = cast<MemSDNode>(Op);
5574 EVT LoadVT = Op.getValueType();
5575
5576 if (LoadVT.getScalarType() == MVT::f16)
5577 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5578 M, DAG, Ops);
5579 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5580 M->getMemOperand());
5581 }
5582 case Intrinsic::amdgcn_raw_buffer_load:
5583 case Intrinsic::amdgcn_raw_buffer_load_format: {
5584 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5585 SDValue Ops[] = {
5586 Op.getOperand(0), // Chain
5587 Op.getOperand(2), // rsrc
5588 DAG.getConstant(0, DL, MVT::i32), // vindex
5589 Offsets.first, // voffset
5590 Op.getOperand(4), // soffset
5591 Offsets.second, // offset
5592 Op.getOperand(5), // cachepolicy
5593 DAG.getConstant(0, DL, MVT::i1), // idxen
5594 };
5595
5596 unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
5597 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5598
5599 EVT VT = Op.getValueType();
5600 EVT IntVT = VT.changeTypeToInteger();
5601 auto *M = cast<MemSDNode>(Op);
5602 EVT LoadVT = Op.getValueType();
5603
5604 if (LoadVT.getScalarType() == MVT::f16)
5605 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5606 M, DAG, Ops);
5607 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5608 M->getMemOperand());
5609 }
5610 case Intrinsic::amdgcn_struct_buffer_load:
5611 case Intrinsic::amdgcn_struct_buffer_load_format: {
5612 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5613 SDValue Ops[] = {
5614 Op.getOperand(0), // Chain
5615 Op.getOperand(2), // rsrc
5616 Op.getOperand(3), // vindex
5617 Offsets.first, // voffset
5618 Op.getOperand(5), // soffset
5619 Offsets.second, // offset
5620 Op.getOperand(6), // cachepolicy
5621 DAG.getConstant(1, DL, MVT::i1), // idxen
5622 };
5623
5624 unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
5625 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5626
Tom Stellard6f9ef142016-12-20 17:19:44 +00005627 EVT VT = Op.getValueType();
5628 EVT IntVT = VT.changeTypeToInteger();
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005629 auto *M = cast<MemSDNode>(Op);
Matt Arsenault1349a042018-05-22 06:32:10 +00005630 EVT LoadVT = Op.getValueType();
Matt Arsenault1349a042018-05-22 06:32:10 +00005631
Tim Renouf366a49d2018-08-02 23:33:01 +00005632 if (LoadVT.getScalarType() == MVT::f16)
5633 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5634 M, DAG, Ops);
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005635 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5636 M->getMemOperand());
Tom Stellard6f9ef142016-12-20 17:19:44 +00005637 }
David Stuttard70e8bc12017-06-22 16:29:22 +00005638 case Intrinsic::amdgcn_tbuffer_load: {
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005639 MemSDNode *M = cast<MemSDNode>(Op);
Matt Arsenault1349a042018-05-22 06:32:10 +00005640 EVT LoadVT = Op.getValueType();
Matt Arsenault1349a042018-05-22 06:32:10 +00005641
Tim Renouf35484c92018-08-21 11:06:05 +00005642 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5643 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5644 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5645 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
5646 unsigned IdxEn = 1;
5647 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5648 IdxEn = Idx->getZExtValue() != 0;
David Stuttard70e8bc12017-06-22 16:29:22 +00005649 SDValue Ops[] = {
5650 Op.getOperand(0), // Chain
5651 Op.getOperand(2), // rsrc
5652 Op.getOperand(3), // vindex
5653 Op.getOperand(4), // voffset
5654 Op.getOperand(5), // soffset
5655 Op.getOperand(6), // offset
Tim Renouf35484c92018-08-21 11:06:05 +00005656 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5657 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5658 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5659 };
5660
5661 if (LoadVT.getScalarType() == MVT::f16)
5662 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5663 M, DAG, Ops);
5664 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5665 Op->getVTList(), Ops, LoadVT,
5666 M->getMemOperand());
5667 }
5668 case Intrinsic::amdgcn_raw_tbuffer_load: {
5669 MemSDNode *M = cast<MemSDNode>(Op);
5670 EVT LoadVT = Op.getValueType();
5671 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5672
5673 SDValue Ops[] = {
5674 Op.getOperand(0), // Chain
5675 Op.getOperand(2), // rsrc
5676 DAG.getConstant(0, DL, MVT::i32), // vindex
5677 Offsets.first, // voffset
5678 Op.getOperand(4), // soffset
5679 Offsets.second, // offset
5680 Op.getOperand(5), // format
5681 Op.getOperand(6), // cachepolicy
5682 DAG.getConstant(0, DL, MVT::i1), // idxen
5683 };
5684
5685 if (LoadVT.getScalarType() == MVT::f16)
5686 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5687 M, DAG, Ops);
5688 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5689 Op->getVTList(), Ops, LoadVT,
5690 M->getMemOperand());
5691 }
5692 case Intrinsic::amdgcn_struct_tbuffer_load: {
5693 MemSDNode *M = cast<MemSDNode>(Op);
5694 EVT LoadVT = Op.getValueType();
5695 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5696
5697 SDValue Ops[] = {
5698 Op.getOperand(0), // Chain
5699 Op.getOperand(2), // rsrc
5700 Op.getOperand(3), // vindex
5701 Offsets.first, // voffset
5702 Op.getOperand(5), // soffset
5703 Offsets.second, // offset
5704 Op.getOperand(6), // format
5705 Op.getOperand(7), // cachepolicy
5706 DAG.getConstant(1, DL, MVT::i1), // idxen
David Stuttard70e8bc12017-06-22 16:29:22 +00005707 };
5708
Tim Renouf366a49d2018-08-02 23:33:01 +00005709 if (LoadVT.getScalarType() == MVT::f16)
5710 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5711 M, DAG, Ops);
David Stuttard70e8bc12017-06-22 16:29:22 +00005712 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
Matt Arsenault1349a042018-05-22 06:32:10 +00005713 Op->getVTList(), Ops, LoadVT,
5714 M->getMemOperand());
David Stuttard70e8bc12017-06-22 16:29:22 +00005715 }
Marek Olsak5cec6412017-11-09 01:52:48 +00005716 case Intrinsic::amdgcn_buffer_atomic_swap:
5717 case Intrinsic::amdgcn_buffer_atomic_add:
5718 case Intrinsic::amdgcn_buffer_atomic_sub:
5719 case Intrinsic::amdgcn_buffer_atomic_smin:
5720 case Intrinsic::amdgcn_buffer_atomic_umin:
5721 case Intrinsic::amdgcn_buffer_atomic_smax:
5722 case Intrinsic::amdgcn_buffer_atomic_umax:
5723 case Intrinsic::amdgcn_buffer_atomic_and:
5724 case Intrinsic::amdgcn_buffer_atomic_or:
5725 case Intrinsic::amdgcn_buffer_atomic_xor: {
Tim Renouf4f703f52018-08-21 11:07:10 +00005726 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5727 unsigned IdxEn = 1;
5728 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5729 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00005730 SDValue Ops[] = {
5731 Op.getOperand(0), // Chain
5732 Op.getOperand(2), // vdata
5733 Op.getOperand(3), // rsrc
5734 Op.getOperand(4), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00005735 SDValue(), // voffset -- will be set by setBufferOffsets
5736 SDValue(), // soffset -- will be set by setBufferOffsets
5737 SDValue(), // offset -- will be set by setBufferOffsets
5738 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5739 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00005740 };
Tim Renouf4f703f52018-08-21 11:07:10 +00005741 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005742 EVT VT = Op.getValueType();
5743
5744 auto *M = cast<MemSDNode>(Op);
Marek Olsak5cec6412017-11-09 01:52:48 +00005745 unsigned Opcode = 0;
5746
5747 switch (IntrID) {
5748 case Intrinsic::amdgcn_buffer_atomic_swap:
5749 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5750 break;
5751 case Intrinsic::amdgcn_buffer_atomic_add:
5752 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5753 break;
5754 case Intrinsic::amdgcn_buffer_atomic_sub:
5755 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5756 break;
5757 case Intrinsic::amdgcn_buffer_atomic_smin:
5758 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5759 break;
5760 case Intrinsic::amdgcn_buffer_atomic_umin:
5761 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5762 break;
5763 case Intrinsic::amdgcn_buffer_atomic_smax:
5764 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5765 break;
5766 case Intrinsic::amdgcn_buffer_atomic_umax:
5767 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5768 break;
5769 case Intrinsic::amdgcn_buffer_atomic_and:
5770 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5771 break;
5772 case Intrinsic::amdgcn_buffer_atomic_or:
5773 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5774 break;
5775 case Intrinsic::amdgcn_buffer_atomic_xor:
5776 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5777 break;
5778 default:
5779 llvm_unreachable("unhandled atomic opcode");
5780 }
5781
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005782 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5783 M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00005784 }
Tim Renouf4f703f52018-08-21 11:07:10 +00005785 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5786 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5787 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5788 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5789 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5790 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5791 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5792 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5793 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5794 case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
5795 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5796 SDValue Ops[] = {
5797 Op.getOperand(0), // Chain
5798 Op.getOperand(2), // vdata
5799 Op.getOperand(3), // rsrc
5800 DAG.getConstant(0, DL, MVT::i32), // vindex
5801 Offsets.first, // voffset
5802 Op.getOperand(5), // soffset
5803 Offsets.second, // offset
5804 Op.getOperand(6), // cachepolicy
5805 DAG.getConstant(0, DL, MVT::i1), // idxen
5806 };
5807 EVT VT = Op.getValueType();
Marek Olsak5cec6412017-11-09 01:52:48 +00005808
Tim Renouf4f703f52018-08-21 11:07:10 +00005809 auto *M = cast<MemSDNode>(Op);
5810 unsigned Opcode = 0;
5811
5812 switch (IntrID) {
5813 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5814 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5815 break;
5816 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5817 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5818 break;
5819 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5820 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5821 break;
5822 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5823 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5824 break;
5825 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5826 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5827 break;
5828 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5829 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5830 break;
5831 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5832 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5833 break;
5834 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5835 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5836 break;
5837 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5838 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5839 break;
5840 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5841 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5842 break;
5843 default:
5844 llvm_unreachable("unhandled atomic opcode");
5845 }
5846
5847 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5848 M->getMemOperand());
5849 }
5850 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5851 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5852 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5853 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5854 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5855 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5856 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5857 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5858 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5859 case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
5860 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5861 SDValue Ops[] = {
5862 Op.getOperand(0), // Chain
5863 Op.getOperand(2), // vdata
5864 Op.getOperand(3), // rsrc
5865 Op.getOperand(4), // vindex
5866 Offsets.first, // voffset
5867 Op.getOperand(6), // soffset
5868 Offsets.second, // offset
5869 Op.getOperand(7), // cachepolicy
5870 DAG.getConstant(1, DL, MVT::i1), // idxen
5871 };
5872 EVT VT = Op.getValueType();
5873
5874 auto *M = cast<MemSDNode>(Op);
5875 unsigned Opcode = 0;
5876
5877 switch (IntrID) {
5878 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5879 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5880 break;
5881 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5882 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5883 break;
5884 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5885 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5886 break;
5887 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5888 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5889 break;
5890 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5891 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5892 break;
5893 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5894 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5895 break;
5896 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5897 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5898 break;
5899 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5900 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5901 break;
5902 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5903 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5904 break;
5905 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5906 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5907 break;
5908 default:
5909 llvm_unreachable("unhandled atomic opcode");
5910 }
5911
5912 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5913 M->getMemOperand());
5914 }
Marek Olsak5cec6412017-11-09 01:52:48 +00005915 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
Tim Renouf4f703f52018-08-21 11:07:10 +00005916 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5917 unsigned IdxEn = 1;
5918 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
5919 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00005920 SDValue Ops[] = {
5921 Op.getOperand(0), // Chain
5922 Op.getOperand(2), // src
5923 Op.getOperand(3), // cmp
5924 Op.getOperand(4), // rsrc
5925 Op.getOperand(5), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00005926 SDValue(), // voffset -- will be set by setBufferOffsets
5927 SDValue(), // soffset -- will be set by setBufferOffsets
5928 SDValue(), // offset -- will be set by setBufferOffsets
5929 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5930 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5931 };
5932 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
5933 EVT VT = Op.getValueType();
5934 auto *M = cast<MemSDNode>(Op);
5935
5936 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5937 Op->getVTList(), Ops, VT, M->getMemOperand());
5938 }
5939 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
5940 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5941 SDValue Ops[] = {
5942 Op.getOperand(0), // Chain
5943 Op.getOperand(2), // src
5944 Op.getOperand(3), // cmp
5945 Op.getOperand(4), // rsrc
5946 DAG.getConstant(0, DL, MVT::i32), // vindex
5947 Offsets.first, // voffset
5948 Op.getOperand(6), // soffset
5949 Offsets.second, // offset
5950 Op.getOperand(7), // cachepolicy
5951 DAG.getConstant(0, DL, MVT::i1), // idxen
5952 };
5953 EVT VT = Op.getValueType();
5954 auto *M = cast<MemSDNode>(Op);
5955
5956 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5957 Op->getVTList(), Ops, VT, M->getMemOperand());
5958 }
5959 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
5960 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
5961 SDValue Ops[] = {
5962 Op.getOperand(0), // Chain
5963 Op.getOperand(2), // src
5964 Op.getOperand(3), // cmp
5965 Op.getOperand(4), // rsrc
5966 Op.getOperand(5), // vindex
5967 Offsets.first, // voffset
5968 Op.getOperand(7), // soffset
5969 Offsets.second, // offset
5970 Op.getOperand(8), // cachepolicy
5971 DAG.getConstant(1, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00005972 };
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005973 EVT VT = Op.getValueType();
5974 auto *M = cast<MemSDNode>(Op);
Marek Olsak5cec6412017-11-09 01:52:48 +00005975
5976 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005977 Op->getVTList(), Ops, VT, M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00005978 }
5979
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005980 default:
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005981 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5982 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5983 return lowerImage(Op, ImageDimIntr, DAG);
Matt Arsenault1349a042018-05-22 06:32:10 +00005984
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005985 return SDValue();
5986 }
5987}
5988
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00005989SDValue SITargetLowering::handleD16VData(SDValue VData,
5990 SelectionDAG &DAG) const {
5991 EVT StoreVT = VData.getValueType();
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00005992
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00005993 // No change for f16 and legal vector D16 types.
Matt Arsenault1349a042018-05-22 06:32:10 +00005994 if (!StoreVT.isVector())
5995 return VData;
5996
5997 SDLoc DL(VData);
5998 assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
5999
6000 if (Subtarget->hasUnpackedD16VMem()) {
6001 // We need to unpack the packed data to store.
6002 EVT IntStoreVT = StoreVT.changeTypeToInteger();
6003 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
6004
6005 EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6006 StoreVT.getVectorNumElements());
6007 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
6008 return DAG.UnrollVectorOp(ZExt.getNode());
6009 }
6010
Matt Arsenault02dc7e12018-06-15 15:15:46 +00006011 assert(isTypeLegal(StoreVT));
6012 return VData;
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006013}
6014
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006015SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6016 SelectionDAG &DAG) const {
Tom Stellardfc92e772015-05-12 14:18:14 +00006017 SDLoc DL(Op);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006018 SDValue Chain = Op.getOperand(0);
6019 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
David Stuttard70e8bc12017-06-22 16:29:22 +00006020 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006021
6022 switch (IntrinsicID) {
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00006023 case Intrinsic::amdgcn_exp: {
Matt Arsenault4165efd2017-01-17 07:26:53 +00006024 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
6025 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
6026 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
6027 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
6028
6029 const SDValue Ops[] = {
6030 Chain,
6031 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
6032 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
6033 Op.getOperand(4), // src0
6034 Op.getOperand(5), // src1
6035 Op.getOperand(6), // src2
6036 Op.getOperand(7), // src3
6037 DAG.getTargetConstant(0, DL, MVT::i1), // compr
6038 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
6039 };
6040
6041 unsigned Opc = Done->isNullValue() ?
6042 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
6043 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
6044 }
6045 case Intrinsic::amdgcn_exp_compr: {
6046 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
6047 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
6048 SDValue Src0 = Op.getOperand(4);
6049 SDValue Src1 = Op.getOperand(5);
6050 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
6051 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
6052
6053 SDValue Undef = DAG.getUNDEF(MVT::f32);
6054 const SDValue Ops[] = {
6055 Chain,
6056 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
6057 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
6058 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
6059 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
6060 Undef, // src2
6061 Undef, // src3
6062 DAG.getTargetConstant(1, DL, MVT::i1), // compr
6063 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
6064 };
6065
6066 unsigned Opc = Done->isNullValue() ?
6067 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
6068 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
6069 }
6070 case Intrinsic::amdgcn_s_sendmsg:
Matt Arsenaultd3e5cb72017-02-16 02:01:17 +00006071 case Intrinsic::amdgcn_s_sendmsghalt: {
6072 unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
6073 AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
Tom Stellardfc92e772015-05-12 14:18:14 +00006074 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
6075 SDValue Glue = Chain.getValue(1);
Matt Arsenaulta78ca622017-02-15 22:17:09 +00006076 return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
Jan Veselyd48445d2017-01-04 18:06:55 +00006077 Op.getOperand(2), Glue);
6078 }
Marek Olsak2d825902017-04-28 20:21:58 +00006079 case Intrinsic::amdgcn_init_exec: {
6080 return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
6081 Op.getOperand(2));
6082 }
6083 case Intrinsic::amdgcn_init_exec_from_input: {
6084 return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
6085 Op.getOperand(2), Op.getOperand(3));
6086 }
Stanislav Mekhanoshinea57c382017-04-06 16:48:30 +00006087 case Intrinsic::amdgcn_s_barrier: {
6088 if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00006089 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Matthias Braunf1caa282017-12-15 22:22:58 +00006090 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
Stanislav Mekhanoshinea57c382017-04-06 16:48:30 +00006091 if (WGSize <= ST.getWavefrontSize())
6092 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
6093 Op.getOperand(0)), 0);
6094 }
6095 return SDValue();
6096 };
David Stuttard70e8bc12017-06-22 16:29:22 +00006097 case Intrinsic::amdgcn_tbuffer_store: {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006098 SDValue VData = Op.getOperand(2);
6099 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6100 if (IsD16)
6101 VData = handleD16VData(VData, DAG);
Tim Renouf35484c92018-08-21 11:06:05 +00006102 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
6103 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
6104 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
6105 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
6106 unsigned IdxEn = 1;
6107 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6108 IdxEn = Idx->getZExtValue() != 0;
David Stuttard70e8bc12017-06-22 16:29:22 +00006109 SDValue Ops[] = {
6110 Chain,
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006111 VData, // vdata
David Stuttard70e8bc12017-06-22 16:29:22 +00006112 Op.getOperand(3), // rsrc
6113 Op.getOperand(4), // vindex
6114 Op.getOperand(5), // voffset
6115 Op.getOperand(6), // soffset
6116 Op.getOperand(7), // offset
Tim Renouf35484c92018-08-21 11:06:05 +00006117 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
6118 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6119 DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
6120 };
6121 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6122 AMDGPUISD::TBUFFER_STORE_FORMAT;
6123 MemSDNode *M = cast<MemSDNode>(Op);
6124 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6125 M->getMemoryVT(), M->getMemOperand());
6126 }
6127
6128 case Intrinsic::amdgcn_struct_tbuffer_store: {
6129 SDValue VData = Op.getOperand(2);
6130 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6131 if (IsD16)
6132 VData = handleD16VData(VData, DAG);
6133 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6134 SDValue Ops[] = {
6135 Chain,
6136 VData, // vdata
6137 Op.getOperand(3), // rsrc
6138 Op.getOperand(4), // vindex
6139 Offsets.first, // voffset
6140 Op.getOperand(6), // soffset
6141 Offsets.second, // offset
6142 Op.getOperand(7), // format
6143 Op.getOperand(8), // cachepolicy
6144 DAG.getConstant(1, DL, MVT::i1), // idexen
6145 };
6146 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6147 AMDGPUISD::TBUFFER_STORE_FORMAT;
6148 MemSDNode *M = cast<MemSDNode>(Op);
6149 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6150 M->getMemoryVT(), M->getMemOperand());
6151 }
6152
6153 case Intrinsic::amdgcn_raw_tbuffer_store: {
6154 SDValue VData = Op.getOperand(2);
6155 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6156 if (IsD16)
6157 VData = handleD16VData(VData, DAG);
6158 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6159 SDValue Ops[] = {
6160 Chain,
6161 VData, // vdata
6162 Op.getOperand(3), // rsrc
6163 DAG.getConstant(0, DL, MVT::i32), // vindex
6164 Offsets.first, // voffset
6165 Op.getOperand(5), // soffset
6166 Offsets.second, // offset
6167 Op.getOperand(6), // format
6168 Op.getOperand(7), // cachepolicy
6169 DAG.getConstant(0, DL, MVT::i1), // idexen
David Stuttard70e8bc12017-06-22 16:29:22 +00006170 };
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006171 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6172 AMDGPUISD::TBUFFER_STORE_FORMAT;
6173 MemSDNode *M = cast<MemSDNode>(Op);
6174 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6175 M->getMemoryVT(), M->getMemOperand());
David Stuttard70e8bc12017-06-22 16:29:22 +00006176 }
6177
Marek Olsak5cec6412017-11-09 01:52:48 +00006178 case Intrinsic::amdgcn_buffer_store:
6179 case Intrinsic::amdgcn_buffer_store_format: {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006180 SDValue VData = Op.getOperand(2);
6181 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6182 if (IsD16)
6183 VData = handleD16VData(VData, DAG);
Tim Renouf4f703f52018-08-21 11:07:10 +00006184 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
6185 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
6186 unsigned IdxEn = 1;
6187 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6188 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00006189 SDValue Ops[] = {
6190 Chain,
Tim Renouf4f703f52018-08-21 11:07:10 +00006191 VData,
Marek Olsak5cec6412017-11-09 01:52:48 +00006192 Op.getOperand(3), // rsrc
6193 Op.getOperand(4), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00006194 SDValue(), // voffset -- will be set by setBufferOffsets
6195 SDValue(), // soffset -- will be set by setBufferOffsets
6196 SDValue(), // offset -- will be set by setBufferOffsets
6197 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6198 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00006199 };
Tim Renouf4f703f52018-08-21 11:07:10 +00006200 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006201 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
6202 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6203 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6204 MemSDNode *M = cast<MemSDNode>(Op);
6205 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6206 M->getMemoryVT(), M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00006207 }
Tim Renouf4f703f52018-08-21 11:07:10 +00006208
6209 case Intrinsic::amdgcn_raw_buffer_store:
6210 case Intrinsic::amdgcn_raw_buffer_store_format: {
6211 SDValue VData = Op.getOperand(2);
6212 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6213 if (IsD16)
6214 VData = handleD16VData(VData, DAG);
6215 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6216 SDValue Ops[] = {
6217 Chain,
6218 VData,
6219 Op.getOperand(3), // rsrc
6220 DAG.getConstant(0, DL, MVT::i32), // vindex
6221 Offsets.first, // voffset
6222 Op.getOperand(5), // soffset
6223 Offsets.second, // offset
6224 Op.getOperand(6), // cachepolicy
6225 DAG.getConstant(0, DL, MVT::i1), // idxen
6226 };
6227 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
6228 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6229 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6230 MemSDNode *M = cast<MemSDNode>(Op);
6231 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6232 M->getMemoryVT(), M->getMemOperand());
6233 }
6234
6235 case Intrinsic::amdgcn_struct_buffer_store:
6236 case Intrinsic::amdgcn_struct_buffer_store_format: {
6237 SDValue VData = Op.getOperand(2);
6238 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6239 if (IsD16)
6240 VData = handleD16VData(VData, DAG);
6241 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6242 SDValue Ops[] = {
6243 Chain,
6244 VData,
6245 Op.getOperand(3), // rsrc
6246 Op.getOperand(4), // vindex
6247 Offsets.first, // voffset
6248 Op.getOperand(6), // soffset
6249 Offsets.second, // offset
6250 Op.getOperand(7), // cachepolicy
6251 DAG.getConstant(1, DL, MVT::i1), // idxen
6252 };
6253 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
6254 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6255 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6256 MemSDNode *M = cast<MemSDNode>(Op);
6257 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6258 M->getMemoryVT(), M->getMemOperand());
6259 }
6260
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006261 default: {
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00006262 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6263 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
6264 return lowerImage(Op, ImageDimIntr, DAG);
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006265
Matt Arsenault754dd3e2017-04-03 18:08:08 +00006266 return Op;
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006267 }
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006268 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006269}
6270
Tim Renouf4f703f52018-08-21 11:07:10 +00006271// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6272// offset (the offset that is included in bounds checking and swizzling, to be
6273// split between the instruction's voffset and immoffset fields) and soffset
6274// (the offset that is excluded from bounds checking and swizzling, to go in
6275// the instruction's soffset field). This function takes the first kind of
6276// offset and figures out how to split it between voffset and immoffset.
Tim Renouf35484c92018-08-21 11:06:05 +00006277std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
6278 SDValue Offset, SelectionDAG &DAG) const {
6279 SDLoc DL(Offset);
6280 const unsigned MaxImm = 4095;
6281 SDValue N0 = Offset;
6282 ConstantSDNode *C1 = nullptr;
Piotr Sobczak378131b2019-01-02 09:47:41 +00006283
6284 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
Tim Renouf35484c92018-08-21 11:06:05 +00006285 N0 = SDValue();
Piotr Sobczak378131b2019-01-02 09:47:41 +00006286 else if (DAG.isBaseWithConstantOffset(N0)) {
6287 C1 = cast<ConstantSDNode>(N0.getOperand(1));
6288 N0 = N0.getOperand(0);
6289 }
Tim Renouf35484c92018-08-21 11:06:05 +00006290
6291 if (C1) {
6292 unsigned ImmOffset = C1->getZExtValue();
6293 // If the immediate value is too big for the immoffset field, put the value
Tim Renoufa37679d2018-10-03 10:29:43 +00006294 // and -4096 into the immoffset field so that the value that is copied/added
Tim Renouf35484c92018-08-21 11:06:05 +00006295 // for the voffset field is a multiple of 4096, and it stands more chance
6296 // of being CSEd with the copy/add for another similar load/store.
Tim Renoufa37679d2018-10-03 10:29:43 +00006297 // However, do not do that rounding down to a multiple of 4096 if that is a
6298 // negative number, as it appears to be illegal to have a negative offset
6299 // in the vgpr, even if adding the immediate offset makes it positive.
Tim Renouf35484c92018-08-21 11:06:05 +00006300 unsigned Overflow = ImmOffset & ~MaxImm;
6301 ImmOffset -= Overflow;
Tim Renoufa37679d2018-10-03 10:29:43 +00006302 if ((int32_t)Overflow < 0) {
6303 Overflow += ImmOffset;
6304 ImmOffset = 0;
6305 }
Tim Renouf35484c92018-08-21 11:06:05 +00006306 C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
6307 if (Overflow) {
6308 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
6309 if (!N0)
6310 N0 = OverflowVal;
6311 else {
6312 SDValue Ops[] = { N0, OverflowVal };
6313 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
6314 }
6315 }
6316 }
6317 if (!N0)
6318 N0 = DAG.getConstant(0, DL, MVT::i32);
6319 if (!C1)
6320 C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
6321 return {N0, SDValue(C1, 0)};
6322}
6323
Tim Renouf4f703f52018-08-21 11:07:10 +00006324// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
6325// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
6326// pointed to by Offsets.
6327void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006328 SelectionDAG &DAG, SDValue *Offsets,
6329 unsigned Align) const {
Tim Renouf4f703f52018-08-21 11:07:10 +00006330 SDLoc DL(CombinedOffset);
6331 if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
6332 uint32_t Imm = C->getZExtValue();
6333 uint32_t SOffset, ImmOffset;
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006334 if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
Tim Renouf4f703f52018-08-21 11:07:10 +00006335 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
6336 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6337 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6338 return;
6339 }
6340 }
6341 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
6342 SDValue N0 = CombinedOffset.getOperand(0);
6343 SDValue N1 = CombinedOffset.getOperand(1);
6344 uint32_t SOffset, ImmOffset;
6345 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006346 if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
6347 Subtarget, Align)) {
Tim Renouf4f703f52018-08-21 11:07:10 +00006348 Offsets[0] = N0;
6349 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6350 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6351 return;
6352 }
6353 }
6354 Offsets[0] = CombinedOffset;
6355 Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
6356 Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
6357}
6358
Matt Arsenault90083d32018-06-07 09:54:49 +00006359static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
6360 ISD::LoadExtType ExtType, SDValue Op,
6361 const SDLoc &SL, EVT VT) {
6362 if (VT.bitsLT(Op.getValueType()))
6363 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
6364
6365 switch (ExtType) {
6366 case ISD::SEXTLOAD:
6367 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
6368 case ISD::ZEXTLOAD:
6369 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
6370 case ISD::EXTLOAD:
6371 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
6372 case ISD::NON_EXTLOAD:
6373 return Op;
6374 }
6375
6376 llvm_unreachable("invalid ext type");
6377}
6378
6379SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
6380 SelectionDAG &DAG = DCI.DAG;
6381 if (Ld->getAlignment() < 4 || Ld->isDivergent())
6382 return SDValue();
6383
6384 // FIXME: Constant loads should all be marked invariant.
6385 unsigned AS = Ld->getAddressSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00006386 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
6387 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
Matt Arsenault90083d32018-06-07 09:54:49 +00006388 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
6389 return SDValue();
6390
6391 // Don't do this early, since it may interfere with adjacent load merging for
6392 // illegal types. We can avoid losing alignment information for exotic types
6393 // pre-legalize.
6394 EVT MemVT = Ld->getMemoryVT();
6395 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
6396 MemVT.getSizeInBits() >= 32)
6397 return SDValue();
6398
6399 SDLoc SL(Ld);
6400
6401 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
6402 "unexpected vector extload");
6403
6404 // TODO: Drop only high part of range.
6405 SDValue Ptr = Ld->getBasePtr();
6406 SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
6407 MVT::i32, SL, Ld->getChain(), Ptr,
6408 Ld->getOffset(),
6409 Ld->getPointerInfo(), MVT::i32,
6410 Ld->getAlignment(),
6411 Ld->getMemOperand()->getFlags(),
6412 Ld->getAAInfo(),
6413 nullptr); // Drop ranges
6414
6415 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
6416 if (MemVT.isFloatingPoint()) {
6417 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
6418 "unexpected fp extload");
6419 TruncVT = MemVT.changeTypeToInteger();
6420 }
6421
6422 SDValue Cvt = NewLoad;
6423 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
6424 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
6425 DAG.getValueType(TruncVT));
6426 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
6427 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
6428 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
6429 } else {
6430 assert(Ld->getExtensionType() == ISD::EXTLOAD);
6431 }
6432
6433 EVT VT = Ld->getValueType(0);
6434 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
6435
6436 DCI.AddToWorklist(Cvt.getNode());
6437
6438 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
6439 // the appropriate extension from the 32-bit load.
6440 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
6441 DCI.AddToWorklist(Cvt.getNode());
6442
6443 // Handle conversion back to floating point if necessary.
6444 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
6445
6446 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
6447}
6448
Tom Stellard81d871d2013-11-13 23:36:50 +00006449SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
6450 SDLoc DL(Op);
6451 LoadSDNode *Load = cast<LoadSDNode>(Op);
Matt Arsenault6dfda962016-02-10 18:21:39 +00006452 ISD::LoadExtType ExtType = Load->getExtensionType();
Matt Arsenaulta1436412016-02-10 18:21:45 +00006453 EVT MemVT = Load->getMemoryVT();
Matt Arsenault6dfda962016-02-10 18:21:39 +00006454
Matt Arsenaulta1436412016-02-10 18:21:45 +00006455 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
Matt Arsenault65ca292a2017-09-07 05:37:34 +00006456 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
6457 return SDValue();
6458
Matt Arsenault6dfda962016-02-10 18:21:39 +00006459 // FIXME: Copied from PPC
6460 // First, load into 32 bits, then truncate to 1 bit.
6461
6462 SDValue Chain = Load->getChain();
6463 SDValue BasePtr = Load->getBasePtr();
6464 MachineMemOperand *MMO = Load->getMemOperand();
6465
Tom Stellard115a6152016-11-10 16:02:37 +00006466 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
6467
Matt Arsenault6dfda962016-02-10 18:21:39 +00006468 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
Tom Stellard115a6152016-11-10 16:02:37 +00006469 BasePtr, RealMemVT, MMO);
Matt Arsenault6dfda962016-02-10 18:21:39 +00006470
6471 SDValue Ops[] = {
Matt Arsenaulta1436412016-02-10 18:21:45 +00006472 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
Matt Arsenault6dfda962016-02-10 18:21:39 +00006473 NewLD.getValue(1)
6474 };
6475
6476 return DAG.getMergeValues(Ops, DL);
6477 }
Tom Stellard81d871d2013-11-13 23:36:50 +00006478
Matt Arsenaulta1436412016-02-10 18:21:45 +00006479 if (!MemVT.isVector())
6480 return SDValue();
Matt Arsenault4d801cd2015-11-24 12:05:03 +00006481
Matt Arsenaulta1436412016-02-10 18:21:45 +00006482 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
6483 "Custom lowering for non-i32 vectors hasn't been implemented.");
Matt Arsenault4d801cd2015-11-24 12:05:03 +00006484
Farhana Aleen89196642018-03-07 17:09:18 +00006485 unsigned Alignment = Load->getAlignment();
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006486 unsigned AS = Load->getAddressSpace();
6487 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
Farhana Aleen89196642018-03-07 17:09:18 +00006488 AS, Alignment)) {
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006489 SDValue Ops[2];
6490 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
6491 return DAG.getMergeValues(Ops, DL);
6492 }
6493
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006494 MachineFunction &MF = DAG.getMachineFunction();
6495 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6496 // If there is a possibilty that flat instruction access scratch memory
6497 // then we need to use the same legalization rules we use for private.
Matt Arsenault0da63502018-08-31 05:49:54 +00006498 if (AS == AMDGPUAS::FLAT_ADDRESS)
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006499 AS = MFI->hasFlatScratchInit() ?
Matt Arsenault0da63502018-08-31 05:49:54 +00006500 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006501
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006502 unsigned NumElements = MemVT.getVectorNumElements();
Matt Arsenault6c041a32018-03-29 19:59:28 +00006503
Matt Arsenault0da63502018-08-31 05:49:54 +00006504 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6505 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +00006506 if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32)
Matt Arsenaulta1436412016-02-10 18:21:45 +00006507 return SDValue();
6508 // Non-uniform loads will be selected to MUBUF instructions, so they
Alexander Timofeev18009562016-12-08 17:28:47 +00006509 // have the same legalization requirements as global and private
Matt Arsenaulta1436412016-02-10 18:21:45 +00006510 // loads.
6511 //
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006512 }
Matt Arsenault6c041a32018-03-29 19:59:28 +00006513
Matt Arsenault0da63502018-08-31 05:49:54 +00006514 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6515 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6516 AS == AMDGPUAS::GLOBAL_ADDRESS) {
Alexander Timofeev2e5eece2018-03-05 15:12:21 +00006517 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
Farhana Aleen89196642018-03-07 17:09:18 +00006518 !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +00006519 Alignment >= 4 && NumElements < 32)
Alexander Timofeev18009562016-12-08 17:28:47 +00006520 return SDValue();
6521 // Non-uniform loads will be selected to MUBUF instructions, so they
6522 // have the same legalization requirements as global and private
6523 // loads.
6524 //
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006525 }
Matt Arsenault0da63502018-08-31 05:49:54 +00006526 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6527 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6528 AS == AMDGPUAS::GLOBAL_ADDRESS ||
6529 AS == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006530 if (NumElements > 4)
Matt Arsenaulta1436412016-02-10 18:21:45 +00006531 return SplitVectorLoad(Op, DAG);
6532 // v4 loads are supported for private and global memory.
6533 return SDValue();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006534 }
Matt Arsenault0da63502018-08-31 05:49:54 +00006535 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006536 // Depending on the setting of the private_element_size field in the
6537 // resource descriptor, we can only make private accesses up to a certain
6538 // size.
6539 switch (Subtarget->getMaxPrivateElementSize()) {
6540 case 4:
Matt Arsenault9c499c32016-04-14 23:31:26 +00006541 return scalarizeVectorLoad(Load, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006542 case 8:
6543 if (NumElements > 2)
6544 return SplitVectorLoad(Op, DAG);
6545 return SDValue();
6546 case 16:
6547 // Same as global/flat
6548 if (NumElements > 4)
6549 return SplitVectorLoad(Op, DAG);
6550 return SDValue();
6551 default:
6552 llvm_unreachable("unsupported private_element_size");
6553 }
Matt Arsenault0da63502018-08-31 05:49:54 +00006554 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Farhana Aleena7cb3112018-03-09 17:41:39 +00006555 // Use ds_read_b128 if possible.
Marek Olsaka9a58fa2018-04-10 22:48:23 +00006556 if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
Farhana Aleena7cb3112018-03-09 17:41:39 +00006557 MemVT.getStoreSize() == 16)
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006558 return SDValue();
6559
Farhana Aleena7cb3112018-03-09 17:41:39 +00006560 if (NumElements > 2)
6561 return SplitVectorLoad(Op, DAG);
Nicolai Haehnle48219372018-10-17 15:37:48 +00006562
6563 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
6564 // address is negative, then the instruction is incorrectly treated as
6565 // out-of-bounds even if base + offsets is in bounds. Split vectorized
6566 // loads here to avoid emitting ds_read2_b32. We may re-combine the
6567 // load later in the SILoadStoreOptimizer.
6568 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
6569 NumElements == 2 && MemVT.getStoreSize() == 8 &&
6570 Load->getAlignment() < 8) {
6571 return SplitVectorLoad(Op, DAG);
6572 }
Tom Stellarde9373602014-01-22 19:24:14 +00006573 }
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006574 return SDValue();
Tom Stellard81d871d2013-11-13 23:36:50 +00006575}
6576
Tom Stellard0ec134f2014-02-04 17:18:40 +00006577SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00006578 EVT VT = Op.getValueType();
6579 assert(VT.getSizeInBits() == 64);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006580
6581 SDLoc DL(Op);
6582 SDValue Cond = Op.getOperand(0);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006583
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00006584 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
6585 SDValue One = DAG.getConstant(1, DL, MVT::i32);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006586
Tom Stellard7ea3d6d2014-03-31 14:01:55 +00006587 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
6588 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
6589
6590 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
6591 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006592
6593 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
6594
Tom Stellard7ea3d6d2014-03-31 14:01:55 +00006595 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
6596 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006597
6598 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
6599
Ahmed Bougacha128f8732016-04-26 21:15:30 +00006600 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
Matt Arsenault02dc7e12018-06-15 15:15:46 +00006601 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006602}
6603
Matt Arsenault22ca3f82014-07-15 23:50:10 +00006604// Catch division cases where we can use shortcuts with rcp and rsq
6605// instructions.
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00006606SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
6607 SelectionDAG &DAG) const {
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006608 SDLoc SL(Op);
6609 SDValue LHS = Op.getOperand(0);
6610 SDValue RHS = Op.getOperand(1);
6611 EVT VT = Op.getValueType();
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00006612 const SDNodeFlags Flags = Op->getFlags();
Michael Berg7acc81b2018-05-04 18:48:20 +00006613 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006614
Konstantin Zhuravlyovc4b18e72017-04-21 19:25:33 +00006615 if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
6616 return SDValue();
6617
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006618 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
Konstantin Zhuravlyovc4b18e72017-04-21 19:25:33 +00006619 if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
Matt Arsenault979902b2016-08-02 22:25:04 +00006620 if (CLHS->isExactlyValue(1.0)) {
6621 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
6622 // the CI documentation has a worst case error of 1 ulp.
6623 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
6624 // use it as long as we aren't trying to use denormals.
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00006625 //
6626 // v_rcp_f16 and v_rsq_f16 DO support denormals.
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006627
Matt Arsenault979902b2016-08-02 22:25:04 +00006628 // 1.0 / sqrt(x) -> rsq(x)
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00006629
Matt Arsenault979902b2016-08-02 22:25:04 +00006630 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
6631 // error seems really high at 2^29 ULP.
6632 if (RHS.getOpcode() == ISD::FSQRT)
6633 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
6634
6635 // 1.0 / x -> rcp(x)
6636 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
6637 }
6638
6639 // Same as for 1.0, but expand the sign out of the constant.
6640 if (CLHS->isExactlyValue(-1.0)) {
6641 // -1.0 / x -> rcp (fneg x)
6642 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
6643 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
6644 }
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006645 }
6646 }
6647
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00006648 if (Unsafe) {
Matt Arsenault22ca3f82014-07-15 23:50:10 +00006649 // Turn into multiply by the reciprocal.
6650 // x / y -> x * (1.0 / y)
6651 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00006652 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
Matt Arsenault22ca3f82014-07-15 23:50:10 +00006653 }
6654
6655 return SDValue();
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006656}
6657
Tom Stellard8485fa02016-12-07 02:42:15 +00006658static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6659 EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
6660 if (GlueChain->getNumValues() <= 1) {
6661 return DAG.getNode(Opcode, SL, VT, A, B);
6662 }
6663
6664 assert(GlueChain->getNumValues() == 3);
6665
6666 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6667 switch (Opcode) {
6668 default: llvm_unreachable("no chain equivalent for opcode");
6669 case ISD::FMUL:
6670 Opcode = AMDGPUISD::FMUL_W_CHAIN;
6671 break;
6672 }
6673
6674 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
6675 GlueChain.getValue(2));
6676}
6677
6678static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6679 EVT VT, SDValue A, SDValue B, SDValue C,
6680 SDValue GlueChain) {
6681 if (GlueChain->getNumValues() <= 1) {
6682 return DAG.getNode(Opcode, SL, VT, A, B, C);
6683 }
6684
6685 assert(GlueChain->getNumValues() == 3);
6686
6687 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6688 switch (Opcode) {
6689 default: llvm_unreachable("no chain equivalent for opcode");
6690 case ISD::FMA:
6691 Opcode = AMDGPUISD::FMA_W_CHAIN;
6692 break;
6693 }
6694
6695 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
6696 GlueChain.getValue(2));
6697}
6698
Matt Arsenault4052a572016-12-22 03:05:41 +00006699SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00006700 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
6701 return FastLowered;
6702
Matt Arsenault4052a572016-12-22 03:05:41 +00006703 SDLoc SL(Op);
6704 SDValue Src0 = Op.getOperand(0);
6705 SDValue Src1 = Op.getOperand(1);
6706
6707 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6708 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6709
6710 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
6711 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
6712
6713 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
6714 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
6715
6716 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
6717}
6718
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00006719// Faster 2.5 ULP division that does not support denormals.
6720SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
6721 SDLoc SL(Op);
6722 SDValue LHS = Op.getOperand(1);
6723 SDValue RHS = Op.getOperand(2);
6724
6725 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
6726
6727 const APFloat K0Val(BitsToFloat(0x6f800000));
6728 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
6729
6730 const APFloat K1Val(BitsToFloat(0x2f800000));
6731 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
6732
6733 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
6734
6735 EVT SetCCVT =
6736 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
6737
6738 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
6739
6740 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
6741
6742 // TODO: Should this propagate fast-math-flags?
6743 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
6744
6745 // rcp does not support denormals.
6746 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
6747
6748 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
6749
6750 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
6751}
6752
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006753SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00006754 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
Eric Christopher538d09d02016-06-07 20:27:12 +00006755 return FastLowered;
Matt Arsenault22ca3f82014-07-15 23:50:10 +00006756
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006757 SDLoc SL(Op);
6758 SDValue LHS = Op.getOperand(0);
6759 SDValue RHS = Op.getOperand(1);
6760
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00006761 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
Matt Arsenault37fefd62016-06-10 02:18:02 +00006762
Wei Dinged0f97f2016-06-09 19:17:15 +00006763 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
Matt Arsenault37fefd62016-06-10 02:18:02 +00006764
Tom Stellard8485fa02016-12-07 02:42:15 +00006765 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
6766 RHS, RHS, LHS);
6767 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
6768 LHS, RHS, LHS);
Matt Arsenault37fefd62016-06-10 02:18:02 +00006769
Matt Arsenaultdfec5ce2016-07-09 07:48:11 +00006770 // Denominator is scaled to not be denormal, so using rcp is ok.
Tom Stellard8485fa02016-12-07 02:42:15 +00006771 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
6772 DenominatorScaled);
6773 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
6774 DenominatorScaled);
Matt Arsenault37fefd62016-06-10 02:18:02 +00006775
Tom Stellard8485fa02016-12-07 02:42:15 +00006776 const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
6777 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
6778 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
Matt Arsenault37fefd62016-06-10 02:18:02 +00006779
Tom Stellard8485fa02016-12-07 02:42:15 +00006780 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
Matt Arsenault37fefd62016-06-10 02:18:02 +00006781
Tom Stellard8485fa02016-12-07 02:42:15 +00006782 if (!Subtarget->hasFP32Denormals()) {
6783 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
6784 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
6785 SL, MVT::i32);
6786 SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
6787 DAG.getEntryNode(),
6788 EnableDenormValue, BitField);
6789 SDValue Ops[3] = {
6790 NegDivScale0,
6791 EnableDenorm.getValue(0),
6792 EnableDenorm.getValue(1)
6793 };
Matt Arsenault37fefd62016-06-10 02:18:02 +00006794
Tom Stellard8485fa02016-12-07 02:42:15 +00006795 NegDivScale0 = DAG.getMergeValues(Ops, SL);
6796 }
6797
6798 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
6799 ApproxRcp, One, NegDivScale0);
6800
6801 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
6802 ApproxRcp, Fma0);
6803
6804 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
6805 Fma1, Fma1);
6806
6807 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
6808 NumeratorScaled, Mul);
6809
6810 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
6811
6812 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
6813 NumeratorScaled, Fma3);
6814
6815 if (!Subtarget->hasFP32Denormals()) {
6816 const SDValue DisableDenormValue =
6817 DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
6818 SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
6819 Fma4.getValue(1),
6820 DisableDenormValue,
6821 BitField,
6822 Fma4.getValue(2));
6823
6824 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
6825 DisableDenorm, DAG.getRoot());
6826 DAG.setRoot(OutputChain);
6827 }
Matt Arsenault37fefd62016-06-10 02:18:02 +00006828
Wei Dinged0f97f2016-06-09 19:17:15 +00006829 SDValue Scale = NumeratorScaled.getValue(1);
Tom Stellard8485fa02016-12-07 02:42:15 +00006830 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
6831 Fma4, Fma1, Fma3, Scale);
Matt Arsenault37fefd62016-06-10 02:18:02 +00006832
Wei Dinged0f97f2016-06-09 19:17:15 +00006833 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006834}
6835
6836SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00006837 if (DAG.getTarget().Options.UnsafeFPMath)
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00006838 return lowerFastUnsafeFDIV(Op, DAG);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00006839
6840 SDLoc SL(Op);
6841 SDValue X = Op.getOperand(0);
6842 SDValue Y = Op.getOperand(1);
6843
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00006844 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00006845
6846 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
6847
6848 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
6849
6850 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
6851
6852 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
6853
6854 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
6855
6856 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
6857
6858 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
6859
6860 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
6861
6862 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
6863 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
6864
6865 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
6866 NegDivScale0, Mul, DivScale1);
6867
6868 SDValue Scale;
6869
Tom Stellard5bfbae52018-07-11 20:59:01 +00006870 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00006871 // Workaround a hardware bug on SI where the condition output from div_scale
6872 // is not usable.
6873
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00006874 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00006875
6876 // Figure out if the scale to use for div_fmas.
6877 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
6878 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
6879 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
6880 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
6881
6882 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
6883 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
6884
6885 SDValue Scale0Hi
6886 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
6887 SDValue Scale1Hi
6888 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
6889
6890 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
6891 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
6892 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
6893 } else {
6894 Scale = DivScale1.getValue(1);
6895 }
6896
6897 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
6898 Fma4, Fma3, Mul, Scale);
6899
6900 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006901}
6902
6903SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
6904 EVT VT = Op.getValueType();
6905
6906 if (VT == MVT::f32)
6907 return LowerFDIV32(Op, DAG);
6908
6909 if (VT == MVT::f64)
6910 return LowerFDIV64(Op, DAG);
6911
Matt Arsenault4052a572016-12-22 03:05:41 +00006912 if (VT == MVT::f16)
6913 return LowerFDIV16(Op, DAG);
6914
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006915 llvm_unreachable("Unexpected type for fdiv");
6916}
6917
Tom Stellard81d871d2013-11-13 23:36:50 +00006918SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
6919 SDLoc DL(Op);
6920 StoreSDNode *Store = cast<StoreSDNode>(Op);
6921 EVT VT = Store->getMemoryVT();
6922
Matt Arsenault95245662016-02-11 05:32:46 +00006923 if (VT == MVT::i1) {
6924 return DAG.getTruncStore(Store->getChain(), DL,
6925 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
6926 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
Tom Stellardb02094e2014-07-21 15:45:01 +00006927 }
6928
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006929 assert(VT.isVector() &&
6930 Store->getValue().getValueType().getScalarType() == MVT::i32);
6931
6932 unsigned AS = Store->getAddressSpace();
6933 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
6934 AS, Store->getAlignment())) {
6935 return expandUnalignedStore(Store, DAG);
6936 }
Tom Stellard81d871d2013-11-13 23:36:50 +00006937
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006938 MachineFunction &MF = DAG.getMachineFunction();
6939 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6940 // If there is a possibilty that flat instruction access scratch memory
6941 // then we need to use the same legalization rules we use for private.
Matt Arsenault0da63502018-08-31 05:49:54 +00006942 if (AS == AMDGPUAS::FLAT_ADDRESS)
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006943 AS = MFI->hasFlatScratchInit() ?
Matt Arsenault0da63502018-08-31 05:49:54 +00006944 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006945
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006946 unsigned NumElements = VT.getVectorNumElements();
Matt Arsenault0da63502018-08-31 05:49:54 +00006947 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
6948 AS == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006949 if (NumElements > 4)
6950 return SplitVectorStore(Op, DAG);
6951 return SDValue();
Matt Arsenault0da63502018-08-31 05:49:54 +00006952 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006953 switch (Subtarget->getMaxPrivateElementSize()) {
6954 case 4:
Matt Arsenault9c499c32016-04-14 23:31:26 +00006955 return scalarizeVectorStore(Store, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006956 case 8:
6957 if (NumElements > 2)
6958 return SplitVectorStore(Op, DAG);
6959 return SDValue();
6960 case 16:
6961 if (NumElements > 4)
6962 return SplitVectorStore(Op, DAG);
6963 return SDValue();
6964 default:
6965 llvm_unreachable("unsupported private_element_size");
6966 }
Matt Arsenault0da63502018-08-31 05:49:54 +00006967 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00006968 // Use ds_write_b128 if possible.
Marek Olsaka9a58fa2018-04-10 22:48:23 +00006969 if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00006970 VT.getStoreSize() == 16)
6971 return SDValue();
6972
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006973 if (NumElements > 2)
6974 return SplitVectorStore(Op, DAG);
Nicolai Haehnle48219372018-10-17 15:37:48 +00006975
6976 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
6977 // address is negative, then the instruction is incorrectly treated as
6978 // out-of-bounds even if base + offsets is in bounds. Split vectorized
6979 // stores here to avoid emitting ds_write2_b32. We may re-combine the
6980 // store later in the SILoadStoreOptimizer.
6981 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
6982 NumElements == 2 && VT.getStoreSize() == 8 &&
6983 Store->getAlignment() < 8) {
6984 return SplitVectorStore(Op, DAG);
6985 }
6986
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00006987 return SDValue();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006988 } else {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006989 llvm_unreachable("unhandled address space");
Matt Arsenault95245662016-02-11 05:32:46 +00006990 }
Tom Stellard81d871d2013-11-13 23:36:50 +00006991}
6992
Matt Arsenaultad14ce82014-07-19 18:44:39 +00006993SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00006994 SDLoc DL(Op);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00006995 EVT VT = Op.getValueType();
6996 SDValue Arg = Op.getOperand(0);
David Stuttard20de3e92018-09-14 10:27:19 +00006997 SDValue TrigVal;
6998
Sanjay Patela2607012015-09-16 16:31:21 +00006999 // TODO: Should this propagate fast-math-flags?
David Stuttard20de3e92018-09-14 10:27:19 +00007000
7001 SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT);
7002
7003 if (Subtarget->hasTrigReducedRange()) {
7004 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
7005 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
7006 } else {
7007 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
7008 }
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007009
7010 switch (Op.getOpcode()) {
7011 case ISD::FCOS:
David Stuttard20de3e92018-09-14 10:27:19 +00007012 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007013 case ISD::FSIN:
David Stuttard20de3e92018-09-14 10:27:19 +00007014 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007015 default:
7016 llvm_unreachable("Wrong trig opcode");
7017 }
7018}
7019
Tom Stellard354a43c2016-04-01 18:27:37 +00007020SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
7021 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
7022 assert(AtomicNode->isCompareAndSwap());
7023 unsigned AS = AtomicNode->getAddressSpace();
7024
7025 // No custom lowering required for local address space
Matt Arsenault0da63502018-08-31 05:49:54 +00007026 if (!isFlatGlobalAddrSpace(AS))
Tom Stellard354a43c2016-04-01 18:27:37 +00007027 return Op;
7028
7029 // Non-local address space requires custom lowering for atomic compare
7030 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
7031 SDLoc DL(Op);
7032 SDValue ChainIn = Op.getOperand(0);
7033 SDValue Addr = Op.getOperand(1);
7034 SDValue Old = Op.getOperand(2);
7035 SDValue New = Op.getOperand(3);
7036 EVT VT = Op.getValueType();
7037 MVT SimpleVT = VT.getSimpleVT();
7038 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
7039
Ahmed Bougacha128f8732016-04-26 21:15:30 +00007040 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
Tom Stellard354a43c2016-04-01 18:27:37 +00007041 SDValue Ops[] = { ChainIn, Addr, NewOld };
Matt Arsenault88701812016-06-09 23:42:48 +00007042
7043 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
7044 Ops, VT, AtomicNode->getMemOperand());
Tom Stellard354a43c2016-04-01 18:27:37 +00007045}
7046
Tom Stellard75aadc22012-12-11 21:25:42 +00007047//===----------------------------------------------------------------------===//
7048// Custom DAG optimizations
7049//===----------------------------------------------------------------------===//
7050
Matt Arsenault364a6742014-06-11 17:50:44 +00007051SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
Matt Arsenaulte6986632015-01-14 01:35:22 +00007052 DAGCombinerInfo &DCI) const {
Matt Arsenault364a6742014-06-11 17:50:44 +00007053 EVT VT = N->getValueType(0);
7054 EVT ScalarVT = VT.getScalarType();
7055 if (ScalarVT != MVT::f32)
7056 return SDValue();
7057
7058 SelectionDAG &DAG = DCI.DAG;
7059 SDLoc DL(N);
7060
7061 SDValue Src = N->getOperand(0);
7062 EVT SrcVT = Src.getValueType();
7063
7064 // TODO: We could try to match extracting the higher bytes, which would be
7065 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
7066 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
7067 // about in practice.
Craig Topper80d3bb32018-03-06 19:44:52 +00007068 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
Matt Arsenault364a6742014-06-11 17:50:44 +00007069 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
7070 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
7071 DCI.AddToWorklist(Cvt.getNode());
7072 return Cvt;
7073 }
7074 }
7075
Matt Arsenault364a6742014-06-11 17:50:44 +00007076 return SDValue();
7077}
7078
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007079// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
7080
7081// This is a variant of
7082// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
7083//
7084// The normal DAG combiner will do this, but only if the add has one use since
7085// that would increase the number of instructions.
7086//
7087// This prevents us from seeing a constant offset that can be folded into a
7088// memory instruction's addressing mode. If we know the resulting add offset of
7089// a pointer can be folded into an addressing offset, we can replace the pointer
7090// operand with the add of new constant offset. This eliminates one of the uses,
7091// and may allow the remaining use to also be simplified.
7092//
7093SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
7094 unsigned AddrSpace,
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007095 EVT MemVT,
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007096 DAGCombinerInfo &DCI) const {
7097 SDValue N0 = N->getOperand(0);
7098 SDValue N1 = N->getOperand(1);
7099
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007100 // We only do this to handle cases where it's profitable when there are
7101 // multiple uses of the add, so defer to the standard combine.
Matt Arsenaultc8903122017-11-14 23:46:42 +00007102 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
7103 N0->hasOneUse())
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007104 return SDValue();
7105
7106 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
7107 if (!CN1)
7108 return SDValue();
7109
7110 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
7111 if (!CAdd)
7112 return SDValue();
7113
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007114 // If the resulting offset is too large, we can't fold it into the addressing
7115 // mode offset.
7116 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007117 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
7118
7119 AddrMode AM;
7120 AM.HasBaseReg = true;
7121 AM.BaseOffs = Offset.getSExtValue();
7122 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007123 return SDValue();
7124
7125 SelectionDAG &DAG = DCI.DAG;
7126 SDLoc SL(N);
7127 EVT VT = N->getValueType(0);
7128
7129 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007130 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007131
Matt Arsenaulte5e0c742017-11-13 05:33:35 +00007132 SDNodeFlags Flags;
7133 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
7134 (N0.getOpcode() == ISD::OR ||
7135 N0->getFlags().hasNoUnsignedWrap()));
7136
7137 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007138}
7139
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00007140SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
7141 DAGCombinerInfo &DCI) const {
7142 SDValue Ptr = N->getBasePtr();
7143 SelectionDAG &DAG = DCI.DAG;
7144 SDLoc SL(N);
7145
7146 // TODO: We could also do this for multiplies.
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007147 if (Ptr.getOpcode() == ISD::SHL) {
7148 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
7149 N->getMemoryVT(), DCI);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00007150 if (NewPtr) {
7151 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
7152
7153 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
7154 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
7155 }
7156 }
7157
7158 return SDValue();
7159}
7160
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007161static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
7162 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
7163 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
7164 (Opc == ISD::XOR && Val == 0);
7165}
7166
7167// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
7168// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
7169// integer combine opportunities since most 64-bit operations are decomposed
7170// this way. TODO: We won't want this for SALU especially if it is an inline
7171// immediate.
7172SDValue SITargetLowering::splitBinaryBitConstantOp(
7173 DAGCombinerInfo &DCI,
7174 const SDLoc &SL,
7175 unsigned Opc, SDValue LHS,
7176 const ConstantSDNode *CRHS) const {
7177 uint64_t Val = CRHS->getZExtValue();
7178 uint32_t ValLo = Lo_32(Val);
7179 uint32_t ValHi = Hi_32(Val);
7180 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7181
7182 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
7183 bitOpWithConstantIsReducible(Opc, ValHi)) ||
7184 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
7185 // If we need to materialize a 64-bit immediate, it will be split up later
7186 // anyway. Avoid creating the harder to understand 64-bit immediate
7187 // materialization.
7188 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
7189 }
7190
7191 return SDValue();
7192}
7193
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00007194// Returns true if argument is a boolean value which is not serialized into
7195// memory or argument and does not require v_cmdmask_b32 to be deserialized.
7196static bool isBoolSGPR(SDValue V) {
7197 if (V.getValueType() != MVT::i1)
7198 return false;
7199 switch (V.getOpcode()) {
7200 default: break;
7201 case ISD::SETCC:
7202 case ISD::AND:
7203 case ISD::OR:
7204 case ISD::XOR:
7205 case AMDGPUISD::FP_CLASS:
7206 return true;
7207 }
7208 return false;
7209}
7210
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007211// If a constant has all zeroes or all ones within each byte return it.
7212// Otherwise return 0.
7213static uint32_t getConstantPermuteMask(uint32_t C) {
7214 // 0xff for any zero byte in the mask
7215 uint32_t ZeroByteMask = 0;
7216 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
7217 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
7218 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
7219 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
7220 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
7221 if ((NonZeroByteMask & C) != NonZeroByteMask)
7222 return 0; // Partial bytes selected.
7223 return C;
7224}
7225
7226// Check if a node selects whole bytes from its operand 0 starting at a byte
7227// boundary while masking the rest. Returns select mask as in the v_perm_b32
7228// or -1 if not succeeded.
7229// Note byte select encoding:
7230// value 0-3 selects corresponding source byte;
7231// value 0xc selects zero;
7232// value 0xff selects 0xff.
7233static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
7234 assert(V.getValueSizeInBits() == 32);
7235
7236 if (V.getNumOperands() != 2)
7237 return ~0;
7238
7239 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
7240 if (!N1)
7241 return ~0;
7242
7243 uint32_t C = N1->getZExtValue();
7244
7245 switch (V.getOpcode()) {
7246 default:
7247 break;
7248 case ISD::AND:
7249 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7250 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
7251 }
7252 break;
7253
7254 case ISD::OR:
7255 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7256 return (0x03020100 & ~ConstMask) | ConstMask;
7257 }
7258 break;
7259
7260 case ISD::SHL:
7261 if (C % 8)
7262 return ~0;
7263
7264 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
7265
7266 case ISD::SRL:
7267 if (C % 8)
7268 return ~0;
7269
7270 return uint32_t(0x0c0c0c0c03020100ull >> C);
7271 }
7272
7273 return ~0;
7274}
7275
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007276SDValue SITargetLowering::performAndCombine(SDNode *N,
7277 DAGCombinerInfo &DCI) const {
7278 if (DCI.isBeforeLegalize())
7279 return SDValue();
7280
7281 SelectionDAG &DAG = DCI.DAG;
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007282 EVT VT = N->getValueType(0);
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007283 SDValue LHS = N->getOperand(0);
7284 SDValue RHS = N->getOperand(1);
7285
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007286
Stanislav Mekhanoshin53a21292017-05-23 19:54:48 +00007287 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7288 if (VT == MVT::i64 && CRHS) {
7289 if (SDValue Split
7290 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
7291 return Split;
7292 }
7293
7294 if (CRHS && VT == MVT::i32) {
7295 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
7296 // nb = number of trailing zeroes in mask
7297 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
7298 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
7299 uint64_t Mask = CRHS->getZExtValue();
7300 unsigned Bits = countPopulation(Mask);
7301 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
7302 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
7303 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
7304 unsigned Shift = CShift->getZExtValue();
7305 unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
7306 unsigned Offset = NB + Shift;
7307 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
7308 SDLoc SL(N);
7309 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
7310 LHS->getOperand(0),
7311 DAG.getConstant(Offset, SL, MVT::i32),
7312 DAG.getConstant(Bits, SL, MVT::i32));
7313 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
7314 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
7315 DAG.getValueType(NarrowVT));
7316 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
7317 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
7318 return Shl;
7319 }
7320 }
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007321 }
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007322
7323 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7324 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
7325 isa<ConstantSDNode>(LHS.getOperand(2))) {
7326 uint32_t Sel = getConstantPermuteMask(Mask);
7327 if (!Sel)
7328 return SDValue();
7329
7330 // Select 0xc for all zero bytes
7331 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
7332 SDLoc DL(N);
7333 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7334 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7335 }
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007336 }
7337
7338 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
7339 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
7340 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007341 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7342 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
7343
7344 SDValue X = LHS.getOperand(0);
7345 SDValue Y = RHS.getOperand(0);
7346 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
7347 return SDValue();
7348
7349 if (LCC == ISD::SETO) {
7350 if (X != LHS.getOperand(1))
7351 return SDValue();
7352
7353 if (RCC == ISD::SETUNE) {
7354 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
7355 if (!C1 || !C1->isInfinity() || C1->isNegative())
7356 return SDValue();
7357
7358 const uint32_t Mask = SIInstrFlags::N_NORMAL |
7359 SIInstrFlags::N_SUBNORMAL |
7360 SIInstrFlags::N_ZERO |
7361 SIInstrFlags::P_ZERO |
7362 SIInstrFlags::P_SUBNORMAL |
7363 SIInstrFlags::P_NORMAL;
7364
7365 static_assert(((~(SIInstrFlags::S_NAN |
7366 SIInstrFlags::Q_NAN |
7367 SIInstrFlags::N_INFINITY |
7368 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
7369 "mask not equal");
7370
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007371 SDLoc DL(N);
7372 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7373 X, DAG.getConstant(Mask, DL, MVT::i32));
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007374 }
7375 }
7376 }
7377
Matt Arsenault3dcf4ce2018-08-10 18:58:56 +00007378 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
7379 std::swap(LHS, RHS);
7380
7381 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7382 RHS.hasOneUse()) {
7383 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7384 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
7385 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
7386 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7387 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
7388 (RHS.getOperand(0) == LHS.getOperand(0) &&
7389 LHS.getOperand(0) == LHS.getOperand(1))) {
7390 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
7391 unsigned NewMask = LCC == ISD::SETO ?
7392 Mask->getZExtValue() & ~OrdMask :
7393 Mask->getZExtValue() & OrdMask;
7394
7395 SDLoc DL(N);
7396 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
7397 DAG.getConstant(NewMask, DL, MVT::i32));
7398 }
7399 }
7400
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00007401 if (VT == MVT::i32 &&
7402 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
7403 // and x, (sext cc from i1) => select cc, x, 0
7404 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
7405 std::swap(LHS, RHS);
7406 if (isBoolSGPR(RHS.getOperand(0)))
7407 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
7408 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
7409 }
7410
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007411 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7412 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7413 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7414 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7415 uint32_t LHSMask = getPermuteMask(DAG, LHS);
7416 uint32_t RHSMask = getPermuteMask(DAG, RHS);
7417 if (LHSMask != ~0u && RHSMask != ~0u) {
7418 // Canonicalize the expression in an attempt to have fewer unique masks
7419 // and therefore fewer registers used to hold the masks.
7420 if (LHSMask > RHSMask) {
7421 std::swap(LHSMask, RHSMask);
7422 std::swap(LHS, RHS);
7423 }
7424
7425 // Select 0xc for each lane used from source operand. Zero has 0xc mask
7426 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7427 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7428 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7429
7430 // Check of we need to combine values from two sources within a byte.
7431 if (!(LHSUsedLanes & RHSUsedLanes) &&
7432 // If we select high and lower word keep it for SDWA.
7433 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7434 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7435 // Each byte in each mask is either selector mask 0-3, or has higher
7436 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
7437 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
7438 // mask which is not 0xff wins. By anding both masks we have a correct
7439 // result except that 0x0c shall be corrected to give 0x0c only.
7440 uint32_t Mask = LHSMask & RHSMask;
7441 for (unsigned I = 0; I < 32; I += 8) {
7442 uint32_t ByteSel = 0xff << I;
7443 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
7444 Mask &= (0x0c << I) & 0xffffffff;
7445 }
7446
7447 // Add 4 to each active LHS lane. It will not affect any existing 0xff
7448 // or 0x0c.
7449 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
7450 SDLoc DL(N);
7451
7452 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7453 LHS.getOperand(0), RHS.getOperand(0),
7454 DAG.getConstant(Sel, DL, MVT::i32));
7455 }
7456 }
7457 }
7458
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007459 return SDValue();
7460}
7461
Matt Arsenaultf2290332015-01-06 23:00:39 +00007462SDValue SITargetLowering::performOrCombine(SDNode *N,
7463 DAGCombinerInfo &DCI) const {
7464 SelectionDAG &DAG = DCI.DAG;
7465 SDValue LHS = N->getOperand(0);
7466 SDValue RHS = N->getOperand(1);
7467
Matt Arsenault3b082382016-04-12 18:24:38 +00007468 EVT VT = N->getValueType(0);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007469 if (VT == MVT::i1) {
7470 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
7471 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7472 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
7473 SDValue Src = LHS.getOperand(0);
7474 if (Src != RHS.getOperand(0))
7475 return SDValue();
Matt Arsenault3b082382016-04-12 18:24:38 +00007476
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007477 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
7478 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7479 if (!CLHS || !CRHS)
7480 return SDValue();
Matt Arsenault3b082382016-04-12 18:24:38 +00007481
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007482 // Only 10 bits are used.
7483 static const uint32_t MaxMask = 0x3ff;
Matt Arsenault3b082382016-04-12 18:24:38 +00007484
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007485 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
7486 SDLoc DL(N);
7487 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7488 Src, DAG.getConstant(NewMask, DL, MVT::i32));
7489 }
Matt Arsenault3b082382016-04-12 18:24:38 +00007490
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007491 return SDValue();
7492 }
7493
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007494 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7495 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
7496 LHS.getOpcode() == AMDGPUISD::PERM &&
7497 isa<ConstantSDNode>(LHS.getOperand(2))) {
7498 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
7499 if (!Sel)
7500 return SDValue();
7501
7502 Sel |= LHS.getConstantOperandVal(2);
7503 SDLoc DL(N);
7504 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7505 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7506 }
7507
7508 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7509 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7510 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7511 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7512 uint32_t LHSMask = getPermuteMask(DAG, LHS);
7513 uint32_t RHSMask = getPermuteMask(DAG, RHS);
7514 if (LHSMask != ~0u && RHSMask != ~0u) {
7515 // Canonicalize the expression in an attempt to have fewer unique masks
7516 // and therefore fewer registers used to hold the masks.
7517 if (LHSMask > RHSMask) {
7518 std::swap(LHSMask, RHSMask);
7519 std::swap(LHS, RHS);
7520 }
7521
7522 // Select 0xc for each lane used from source operand. Zero has 0xc mask
7523 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7524 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7525 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7526
7527 // Check of we need to combine values from two sources within a byte.
7528 if (!(LHSUsedLanes & RHSUsedLanes) &&
7529 // If we select high and lower word keep it for SDWA.
7530 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7531 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7532 // Kill zero bytes selected by other mask. Zero value is 0xc.
7533 LHSMask &= ~RHSUsedLanes;
7534 RHSMask &= ~LHSUsedLanes;
7535 // Add 4 to each active LHS lane
7536 LHSMask |= LHSUsedLanes & 0x04040404;
7537 // Combine masks
7538 uint32_t Sel = LHSMask | RHSMask;
7539 SDLoc DL(N);
7540
7541 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7542 LHS.getOperand(0), RHS.getOperand(0),
7543 DAG.getConstant(Sel, DL, MVT::i32));
7544 }
7545 }
7546 }
7547
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007548 if (VT != MVT::i64)
7549 return SDValue();
7550
7551 // TODO: This could be a generic combine with a predicate for extracting the
7552 // high half of an integer being free.
7553
7554 // (or i64:x, (zero_extend i32:y)) ->
7555 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
7556 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
7557 RHS.getOpcode() != ISD::ZERO_EXTEND)
7558 std::swap(LHS, RHS);
7559
7560 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
7561 SDValue ExtSrc = RHS.getOperand(0);
7562 EVT SrcVT = ExtSrc.getValueType();
7563 if (SrcVT == MVT::i32) {
7564 SDLoc SL(N);
7565 SDValue LowLHS, HiBits;
7566 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
7567 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
7568
7569 DCI.AddToWorklist(LowOr.getNode());
7570 DCI.AddToWorklist(HiBits.getNode());
7571
7572 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
7573 LowOr, HiBits);
7574 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
Matt Arsenault3b082382016-04-12 18:24:38 +00007575 }
7576 }
7577
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007578 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
7579 if (CRHS) {
7580 if (SDValue Split
7581 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
7582 return Split;
7583 }
Matt Arsenaultf2290332015-01-06 23:00:39 +00007584
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007585 return SDValue();
7586}
Matt Arsenaultf2290332015-01-06 23:00:39 +00007587
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007588SDValue SITargetLowering::performXorCombine(SDNode *N,
7589 DAGCombinerInfo &DCI) const {
7590 EVT VT = N->getValueType(0);
7591 if (VT != MVT::i64)
7592 return SDValue();
Matt Arsenaultf2290332015-01-06 23:00:39 +00007593
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007594 SDValue LHS = N->getOperand(0);
7595 SDValue RHS = N->getOperand(1);
7596
7597 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7598 if (CRHS) {
7599 if (SDValue Split
7600 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
7601 return Split;
Matt Arsenaultf2290332015-01-06 23:00:39 +00007602 }
7603
7604 return SDValue();
7605}
7606
Matt Arsenault5cf42712017-04-06 20:58:30 +00007607// Instructions that will be lowered with a final instruction that zeros the
7608// high result bits.
7609// XXX - probably only need to list legal operations.
Matt Arsenault8edfaee2017-03-31 19:53:03 +00007610static bool fp16SrcZerosHighBits(unsigned Opc) {
7611 switch (Opc) {
Matt Arsenault5cf42712017-04-06 20:58:30 +00007612 case ISD::FADD:
7613 case ISD::FSUB:
7614 case ISD::FMUL:
7615 case ISD::FDIV:
7616 case ISD::FREM:
7617 case ISD::FMA:
7618 case ISD::FMAD:
7619 case ISD::FCANONICALIZE:
7620 case ISD::FP_ROUND:
7621 case ISD::UINT_TO_FP:
7622 case ISD::SINT_TO_FP:
7623 case ISD::FABS:
7624 // Fabs is lowered to a bit operation, but it's an and which will clear the
7625 // high bits anyway.
7626 case ISD::FSQRT:
7627 case ISD::FSIN:
7628 case ISD::FCOS:
7629 case ISD::FPOWI:
7630 case ISD::FPOW:
7631 case ISD::FLOG:
7632 case ISD::FLOG2:
7633 case ISD::FLOG10:
7634 case ISD::FEXP:
7635 case ISD::FEXP2:
7636 case ISD::FCEIL:
7637 case ISD::FTRUNC:
7638 case ISD::FRINT:
7639 case ISD::FNEARBYINT:
7640 case ISD::FROUND:
7641 case ISD::FFLOOR:
7642 case ISD::FMINNUM:
7643 case ISD::FMAXNUM:
7644 case AMDGPUISD::FRACT:
7645 case AMDGPUISD::CLAMP:
7646 case AMDGPUISD::COS_HW:
7647 case AMDGPUISD::SIN_HW:
7648 case AMDGPUISD::FMIN3:
7649 case AMDGPUISD::FMAX3:
7650 case AMDGPUISD::FMED3:
7651 case AMDGPUISD::FMAD_FTZ:
7652 case AMDGPUISD::RCP:
7653 case AMDGPUISD::RSQ:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00007654 case AMDGPUISD::RCP_IFLAG:
Matt Arsenault5cf42712017-04-06 20:58:30 +00007655 case AMDGPUISD::LDEXP:
Matt Arsenault8edfaee2017-03-31 19:53:03 +00007656 return true;
Matt Arsenault5cf42712017-04-06 20:58:30 +00007657 default:
7658 // fcopysign, select and others may be lowered to 32-bit bit operations
7659 // which don't zero the high bits.
7660 return false;
Matt Arsenault8edfaee2017-03-31 19:53:03 +00007661 }
7662}
7663
7664SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
7665 DAGCombinerInfo &DCI) const {
7666 if (!Subtarget->has16BitInsts() ||
7667 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
7668 return SDValue();
7669
7670 EVT VT = N->getValueType(0);
7671 if (VT != MVT::i32)
7672 return SDValue();
7673
7674 SDValue Src = N->getOperand(0);
7675 if (Src.getValueType() != MVT::i16)
7676 return SDValue();
7677
7678 // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
7679 // FIXME: It is not universally true that the high bits are zeroed on gfx9.
7680 if (Src.getOpcode() == ISD::BITCAST) {
7681 SDValue BCSrc = Src.getOperand(0);
7682 if (BCSrc.getValueType() == MVT::f16 &&
7683 fp16SrcZerosHighBits(BCSrc.getOpcode()))
7684 return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
7685 }
7686
7687 return SDValue();
7688}
7689
Matt Arsenaultf2290332015-01-06 23:00:39 +00007690SDValue SITargetLowering::performClassCombine(SDNode *N,
7691 DAGCombinerInfo &DCI) const {
7692 SelectionDAG &DAG = DCI.DAG;
7693 SDValue Mask = N->getOperand(1);
7694
7695 // fp_class x, 0 -> false
7696 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
7697 if (CMask->isNullValue())
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007698 return DAG.getConstant(0, SDLoc(N), MVT::i1);
Matt Arsenaultf2290332015-01-06 23:00:39 +00007699 }
7700
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00007701 if (N->getOperand(0).isUndef())
7702 return DAG.getUNDEF(MVT::i1);
7703
Matt Arsenaultf2290332015-01-06 23:00:39 +00007704 return SDValue();
7705}
7706
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00007707SDValue SITargetLowering::performRcpCombine(SDNode *N,
7708 DAGCombinerInfo &DCI) const {
7709 EVT VT = N->getValueType(0);
7710 SDValue N0 = N->getOperand(0);
7711
7712 if (N0.isUndef())
7713 return N0;
7714
7715 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
7716 N0.getOpcode() == ISD::SINT_TO_FP)) {
7717 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
7718 N->getFlags());
7719 }
7720
7721 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
7722}
7723
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007724bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
7725 unsigned MaxDepth) const {
7726 unsigned Opcode = Op.getOpcode();
7727 if (Opcode == ISD::FCANONICALIZE)
7728 return true;
7729
7730 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
7731 auto F = CFP->getValueAPF();
7732 if (F.isNaN() && F.isSignaling())
7733 return false;
7734 return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
7735 }
7736
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007737 // If source is a result of another standard FP operation it is already in
7738 // canonical form.
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007739 if (MaxDepth == 0)
7740 return false;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007741
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007742 switch (Opcode) {
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007743 // These will flush denorms if required.
7744 case ISD::FADD:
7745 case ISD::FSUB:
7746 case ISD::FMUL:
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007747 case ISD::FCEIL:
7748 case ISD::FFLOOR:
7749 case ISD::FMA:
7750 case ISD::FMAD:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007751 case ISD::FSQRT:
7752 case ISD::FDIV:
7753 case ISD::FREM:
Matt Arsenaultce6d61f2018-08-06 21:51:52 +00007754 case ISD::FP_ROUND:
7755 case ISD::FP_EXTEND:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007756 case AMDGPUISD::FMUL_LEGACY:
7757 case AMDGPUISD::FMAD_FTZ:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00007758 case AMDGPUISD::RCP:
7759 case AMDGPUISD::RSQ:
7760 case AMDGPUISD::RSQ_CLAMP:
7761 case AMDGPUISD::RCP_LEGACY:
7762 case AMDGPUISD::RSQ_LEGACY:
7763 case AMDGPUISD::RCP_IFLAG:
7764 case AMDGPUISD::TRIG_PREOP:
7765 case AMDGPUISD::DIV_SCALE:
7766 case AMDGPUISD::DIV_FMAS:
7767 case AMDGPUISD::DIV_FIXUP:
7768 case AMDGPUISD::FRACT:
7769 case AMDGPUISD::LDEXP:
Matt Arsenault08f3fe42018-08-06 23:01:31 +00007770 case AMDGPUISD::CVT_PKRTZ_F16_F32:
Matt Arsenault940e6072018-08-10 19:20:17 +00007771 case AMDGPUISD::CVT_F32_UBYTE0:
7772 case AMDGPUISD::CVT_F32_UBYTE1:
7773 case AMDGPUISD::CVT_F32_UBYTE2:
7774 case AMDGPUISD::CVT_F32_UBYTE3:
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007775 return true;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007776
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007777 // It can/will be lowered or combined as a bit operation.
7778 // Need to check their input recursively to handle.
7779 case ISD::FNEG:
7780 case ISD::FABS:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007781 case ISD::FCOPYSIGN:
7782 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007783
7784 case ISD::FSIN:
7785 case ISD::FCOS:
7786 case ISD::FSINCOS:
7787 return Op.getValueType().getScalarType() != MVT::f16;
7788
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007789 case ISD::FMINNUM:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00007790 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00007791 case ISD::FMINNUM_IEEE:
7792 case ISD::FMAXNUM_IEEE:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00007793 case AMDGPUISD::CLAMP:
7794 case AMDGPUISD::FMED3:
7795 case AMDGPUISD::FMAX3:
7796 case AMDGPUISD::FMIN3: {
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007797 // FIXME: Shouldn't treat the generic operations different based these.
Matt Arsenault687ec752018-10-22 16:27:27 +00007798 // However, we aren't really required to flush the result from
7799 // minnum/maxnum..
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007800
Matt Arsenault687ec752018-10-22 16:27:27 +00007801 // snans will be quieted, so we only need to worry about denormals.
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007802 if (Subtarget->supportsMinMaxDenormModes() ||
Matt Arsenault687ec752018-10-22 16:27:27 +00007803 denormalsEnabledForType(Op.getValueType()))
7804 return true;
7805
7806 // Flushing may be required.
7807 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
7808 // targets need to check their input recursively.
7809
7810 // FIXME: Does this apply with clamp? It's implemented with max.
7811 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
7812 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
7813 return false;
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007814 }
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007815
Matt Arsenault687ec752018-10-22 16:27:27 +00007816 return true;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007817 }
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007818 case ISD::SELECT: {
7819 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
7820 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007821 }
Matt Arsenaulte94ee832018-08-06 22:45:51 +00007822 case ISD::BUILD_VECTOR: {
7823 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
7824 SDValue SrcOp = Op.getOperand(i);
7825 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
7826 return false;
7827 }
7828
7829 return true;
7830 }
7831 case ISD::EXTRACT_VECTOR_ELT:
7832 case ISD::EXTRACT_SUBVECTOR: {
7833 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
7834 }
7835 case ISD::INSERT_VECTOR_ELT: {
7836 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
7837 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
7838 }
7839 case ISD::UNDEF:
7840 // Could be anything.
7841 return false;
Matt Arsenault08f3fe42018-08-06 23:01:31 +00007842
Matt Arsenault687ec752018-10-22 16:27:27 +00007843 case ISD::BITCAST: {
7844 // Hack round the mess we make when legalizing extract_vector_elt
7845 SDValue Src = Op.getOperand(0);
7846 if (Src.getValueType() == MVT::i16 &&
7847 Src.getOpcode() == ISD::TRUNCATE) {
7848 SDValue TruncSrc = Src.getOperand(0);
7849 if (TruncSrc.getValueType() == MVT::i32 &&
7850 TruncSrc.getOpcode() == ISD::BITCAST &&
7851 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
7852 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
7853 }
7854 }
7855
7856 return false;
7857 }
Matt Arsenault08f3fe42018-08-06 23:01:31 +00007858 case ISD::INTRINSIC_WO_CHAIN: {
7859 unsigned IntrinsicID
7860 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7861 // TODO: Handle more intrinsics
7862 switch (IntrinsicID) {
7863 case Intrinsic::amdgcn_cvt_pkrtz:
Matt Arsenault940e6072018-08-10 19:20:17 +00007864 case Intrinsic::amdgcn_cubeid:
7865 case Intrinsic::amdgcn_frexp_mant:
7866 case Intrinsic::amdgcn_fdot2:
Matt Arsenault08f3fe42018-08-06 23:01:31 +00007867 return true;
7868 default:
7869 break;
7870 }
Matt Arsenault5bb9d792018-08-10 17:57:12 +00007871
7872 LLVM_FALLTHROUGH;
Matt Arsenault08f3fe42018-08-06 23:01:31 +00007873 }
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007874 default:
7875 return denormalsEnabledForType(Op.getValueType()) &&
7876 DAG.isKnownNeverSNaN(Op);
7877 }
7878
7879 llvm_unreachable("invalid operation");
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007880}
7881
Matt Arsenault9cd90712016-04-14 01:42:16 +00007882// Constant fold canonicalize.
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00007883SDValue SITargetLowering::getCanonicalConstantFP(
7884 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
7885 // Flush denormals to 0 if not enabled.
7886 if (C.isDenormal() && !denormalsEnabledForType(VT))
7887 return DAG.getConstantFP(0.0, SL, VT);
7888
7889 if (C.isNaN()) {
7890 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
7891 if (C.isSignaling()) {
7892 // Quiet a signaling NaN.
7893 // FIXME: Is this supposed to preserve payload bits?
7894 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
7895 }
7896
7897 // Make sure it is the canonical NaN bitpattern.
7898 //
7899 // TODO: Can we use -1 as the canonical NaN value since it's an inline
7900 // immediate?
7901 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
7902 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
7903 }
7904
7905 // Already canonical.
7906 return DAG.getConstantFP(C, SL, VT);
7907}
7908
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007909static bool vectorEltWillFoldAway(SDValue Op) {
7910 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
7911}
7912
Matt Arsenault9cd90712016-04-14 01:42:16 +00007913SDValue SITargetLowering::performFCanonicalizeCombine(
7914 SDNode *N,
7915 DAGCombinerInfo &DCI) const {
Matt Arsenault9cd90712016-04-14 01:42:16 +00007916 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault4aec86d2018-07-31 13:34:31 +00007917 SDValue N0 = N->getOperand(0);
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007918 EVT VT = N->getValueType(0);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007919
Matt Arsenault4aec86d2018-07-31 13:34:31 +00007920 // fcanonicalize undef -> qnan
7921 if (N0.isUndef()) {
Matt Arsenault4aec86d2018-07-31 13:34:31 +00007922 APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
7923 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
7924 }
7925
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00007926 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
Matt Arsenault9cd90712016-04-14 01:42:16 +00007927 EVT VT = N->getValueType(0);
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00007928 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
Matt Arsenault9cd90712016-04-14 01:42:16 +00007929 }
7930
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007931 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
7932 // (fcanonicalize k)
7933 //
7934 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
7935
7936 // TODO: This could be better with wider vectors that will be split to v2f16,
7937 // and to consider uses since there aren't that many packed operations.
Matt Arsenaultb5acec12018-08-12 08:42:54 +00007938 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
7939 isTypeLegal(MVT::v2f16)) {
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007940 SDLoc SL(N);
7941 SDValue NewElts[2];
7942 SDValue Lo = N0.getOperand(0);
7943 SDValue Hi = N0.getOperand(1);
Matt Arsenaultb5acec12018-08-12 08:42:54 +00007944 EVT EltVT = Lo.getValueType();
7945
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007946 if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
7947 for (unsigned I = 0; I != 2; ++I) {
7948 SDValue Op = N0.getOperand(I);
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007949 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
7950 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
7951 CFP->getValueAPF());
7952 } else if (Op.isUndef()) {
Matt Arsenaultb5acec12018-08-12 08:42:54 +00007953 // Handled below based on what the other operand is.
7954 NewElts[I] = Op;
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007955 } else {
7956 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
7957 }
7958 }
7959
Matt Arsenaultb5acec12018-08-12 08:42:54 +00007960 // If one half is undef, and one is constant, perfer a splat vector rather
7961 // than the normal qNaN. If it's a register, prefer 0.0 since that's
7962 // cheaper to use and may be free with a packed operation.
7963 if (NewElts[0].isUndef()) {
7964 if (isa<ConstantFPSDNode>(NewElts[1]))
7965 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
7966 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
7967 }
7968
7969 if (NewElts[1].isUndef()) {
7970 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
7971 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
7972 }
7973
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007974 return DAG.getBuildVector(VT, SL, NewElts);
7975 }
7976 }
7977
Matt Arsenault687ec752018-10-22 16:27:27 +00007978 unsigned SrcOpc = N0.getOpcode();
7979
7980 // If it's free to do so, push canonicalizes further up the source, which may
7981 // find a canonical source.
7982 //
7983 // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
7984 // sNaNs.
7985 if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
7986 auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
7987 if (CRHS && N0.hasOneUse()) {
7988 SDLoc SL(N);
7989 SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
7990 N0.getOperand(0));
7991 SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
7992 DCI.AddToWorklist(Canon0.getNode());
7993
7994 return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
7995 }
7996 }
7997
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00007998 return isCanonicalized(DAG, N0) ? N0 : SDValue();
Matt Arsenault9cd90712016-04-14 01:42:16 +00007999}
8000
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008001static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
8002 switch (Opc) {
8003 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008004 case ISD::FMAXNUM_IEEE:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008005 return AMDGPUISD::FMAX3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008006 case ISD::SMAX:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008007 return AMDGPUISD::SMAX3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008008 case ISD::UMAX:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008009 return AMDGPUISD::UMAX3;
8010 case ISD::FMINNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008011 case ISD::FMINNUM_IEEE:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008012 return AMDGPUISD::FMIN3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008013 case ISD::SMIN:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008014 return AMDGPUISD::SMIN3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008015 case ISD::UMIN:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008016 return AMDGPUISD::UMIN3;
8017 default:
8018 llvm_unreachable("Not a min/max opcode");
8019 }
8020}
8021
Matt Arsenault10268f92017-02-27 22:40:39 +00008022SDValue SITargetLowering::performIntMed3ImmCombine(
8023 SelectionDAG &DAG, const SDLoc &SL,
8024 SDValue Op0, SDValue Op1, bool Signed) const {
Matt Arsenaultf639c322016-01-28 20:53:42 +00008025 ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
8026 if (!K1)
8027 return SDValue();
8028
8029 ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
8030 if (!K0)
8031 return SDValue();
8032
Matt Arsenaultf639c322016-01-28 20:53:42 +00008033 if (Signed) {
8034 if (K0->getAPIntValue().sge(K1->getAPIntValue()))
8035 return SDValue();
8036 } else {
8037 if (K0->getAPIntValue().uge(K1->getAPIntValue()))
8038 return SDValue();
8039 }
8040
8041 EVT VT = K0->getValueType(0);
Matt Arsenault10268f92017-02-27 22:40:39 +00008042 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
8043 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
8044 return DAG.getNode(Med3Opc, SL, VT,
8045 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
8046 }
Tom Stellard115a6152016-11-10 16:02:37 +00008047
Matt Arsenault10268f92017-02-27 22:40:39 +00008048 // If there isn't a 16-bit med3 operation, convert to 32-bit.
Tom Stellard115a6152016-11-10 16:02:37 +00008049 MVT NVT = MVT::i32;
8050 unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8051
Matt Arsenault10268f92017-02-27 22:40:39 +00008052 SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
8053 SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
8054 SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
Tom Stellard115a6152016-11-10 16:02:37 +00008055
Matt Arsenault10268f92017-02-27 22:40:39 +00008056 SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
8057 return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
Matt Arsenaultf639c322016-01-28 20:53:42 +00008058}
8059
Matt Arsenault6b114d22017-08-30 01:20:17 +00008060static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
8061 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
8062 return C;
8063
8064 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
8065 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
8066 return C;
8067 }
8068
8069 return nullptr;
8070}
8071
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008072SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
8073 const SDLoc &SL,
8074 SDValue Op0,
8075 SDValue Op1) const {
Matt Arsenault6b114d22017-08-30 01:20:17 +00008076 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
Matt Arsenaultf639c322016-01-28 20:53:42 +00008077 if (!K1)
8078 return SDValue();
8079
Matt Arsenault6b114d22017-08-30 01:20:17 +00008080 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
Matt Arsenaultf639c322016-01-28 20:53:42 +00008081 if (!K0)
8082 return SDValue();
8083
8084 // Ordered >= (although NaN inputs should have folded away by now).
8085 APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
8086 if (Cmp == APFloat::cmpGreaterThan)
8087 return SDValue();
8088
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008089 // TODO: Check IEEE bit enabled?
Matt Arsenault6b114d22017-08-30 01:20:17 +00008090 EVT VT = Op0.getValueType();
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008091 if (Subtarget->enableDX10Clamp()) {
8092 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
8093 // hardware fmed3 behavior converting to a min.
8094 // FIXME: Should this be allowing -0.0?
8095 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
8096 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
8097 }
8098
Matt Arsenault6b114d22017-08-30 01:20:17 +00008099 // med3 for f16 is only available on gfx9+, and not available for v2f16.
8100 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
8101 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
8102 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
8103 // then give the other result, which is different from med3 with a NaN
8104 // input.
8105 SDValue Var = Op0.getOperand(0);
Matt Arsenaultc3dc8e62018-08-03 18:27:52 +00008106 if (!DAG.isKnownNeverSNaN(Var))
Matt Arsenault6b114d22017-08-30 01:20:17 +00008107 return SDValue();
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008108
Matt Arsenaultebf46142018-09-18 02:34:54 +00008109 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
8110
8111 if ((!K0->hasOneUse() ||
8112 TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
8113 (!K1->hasOneUse() ||
8114 TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
8115 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
8116 Var, SDValue(K0, 0), SDValue(K1, 0));
8117 }
Matt Arsenault6b114d22017-08-30 01:20:17 +00008118 }
Matt Arsenaultf639c322016-01-28 20:53:42 +00008119
Matt Arsenault6b114d22017-08-30 01:20:17 +00008120 return SDValue();
Matt Arsenaultf639c322016-01-28 20:53:42 +00008121}
8122
8123SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
8124 DAGCombinerInfo &DCI) const {
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008125 SelectionDAG &DAG = DCI.DAG;
8126
Matt Arsenault79a45db2017-02-22 23:53:37 +00008127 EVT VT = N->getValueType(0);
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008128 unsigned Opc = N->getOpcode();
8129 SDValue Op0 = N->getOperand(0);
8130 SDValue Op1 = N->getOperand(1);
8131
8132 // Only do this if the inner op has one use since this will just increases
8133 // register pressure for no benefit.
8134
Matt Arsenault79a45db2017-02-22 23:53:37 +00008135
8136 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
Farhana Aleene80aeac2018-04-03 23:00:30 +00008137 !VT.isVector() && VT != MVT::f64 &&
Matt Arsenaultee324ff2017-05-17 19:25:06 +00008138 ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
Matt Arsenault5b39b342016-01-28 20:53:48 +00008139 // max(max(a, b), c) -> max3(a, b, c)
8140 // min(min(a, b), c) -> min3(a, b, c)
8141 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
8142 SDLoc DL(N);
8143 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
8144 DL,
8145 N->getValueType(0),
8146 Op0.getOperand(0),
8147 Op0.getOperand(1),
8148 Op1);
8149 }
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008150
Matt Arsenault5b39b342016-01-28 20:53:48 +00008151 // Try commuted.
8152 // max(a, max(b, c)) -> max3(a, b, c)
8153 // min(a, min(b, c)) -> min3(a, b, c)
8154 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
8155 SDLoc DL(N);
8156 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
8157 DL,
8158 N->getValueType(0),
8159 Op0,
8160 Op1.getOperand(0),
8161 Op1.getOperand(1));
8162 }
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008163 }
8164
Matt Arsenaultf639c322016-01-28 20:53:42 +00008165 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
8166 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
8167 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
8168 return Med3;
8169 }
8170
8171 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
8172 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
8173 return Med3;
8174 }
8175
8176 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
Matt Arsenault5b39b342016-01-28 20:53:48 +00008177 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
Matt Arsenault687ec752018-10-22 16:27:27 +00008178 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
Matt Arsenault5b39b342016-01-28 20:53:48 +00008179 (Opc == AMDGPUISD::FMIN_LEGACY &&
8180 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
Matt Arsenault79a45db2017-02-22 23:53:37 +00008181 (VT == MVT::f32 || VT == MVT::f64 ||
Matt Arsenault6b114d22017-08-30 01:20:17 +00008182 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
8183 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008184 Op0.hasOneUse()) {
Matt Arsenaultf639c322016-01-28 20:53:42 +00008185 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
8186 return Res;
8187 }
8188
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008189 return SDValue();
8190}
8191
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008192static bool isClampZeroToOne(SDValue A, SDValue B) {
8193 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
8194 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
8195 // FIXME: Should this be allowing -0.0?
8196 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
8197 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
8198 }
8199 }
8200
8201 return false;
8202}
8203
8204// FIXME: Should only worry about snans for version with chain.
8205SDValue SITargetLowering::performFMed3Combine(SDNode *N,
8206 DAGCombinerInfo &DCI) const {
8207 EVT VT = N->getValueType(0);
8208 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
8209 // NaNs. With a NaN input, the order of the operands may change the result.
8210
8211 SelectionDAG &DAG = DCI.DAG;
8212 SDLoc SL(N);
8213
8214 SDValue Src0 = N->getOperand(0);
8215 SDValue Src1 = N->getOperand(1);
8216 SDValue Src2 = N->getOperand(2);
8217
8218 if (isClampZeroToOne(Src0, Src1)) {
8219 // const_a, const_b, x -> clamp is safe in all cases including signaling
8220 // nans.
8221 // FIXME: Should this be allowing -0.0?
8222 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
8223 }
8224
8225 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
8226 // handling no dx10-clamp?
8227 if (Subtarget->enableDX10Clamp()) {
8228 // If NaNs is clamped to 0, we are free to reorder the inputs.
8229
8230 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8231 std::swap(Src0, Src1);
8232
8233 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
8234 std::swap(Src1, Src2);
8235
8236 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8237 std::swap(Src0, Src1);
8238
8239 if (isClampZeroToOne(Src1, Src2))
8240 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
8241 }
8242
8243 return SDValue();
8244}
8245
Matt Arsenault1f17c662017-02-22 00:27:34 +00008246SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
8247 DAGCombinerInfo &DCI) const {
8248 SDValue Src0 = N->getOperand(0);
8249 SDValue Src1 = N->getOperand(1);
8250 if (Src0.isUndef() && Src1.isUndef())
8251 return DCI.DAG.getUNDEF(N->getValueType(0));
8252 return SDValue();
8253}
8254
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008255SDValue SITargetLowering::performExtractVectorEltCombine(
8256 SDNode *N, DAGCombinerInfo &DCI) const {
8257 SDValue Vec = N->getOperand(0);
Matt Arsenault8cbb4882017-09-20 21:01:24 +00008258 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008259
8260 EVT VecVT = Vec.getValueType();
8261 EVT EltVT = VecVT.getVectorElementType();
8262
Matt Arsenaultfcc5ba42018-04-26 19:21:32 +00008263 if ((Vec.getOpcode() == ISD::FNEG ||
8264 Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008265 SDLoc SL(N);
8266 EVT EltVT = N->getValueType(0);
8267 SDValue Idx = N->getOperand(1);
8268 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8269 Vec.getOperand(0), Idx);
Matt Arsenaultfcc5ba42018-04-26 19:21:32 +00008270 return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008271 }
8272
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008273 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
8274 // =>
8275 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
8276 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
8277 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
Farhana Aleene24f3ff2018-05-09 21:18:34 +00008278 if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008279 SDLoc SL(N);
8280 EVT EltVT = N->getValueType(0);
8281 SDValue Idx = N->getOperand(1);
8282 unsigned Opc = Vec.getOpcode();
8283
8284 switch(Opc) {
8285 default:
Stanislav Mekhanoshinbcb34ac2018-11-13 21:18:21 +00008286 break;
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008287 // TODO: Support other binary operations.
8288 case ISD::FADD:
Matt Arsenaulta8160732018-08-15 21:34:06 +00008289 case ISD::FSUB:
8290 case ISD::FMUL:
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008291 case ISD::ADD:
Farhana Aleene24f3ff2018-05-09 21:18:34 +00008292 case ISD::UMIN:
8293 case ISD::UMAX:
8294 case ISD::SMIN:
8295 case ISD::SMAX:
8296 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008297 case ISD::FMINNUM:
8298 case ISD::FMAXNUM_IEEE:
8299 case ISD::FMINNUM_IEEE: {
Matt Arsenaulta8160732018-08-15 21:34:06 +00008300 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8301 Vec.getOperand(0), Idx);
8302 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8303 Vec.getOperand(1), Idx);
8304
8305 DCI.AddToWorklist(Elt0.getNode());
8306 DCI.AddToWorklist(Elt1.getNode());
8307 return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
8308 }
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008309 }
8310 }
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008311
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008312 unsigned VecSize = VecVT.getSizeInBits();
8313 unsigned EltSize = EltVT.getSizeInBits();
8314
Stanislav Mekhanoshinbcb34ac2018-11-13 21:18:21 +00008315 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
8316 // This elminates non-constant index and subsequent movrel or scratch access.
8317 // Sub-dword vectors of size 2 dword or less have better implementation.
8318 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8319 // instructions.
8320 if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) &&
8321 !isa<ConstantSDNode>(N->getOperand(1))) {
8322 SDLoc SL(N);
8323 SDValue Idx = N->getOperand(1);
8324 EVT IdxVT = Idx.getValueType();
8325 SDValue V;
8326 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8327 SDValue IC = DAG.getConstant(I, SL, IdxVT);
8328 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8329 if (I == 0)
8330 V = Elt;
8331 else
8332 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
8333 }
8334 return V;
8335 }
8336
8337 if (!DCI.isBeforeLegalize())
8338 return SDValue();
8339
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008340 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
8341 // elements. This exposes more load reduction opportunities by replacing
8342 // multiple small extract_vector_elements with a single 32-bit extract.
8343 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
Matt Arsenaultbf07a502018-08-31 15:39:52 +00008344 if (isa<MemSDNode>(Vec) &&
8345 EltSize <= 16 &&
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008346 EltVT.isByteSized() &&
8347 VecSize > 32 &&
8348 VecSize % 32 == 0 &&
8349 Idx) {
8350 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
8351
8352 unsigned BitIndex = Idx->getZExtValue() * EltSize;
8353 unsigned EltIdx = BitIndex / 32;
8354 unsigned LeftoverBitIdx = BitIndex % 32;
8355 SDLoc SL(N);
8356
8357 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
8358 DCI.AddToWorklist(Cast.getNode());
8359
8360 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
8361 DAG.getConstant(EltIdx, SL, MVT::i32));
8362 DCI.AddToWorklist(Elt.getNode());
8363 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
8364 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
8365 DCI.AddToWorklist(Srl.getNode());
8366
8367 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
8368 DCI.AddToWorklist(Trunc.getNode());
8369 return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
8370 }
8371
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008372 return SDValue();
8373}
8374
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00008375SDValue
8376SITargetLowering::performInsertVectorEltCombine(SDNode *N,
8377 DAGCombinerInfo &DCI) const {
8378 SDValue Vec = N->getOperand(0);
8379 SDValue Idx = N->getOperand(2);
8380 EVT VecVT = Vec.getValueType();
8381 EVT EltVT = VecVT.getVectorElementType();
8382 unsigned VecSize = VecVT.getSizeInBits();
8383 unsigned EltSize = EltVT.getSizeInBits();
8384
8385 // INSERT_VECTOR_ELT (<n x e>, var-idx)
8386 // => BUILD_VECTOR n x select (e, const-idx)
8387 // This elminates non-constant index and subsequent movrel or scratch access.
8388 // Sub-dword vectors of size 2 dword or less have better implementation.
8389 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8390 // instructions.
8391 if (isa<ConstantSDNode>(Idx) ||
8392 VecSize > 256 || (VecSize <= 64 && EltSize < 32))
8393 return SDValue();
8394
8395 SelectionDAG &DAG = DCI.DAG;
8396 SDLoc SL(N);
8397 SDValue Ins = N->getOperand(1);
8398 EVT IdxVT = Idx.getValueType();
8399
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00008400 SmallVector<SDValue, 16> Ops;
8401 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8402 SDValue IC = DAG.getConstant(I, SL, IdxVT);
8403 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8404 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
8405 Ops.push_back(V);
8406 }
8407
8408 return DAG.getBuildVector(VecVT, SL, Ops);
8409}
8410
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008411unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
8412 const SDNode *N0,
8413 const SDNode *N1) const {
8414 EVT VT = N0->getValueType(0);
8415
Matt Arsenault770ec862016-12-22 03:55:35 +00008416 // Only do this if we are not trying to support denormals. v_mad_f32 does not
8417 // support denormals ever.
8418 if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
8419 (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
8420 return ISD::FMAD;
8421
8422 const TargetOptions &Options = DAG.getTarget().Options;
Amara Emersond28f0cd42017-05-01 15:17:51 +00008423 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
Michael Berg7acc81b2018-05-04 18:48:20 +00008424 (N0->getFlags().hasAllowContract() &&
8425 N1->getFlags().hasAllowContract())) &&
Matt Arsenault770ec862016-12-22 03:55:35 +00008426 isFMAFasterThanFMulAndFAdd(VT)) {
8427 return ISD::FMA;
8428 }
8429
8430 return 0;
8431}
8432
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00008433// For a reassociatable opcode perform:
8434// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
8435SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
8436 SelectionDAG &DAG) const {
8437 EVT VT = N->getValueType(0);
8438 if (VT != MVT::i32 && VT != MVT::i64)
8439 return SDValue();
8440
8441 unsigned Opc = N->getOpcode();
8442 SDValue Op0 = N->getOperand(0);
8443 SDValue Op1 = N->getOperand(1);
8444
8445 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
8446 return SDValue();
8447
8448 if (Op0->isDivergent())
8449 std::swap(Op0, Op1);
8450
8451 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
8452 return SDValue();
8453
8454 SDValue Op2 = Op1.getOperand(1);
8455 Op1 = Op1.getOperand(0);
8456 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
8457 return SDValue();
8458
8459 if (Op1->isDivergent())
8460 std::swap(Op1, Op2);
8461
8462 // If either operand is constant this will conflict with
8463 // DAGCombiner::ReassociateOps().
Stanislav Mekhanoshinda1628e2019-02-26 20:56:25 +00008464 if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
8465 DAG.isConstantIntBuildVectorOrConstantInt(Op1))
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00008466 return SDValue();
8467
8468 SDLoc SL(N);
8469 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
8470 return DAG.getNode(Opc, SL, VT, Add1, Op2);
8471}
8472
Matt Arsenault4f6318f2017-11-06 17:04:37 +00008473static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
8474 EVT VT,
8475 SDValue N0, SDValue N1, SDValue N2,
8476 bool Signed) {
8477 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
8478 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
8479 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
8480 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
8481}
8482
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008483SDValue SITargetLowering::performAddCombine(SDNode *N,
8484 DAGCombinerInfo &DCI) const {
8485 SelectionDAG &DAG = DCI.DAG;
8486 EVT VT = N->getValueType(0);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008487 SDLoc SL(N);
8488 SDValue LHS = N->getOperand(0);
8489 SDValue RHS = N->getOperand(1);
8490
Matt Arsenault4f6318f2017-11-06 17:04:37 +00008491 if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
8492 && Subtarget->hasMad64_32() &&
8493 !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
8494 VT.getScalarSizeInBits() <= 64) {
8495 if (LHS.getOpcode() != ISD::MUL)
8496 std::swap(LHS, RHS);
8497
8498 SDValue MulLHS = LHS.getOperand(0);
8499 SDValue MulRHS = LHS.getOperand(1);
8500 SDValue AddRHS = RHS;
8501
8502 // TODO: Maybe restrict if SGPR inputs.
8503 if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
8504 numBitsUnsigned(MulRHS, DAG) <= 32) {
8505 MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
8506 MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
8507 AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
8508 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
8509 }
8510
8511 if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
8512 MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
8513 MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
8514 AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
8515 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
8516 }
8517
8518 return SDValue();
8519 }
8520
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00008521 if (SDValue V = reassociateScalarOps(N, DAG)) {
8522 return V;
8523 }
8524
Farhana Aleen07e61232018-05-02 18:16:39 +00008525 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
Matt Arsenault4f6318f2017-11-06 17:04:37 +00008526 return SDValue();
8527
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008528 // add x, zext (setcc) => addcarry x, 0, setcc
8529 // add x, sext (setcc) => subcarry x, 0, setcc
8530 unsigned Opc = LHS.getOpcode();
8531 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008532 Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008533 std::swap(RHS, LHS);
8534
8535 Opc = RHS.getOpcode();
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008536 switch (Opc) {
8537 default: break;
8538 case ISD::ZERO_EXTEND:
8539 case ISD::SIGN_EXTEND:
8540 case ISD::ANY_EXTEND: {
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008541 auto Cond = RHS.getOperand(0);
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00008542 if (!isBoolSGPR(Cond))
Stanislav Mekhanoshin3ed38c62017-06-21 23:46:22 +00008543 break;
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008544 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
8545 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
8546 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
8547 return DAG.getNode(Opc, SL, VTList, Args);
8548 }
8549 case ISD::ADDCARRY: {
8550 // add x, (addcarry y, 0, cc) => addcarry x, y, cc
8551 auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
8552 if (!C || C->getZExtValue() != 0) break;
8553 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
8554 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
8555 }
8556 }
8557 return SDValue();
8558}
8559
8560SDValue SITargetLowering::performSubCombine(SDNode *N,
8561 DAGCombinerInfo &DCI) const {
8562 SelectionDAG &DAG = DCI.DAG;
8563 EVT VT = N->getValueType(0);
8564
8565 if (VT != MVT::i32)
8566 return SDValue();
8567
8568 SDLoc SL(N);
8569 SDValue LHS = N->getOperand(0);
8570 SDValue RHS = N->getOperand(1);
8571
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008572 if (LHS.getOpcode() == ISD::SUBCARRY) {
8573 // sub (subcarry x, 0, cc), y => subcarry x, y, cc
8574 auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
Stanislav Mekhanoshin42e229e2019-02-21 02:58:00 +00008575 if (!C || !C->isNullValue())
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008576 return SDValue();
8577 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
8578 return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
8579 }
8580 return SDValue();
8581}
8582
8583SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
8584 DAGCombinerInfo &DCI) const {
8585
8586 if (N->getValueType(0) != MVT::i32)
8587 return SDValue();
8588
8589 auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8590 if (!C || C->getZExtValue() != 0)
8591 return SDValue();
8592
8593 SelectionDAG &DAG = DCI.DAG;
8594 SDValue LHS = N->getOperand(0);
8595
8596 // addcarry (add x, y), 0, cc => addcarry x, y, cc
8597 // subcarry (sub x, y), 0, cc => subcarry x, y, cc
8598 unsigned LHSOpc = LHS.getOpcode();
8599 unsigned Opc = N->getOpcode();
8600 if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
8601 (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
8602 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
8603 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008604 }
8605 return SDValue();
8606}
8607
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008608SDValue SITargetLowering::performFAddCombine(SDNode *N,
8609 DAGCombinerInfo &DCI) const {
8610 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8611 return SDValue();
8612
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008613 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault770ec862016-12-22 03:55:35 +00008614 EVT VT = N->getValueType(0);
Matt Arsenault770ec862016-12-22 03:55:35 +00008615
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008616 SDLoc SL(N);
8617 SDValue LHS = N->getOperand(0);
8618 SDValue RHS = N->getOperand(1);
8619
8620 // These should really be instruction patterns, but writing patterns with
8621 // source modiifiers is a pain.
8622
8623 // fadd (fadd (a, a), b) -> mad 2.0, a, b
8624 if (LHS.getOpcode() == ISD::FADD) {
8625 SDValue A = LHS.getOperand(0);
8626 if (A == LHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008627 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00008628 if (FusedOp != 0) {
8629 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00008630 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
Matt Arsenault770ec862016-12-22 03:55:35 +00008631 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008632 }
8633 }
8634
8635 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
8636 if (RHS.getOpcode() == ISD::FADD) {
8637 SDValue A = RHS.getOperand(0);
8638 if (A == RHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008639 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00008640 if (FusedOp != 0) {
8641 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00008642 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
Matt Arsenault770ec862016-12-22 03:55:35 +00008643 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008644 }
8645 }
8646
8647 return SDValue();
8648}
8649
8650SDValue SITargetLowering::performFSubCombine(SDNode *N,
8651 DAGCombinerInfo &DCI) const {
8652 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8653 return SDValue();
8654
8655 SelectionDAG &DAG = DCI.DAG;
8656 SDLoc SL(N);
8657 EVT VT = N->getValueType(0);
8658 assert(!VT.isVector());
8659
8660 // Try to get the fneg to fold into the source modifier. This undoes generic
8661 // DAG combines and folds them into the mad.
8662 //
8663 // Only do this if we are not trying to support denormals. v_mad_f32 does
8664 // not support denormals ever.
Matt Arsenault770ec862016-12-22 03:55:35 +00008665 SDValue LHS = N->getOperand(0);
8666 SDValue RHS = N->getOperand(1);
8667 if (LHS.getOpcode() == ISD::FADD) {
8668 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
8669 SDValue A = LHS.getOperand(0);
8670 if (A == LHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008671 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00008672 if (FusedOp != 0){
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008673 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8674 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
8675
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00008676 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008677 }
8678 }
Matt Arsenault770ec862016-12-22 03:55:35 +00008679 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008680
Matt Arsenault770ec862016-12-22 03:55:35 +00008681 if (RHS.getOpcode() == ISD::FADD) {
8682 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008683
Matt Arsenault770ec862016-12-22 03:55:35 +00008684 SDValue A = RHS.getOperand(0);
8685 if (A == RHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008686 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00008687 if (FusedOp != 0){
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008688 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00008689 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008690 }
8691 }
8692 }
8693
8694 return SDValue();
8695}
8696
Farhana Aleenc370d7b2018-07-16 18:19:59 +00008697SDValue SITargetLowering::performFMACombine(SDNode *N,
8698 DAGCombinerInfo &DCI) const {
8699 SelectionDAG &DAG = DCI.DAG;
8700 EVT VT = N->getValueType(0);
8701 SDLoc SL(N);
8702
Stanislav Mekhanoshin0e858b02019-02-09 00:34:21 +00008703 if (!Subtarget->hasDot2Insts() || VT != MVT::f32)
Farhana Aleenc370d7b2018-07-16 18:19:59 +00008704 return SDValue();
8705
8706 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
8707 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
8708 SDValue Op1 = N->getOperand(0);
8709 SDValue Op2 = N->getOperand(1);
8710 SDValue FMA = N->getOperand(2);
8711
8712 if (FMA.getOpcode() != ISD::FMA ||
8713 Op1.getOpcode() != ISD::FP_EXTEND ||
8714 Op2.getOpcode() != ISD::FP_EXTEND)
8715 return SDValue();
8716
8717 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
8718 // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
8719 // is sufficient to allow generaing fdot2.
8720 const TargetOptions &Options = DAG.getTarget().Options;
8721 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
8722 (N->getFlags().hasAllowContract() &&
8723 FMA->getFlags().hasAllowContract())) {
8724 Op1 = Op1.getOperand(0);
8725 Op2 = Op2.getOperand(0);
8726 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8727 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8728 return SDValue();
8729
8730 SDValue Vec1 = Op1.getOperand(0);
8731 SDValue Idx1 = Op1.getOperand(1);
8732 SDValue Vec2 = Op2.getOperand(0);
8733
8734 SDValue FMAOp1 = FMA.getOperand(0);
8735 SDValue FMAOp2 = FMA.getOperand(1);
8736 SDValue FMAAcc = FMA.getOperand(2);
8737
8738 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
8739 FMAOp2.getOpcode() != ISD::FP_EXTEND)
8740 return SDValue();
8741
8742 FMAOp1 = FMAOp1.getOperand(0);
8743 FMAOp2 = FMAOp2.getOperand(0);
8744 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8745 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8746 return SDValue();
8747
8748 SDValue Vec3 = FMAOp1.getOperand(0);
8749 SDValue Vec4 = FMAOp2.getOperand(0);
8750 SDValue Idx2 = FMAOp1.getOperand(1);
8751
8752 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
8753 // Idx1 and Idx2 cannot be the same.
8754 Idx1 == Idx2)
8755 return SDValue();
8756
8757 if (Vec1 == Vec2 || Vec3 == Vec4)
8758 return SDValue();
8759
8760 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
8761 return SDValue();
8762
8763 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
Konstantin Zhuravlyovbb30ef72018-08-01 01:31:30 +00008764 (Vec1 == Vec4 && Vec2 == Vec3)) {
8765 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
8766 DAG.getTargetConstant(0, SL, MVT::i1));
8767 }
Farhana Aleenc370d7b2018-07-16 18:19:59 +00008768 }
8769 return SDValue();
8770}
8771
Matt Arsenault6f6233d2015-01-06 23:00:41 +00008772SDValue SITargetLowering::performSetCCCombine(SDNode *N,
8773 DAGCombinerInfo &DCI) const {
8774 SelectionDAG &DAG = DCI.DAG;
8775 SDLoc SL(N);
8776
8777 SDValue LHS = N->getOperand(0);
8778 SDValue RHS = N->getOperand(1);
8779 EVT VT = LHS.getValueType();
Stanislav Mekhanoshinc9bd53a2017-06-27 18:53:03 +00008780 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
8781
8782 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
8783 if (!CRHS) {
8784 CRHS = dyn_cast<ConstantSDNode>(LHS);
8785 if (CRHS) {
8786 std::swap(LHS, RHS);
8787 CC = getSetCCSwappedOperands(CC);
8788 }
8789 }
8790
Stanislav Mekhanoshin3b117942018-06-16 03:46:59 +00008791 if (CRHS) {
8792 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
8793 isBoolSGPR(LHS.getOperand(0))) {
8794 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
8795 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
8796 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
8797 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
8798 if ((CRHS->isAllOnesValue() &&
8799 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
8800 (CRHS->isNullValue() &&
8801 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
8802 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
8803 DAG.getConstant(-1, SL, MVT::i1));
8804 if ((CRHS->isAllOnesValue() &&
8805 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
8806 (CRHS->isNullValue() &&
8807 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
8808 return LHS.getOperand(0);
8809 }
8810
8811 uint64_t CRHSVal = CRHS->getZExtValue();
8812 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
8813 LHS.getOpcode() == ISD::SELECT &&
8814 isa<ConstantSDNode>(LHS.getOperand(1)) &&
8815 isa<ConstantSDNode>(LHS.getOperand(2)) &&
8816 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
8817 isBoolSGPR(LHS.getOperand(0))) {
8818 // Given CT != FT:
8819 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
8820 // setcc (select cc, CT, CF), CF, ne => cc
8821 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
8822 // setcc (select cc, CT, CF), CT, eq => cc
8823 uint64_t CT = LHS.getConstantOperandVal(1);
8824 uint64_t CF = LHS.getConstantOperandVal(2);
8825
8826 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
8827 (CT == CRHSVal && CC == ISD::SETNE))
8828 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
8829 DAG.getConstant(-1, SL, MVT::i1));
8830 if ((CF == CRHSVal && CC == ISD::SETNE) ||
8831 (CT == CRHSVal && CC == ISD::SETEQ))
8832 return LHS.getOperand(0);
8833 }
Stanislav Mekhanoshinc9bd53a2017-06-27 18:53:03 +00008834 }
Matt Arsenault6f6233d2015-01-06 23:00:41 +00008835
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00008836 if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
8837 VT != MVT::f16))
Matt Arsenault6f6233d2015-01-06 23:00:41 +00008838 return SDValue();
8839
Matt Arsenault8ad00d32018-08-10 18:58:41 +00008840 // Match isinf/isfinite pattern
Matt Arsenault6f6233d2015-01-06 23:00:41 +00008841 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
Matt Arsenault8ad00d32018-08-10 18:58:41 +00008842 // (fcmp one (fabs x), inf) -> (fp_class x,
8843 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
8844 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
Matt Arsenault6f6233d2015-01-06 23:00:41 +00008845 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
8846 if (!CRHS)
8847 return SDValue();
8848
8849 const APFloat &APF = CRHS->getValueAPF();
8850 if (APF.isInfinity() && !APF.isNegative()) {
Matt Arsenault8ad00d32018-08-10 18:58:41 +00008851 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
8852 SIInstrFlags::N_INFINITY;
8853 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
8854 SIInstrFlags::P_ZERO |
8855 SIInstrFlags::N_NORMAL |
8856 SIInstrFlags::P_NORMAL |
8857 SIInstrFlags::N_SUBNORMAL |
8858 SIInstrFlags::P_SUBNORMAL;
8859 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00008860 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
8861 DAG.getConstant(Mask, SL, MVT::i32));
Matt Arsenault6f6233d2015-01-06 23:00:41 +00008862 }
8863 }
8864
8865 return SDValue();
8866}
8867
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008868SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
8869 DAGCombinerInfo &DCI) const {
8870 SelectionDAG &DAG = DCI.DAG;
8871 SDLoc SL(N);
8872 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
8873
8874 SDValue Src = N->getOperand(0);
8875 SDValue Srl = N->getOperand(0);
8876 if (Srl.getOpcode() == ISD::ZERO_EXTEND)
8877 Srl = Srl.getOperand(0);
8878
8879 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
8880 if (Srl.getOpcode() == ISD::SRL) {
8881 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
8882 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
8883 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
8884
8885 if (const ConstantSDNode *C =
8886 dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
8887 Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
8888 EVT(MVT::i32));
8889
8890 unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
8891 if (SrcOffset < 32 && SrcOffset % 8 == 0) {
8892 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
8893 MVT::f32, Srl);
8894 }
8895 }
8896 }
8897
8898 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
8899
Craig Topperd0af7e82017-04-28 05:31:46 +00008900 KnownBits Known;
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008901 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
8902 !DCI.isBeforeLegalizeOps());
8903 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
Stanislav Mekhanoshined0d6c62019-01-09 02:24:22 +00008904 if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008905 DCI.CommitTargetLoweringOpt(TLO);
8906 }
8907
8908 return SDValue();
8909}
8910
Tom Stellard1b95fed2018-05-24 05:28:34 +00008911SDValue SITargetLowering::performClampCombine(SDNode *N,
8912 DAGCombinerInfo &DCI) const {
8913 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
8914 if (!CSrc)
8915 return SDValue();
8916
8917 const APFloat &F = CSrc->getValueAPF();
8918 APFloat Zero = APFloat::getZero(F.getSemantics());
8919 APFloat::cmpResult Cmp0 = F.compare(Zero);
8920 if (Cmp0 == APFloat::cmpLessThan ||
8921 (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
8922 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
8923 }
8924
8925 APFloat One(F.getSemantics(), "1.0");
8926 APFloat::cmpResult Cmp1 = F.compare(One);
8927 if (Cmp1 == APFloat::cmpGreaterThan)
8928 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
8929
8930 return SDValue(CSrc, 0);
8931}
8932
8933
Tom Stellard75aadc22012-12-11 21:25:42 +00008934SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
8935 DAGCombinerInfo &DCI) const {
Stanislav Mekhanoshin443a7f92018-11-27 15:13:37 +00008936 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
8937 return SDValue();
8938
Tom Stellard75aadc22012-12-11 21:25:42 +00008939 switch (N->getOpcode()) {
Matt Arsenault22b4c252014-12-21 16:48:42 +00008940 default:
8941 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008942 case ISD::ADD:
8943 return performAddCombine(N, DCI);
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008944 case ISD::SUB:
8945 return performSubCombine(N, DCI);
8946 case ISD::ADDCARRY:
8947 case ISD::SUBCARRY:
8948 return performAddCarrySubCarryCombine(N, DCI);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008949 case ISD::FADD:
8950 return performFAddCombine(N, DCI);
8951 case ISD::FSUB:
8952 return performFSubCombine(N, DCI);
Matt Arsenault6f6233d2015-01-06 23:00:41 +00008953 case ISD::SETCC:
8954 return performSetCCCombine(N, DCI);
Matt Arsenault5b39b342016-01-28 20:53:48 +00008955 case ISD::FMAXNUM:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008956 case ISD::FMINNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008957 case ISD::FMAXNUM_IEEE:
8958 case ISD::FMINNUM_IEEE:
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008959 case ISD::SMAX:
8960 case ISD::SMIN:
8961 case ISD::UMAX:
Matt Arsenault5b39b342016-01-28 20:53:48 +00008962 case ISD::UMIN:
8963 case AMDGPUISD::FMIN_LEGACY:
Stanislav Mekhanoshin443a7f92018-11-27 15:13:37 +00008964 case AMDGPUISD::FMAX_LEGACY:
8965 return performMinMaxCombine(N, DCI);
Farhana Aleenc370d7b2018-07-16 18:19:59 +00008966 case ISD::FMA:
8967 return performFMACombine(N, DCI);
Matt Arsenault90083d32018-06-07 09:54:49 +00008968 case ISD::LOAD: {
8969 if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
8970 return Widended;
8971 LLVM_FALLTHROUGH;
8972 }
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00008973 case ISD::STORE:
8974 case ISD::ATOMIC_LOAD:
8975 case ISD::ATOMIC_STORE:
8976 case ISD::ATOMIC_CMP_SWAP:
8977 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
8978 case ISD::ATOMIC_SWAP:
8979 case ISD::ATOMIC_LOAD_ADD:
8980 case ISD::ATOMIC_LOAD_SUB:
8981 case ISD::ATOMIC_LOAD_AND:
8982 case ISD::ATOMIC_LOAD_OR:
8983 case ISD::ATOMIC_LOAD_XOR:
8984 case ISD::ATOMIC_LOAD_NAND:
8985 case ISD::ATOMIC_LOAD_MIN:
8986 case ISD::ATOMIC_LOAD_MAX:
8987 case ISD::ATOMIC_LOAD_UMIN:
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00008988 case ISD::ATOMIC_LOAD_UMAX:
Matt Arsenaulta5840c32019-01-22 18:36:06 +00008989 case ISD::ATOMIC_LOAD_FADD:
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00008990 case AMDGPUISD::ATOMIC_INC:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00008991 case AMDGPUISD::ATOMIC_DEC:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00008992 case AMDGPUISD::ATOMIC_LOAD_FMIN:
Matt Arsenaulta5840c32019-01-22 18:36:06 +00008993 case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00008994 if (DCI.isBeforeLegalize())
8995 break;
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008996 return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
Matt Arsenaultd0101a22015-01-06 23:00:46 +00008997 case ISD::AND:
8998 return performAndCombine(N, DCI);
Matt Arsenaultf2290332015-01-06 23:00:39 +00008999 case ISD::OR:
9000 return performOrCombine(N, DCI);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00009001 case ISD::XOR:
9002 return performXorCombine(N, DCI);
Matt Arsenault8edfaee2017-03-31 19:53:03 +00009003 case ISD::ZERO_EXTEND:
9004 return performZeroExtendCombine(N, DCI);
Matt Arsenaultf2290332015-01-06 23:00:39 +00009005 case AMDGPUISD::FP_CLASS:
9006 return performClassCombine(N, DCI);
Matt Arsenault9cd90712016-04-14 01:42:16 +00009007 case ISD::FCANONICALIZE:
9008 return performFCanonicalizeCombine(N, DCI);
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009009 case AMDGPUISD::RCP:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00009010 return performRcpCombine(N, DCI);
9011 case AMDGPUISD::FRACT:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009012 case AMDGPUISD::RSQ:
Matt Arsenault32fc5272016-07-26 16:45:45 +00009013 case AMDGPUISD::RCP_LEGACY:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009014 case AMDGPUISD::RSQ_LEGACY:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00009015 case AMDGPUISD::RCP_IFLAG:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009016 case AMDGPUISD::RSQ_CLAMP:
9017 case AMDGPUISD::LDEXP: {
9018 SDValue Src = N->getOperand(0);
9019 if (Src.isUndef())
9020 return Src;
9021 break;
9022 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009023 case ISD::SINT_TO_FP:
9024 case ISD::UINT_TO_FP:
9025 return performUCharToFloatCombine(N, DCI);
9026 case AMDGPUISD::CVT_F32_UBYTE0:
9027 case AMDGPUISD::CVT_F32_UBYTE1:
9028 case AMDGPUISD::CVT_F32_UBYTE2:
9029 case AMDGPUISD::CVT_F32_UBYTE3:
9030 return performCvtF32UByteNCombine(N, DCI);
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00009031 case AMDGPUISD::FMED3:
9032 return performFMed3Combine(N, DCI);
Matt Arsenault1f17c662017-02-22 00:27:34 +00009033 case AMDGPUISD::CVT_PKRTZ_F16_F32:
9034 return performCvtPkRTZCombine(N, DCI);
Tom Stellard1b95fed2018-05-24 05:28:34 +00009035 case AMDGPUISD::CLAMP:
9036 return performClampCombine(N, DCI);
Matt Arsenaulteb522e62017-02-27 22:15:25 +00009037 case ISD::SCALAR_TO_VECTOR: {
9038 SelectionDAG &DAG = DCI.DAG;
9039 EVT VT = N->getValueType(0);
9040
9041 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
9042 if (VT == MVT::v2i16 || VT == MVT::v2f16) {
9043 SDLoc SL(N);
9044 SDValue Src = N->getOperand(0);
9045 EVT EltVT = Src.getValueType();
9046 if (EltVT == MVT::f16)
9047 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
9048
9049 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
9050 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
9051 }
9052
9053 break;
9054 }
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00009055 case ISD::EXTRACT_VECTOR_ELT:
9056 return performExtractVectorEltCombine(N, DCI);
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00009057 case ISD::INSERT_VECTOR_ELT:
9058 return performInsertVectorEltCombine(N, DCI);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00009059 }
Matt Arsenault5565f65e2014-05-22 18:09:07 +00009060 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
Tom Stellard75aadc22012-12-11 21:25:42 +00009061}
Christian Konigd910b7d2013-02-26 17:52:16 +00009062
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009063/// Helper function for adjustWritemask
Benjamin Kramer635e3682013-05-23 15:43:05 +00009064static unsigned SubIdx2Lane(unsigned Idx) {
Christian Konig8e06e2a2013-04-10 08:39:08 +00009065 switch (Idx) {
9066 default: return 0;
9067 case AMDGPU::sub0: return 0;
9068 case AMDGPU::sub1: return 1;
9069 case AMDGPU::sub2: return 2;
9070 case AMDGPU::sub3: return 3;
David Stuttardf77079f2019-01-14 11:55:24 +00009071 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
Christian Konig8e06e2a2013-04-10 08:39:08 +00009072 }
9073}
9074
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009075/// Adjust the writemask of MIMG instructions
Matt Arsenault68f05052017-12-04 22:18:27 +00009076SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
9077 SelectionDAG &DAG) const {
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009078 unsigned Opcode = Node->getMachineOpcode();
9079
9080 // Subtract 1 because the vdata output is not a MachineSDNode operand.
9081 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
9082 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
9083 return Node; // not implemented for D16
9084
David Stuttardf77079f2019-01-14 11:55:24 +00009085 SDNode *Users[5] = { nullptr };
Tom Stellard54774e52013-10-23 02:53:47 +00009086 unsigned Lane = 0;
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009087 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009088 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
Tom Stellard54774e52013-10-23 02:53:47 +00009089 unsigned NewDmask = 0;
David Stuttardf77079f2019-01-14 11:55:24 +00009090 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
9091 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
9092 bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
9093 Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
9094 unsigned TFCLane = 0;
Matt Arsenault856777d2017-12-08 20:00:57 +00009095 bool HasChain = Node->getNumValues() > 1;
9096
9097 if (OldDmask == 0) {
9098 // These are folded out, but on the chance it happens don't assert.
9099 return Node;
9100 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009101
David Stuttardf77079f2019-01-14 11:55:24 +00009102 unsigned OldBitsSet = countPopulation(OldDmask);
9103 // Work out which is the TFE/LWE lane if that is enabled.
9104 if (UsesTFC) {
9105 TFCLane = OldBitsSet;
9106 }
9107
Christian Konig8e06e2a2013-04-10 08:39:08 +00009108 // Try to figure out the used register components
9109 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
9110 I != E; ++I) {
9111
Matt Arsenault93e65ea2017-02-22 21:16:41 +00009112 // Don't look at users of the chain.
9113 if (I.getUse().getResNo() != 0)
9114 continue;
9115
Christian Konig8e06e2a2013-04-10 08:39:08 +00009116 // Abort if we can't understand the usage
9117 if (!I->isMachineOpcode() ||
9118 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
Matt Arsenault68f05052017-12-04 22:18:27 +00009119 return Node;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009120
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +00009121 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
Tom Stellard54774e52013-10-23 02:53:47 +00009122 // Note that subregs are packed, i.e. Lane==0 is the first bit set
9123 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
9124 // set, etc.
Christian Konig8b1ed282013-04-10 08:39:16 +00009125 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
Christian Konig8e06e2a2013-04-10 08:39:08 +00009126
David Stuttardf77079f2019-01-14 11:55:24 +00009127 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
9128 if (UsesTFC && Lane == TFCLane) {
9129 Users[Lane] = *I;
9130 } else {
9131 // Set which texture component corresponds to the lane.
9132 unsigned Comp;
9133 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
9134 Comp = countTrailingZeros(Dmask);
9135 Dmask &= ~(1 << Comp);
9136 }
9137
9138 // Abort if we have more than one user per component.
9139 if (Users[Lane])
9140 return Node;
9141
9142 Users[Lane] = *I;
9143 NewDmask |= 1 << Comp;
Tom Stellard54774e52013-10-23 02:53:47 +00009144 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009145 }
9146
David Stuttardf77079f2019-01-14 11:55:24 +00009147 // Don't allow 0 dmask, as hardware assumes one channel enabled.
9148 bool NoChannels = !NewDmask;
9149 if (NoChannels) {
9150 // If the original dmask has one channel - then nothing to do
9151 if (OldBitsSet == 1)
9152 return Node;
9153 // Use an arbitrary dmask - required for the instruction to work
9154 NewDmask = 1;
9155 }
Tom Stellard54774e52013-10-23 02:53:47 +00009156 // Abort if there's no change
9157 if (NewDmask == OldDmask)
Matt Arsenault68f05052017-12-04 22:18:27 +00009158 return Node;
9159
9160 unsigned BitsSet = countPopulation(NewDmask);
9161
David Stuttardf77079f2019-01-14 11:55:24 +00009162 // Check for TFE or LWE - increase the number of channels by one to account
9163 // for the extra return value
9164 // This will need adjustment for D16 if this is also included in
9165 // adjustWriteMask (this function) but at present D16 are excluded.
9166 unsigned NewChannels = BitsSet + UsesTFC;
9167
9168 int NewOpcode =
9169 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
Matt Arsenault68f05052017-12-04 22:18:27 +00009170 assert(NewOpcode != -1 &&
9171 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
9172 "failed to find equivalent MIMG op");
Christian Konig8e06e2a2013-04-10 08:39:08 +00009173
9174 // Adjust the writemask in the node
Matt Arsenault68f05052017-12-04 22:18:27 +00009175 SmallVector<SDValue, 12> Ops;
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009176 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009177 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009178 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
Christian Konig8e06e2a2013-04-10 08:39:08 +00009179
Matt Arsenault68f05052017-12-04 22:18:27 +00009180 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
9181
David Stuttardf77079f2019-01-14 11:55:24 +00009182 MVT ResultVT = NewChannels == 1 ?
9183 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
9184 NewChannels == 5 ? 8 : NewChannels);
Matt Arsenault856777d2017-12-08 20:00:57 +00009185 SDVTList NewVTList = HasChain ?
9186 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
9187
Matt Arsenault68f05052017-12-04 22:18:27 +00009188
9189 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
9190 NewVTList, Ops);
Matt Arsenaultecad0d532017-12-08 20:00:45 +00009191
Matt Arsenault856777d2017-12-08 20:00:57 +00009192 if (HasChain) {
9193 // Update chain.
Chandler Carruth66654b72018-08-14 23:30:32 +00009194 DAG.setNodeMemRefs(NewNode, Node->memoperands());
Matt Arsenault856777d2017-12-08 20:00:57 +00009195 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
9196 }
Matt Arsenault68f05052017-12-04 22:18:27 +00009197
David Stuttardf77079f2019-01-14 11:55:24 +00009198 if (NewChannels == 1) {
Matt Arsenault68f05052017-12-04 22:18:27 +00009199 assert(Node->hasNUsesOfValue(1, 0));
9200 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
9201 SDLoc(Node), Users[Lane]->getValueType(0),
9202 SDValue(NewNode, 0));
Christian Konig8b1ed282013-04-10 08:39:16 +00009203 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
Matt Arsenault68f05052017-12-04 22:18:27 +00009204 return nullptr;
Christian Konig8b1ed282013-04-10 08:39:16 +00009205 }
9206
Christian Konig8e06e2a2013-04-10 08:39:08 +00009207 // Update the users of the node with the new indices
David Stuttardf77079f2019-01-14 11:55:24 +00009208 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
Christian Konig8e06e2a2013-04-10 08:39:08 +00009209 SDNode *User = Users[i];
David Stuttardf77079f2019-01-14 11:55:24 +00009210 if (!User) {
9211 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
9212 // Users[0] is still nullptr because channel 0 doesn't really have a use.
9213 if (i || !NoChannels)
9214 continue;
9215 } else {
9216 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
9217 DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
9218 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009219
9220 switch (Idx) {
9221 default: break;
9222 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
9223 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
9224 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
David Stuttardf77079f2019-01-14 11:55:24 +00009225 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009226 }
9227 }
Matt Arsenault68f05052017-12-04 22:18:27 +00009228
9229 DAG.RemoveDeadNode(Node);
9230 return nullptr;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009231}
9232
Tom Stellardc98ee202015-07-16 19:40:07 +00009233static bool isFrameIndexOp(SDValue Op) {
9234 if (Op.getOpcode() == ISD::AssertZext)
9235 Op = Op.getOperand(0);
9236
9237 return isa<FrameIndexSDNode>(Op);
9238}
9239
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009240/// Legalize target independent instructions (e.g. INSERT_SUBREG)
Tom Stellard3457a842014-10-09 19:06:00 +00009241/// with frame index operands.
9242/// LLVM assumes that inputs are to these instructions are registers.
Matt Arsenault0d0d6c22017-04-12 21:58:23 +00009243SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
9244 SelectionDAG &DAG) const {
9245 if (Node->getOpcode() == ISD::CopyToReg) {
9246 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
9247 SDValue SrcVal = Node->getOperand(2);
9248
9249 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
9250 // to try understanding copies to physical registers.
9251 if (SrcVal.getValueType() == MVT::i1 &&
9252 TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
9253 SDLoc SL(Node);
9254 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
9255 SDValue VReg = DAG.getRegister(
9256 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
9257
9258 SDNode *Glued = Node->getGluedNode();
9259 SDValue ToVReg
9260 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
9261 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
9262 SDValue ToResultReg
9263 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
9264 VReg, ToVReg.getValue(1));
9265 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
9266 DAG.RemoveDeadNode(Node);
9267 return ToResultReg.getNode();
9268 }
9269 }
Tom Stellard8dd392e2014-10-09 18:09:15 +00009270
9271 SmallVector<SDValue, 8> Ops;
Tom Stellard3457a842014-10-09 19:06:00 +00009272 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
Tom Stellardc98ee202015-07-16 19:40:07 +00009273 if (!isFrameIndexOp(Node->getOperand(i))) {
Tom Stellard3457a842014-10-09 19:06:00 +00009274 Ops.push_back(Node->getOperand(i));
Tom Stellard8dd392e2014-10-09 18:09:15 +00009275 continue;
9276 }
9277
Tom Stellard3457a842014-10-09 19:06:00 +00009278 SDLoc DL(Node);
Tom Stellard8dd392e2014-10-09 18:09:15 +00009279 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
Tom Stellard3457a842014-10-09 19:06:00 +00009280 Node->getOperand(i).getValueType(),
9281 Node->getOperand(i)), 0));
Tom Stellard8dd392e2014-10-09 18:09:15 +00009282 }
9283
Mark Searles4e3d6162017-10-16 23:38:53 +00009284 return DAG.UpdateNodeOperands(Node, Ops);
Tom Stellard8dd392e2014-10-09 18:09:15 +00009285}
9286
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009287/// Fold the instructions after selecting them.
Matt Arsenault68f05052017-12-04 22:18:27 +00009288/// Returns null if users were already updated.
Christian Konig8e06e2a2013-04-10 08:39:08 +00009289SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
9290 SelectionDAG &DAG) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009291 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Nicolai Haehnlef2c64db2016-02-18 16:44:18 +00009292 unsigned Opcode = Node->getMachineOpcode();
Christian Konig8e06e2a2013-04-10 08:39:08 +00009293
Nicolai Haehnlec06bfa12016-07-11 21:59:43 +00009294 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009295 !TII->isGather4(Opcode)) {
Matt Arsenault68f05052017-12-04 22:18:27 +00009296 return adjustWritemask(Node, DAG);
9297 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009298
Nicolai Haehnlef2c64db2016-02-18 16:44:18 +00009299 if (Opcode == AMDGPU::INSERT_SUBREG ||
9300 Opcode == AMDGPU::REG_SEQUENCE) {
Tom Stellard8dd392e2014-10-09 18:09:15 +00009301 legalizeTargetIndependentNode(Node, DAG);
9302 return Node;
9303 }
Matt Arsenault206f8262017-08-01 20:49:41 +00009304
9305 switch (Opcode) {
9306 case AMDGPU::V_DIV_SCALE_F32:
9307 case AMDGPU::V_DIV_SCALE_F64: {
9308 // Satisfy the operand register constraint when one of the inputs is
9309 // undefined. Ordinarily each undef value will have its own implicit_def of
9310 // a vreg, so force these to use a single register.
9311 SDValue Src0 = Node->getOperand(0);
9312 SDValue Src1 = Node->getOperand(1);
9313 SDValue Src2 = Node->getOperand(2);
9314
9315 if ((Src0.isMachineOpcode() &&
9316 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
9317 (Src0 == Src1 || Src0 == Src2))
9318 break;
9319
9320 MVT VT = Src0.getValueType().getSimpleVT();
9321 const TargetRegisterClass *RC = getRegClassFor(VT);
9322
9323 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
9324 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
9325
9326 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
9327 UndefReg, Src0, SDValue());
9328
9329 // src0 must be the same register as src1 or src2, even if the value is
9330 // undefined, so make sure we don't violate this constraint.
9331 if (Src0.isMachineOpcode() &&
9332 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
9333 if (Src1.isMachineOpcode() &&
9334 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9335 Src0 = Src1;
9336 else if (Src2.isMachineOpcode() &&
9337 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9338 Src0 = Src2;
9339 else {
9340 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
9341 Src0 = UndefReg;
9342 Src1 = UndefReg;
9343 }
9344 } else
9345 break;
9346
9347 SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
9348 for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
9349 Ops.push_back(Node->getOperand(I));
9350
9351 Ops.push_back(ImpDef.getValue(1));
9352 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
9353 }
9354 default:
9355 break;
9356 }
9357
Tom Stellard654d6692015-01-08 15:08:17 +00009358 return Node;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009359}
Christian Konig8b1ed282013-04-10 08:39:16 +00009360
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009361/// Assign the register class depending on the number of
Christian Konig8b1ed282013-04-10 08:39:16 +00009362/// bits set in the writemask
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009363void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
Christian Konig8b1ed282013-04-10 08:39:16 +00009364 SDNode *Node) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009365 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009366
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009367 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009368
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009369 if (TII->isVOP3(MI.getOpcode())) {
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009370 // Make sure constant bus requirements are respected.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009371 TII->legalizeOperandsVOP3(MRI, MI);
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009372 return;
9373 }
Matt Arsenaultcb0ac3d2014-09-26 17:54:59 +00009374
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009375 // Replace unused atomics with the no return version.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009376 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009377 if (NoRetAtomicOp != -1) {
9378 if (!Node->hasAnyUseOfValue(0)) {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009379 MI.setDesc(TII->get(NoRetAtomicOp));
9380 MI.RemoveOperand(0);
Tom Stellard354a43c2016-04-01 18:27:37 +00009381 return;
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009382 }
9383
Tom Stellard354a43c2016-04-01 18:27:37 +00009384 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
9385 // instruction, because the return type of these instructions is a vec2 of
9386 // the memory type, so it can be tied to the input operand.
9387 // This means these instructions always have a use, so we need to add a
9388 // special case to check if the atomic has only one extract_subreg use,
9389 // which itself has no uses.
9390 if ((Node->hasNUsesOfValue(1, 0) &&
Nicolai Haehnle750082d2016-04-15 14:42:36 +00009391 Node->use_begin()->isMachineOpcode() &&
Tom Stellard354a43c2016-04-01 18:27:37 +00009392 Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
9393 !Node->use_begin()->hasAnyUseOfValue(0))) {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009394 unsigned Def = MI.getOperand(0).getReg();
Tom Stellard354a43c2016-04-01 18:27:37 +00009395
9396 // Change this into a noret atomic.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009397 MI.setDesc(TII->get(NoRetAtomicOp));
9398 MI.RemoveOperand(0);
Tom Stellard354a43c2016-04-01 18:27:37 +00009399
9400 // If we only remove the def operand from the atomic instruction, the
9401 // extract_subreg will be left with a use of a vreg without a def.
9402 // So we need to insert an implicit_def to avoid machine verifier
9403 // errors.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009404 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
Tom Stellard354a43c2016-04-01 18:27:37 +00009405 TII->get(AMDGPU::IMPLICIT_DEF), Def);
9406 }
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009407 return;
9408 }
Christian Konig8b1ed282013-04-10 08:39:16 +00009409}
Tom Stellard0518ff82013-06-03 17:39:58 +00009410
Benjamin Kramerbdc49562016-06-12 15:39:02 +00009411static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
9412 uint64_t Val) {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009413 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
Matt Arsenault485defe2014-11-05 19:01:17 +00009414 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
9415}
9416
9417MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
Benjamin Kramerbdc49562016-06-12 15:39:02 +00009418 const SDLoc &DL,
Matt Arsenault485defe2014-11-05 19:01:17 +00009419 SDValue Ptr) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009420 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Matt Arsenault485defe2014-11-05 19:01:17 +00009421
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009422 // Build the half of the subregister with the constants before building the
9423 // full 128-bit register. If we are building multiple resource descriptors,
9424 // this will allow CSEing of the 2-component register.
9425 const SDValue Ops0[] = {
9426 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
9427 buildSMovImm32(DAG, DL, 0),
9428 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
9429 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
9430 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
9431 };
Matt Arsenault485defe2014-11-05 19:01:17 +00009432
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009433 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
9434 MVT::v2i32, Ops0), 0);
Matt Arsenault485defe2014-11-05 19:01:17 +00009435
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009436 // Combine the constants and the pointer.
9437 const SDValue Ops1[] = {
9438 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
9439 Ptr,
9440 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
9441 SubRegHi,
9442 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
9443 };
Matt Arsenault485defe2014-11-05 19:01:17 +00009444
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009445 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
Matt Arsenault485defe2014-11-05 19:01:17 +00009446}
9447
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009448/// Return a resource descriptor with the 'Add TID' bit enabled
Benjamin Kramerdf005cb2015-08-08 18:27:36 +00009449/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
9450/// of the resource descriptor) to create an offset, which is added to
9451/// the resource pointer.
Benjamin Kramerbdc49562016-06-12 15:39:02 +00009452MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
9453 SDValue Ptr, uint32_t RsrcDword1,
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009454 uint64_t RsrcDword2And3) const {
9455 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
9456 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
9457 if (RsrcDword1) {
9458 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009459 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
9460 0);
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009461 }
9462
9463 SDValue DataLo = buildSMovImm32(DAG, DL,
9464 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
9465 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
9466
9467 const SDValue Ops[] = {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009468 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009469 PtrLo,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009470 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009471 PtrHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009472 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009473 DataLo,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009474 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009475 DataHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009476 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009477 };
9478
9479 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
9480}
9481
Tom Stellardd7e6f132015-04-08 01:09:26 +00009482//===----------------------------------------------------------------------===//
9483// SI Inline Assembly Support
9484//===----------------------------------------------------------------------===//
9485
9486std::pair<unsigned, const TargetRegisterClass *>
9487SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
Benjamin Kramer9bfb6272015-07-05 19:29:18 +00009488 StringRef Constraint,
Tom Stellardd7e6f132015-04-08 01:09:26 +00009489 MVT VT) const {
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009490 const TargetRegisterClass *RC = nullptr;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009491 if (Constraint.size() == 1) {
9492 switch (Constraint[0]) {
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009493 default:
9494 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009495 case 's':
9496 case 'r':
9497 switch (VT.getSizeInBits()) {
9498 default:
9499 return std::make_pair(0U, nullptr);
9500 case 32:
Matt Arsenault9e910142016-12-20 19:06:12 +00009501 case 16:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009502 RC = &AMDGPU::SReg_32_XM0RegClass;
9503 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009504 case 64:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009505 RC = &AMDGPU::SGPR_64RegClass;
9506 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009507 case 128:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009508 RC = &AMDGPU::SReg_128RegClass;
9509 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009510 case 256:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009511 RC = &AMDGPU::SReg_256RegClass;
9512 break;
Matt Arsenaulte0bf7d02017-02-21 19:12:08 +00009513 case 512:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009514 RC = &AMDGPU::SReg_512RegClass;
9515 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009516 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009517 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009518 case 'v':
9519 switch (VT.getSizeInBits()) {
9520 default:
9521 return std::make_pair(0U, nullptr);
9522 case 32:
Matt Arsenault9e910142016-12-20 19:06:12 +00009523 case 16:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009524 RC = &AMDGPU::VGPR_32RegClass;
9525 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009526 case 64:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009527 RC = &AMDGPU::VReg_64RegClass;
9528 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009529 case 96:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009530 RC = &AMDGPU::VReg_96RegClass;
9531 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009532 case 128:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009533 RC = &AMDGPU::VReg_128RegClass;
9534 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009535 case 256:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009536 RC = &AMDGPU::VReg_256RegClass;
9537 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009538 case 512:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009539 RC = &AMDGPU::VReg_512RegClass;
9540 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009541 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009542 break;
Tom Stellardd7e6f132015-04-08 01:09:26 +00009543 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009544 // We actually support i128, i16 and f16 as inline parameters
9545 // even if they are not reported as legal
9546 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
9547 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
9548 return std::make_pair(0U, RC);
Tom Stellardd7e6f132015-04-08 01:09:26 +00009549 }
9550
9551 if (Constraint.size() > 1) {
Tom Stellardd7e6f132015-04-08 01:09:26 +00009552 if (Constraint[1] == 'v') {
9553 RC = &AMDGPU::VGPR_32RegClass;
9554 } else if (Constraint[1] == 's') {
9555 RC = &AMDGPU::SGPR_32RegClass;
9556 }
9557
9558 if (RC) {
Matt Arsenault0b554ed2015-06-23 02:05:55 +00009559 uint32_t Idx;
9560 bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
9561 if (!Failed && Idx < RC->getNumRegs())
Tom Stellardd7e6f132015-04-08 01:09:26 +00009562 return std::make_pair(RC->getRegister(Idx), RC);
9563 }
9564 }
9565 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9566}
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009567
9568SITargetLowering::ConstraintType
9569SITargetLowering::getConstraintType(StringRef Constraint) const {
9570 if (Constraint.size() == 1) {
9571 switch (Constraint[0]) {
9572 default: break;
9573 case 's':
9574 case 'v':
9575 return C_RegisterClass;
9576 }
9577 }
9578 return TargetLowering::getConstraintType(Constraint);
9579}
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009580
9581// Figure out which registers should be reserved for stack access. Only after
9582// the function is legalized do we know all of the non-spill stack objects or if
9583// calls are present.
9584void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
9585 MachineRegisterInfo &MRI = MF.getRegInfo();
9586 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
9587 const MachineFrameInfo &MFI = MF.getFrameInfo();
Tom Stellardc5a154d2018-06-28 23:47:12 +00009588 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009589
9590 if (Info->isEntryFunction()) {
9591 // Callable functions have fixed registers used for stack access.
9592 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
9593 }
9594
9595 // We have to assume the SP is needed in case there are calls in the function
9596 // during lowering. Calls are only detected after the function is
9597 // lowered. We're about to reserve registers, so don't bother using it if we
9598 // aren't really going to use it.
9599 bool NeedSP = !Info->isEntryFunction() ||
9600 MFI.hasVarSizedObjects() ||
9601 MFI.hasCalls();
9602
9603 if (NeedSP) {
9604 unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
9605 Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
9606
9607 assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
9608 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
9609 Info->getStackPtrOffsetReg()));
Matt Arsenaultbc6d07c2019-03-14 22:54:43 +00009610 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
9611 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009612 }
9613
Matt Arsenaultbc6d07c2019-03-14 22:54:43 +00009614 // We need to worry about replacing the default register with itself in case
9615 // of MIR testcases missing the MFI.
9616 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
9617 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
9618
9619 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
9620 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
9621
9622 if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) {
9623 MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
9624 Info->getScratchWaveOffsetReg());
9625 }
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009626
Stanislav Mekhanoshind4b500c2018-05-31 05:36:04 +00009627 Info->limitOccupancy(MF);
9628
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009629 TargetLoweringBase::finalizeLowering(MF);
9630}
Matt Arsenault45b98182017-11-15 00:45:43 +00009631
9632void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
9633 KnownBits &Known,
9634 const APInt &DemandedElts,
9635 const SelectionDAG &DAG,
9636 unsigned Depth) const {
9637 TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
9638 DAG, Depth);
9639
9640 if (getSubtarget()->enableHugePrivateBuffer())
9641 return;
9642
9643 // Technically it may be possible to have a dispatch with a single workitem
9644 // that uses the full private memory size, but that's not really useful. We
9645 // can't use vaddr in MUBUF instructions if we don't know the address
9646 // calculation won't overflow, so assume the sign bit is never set.
9647 Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
9648}
Tom Stellard264c1712018-06-13 15:06:37 +00009649
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +00009650LLVM_ATTRIBUTE_UNUSED
9651static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
9652 assert(N->getOpcode() == ISD::CopyFromReg);
9653 do {
9654 // Follow the chain until we find an INLINEASM node.
9655 N = N->getOperand(0).getNode();
Craig Topper784929d2019-02-08 20:48:56 +00009656 if (N->getOpcode() == ISD::INLINEASM ||
9657 N->getOpcode() == ISD::INLINEASM_BR)
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +00009658 return true;
9659 } while (N->getOpcode() == ISD::CopyFromReg);
9660 return false;
9661}
9662
Tom Stellard264c1712018-06-13 15:06:37 +00009663bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
Nicolai Haehnle35617ed2018-08-30 14:21:36 +00009664 FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
Tom Stellard264c1712018-06-13 15:06:37 +00009665{
9666 switch (N->getOpcode()) {
Tom Stellard264c1712018-06-13 15:06:37 +00009667 case ISD::CopyFromReg:
9668 {
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +00009669 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
9670 const MachineFunction * MF = FLI->MF;
9671 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
9672 const MachineRegisterInfo &MRI = MF->getRegInfo();
9673 const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
9674 unsigned Reg = R->getReg();
9675 if (TRI.isPhysicalRegister(Reg))
9676 return !TRI.isSGPRReg(MRI, Reg);
Tom Stellard264c1712018-06-13 15:06:37 +00009677
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +00009678 if (MRI.isLiveIn(Reg)) {
9679 // workitem.id.x workitem.id.y workitem.id.z
9680 // Any VGPR formal argument is also considered divergent
9681 if (!TRI.isSGPRReg(MRI, Reg))
9682 return true;
9683 // Formal arguments of non-entry functions
9684 // are conservatively considered divergent
9685 else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
9686 return true;
9687 return false;
Tom Stellard264c1712018-06-13 15:06:37 +00009688 }
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +00009689 const Value *V = FLI->getValueFromVirtualReg(Reg);
9690 if (V)
9691 return KDA->isDivergent(V);
9692 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
9693 return !TRI.isSGPRReg(MRI, Reg);
Tom Stellard264c1712018-06-13 15:06:37 +00009694 }
9695 break;
9696 case ISD::LOAD: {
Matt Arsenault813613c2018-09-04 18:58:19 +00009697 const LoadSDNode *L = cast<LoadSDNode>(N);
9698 unsigned AS = L->getAddressSpace();
9699 // A flat load may access private memory.
9700 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
Tom Stellard264c1712018-06-13 15:06:37 +00009701 } break;
9702 case ISD::CALLSEQ_END:
9703 return true;
9704 break;
9705 case ISD::INTRINSIC_WO_CHAIN:
9706 {
9707
9708 }
9709 return AMDGPU::isIntrinsicSourceOfDivergence(
9710 cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
9711 case ISD::INTRINSIC_W_CHAIN:
9712 return AMDGPU::isIntrinsicSourceOfDivergence(
9713 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
9714 // In some cases intrinsics that are a source of divergence have been
9715 // lowered to AMDGPUISD so we also need to check those too.
9716 case AMDGPUISD::INTERP_MOV:
9717 case AMDGPUISD::INTERP_P1:
9718 case AMDGPUISD::INTERP_P2:
9719 return true;
9720 }
9721 return false;
9722}
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00009723
9724bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
9725 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
9726 case MVT::f32:
9727 return Subtarget->hasFP32Denormals();
9728 case MVT::f64:
9729 return Subtarget->hasFP64Denormals();
9730 case MVT::f16:
9731 return Subtarget->hasFP16Denormals();
9732 default:
9733 return false;
9734 }
9735}
Matt Arsenault687ec752018-10-22 16:27:27 +00009736
9737bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
9738 const SelectionDAG &DAG,
9739 bool SNaN,
9740 unsigned Depth) const {
9741 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
9742 if (Subtarget->enableDX10Clamp())
9743 return true; // Clamped to 0.
9744 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
9745 }
9746
9747 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
9748 SNaN, Depth);
9749}
Matt Arsenaulta5840c32019-01-22 18:36:06 +00009750
9751TargetLowering::AtomicExpansionKind
9752SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
9753 switch (RMW->getOperation()) {
9754 case AtomicRMWInst::FAdd: {
9755 Type *Ty = RMW->getType();
9756
9757 // We don't have a way to support 16-bit atomics now, so just leave them
9758 // as-is.
9759 if (Ty->isHalfTy())
9760 return AtomicExpansionKind::None;
9761
9762 if (!Ty->isFloatTy())
9763 return AtomicExpansionKind::CmpXChg;
9764
9765 // TODO: Do have these for flat. Older targets also had them for buffers.
9766 unsigned AS = RMW->getPointerAddressSpace();
9767 return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
9768 AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
9769 }
9770 default:
9771 break;
9772 }
9773
9774 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
9775}