blob: 6f48a2eabc5e9d0bc9059b13304df041b90fe62e [file] [log] [blame]
Tom Stellard75aadc22012-12-11 21:25:42 +00001//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Tom Stellard75aadc22012-12-11 21:25:42 +00006//
7//===----------------------------------------------------------------------===//
8//
9/// \file
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000010/// Custom DAG lowering for SI
Tom Stellard75aadc22012-12-11 21:25:42 +000011//
12//===----------------------------------------------------------------------===//
13
Sylvestre Ledrudf92dab2018-11-02 17:25:40 +000014#if defined(_MSC_VER) || defined(__MINGW32__)
NAKAMURA Takumi45e0a832014-07-20 11:15:07 +000015// Provide M_PI.
16#define _USE_MATH_DEFINES
NAKAMURA Takumi45e0a832014-07-20 11:15:07 +000017#endif
18
Chandler Carruth6bda14b2017-06-06 11:49:48 +000019#include "SIISelLowering.h"
Christian Konig99ee0f42013-03-07 09:04:14 +000020#include "AMDGPU.h"
Matt Arsenault41e2f2b2014-02-24 21:01:28 +000021#include "AMDGPUSubtarget.h"
Chandler Carruth6bda14b2017-06-06 11:49:48 +000022#include "AMDGPUTargetMachine.h"
Tom Stellard8485fa02016-12-07 02:42:15 +000023#include "SIDefines.h"
Tom Stellard75aadc22012-12-11 21:25:42 +000024#include "SIInstrInfo.h"
25#include "SIMachineFunctionInfo.h"
26#include "SIRegisterInfo.h"
Tom Stellard44b30b42018-05-22 02:03:23 +000027#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000028#include "Utils/AMDGPUBaseInfo.h"
29#include "llvm/ADT/APFloat.h"
30#include "llvm/ADT/APInt.h"
31#include "llvm/ADT/ArrayRef.h"
Alexey Samsonova253bf92014-08-27 19:36:53 +000032#include "llvm/ADT/BitVector.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000033#include "llvm/ADT/SmallVector.h"
Matt Arsenault71bcbd42017-08-11 20:42:08 +000034#include "llvm/ADT/Statistic.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000035#include "llvm/ADT/StringRef.h"
Matt Arsenault9a10cea2016-01-26 04:29:24 +000036#include "llvm/ADT/StringSwitch.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000037#include "llvm/ADT/Twine.h"
Wei Ding07e03712016-07-28 16:42:13 +000038#include "llvm/CodeGen/Analysis.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000039#include "llvm/CodeGen/CallingConvLower.h"
40#include "llvm/CodeGen/DAGCombine.h"
41#include "llvm/CodeGen/ISDOpcodes.h"
42#include "llvm/CodeGen/MachineBasicBlock.h"
43#include "llvm/CodeGen/MachineFrameInfo.h"
44#include "llvm/CodeGen/MachineFunction.h"
45#include "llvm/CodeGen/MachineInstr.h"
46#include "llvm/CodeGen/MachineInstrBuilder.h"
47#include "llvm/CodeGen/MachineMemOperand.h"
Matt Arsenault8623e8d2017-08-03 23:00:29 +000048#include "llvm/CodeGen/MachineModuleInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000049#include "llvm/CodeGen/MachineOperand.h"
50#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000051#include "llvm/CodeGen/SelectionDAG.h"
52#include "llvm/CodeGen/SelectionDAGNodes.h"
David Blaikieb3bde2e2017-11-17 01:07:10 +000053#include "llvm/CodeGen/TargetCallingConv.h"
54#include "llvm/CodeGen/TargetRegisterInfo.h"
Craig Topper2fa14362018-03-29 17:21:10 +000055#include "llvm/CodeGen/ValueTypes.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000056#include "llvm/IR/Constants.h"
57#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/DebugLoc.h"
59#include "llvm/IR/DerivedTypes.h"
Oliver Stannard7e7d9832016-02-02 13:52:43 +000060#include "llvm/IR/DiagnosticInfo.h"
Benjamin Kramerd78bb462013-05-23 17:10:37 +000061#include "llvm/IR/Function.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000062#include "llvm/IR/GlobalValue.h"
63#include "llvm/IR/InstrTypes.h"
64#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Instructions.h"
Matt Arsenault7dc01c92017-03-15 23:15:12 +000066#include "llvm/IR/IntrinsicInst.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000067#include "llvm/IR/Type.h"
68#include "llvm/Support/Casting.h"
69#include "llvm/Support/CodeGen.h"
70#include "llvm/Support/CommandLine.h"
71#include "llvm/Support/Compiler.h"
72#include "llvm/Support/ErrorHandling.h"
Craig Topperd0af7e82017-04-28 05:31:46 +000073#include "llvm/Support/KnownBits.h"
David Blaikie13e77db2018-03-23 23:58:25 +000074#include "llvm/Support/MachineValueType.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000075#include "llvm/Support/MathExtras.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000076#include "llvm/Target/TargetOptions.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000077#include <cassert>
78#include <cmath>
79#include <cstdint>
80#include <iterator>
81#include <tuple>
82#include <utility>
83#include <vector>
Tom Stellard75aadc22012-12-11 21:25:42 +000084
85using namespace llvm;
86
Matt Arsenault71bcbd42017-08-11 20:42:08 +000087#define DEBUG_TYPE "si-lower"
88
89STATISTIC(NumTailCalls, "Number of tail calls");
90
Matt Arsenaultd486d3f2016-10-12 18:49:05 +000091static cl::opt<bool> EnableVGPRIndexMode(
92 "amdgpu-vgpr-index-mode",
93 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
94 cl::init(false));
95
Matt Arsenault45b98182017-11-15 00:45:43 +000096static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
97 "amdgpu-frame-index-zero-bits",
98 cl::desc("High bits of frame index assumed to be zero"),
99 cl::init(5),
100 cl::ReallyHidden);
101
Tom Stellardf110f8f2016-04-14 16:27:03 +0000102static unsigned findFirstFreeSGPR(CCState &CCInfo) {
103 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
104 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
105 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
106 return AMDGPU::SGPR0 + Reg;
107 }
108 }
109 llvm_unreachable("Cannot allocate sgpr");
110}
111
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000112SITargetLowering::SITargetLowering(const TargetMachine &TM,
Tom Stellard5bfbae52018-07-11 20:59:01 +0000113 const GCNSubtarget &STI)
Tom Stellardc5a154d2018-06-28 23:47:12 +0000114 : AMDGPUTargetLowering(TM, STI),
115 Subtarget(&STI) {
Tom Stellard1bd80722014-04-30 15:31:33 +0000116 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
Tom Stellard436780b2014-05-15 14:41:57 +0000117 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000118
Marek Olsak79c05872016-11-25 17:37:09 +0000119 addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
Tom Stellard45c0b3a2015-01-07 20:59:25 +0000120 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
Tom Stellard75aadc22012-12-11 21:25:42 +0000121
Tom Stellard436780b2014-05-15 14:41:57 +0000122 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
123 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
124 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000125
Matt Arsenault61001bb2015-11-25 19:58:34 +0000126 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
127 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
128
Tom Stellard436780b2014-05-15 14:41:57 +0000129 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
130 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000131
Tom Stellardf0a21072014-11-18 20:39:39 +0000132 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000133 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
134
Tom Stellardf0a21072014-11-18 20:39:39 +0000135 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000136 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
Tom Stellard75aadc22012-12-11 21:25:42 +0000137
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000138 if (Subtarget->has16BitInsts()) {
Marek Olsak79c05872016-11-25 17:37:09 +0000139 addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
140 addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
Tom Stellard115a6152016-11-10 16:02:37 +0000141
Matt Arsenault1349a042018-05-22 06:32:10 +0000142 // Unless there are also VOP3P operations, not operations are really legal.
Matt Arsenault7596f132017-02-27 20:52:10 +0000143 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
144 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000145 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
146 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
Matt Arsenault7596f132017-02-27 20:52:10 +0000147 }
148
Tom Stellardc5a154d2018-06-28 23:47:12 +0000149 computeRegisterProperties(Subtarget->getRegisterInfo());
Tom Stellard75aadc22012-12-11 21:25:42 +0000150
Tom Stellard35bb18c2013-08-26 15:06:04 +0000151 // We need to custom lower vector stores from local memory
Matt Arsenault71e66762016-05-21 02:27:49 +0000152 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
Tom Stellard35bb18c2013-08-26 15:06:04 +0000153 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
Tom Stellardaf775432013-10-23 00:44:32 +0000154 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
155 setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000156 setOperationAction(ISD::LOAD, MVT::i1, Custom);
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000157 setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
Matt Arsenault2b957b52016-05-02 20:07:26 +0000158
Matt Arsenaultbcdfee72016-05-02 20:13:51 +0000159 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000160 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
161 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
162 setOperationAction(ISD::STORE, MVT::v16i32, Custom);
163 setOperationAction(ISD::STORE, MVT::i1, Custom);
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000164 setOperationAction(ISD::STORE, MVT::v32i32, Custom);
Matt Arsenaultbcdfee72016-05-02 20:13:51 +0000165
Jan Vesely06200bd2017-01-06 21:00:46 +0000166 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
167 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
168 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
169 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
170 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
171 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
172 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
173 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
174 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
175 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
176
Matt Arsenault71e66762016-05-21 02:27:49 +0000177 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
178 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000179
180 setOperationAction(ISD::SELECT, MVT::i1, Promote);
Tom Stellard0ec134f2014-02-04 17:18:40 +0000181 setOperationAction(ISD::SELECT, MVT::i64, Custom);
Tom Stellardda99c6e2014-03-24 16:07:30 +0000182 setOperationAction(ISD::SELECT, MVT::f64, Promote);
183 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
Tom Stellard81d871d2013-11-13 23:36:50 +0000184
Tom Stellard3ca1bfc2014-06-10 16:01:22 +0000185 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
186 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
187 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
188 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
Matt Arsenault71e66762016-05-21 02:27:49 +0000189 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
Tom Stellard754f80f2013-04-05 23:31:51 +0000190
Tom Stellardd1efda82016-01-20 21:48:24 +0000191 setOperationAction(ISD::SETCC, MVT::i1, Promote);
Tom Stellard83747202013-07-18 21:43:53 +0000192 setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
193 setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
Matt Arsenault18f56be2016-12-22 16:27:11 +0000194 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
Tom Stellard83747202013-07-18 21:43:53 +0000195
Matt Arsenault71e66762016-05-21 02:27:49 +0000196 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
197 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
Matt Arsenaulte306a322014-10-21 16:25:08 +0000198
Matt Arsenault4e466652014-04-16 01:41:30 +0000199 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
200 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000201 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
202 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000203 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
204 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000205 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
206
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000207 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
Tom Stellard9fa17912013-08-14 23:24:45 +0000208 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
Tom Stellard9fa17912013-08-14 23:24:45 +0000209 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
Matt Arsenaultb3a80e52018-08-15 21:25:20 +0000210 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
211 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
Marek Olsak13e47412018-01-31 20:18:04 +0000212 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000213 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
214
Changpeng Fang44dfa1d2018-01-12 21:12:19 +0000215 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
216 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
David Stuttardf77079f2019-01-14 11:55:24 +0000217 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000218 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000219
220 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
Matt Arsenault4165efd2017-01-17 07:26:53 +0000221 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
222 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +0000223 setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000224
Matt Arsenaulte54e1c32014-06-23 18:00:44 +0000225 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000226 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
Tom Stellardbc4497b2016-02-12 23:45:29 +0000227 setOperationAction(ISD::BR_CC, MVT::i32, Expand);
228 setOperationAction(ISD::BR_CC, MVT::i64, Expand);
229 setOperationAction(ISD::BR_CC, MVT::f32, Expand);
230 setOperationAction(ISD::BR_CC, MVT::f64, Expand);
Tom Stellardafcf12f2013-09-12 02:55:14 +0000231
Matt Arsenaultee3f0ac2017-01-30 18:11:38 +0000232 setOperationAction(ISD::UADDO, MVT::i32, Legal);
233 setOperationAction(ISD::USUBO, MVT::i32, Legal);
234
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +0000235 setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
236 setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
237
Matt Arsenaulte7191392018-08-08 16:58:33 +0000238 setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
239 setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
240 setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
241
Matt Arsenault84445dd2017-11-30 22:51:26 +0000242#if 0
243 setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
244 setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
245#endif
246
Benjamin Kramer867bfc52015-03-07 17:41:00 +0000247 // We only support LOAD/STORE and vector manipulation ops for vectors
248 // with > 4 elements.
Matt Arsenault7596f132017-02-27 20:52:10 +0000249 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000250 MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
Tom Stellard967bf582014-02-13 23:34:15 +0000251 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
Matt Arsenault71e66762016-05-21 02:27:49 +0000252 switch (Op) {
Tom Stellard967bf582014-02-13 23:34:15 +0000253 case ISD::LOAD:
254 case ISD::STORE:
255 case ISD::BUILD_VECTOR:
256 case ISD::BITCAST:
257 case ISD::EXTRACT_VECTOR_ELT:
258 case ISD::INSERT_VECTOR_ELT:
Tom Stellard967bf582014-02-13 23:34:15 +0000259 case ISD::INSERT_SUBVECTOR:
260 case ISD::EXTRACT_SUBVECTOR:
Matt Arsenault61001bb2015-11-25 19:58:34 +0000261 case ISD::SCALAR_TO_VECTOR:
Tom Stellard967bf582014-02-13 23:34:15 +0000262 break;
Tom Stellardc0503db2014-08-09 01:06:56 +0000263 case ISD::CONCAT_VECTORS:
264 setOperationAction(Op, VT, Custom);
265 break;
Tom Stellard967bf582014-02-13 23:34:15 +0000266 default:
Matt Arsenaultd504a742014-05-15 21:44:05 +0000267 setOperationAction(Op, VT, Expand);
Tom Stellard967bf582014-02-13 23:34:15 +0000268 break;
269 }
270 }
271 }
272
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000273 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
274
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000275 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
276 // is expanded to avoid having two separate loops in case the index is a VGPR.
277
Matt Arsenault61001bb2015-11-25 19:58:34 +0000278 // Most operations are naturally 32-bit vector operations. We only support
279 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
280 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
281 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
282 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
283
284 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
285 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
286
287 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
288 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
289
290 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
291 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
292 }
293
Matt Arsenault71e66762016-05-21 02:27:49 +0000294 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
295 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
296 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
297 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +0000298
Matt Arsenault67a98152018-05-16 11:47:30 +0000299 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
300 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
301
Matt Arsenault3aef8092017-01-23 23:09:58 +0000302 // Avoid stack access for these.
303 // TODO: Generalize to more vector types.
304 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
305 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault67a98152018-05-16 11:47:30 +0000306 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
307 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
308
Matt Arsenault3aef8092017-01-23 23:09:58 +0000309 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
310 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault9224c002018-06-05 19:52:46 +0000311 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
312 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
313 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
314
315 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
316 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
317 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
Matt Arsenault3aef8092017-01-23 23:09:58 +0000318
Matt Arsenault67a98152018-05-16 11:47:30 +0000319 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
320 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
321 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
322 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
323
Tom Stellard354a43c2016-04-01 18:27:37 +0000324 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
325 // and output demarshalling
326 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
327 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
328
329 // We can't return success/failure, only the old value,
330 // let LLVM add the comparison
331 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
332 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
333
Tom Stellardc5a154d2018-06-28 23:47:12 +0000334 if (Subtarget->hasFlatAddressSpace()) {
Matt Arsenault99c14522016-04-25 19:27:24 +0000335 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
336 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
337 }
338
Matt Arsenault71e66762016-05-21 02:27:49 +0000339 setOperationAction(ISD::BSWAP, MVT::i32, Legal);
340 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
341
342 // On SI this is s_memtime and s_memrealtime on VI.
343 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
Matt Arsenault3e025382017-04-24 17:49:13 +0000344 setOperationAction(ISD::TRAP, MVT::Other, Custom);
345 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000346
Tom Stellardc5a154d2018-06-28 23:47:12 +0000347 if (Subtarget->has16BitInsts()) {
348 setOperationAction(ISD::FLOG, MVT::f16, Custom);
Matt Arsenault7121bed2018-08-16 17:07:52 +0000349 setOperationAction(ISD::FEXP, MVT::f16, Custom);
Tom Stellardc5a154d2018-06-28 23:47:12 +0000350 setOperationAction(ISD::FLOG10, MVT::f16, Custom);
351 }
352
353 // v_mad_f32 does not support denormals according to some sources.
354 if (!Subtarget->hasFP32Denormals())
355 setOperationAction(ISD::FMAD, MVT::f32, Legal);
356
357 if (!Subtarget->hasBFI()) {
358 // fcopysign can be done in a single instruction with BFI.
359 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
360 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
361 }
362
363 if (!Subtarget->hasBCNT(32))
364 setOperationAction(ISD::CTPOP, MVT::i32, Expand);
365
366 if (!Subtarget->hasBCNT(64))
367 setOperationAction(ISD::CTPOP, MVT::i64, Expand);
368
369 if (Subtarget->hasFFBH())
370 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
371
372 if (Subtarget->hasFFBL())
373 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
374
375 // We only really have 32-bit BFE instructions (and 16-bit on VI).
376 //
377 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
378 // effort to match them now. We want this to be false for i64 cases when the
379 // extraction isn't restricted to the upper or lower half. Ideally we would
380 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
381 // span the midpoint are probably relatively rare, so don't worry about them
382 // for now.
383 if (Subtarget->hasBFE())
384 setHasExtractBitsInsn(true);
385
Matt Arsenault687ec752018-10-22 16:27:27 +0000386 setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
387 setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
388 setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
389 setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
390
391
392 // These are really only legal for ieee_mode functions. We should be avoiding
393 // them for functions that don't have ieee_mode enabled, so just say they are
394 // legal.
395 setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
396 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
397 setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
398 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
399
Matt Arsenault71e66762016-05-21 02:27:49 +0000400
Tom Stellard5bfbae52018-07-11 20:59:01 +0000401 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
Matt Arsenault71e66762016-05-21 02:27:49 +0000402 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
403 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
404 setOperationAction(ISD::FRINT, MVT::f64, Legal);
Tom Stellardc5a154d2018-06-28 23:47:12 +0000405 } else {
406 setOperationAction(ISD::FCEIL, MVT::f64, Custom);
407 setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
408 setOperationAction(ISD::FRINT, MVT::f64, Custom);
409 setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000410 }
411
412 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
413
414 setOperationAction(ISD::FSIN, MVT::f32, Custom);
415 setOperationAction(ISD::FCOS, MVT::f32, Custom);
416 setOperationAction(ISD::FDIV, MVT::f32, Custom);
417 setOperationAction(ISD::FDIV, MVT::f64, Custom);
418
Tom Stellard115a6152016-11-10 16:02:37 +0000419 if (Subtarget->has16BitInsts()) {
420 setOperationAction(ISD::Constant, MVT::i16, Legal);
421
422 setOperationAction(ISD::SMIN, MVT::i16, Legal);
423 setOperationAction(ISD::SMAX, MVT::i16, Legal);
424
425 setOperationAction(ISD::UMIN, MVT::i16, Legal);
426 setOperationAction(ISD::UMAX, MVT::i16, Legal);
427
Tom Stellard115a6152016-11-10 16:02:37 +0000428 setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
429 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
430
431 setOperationAction(ISD::ROTR, MVT::i16, Promote);
432 setOperationAction(ISD::ROTL, MVT::i16, Promote);
433
434 setOperationAction(ISD::SDIV, MVT::i16, Promote);
435 setOperationAction(ISD::UDIV, MVT::i16, Promote);
436 setOperationAction(ISD::SREM, MVT::i16, Promote);
437 setOperationAction(ISD::UREM, MVT::i16, Promote);
438
439 setOperationAction(ISD::BSWAP, MVT::i16, Promote);
440 setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
441
442 setOperationAction(ISD::CTTZ, MVT::i16, Promote);
443 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
444 setOperationAction(ISD::CTLZ, MVT::i16, Promote);
445 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
Jan Veselyb283ea02018-03-02 02:50:22 +0000446 setOperationAction(ISD::CTPOP, MVT::i16, Promote);
Tom Stellard115a6152016-11-10 16:02:37 +0000447
448 setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
449
450 setOperationAction(ISD::BR_CC, MVT::i16, Expand);
451
452 setOperationAction(ISD::LOAD, MVT::i16, Custom);
453
454 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
455
Tom Stellard115a6152016-11-10 16:02:37 +0000456 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
457 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
458 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
459 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
Tom Stellardb4c8e8e2016-11-12 00:19:11 +0000460
Konstantin Zhuravlyov3f0cdc72016-11-17 04:00:46 +0000461 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
462 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
463 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
464 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
Tom Stellardb4c8e8e2016-11-12 00:19:11 +0000465
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000466 // F16 - Constant Actions.
Matt Arsenaulte96d0372016-12-08 20:14:46 +0000467 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000468
469 // F16 - Load/Store Actions.
470 setOperationAction(ISD::LOAD, MVT::f16, Promote);
471 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
472 setOperationAction(ISD::STORE, MVT::f16, Promote);
473 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
474
475 // F16 - VOP1 Actions.
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +0000476 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000477 setOperationAction(ISD::FCOS, MVT::f16, Promote);
478 setOperationAction(ISD::FSIN, MVT::f16, Promote);
Konstantin Zhuravlyov3f0cdc72016-11-17 04:00:46 +0000479 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
480 setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
481 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
482 setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
Matt Arsenaultb5d23272017-03-24 20:04:18 +0000483 setOperationAction(ISD::FROUND, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000484
485 // F16 - VOP2 Actions.
Konstantin Zhuravlyov662e01d2016-11-17 03:49:01 +0000486 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
Konstantin Zhuravlyov2a87a422016-11-16 03:16:26 +0000487 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
Matt Arsenault687ec752018-10-22 16:27:27 +0000488
Matt Arsenault4052a572016-12-22 03:05:41 +0000489 setOperationAction(ISD::FDIV, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000490
491 // F16 - VOP3 Actions.
492 setOperationAction(ISD::FMA, MVT::f16, Legal);
493 if (!Subtarget->hasFP16Denormals())
494 setOperationAction(ISD::FMAD, MVT::f16, Legal);
Tom Stellard115a6152016-11-10 16:02:37 +0000495
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000496 for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
Matt Arsenault7596f132017-02-27 20:52:10 +0000497 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
498 switch (Op) {
499 case ISD::LOAD:
500 case ISD::STORE:
501 case ISD::BUILD_VECTOR:
502 case ISD::BITCAST:
503 case ISD::EXTRACT_VECTOR_ELT:
504 case ISD::INSERT_VECTOR_ELT:
505 case ISD::INSERT_SUBVECTOR:
506 case ISD::EXTRACT_SUBVECTOR:
507 case ISD::SCALAR_TO_VECTOR:
508 break;
509 case ISD::CONCAT_VECTORS:
510 setOperationAction(Op, VT, Custom);
511 break;
512 default:
513 setOperationAction(Op, VT, Expand);
514 break;
515 }
516 }
517 }
518
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000519 // XXX - Do these do anything? Vector constants turn into build_vector.
520 setOperationAction(ISD::Constant, MVT::v2i16, Legal);
521 setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
522
Matt Arsenaultdfb88df2018-05-13 10:04:38 +0000523 setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
524 setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
525
Matt Arsenault7596f132017-02-27 20:52:10 +0000526 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
527 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
528 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
529 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
530
531 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
532 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
533 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
534 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000535
536 setOperationAction(ISD::AND, MVT::v2i16, Promote);
537 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
538 setOperationAction(ISD::OR, MVT::v2i16, Promote);
539 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
540 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
541 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000542
Matt Arsenault1349a042018-05-22 06:32:10 +0000543 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
544 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
545 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
546 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
547
548 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
549 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
550 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
551 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
552
553 setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
554 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
555 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
556 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
557
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000558 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
559 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
560 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
561
Matt Arsenault1349a042018-05-22 06:32:10 +0000562 if (!Subtarget->hasVOP3PInsts()) {
563 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
564 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
565 }
566
567 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
568 // This isn't really legal, but this avoids the legalizer unrolling it (and
569 // allows matching fneg (fabs x) patterns)
570 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
Matt Arsenault687ec752018-10-22 16:27:27 +0000571
572 setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
573 setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
574 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
575 setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
576
577 setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
578 setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
579
580 setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
581 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
Matt Arsenault1349a042018-05-22 06:32:10 +0000582 }
583
584 if (Subtarget->hasVOP3PInsts()) {
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000585 setOperationAction(ISD::ADD, MVT::v2i16, Legal);
586 setOperationAction(ISD::SUB, MVT::v2i16, Legal);
587 setOperationAction(ISD::MUL, MVT::v2i16, Legal);
588 setOperationAction(ISD::SHL, MVT::v2i16, Legal);
589 setOperationAction(ISD::SRL, MVT::v2i16, Legal);
590 setOperationAction(ISD::SRA, MVT::v2i16, Legal);
591 setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
592 setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
593 setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
594 setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
595
596 setOperationAction(ISD::FADD, MVT::v2f16, Legal);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000597 setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
598 setOperationAction(ISD::FMA, MVT::v2f16, Legal);
Matt Arsenault687ec752018-10-22 16:27:27 +0000599
600 setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
601 setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
602
Matt Arsenault540512c2018-04-26 19:21:37 +0000603 setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000604
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000605 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
606 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000607
608 setOperationAction(ISD::SHL, MVT::v4i16, Custom);
609 setOperationAction(ISD::SRA, MVT::v4i16, Custom);
610 setOperationAction(ISD::SRL, MVT::v4i16, Custom);
611 setOperationAction(ISD::ADD, MVT::v4i16, Custom);
612 setOperationAction(ISD::SUB, MVT::v4i16, Custom);
613 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
614
615 setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
616 setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
617 setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
618 setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
619
620 setOperationAction(ISD::FADD, MVT::v4f16, Custom);
621 setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
Matt Arsenault687ec752018-10-22 16:27:27 +0000622
623 setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
624 setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
625
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000626 setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
627 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
Matt Arsenault36cdcfa2018-08-02 13:43:42 +0000628 setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000629
Matt Arsenault7121bed2018-08-16 17:07:52 +0000630 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000631 setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
632 setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
Matt Arsenault1349a042018-05-22 06:32:10 +0000633 }
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000634
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000635 setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
636 setOperationAction(ISD::FABS, MVT::v4f16, Custom);
637
Matt Arsenault1349a042018-05-22 06:32:10 +0000638 if (Subtarget->has16BitInsts()) {
639 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
640 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
641 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
642 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
Matt Arsenault4a486232017-04-19 20:53:07 +0000643 } else {
Matt Arsenault1349a042018-05-22 06:32:10 +0000644 // Legalization hack.
Matt Arsenault4a486232017-04-19 20:53:07 +0000645 setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
646 setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
Matt Arsenaulte9524f12018-06-06 21:28:11 +0000647
648 setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
649 setOperationAction(ISD::FABS, MVT::v2f16, Custom);
Matt Arsenault4a486232017-04-19 20:53:07 +0000650 }
651
652 for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
653 setOperationAction(ISD::SELECT, VT, Custom);
Matt Arsenault7596f132017-02-27 20:52:10 +0000654 }
655
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +0000656 setTargetDAGCombine(ISD::ADD);
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +0000657 setTargetDAGCombine(ISD::ADDCARRY);
658 setTargetDAGCombine(ISD::SUB);
659 setTargetDAGCombine(ISD::SUBCARRY);
Matt Arsenault02cb0ff2014-09-29 14:59:34 +0000660 setTargetDAGCombine(ISD::FADD);
Matt Arsenault8675db12014-08-29 16:01:14 +0000661 setTargetDAGCombine(ISD::FSUB);
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +0000662 setTargetDAGCombine(ISD::FMINNUM);
663 setTargetDAGCombine(ISD::FMAXNUM);
Matt Arsenault687ec752018-10-22 16:27:27 +0000664 setTargetDAGCombine(ISD::FMINNUM_IEEE);
665 setTargetDAGCombine(ISD::FMAXNUM_IEEE);
Farhana Aleenc370d7b2018-07-16 18:19:59 +0000666 setTargetDAGCombine(ISD::FMA);
Matt Arsenault5881f4e2015-06-09 00:52:37 +0000667 setTargetDAGCombine(ISD::SMIN);
668 setTargetDAGCombine(ISD::SMAX);
669 setTargetDAGCombine(ISD::UMIN);
670 setTargetDAGCombine(ISD::UMAX);
Tom Stellard75aadc22012-12-11 21:25:42 +0000671 setTargetDAGCombine(ISD::SETCC);
Matt Arsenaultd0101a22015-01-06 23:00:46 +0000672 setTargetDAGCombine(ISD::AND);
Matt Arsenaultf2290332015-01-06 23:00:39 +0000673 setTargetDAGCombine(ISD::OR);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +0000674 setTargetDAGCombine(ISD::XOR);
Konstantin Zhuravlyovfda33ea2016-10-21 22:10:03 +0000675 setTargetDAGCombine(ISD::SINT_TO_FP);
Matt Arsenault364a6742014-06-11 17:50:44 +0000676 setTargetDAGCombine(ISD::UINT_TO_FP);
Matt Arsenault9cd90712016-04-14 01:42:16 +0000677 setTargetDAGCombine(ISD::FCANONICALIZE);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000678 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
Matt Arsenault8edfaee2017-03-31 19:53:03 +0000679 setTargetDAGCombine(ISD::ZERO_EXTEND);
Matt Arsenaultbf5482e2017-05-11 17:26:25 +0000680 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +0000681 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
Matt Arsenault364a6742014-06-11 17:50:44 +0000682
Matt Arsenaultb2baffa2014-08-15 17:49:05 +0000683 // All memory operations. Some folding on the pointer operand is done to help
684 // matching the constant offsets in the addressing modes.
685 setTargetDAGCombine(ISD::LOAD);
686 setTargetDAGCombine(ISD::STORE);
687 setTargetDAGCombine(ISD::ATOMIC_LOAD);
688 setTargetDAGCombine(ISD::ATOMIC_STORE);
689 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
690 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
691 setTargetDAGCombine(ISD::ATOMIC_SWAP);
692 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
693 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
694 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
695 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
696 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
697 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
698 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
699 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
700 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
701 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
Matt Arsenaulta5840c32019-01-22 18:36:06 +0000702 setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +0000703
Christian Konigeecebd02013-03-26 14:04:02 +0000704 setSchedulingPreference(Sched::RegPressure);
Tom Stellardc5a154d2018-06-28 23:47:12 +0000705
706 // SI at least has hardware support for floating point exceptions, but no way
707 // of using or handling them is implemented. They are also optional in OpenCL
708 // (Section 7.3)
709 setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
Tom Stellard75aadc22012-12-11 21:25:42 +0000710}
711
Tom Stellard5bfbae52018-07-11 20:59:01 +0000712const GCNSubtarget *SITargetLowering::getSubtarget() const {
Tom Stellardc5a154d2018-06-28 23:47:12 +0000713 return Subtarget;
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000714}
715
Tom Stellard0125f2a2013-06-25 02:39:35 +0000716//===----------------------------------------------------------------------===//
717// TargetLowering queries
718//===----------------------------------------------------------------------===//
719
Tom Stellardb12f4de2018-05-22 19:37:55 +0000720// v_mad_mix* support a conversion from f16 to f32.
721//
722// There is only one special case when denormals are enabled we don't currently,
723// where this is OK to use.
724bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
725 EVT DestVT, EVT SrcVT) const {
726 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
727 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
728 DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
729 SrcVT.getScalarType() == MVT::f16;
730}
731
Zvi Rackover1b736822017-07-26 08:06:58 +0000732bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
Matt Arsenault7dc01c92017-03-15 23:15:12 +0000733 // SI has some legal vector types, but no legal vector operations. Say no
734 // shuffles are legal in order to prefer scalarizing some vector operations.
735 return false;
736}
737
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000738MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
739 CallingConv::ID CC,
740 EVT VT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000741 // TODO: Consider splitting all arguments into 32-bit pieces.
742 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000743 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000744 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000745 if (Size == 32)
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000746 return ScalarVT.getSimpleVT();
Matt Arsenault0395da72018-07-31 19:17:47 +0000747
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000748 if (Size == 64)
749 return MVT::i32;
750
Matt Arsenault57b59662018-09-10 11:49:23 +0000751 if (Size == 16 && Subtarget->has16BitInsts())
Matt Arsenault0395da72018-07-31 19:17:47 +0000752 return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000753 }
754
755 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
756}
757
758unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
759 CallingConv::ID CC,
760 EVT VT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000761 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000762 unsigned NumElts = VT.getVectorNumElements();
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000763 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000764 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenault0395da72018-07-31 19:17:47 +0000765
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000766 if (Size == 32)
Matt Arsenault0395da72018-07-31 19:17:47 +0000767 return NumElts;
768
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000769 if (Size == 64)
770 return 2 * NumElts;
771
Matt Arsenault57b59662018-09-10 11:49:23 +0000772 if (Size == 16 && Subtarget->has16BitInsts())
773 return (VT.getVectorNumElements() + 1) / 2;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000774 }
775
776 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
777}
778
779unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
780 LLVMContext &Context, CallingConv::ID CC,
781 EVT VT, EVT &IntermediateVT,
782 unsigned &NumIntermediates, MVT &RegisterVT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000783 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000784 unsigned NumElts = VT.getVectorNumElements();
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000785 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000786 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000787 if (Size == 32) {
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000788 RegisterVT = ScalarVT.getSimpleVT();
789 IntermediateVT = RegisterVT;
Matt Arsenault0395da72018-07-31 19:17:47 +0000790 NumIntermediates = NumElts;
791 return NumIntermediates;
792 }
793
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000794 if (Size == 64) {
795 RegisterVT = MVT::i32;
796 IntermediateVT = RegisterVT;
797 NumIntermediates = 2 * NumElts;
798 return NumIntermediates;
799 }
800
Matt Arsenault0395da72018-07-31 19:17:47 +0000801 // FIXME: We should fix the ABI to be the same on targets without 16-bit
802 // support, but unless we can properly handle 3-vectors, it will be still be
803 // inconsistent.
Matt Arsenault57b59662018-09-10 11:49:23 +0000804 if (Size == 16 && Subtarget->has16BitInsts()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000805 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
806 IntermediateVT = RegisterVT;
Matt Arsenault57b59662018-09-10 11:49:23 +0000807 NumIntermediates = (NumElts + 1) / 2;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000808 return NumIntermediates;
809 }
810 }
811
812 return TargetLowering::getVectorTypeBreakdownForCallingConv(
813 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
814}
815
David Stuttardf77079f2019-01-14 11:55:24 +0000816static MVT memVTFromAggregate(Type *Ty) {
817 // Only limited forms of aggregate type currently expected.
818 assert(Ty->isStructTy() && "Expected struct type");
819
820
821 Type *ElementType = nullptr;
822 unsigned NumElts;
823 if (Ty->getContainedType(0)->isVectorTy()) {
824 VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
825 ElementType = VecComponent->getElementType();
826 NumElts = VecComponent->getNumElements();
827 } else {
828 ElementType = Ty->getContainedType(0);
829 NumElts = 1;
830 }
831
832 assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
833
834 // Calculate the size of the memVT type from the aggregate
835 unsigned Pow2Elts = 0;
836 unsigned ElementSize;
837 switch (ElementType->getTypeID()) {
838 default:
839 llvm_unreachable("Unknown type!");
840 case Type::IntegerTyID:
841 ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
842 break;
843 case Type::HalfTyID:
844 ElementSize = 16;
845 break;
846 case Type::FloatTyID:
847 ElementSize = 32;
848 break;
849 }
850 unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
851 Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
852
853 return MVT::getVectorVT(MVT::getVT(ElementType, false),
854 Pow2Elts);
855}
856
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000857bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
858 const CallInst &CI,
Matt Arsenault7d7adf42017-12-14 22:34:10 +0000859 MachineFunction &MF,
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000860 unsigned IntrID) const {
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000861 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
Nicolai Haehnlee741d7e2018-06-21 13:36:33 +0000862 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000863 AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
864 (Intrinsic::ID)IntrID);
865 if (Attr.hasFnAttribute(Attribute::ReadNone))
866 return false;
867
868 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
869
870 if (RsrcIntr->IsImage) {
871 Info.ptrVal = MFI->getImagePSV(
Tom Stellard5bfbae52018-07-11 20:59:01 +0000872 *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000873 CI.getArgOperand(RsrcIntr->RsrcArg));
874 Info.align = 0;
875 } else {
876 Info.ptrVal = MFI->getBufferPSV(
Tom Stellard5bfbae52018-07-11 20:59:01 +0000877 *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000878 CI.getArgOperand(RsrcIntr->RsrcArg));
879 }
880
881 Info.flags = MachineMemOperand::MODereferenceable;
882 if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
883 Info.opc = ISD::INTRINSIC_W_CHAIN;
David Stuttardf77079f2019-01-14 11:55:24 +0000884 Info.memVT = MVT::getVT(CI.getType(), true);
885 if (Info.memVT == MVT::Other) {
886 // Some intrinsics return an aggregate type - special case to work out
887 // the correct memVT
888 Info.memVT = memVTFromAggregate(CI.getType());
889 }
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000890 Info.flags |= MachineMemOperand::MOLoad;
891 } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
892 Info.opc = ISD::INTRINSIC_VOID;
893 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
894 Info.flags |= MachineMemOperand::MOStore;
895 } else {
896 // Atomic
897 Info.opc = ISD::INTRINSIC_W_CHAIN;
898 Info.memVT = MVT::getVT(CI.getType());
899 Info.flags = MachineMemOperand::MOLoad |
900 MachineMemOperand::MOStore |
901 MachineMemOperand::MODereferenceable;
902
903 // XXX - Should this be volatile without known ordering?
904 Info.flags |= MachineMemOperand::MOVolatile;
905 }
906 return true;
907 }
908
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000909 switch (IntrID) {
910 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +0000911 case Intrinsic::amdgcn_atomic_dec:
Marek Olsakc5cec5e2019-01-16 15:43:53 +0000912 case Intrinsic::amdgcn_ds_ordered_add:
913 case Intrinsic::amdgcn_ds_ordered_swap:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +0000914 case Intrinsic::amdgcn_ds_fadd:
915 case Intrinsic::amdgcn_ds_fmin:
916 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000917 Info.opc = ISD::INTRINSIC_W_CHAIN;
918 Info.memVT = MVT::getVT(CI.getType());
919 Info.ptrVal = CI.getOperand(0);
920 Info.align = 0;
Matt Arsenault11171332017-12-14 21:39:51 +0000921 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
Matt Arsenault79f837c2017-03-30 22:21:40 +0000922
Matt Arsenaultcaf13162019-03-12 21:02:54 +0000923 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
924 if (!Vol->isZero())
Matt Arsenault11171332017-12-14 21:39:51 +0000925 Info.flags |= MachineMemOperand::MOVolatile;
926
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000927 return true;
Matt Arsenault79f837c2017-03-30 22:21:40 +0000928 }
Matt Arsenaultcdd191d2019-01-28 20:14:49 +0000929 case Intrinsic::amdgcn_ds_append:
930 case Intrinsic::amdgcn_ds_consume: {
931 Info.opc = ISD::INTRINSIC_W_CHAIN;
932 Info.memVT = MVT::getVT(CI.getType());
933 Info.ptrVal = CI.getOperand(0);
934 Info.align = 0;
935 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
Matt Arsenault905f3512017-12-29 17:18:14 +0000936
Matt Arsenaultcaf13162019-03-12 21:02:54 +0000937 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
938 if (!Vol->isZero())
Matt Arsenaultcdd191d2019-01-28 20:14:49 +0000939 Info.flags |= MachineMemOperand::MOVolatile;
940
941 return true;
942 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000943 default:
944 return false;
945 }
946}
947
Matt Arsenault7dc01c92017-03-15 23:15:12 +0000948bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
949 SmallVectorImpl<Value*> &Ops,
950 Type *&AccessTy) const {
951 switch (II->getIntrinsicID()) {
952 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +0000953 case Intrinsic::amdgcn_atomic_dec:
Marek Olsakc5cec5e2019-01-16 15:43:53 +0000954 case Intrinsic::amdgcn_ds_ordered_add:
955 case Intrinsic::amdgcn_ds_ordered_swap:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +0000956 case Intrinsic::amdgcn_ds_fadd:
957 case Intrinsic::amdgcn_ds_fmin:
958 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenault7dc01c92017-03-15 23:15:12 +0000959 Value *Ptr = II->getArgOperand(0);
960 AccessTy = II->getType();
961 Ops.push_back(Ptr);
962 return true;
963 }
964 default:
965 return false;
966 }
Matt Arsenaulte306a322014-10-21 16:25:08 +0000967}
968
Tom Stellard70580f82015-07-20 14:28:41 +0000969bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
Matt Arsenaultd9b77842017-06-12 17:06:35 +0000970 if (!Subtarget->hasFlatInstOffsets()) {
971 // Flat instructions do not have offsets, and only have the register
972 // address.
973 return AM.BaseOffs == 0 && AM.Scale == 0;
974 }
975
976 // GFX9 added a 13-bit signed offset. When using regular flat instructions,
977 // the sign bit is ignored and is treated as a 12-bit unsigned offset.
978
979 // Just r + i
980 return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
Tom Stellard70580f82015-07-20 14:28:41 +0000981}
982
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +0000983bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
984 if (Subtarget->hasFlatGlobalInsts())
985 return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
986
987 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
988 // Assume the we will use FLAT for all global memory accesses
989 // on VI.
990 // FIXME: This assumption is currently wrong. On VI we still use
991 // MUBUF instructions for the r + i addressing mode. As currently
992 // implemented, the MUBUF instructions only work on buffer < 4GB.
993 // It may be possible to support > 4GB buffers with MUBUF instructions,
994 // by setting the stride value in the resource descriptor which would
995 // increase the size limit to (stride * 4GB). However, this is risky,
996 // because it has never been validated.
997 return isLegalFlatAddressingMode(AM);
998 }
999
1000 return isLegalMUBUFAddressingMode(AM);
1001}
1002
Matt Arsenault711b3902015-08-07 20:18:34 +00001003bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1004 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1005 // additionally can do r + r + i with addr64. 32-bit has more addressing
1006 // mode options. Depending on the resource constant, it can also do
1007 // (i64 r0) + (i32 r1) * (i14 i).
1008 //
1009 // Private arrays end up using a scratch buffer most of the time, so also
1010 // assume those use MUBUF instructions. Scratch loads / stores are currently
1011 // implemented as mubuf instructions with offen bit set, so slightly
1012 // different than the normal addr64.
1013 if (!isUInt<12>(AM.BaseOffs))
1014 return false;
1015
1016 // FIXME: Since we can split immediate into soffset and immediate offset,
1017 // would it make sense to allow any immediate?
1018
1019 switch (AM.Scale) {
1020 case 0: // r + i or just i, depending on HasBaseReg.
1021 return true;
1022 case 1:
1023 return true; // We have r + r or r + i.
1024 case 2:
1025 if (AM.HasBaseReg) {
1026 // Reject 2 * r + r.
1027 return false;
1028 }
1029
1030 // Allow 2 * r as r + r
1031 // Or 2 * r + i is allowed as r + r + i.
1032 return true;
1033 default: // Don't allow n * r
1034 return false;
1035 }
1036}
1037
Mehdi Amini0cdec1e2015-07-09 02:09:40 +00001038bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1039 const AddrMode &AM, Type *Ty,
Jonas Paulsson024e3192017-07-21 11:59:37 +00001040 unsigned AS, Instruction *I) const {
Matt Arsenault5015a892014-08-15 17:17:07 +00001041 // No global is ever allowed as a base.
1042 if (AM.BaseGV)
1043 return false;
1044
Matt Arsenault0da63502018-08-31 05:49:54 +00001045 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +00001046 return isLegalGlobalAddressingMode(AM);
Matt Arsenault5015a892014-08-15 17:17:07 +00001047
Matt Arsenault0da63502018-08-31 05:49:54 +00001048 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
Neil Henning523dab02019-03-18 14:44:28 +00001049 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1050 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001051 // If the offset isn't a multiple of 4, it probably isn't going to be
1052 // correctly aligned.
Matt Arsenault3cc1e002016-08-13 01:43:51 +00001053 // FIXME: Can we get the real alignment here?
Matt Arsenault711b3902015-08-07 20:18:34 +00001054 if (AM.BaseOffs % 4 != 0)
1055 return isLegalMUBUFAddressingMode(AM);
1056
1057 // There are no SMRD extloads, so if we have to do a small type access we
1058 // will use a MUBUF load.
1059 // FIXME?: We also need to do this if unaligned, but we don't know the
1060 // alignment here.
Stanislav Mekhanoshin57d341c2018-05-15 22:07:51 +00001061 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +00001062 return isLegalGlobalAddressingMode(AM);
Matt Arsenault711b3902015-08-07 20:18:34 +00001063
Tom Stellard5bfbae52018-07-11 20:59:01 +00001064 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001065 // SMRD instructions have an 8-bit, dword offset on SI.
1066 if (!isUInt<8>(AM.BaseOffs / 4))
1067 return false;
Tom Stellard5bfbae52018-07-11 20:59:01 +00001068 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001069 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1070 // in 8-bits, it can use a smaller encoding.
1071 if (!isUInt<32>(AM.BaseOffs / 4))
1072 return false;
Tom Stellard5bfbae52018-07-11 20:59:01 +00001073 } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001074 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1075 if (!isUInt<20>(AM.BaseOffs))
1076 return false;
1077 } else
1078 llvm_unreachable("unhandled generation");
1079
1080 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1081 return true;
1082
1083 if (AM.Scale == 1 && AM.HasBaseReg)
1084 return true;
1085
1086 return false;
Matt Arsenault711b3902015-08-07 20:18:34 +00001087
Matt Arsenault0da63502018-08-31 05:49:54 +00001088 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001089 return isLegalMUBUFAddressingMode(AM);
Matt Arsenault0da63502018-08-31 05:49:54 +00001090 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1091 AS == AMDGPUAS::REGION_ADDRESS) {
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001092 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1093 // field.
1094 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1095 // an 8-bit dword offset but we don't know the alignment here.
1096 if (!isUInt<16>(AM.BaseOffs))
Matt Arsenault5015a892014-08-15 17:17:07 +00001097 return false;
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001098
1099 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1100 return true;
1101
1102 if (AM.Scale == 1 && AM.HasBaseReg)
1103 return true;
1104
Matt Arsenault5015a892014-08-15 17:17:07 +00001105 return false;
Matt Arsenault0da63502018-08-31 05:49:54 +00001106 } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1107 AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
Matt Arsenault7d1b6c82016-04-29 06:25:10 +00001108 // For an unknown address space, this usually means that this is for some
1109 // reason being used for pure arithmetic, and not based on some addressing
1110 // computation. We don't have instructions that compute pointers with any
1111 // addressing modes, so treat them as having no offset like flat
1112 // instructions.
Tom Stellard70580f82015-07-20 14:28:41 +00001113 return isLegalFlatAddressingMode(AM);
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00001114 } else {
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001115 llvm_unreachable("unhandled address space");
1116 }
Matt Arsenault5015a892014-08-15 17:17:07 +00001117}
1118
Nirav Dave4dcad5d2017-07-10 20:25:54 +00001119bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1120 const SelectionDAG &DAG) const {
Matt Arsenault0da63502018-08-31 05:49:54 +00001121 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001122 return (MemVT.getSizeInBits() <= 4 * 32);
Matt Arsenault0da63502018-08-31 05:49:54 +00001123 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001124 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1125 return (MemVT.getSizeInBits() <= MaxPrivateBits);
Matt Arsenault0da63502018-08-31 05:49:54 +00001126 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001127 return (MemVT.getSizeInBits() <= 2 * 32);
1128 }
1129 return true;
1130}
1131
Matt Arsenaulte6986632015-01-14 01:35:22 +00001132bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
Matt Arsenault6f2a5262014-07-27 17:46:40 +00001133 unsigned AddrSpace,
1134 unsigned Align,
1135 bool *IsFast) const {
Matt Arsenault1018c892014-04-24 17:08:26 +00001136 if (IsFast)
1137 *IsFast = false;
1138
Matt Arsenault1018c892014-04-24 17:08:26 +00001139 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1140 // which isn't a simple VT.
Alina Sbirlea6f937b12016-08-04 16:38:44 +00001141 // Until MVT is extended to handle this, simply check for the size and
1142 // rely on the condition below: allow accesses if the size is a multiple of 4.
1143 if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1144 VT.getStoreSize() > 16)) {
Tom Stellard81d871d2013-11-13 23:36:50 +00001145 return false;
Alina Sbirlea6f937b12016-08-04 16:38:44 +00001146 }
Matt Arsenault1018c892014-04-24 17:08:26 +00001147
Matt Arsenault0da63502018-08-31 05:49:54 +00001148 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1149 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
Matt Arsenault6f2a5262014-07-27 17:46:40 +00001150 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1151 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1152 // with adjacent offsets.
Sanjay Patelce74db92015-09-03 15:03:19 +00001153 bool AlignedBy4 = (Align % 4 == 0);
1154 if (IsFast)
1155 *IsFast = AlignedBy4;
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001156
Sanjay Patelce74db92015-09-03 15:03:19 +00001157 return AlignedBy4;
Matt Arsenault6f2a5262014-07-27 17:46:40 +00001158 }
Matt Arsenault1018c892014-04-24 17:08:26 +00001159
Tom Stellard64a9d082016-10-14 18:10:39 +00001160 // FIXME: We have to be conservative here and assume that flat operations
1161 // will access scratch. If we had access to the IR function, then we
1162 // could determine if any private memory was used in the function.
1163 if (!Subtarget->hasUnalignedScratchAccess() &&
Matt Arsenault0da63502018-08-31 05:49:54 +00001164 (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1165 AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
Matt Arsenaultf4320112018-09-24 13:18:15 +00001166 bool AlignedBy4 = Align >= 4;
1167 if (IsFast)
1168 *IsFast = AlignedBy4;
1169
1170 return AlignedBy4;
Tom Stellard64a9d082016-10-14 18:10:39 +00001171 }
1172
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001173 if (Subtarget->hasUnalignedBufferAccess()) {
1174 // If we have an uniform constant load, it still requires using a slow
1175 // buffer instruction if unaligned.
1176 if (IsFast) {
Matt Arsenault0da63502018-08-31 05:49:54 +00001177 *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1178 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001179 (Align % 4 == 0) : true;
1180 }
1181
1182 return true;
1183 }
1184
Tom Stellard33e64c62015-02-04 20:49:52 +00001185 // Smaller than dword value must be aligned.
Tom Stellard33e64c62015-02-04 20:49:52 +00001186 if (VT.bitsLT(MVT::i32))
1187 return false;
1188
Matt Arsenault1018c892014-04-24 17:08:26 +00001189 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1190 // byte-address are ignored, thus forcing Dword alignment.
Tom Stellarde812f2f2014-07-21 15:45:06 +00001191 // This applies to private, global, and constant memory.
Matt Arsenault1018c892014-04-24 17:08:26 +00001192 if (IsFast)
1193 *IsFast = true;
Tom Stellardc6b299c2015-02-02 18:02:28 +00001194
1195 return VT.bitsGT(MVT::i32) && Align % 4 == 0;
Tom Stellard0125f2a2013-06-25 02:39:35 +00001196}
1197
Matt Arsenault46645fa2014-07-28 17:49:26 +00001198EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
1199 unsigned SrcAlign, bool IsMemset,
1200 bool ZeroMemset,
1201 bool MemcpyStrSrc,
1202 MachineFunction &MF) const {
1203 // FIXME: Should account for address space here.
1204
1205 // The default fallback uses the private pointer size as a guess for a type to
1206 // use. Make sure we switch these to 64-bit accesses.
1207
1208 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
1209 return MVT::v4i32;
1210
1211 if (Size >= 8 && DstAlign >= 4)
1212 return MVT::v2i32;
1213
1214 // Use the default.
1215 return MVT::Other;
1216}
1217
Matt Arsenault0da63502018-08-31 05:49:54 +00001218static bool isFlatGlobalAddrSpace(unsigned AS) {
1219 return AS == AMDGPUAS::GLOBAL_ADDRESS ||
1220 AS == AMDGPUAS::FLAT_ADDRESS ||
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001221 AS == AMDGPUAS::CONSTANT_ADDRESS ||
1222 AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
Matt Arsenaultf9bfeaf2015-12-01 23:04:00 +00001223}
1224
1225bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1226 unsigned DestAS) const {
Matt Arsenault0da63502018-08-31 05:49:54 +00001227 return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
Matt Arsenaultf9bfeaf2015-12-01 23:04:00 +00001228}
1229
Alexander Timofeev18009562016-12-08 17:28:47 +00001230bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1231 const MemSDNode *MemNode = cast<MemSDNode>(N);
1232 const Value *Ptr = MemNode->getMemOperand()->getValue();
Matt Arsenault0a0c8712018-03-27 18:39:45 +00001233 const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
Alexander Timofeev18009562016-12-08 17:28:47 +00001234 return I && I->getMetadata("amdgpu.noclobber");
1235}
1236
Matt Arsenaultd4da0ed2016-12-02 18:12:53 +00001237bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
1238 unsigned DestAS) const {
1239 // Flat -> private/local is a simple truncate.
1240 // Flat -> global is no-op
Matt Arsenault0da63502018-08-31 05:49:54 +00001241 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
Matt Arsenaultd4da0ed2016-12-02 18:12:53 +00001242 return true;
1243
1244 return isNoopAddrSpaceCast(SrcAS, DestAS);
1245}
1246
Tom Stellarda6f24c62015-12-15 20:55:55 +00001247bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1248 const MemSDNode *MemNode = cast<MemSDNode>(N);
Tom Stellarda6f24c62015-12-15 20:55:55 +00001249
Matt Arsenaultbcf7bec2018-02-09 16:57:48 +00001250 return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
Tom Stellarda6f24c62015-12-15 20:55:55 +00001251}
1252
Chandler Carruth9d010ff2014-07-03 00:23:43 +00001253TargetLoweringBase::LegalizeTypeAction
Craig Topper0b5f8162018-11-05 23:26:13 +00001254SITargetLowering::getPreferredVectorAction(MVT VT) const {
Chandler Carruth9d010ff2014-07-03 00:23:43 +00001255 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
1256 return TypeSplitVector;
1257
1258 return TargetLoweringBase::getPreferredVectorAction(VT);
Tom Stellardd86003e2013-08-14 23:25:00 +00001259}
Tom Stellard0125f2a2013-06-25 02:39:35 +00001260
Matt Arsenaultd7bdcc42014-03-31 19:54:27 +00001261bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1262 Type *Ty) const {
Matt Arsenault749035b2016-07-30 01:40:36 +00001263 // FIXME: Could be smarter if called for vector constants.
1264 return true;
Matt Arsenaultd7bdcc42014-03-31 19:54:27 +00001265}
1266
Tom Stellard2e045bb2016-01-20 00:13:22 +00001267bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
Matt Arsenault7b00cf42016-12-09 17:57:43 +00001268 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1269 switch (Op) {
1270 case ISD::LOAD:
1271 case ISD::STORE:
Tom Stellard2e045bb2016-01-20 00:13:22 +00001272
Matt Arsenault7b00cf42016-12-09 17:57:43 +00001273 // These operations are done with 32-bit instructions anyway.
1274 case ISD::AND:
1275 case ISD::OR:
1276 case ISD::XOR:
1277 case ISD::SELECT:
1278 // TODO: Extensions?
1279 return true;
1280 default:
1281 return false;
1282 }
1283 }
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +00001284
Tom Stellard2e045bb2016-01-20 00:13:22 +00001285 // SimplifySetCC uses this function to determine whether or not it should
1286 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1287 if (VT == MVT::i1 && Op == ISD::SETCC)
1288 return false;
1289
1290 return TargetLowering::isTypeDesirableForOp(Op, VT);
1291}
1292
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001293SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1294 const SDLoc &SL,
1295 SDValue Chain,
1296 uint64_t Offset) const {
Mehdi Aminia749f2a2015-07-09 02:09:52 +00001297 const DataLayout &DL = DAG.getDataLayout();
Tom Stellardec2e43c2014-09-22 15:35:29 +00001298 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001299 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1300
1301 const ArgDescriptor *InputPtrReg;
1302 const TargetRegisterClass *RC;
1303
1304 std::tie(InputPtrReg, RC)
1305 = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
Tom Stellard94593ee2013-06-03 17:40:18 +00001306
Matt Arsenault86033ca2014-07-28 17:31:39 +00001307 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
Matt Arsenault0da63502018-08-31 05:49:54 +00001308 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
Matt Arsenaulta0269b62015-06-01 21:58:24 +00001309 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001310 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1311
Matt Arsenault2fb9ccf2018-05-29 17:42:38 +00001312 return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
Jan Veselyfea814d2016-06-21 20:46:20 +00001313}
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00001314
Matt Arsenault9166ce82017-07-28 15:52:08 +00001315SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1316 const SDLoc &SL) const {
Matt Arsenault75e71922018-06-28 10:18:55 +00001317 uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1318 FIRST_IMPLICIT);
Matt Arsenault9166ce82017-07-28 15:52:08 +00001319 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1320}
1321
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001322SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1323 const SDLoc &SL, SDValue Val,
1324 bool Signed,
Matt Arsenault6dca5422017-01-09 18:52:39 +00001325 const ISD::InputArg *Arg) const {
Matt Arsenault6dca5422017-01-09 18:52:39 +00001326 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1327 VT.bitsLT(MemVT)) {
1328 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1329 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1330 }
1331
Tom Stellardbc6c5232016-10-17 16:21:45 +00001332 if (MemVT.isFloatingPoint())
Matt Arsenault6dca5422017-01-09 18:52:39 +00001333 Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001334 else if (Signed)
Matt Arsenault6dca5422017-01-09 18:52:39 +00001335 Val = DAG.getSExtOrTrunc(Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001336 else
Matt Arsenault6dca5422017-01-09 18:52:39 +00001337 Val = DAG.getZExtOrTrunc(Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001338
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001339 return Val;
1340}
1341
1342SDValue SITargetLowering::lowerKernargMemParameter(
1343 SelectionDAG &DAG, EVT VT, EVT MemVT,
1344 const SDLoc &SL, SDValue Chain,
Matt Arsenault7b4826e2018-05-30 16:17:51 +00001345 uint64_t Offset, unsigned Align, bool Signed,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001346 const ISD::InputArg *Arg) const {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001347 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
Matt Arsenault0da63502018-08-31 05:49:54 +00001348 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001349 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1350
Matt Arsenault90083d32018-06-07 09:54:49 +00001351 // Try to avoid using an extload by loading earlier than the argument address,
1352 // and extracting the relevant bits. The load should hopefully be merged with
1353 // the previous argument.
Matt Arsenault4bec7d42018-07-20 09:05:08 +00001354 if (MemVT.getStoreSize() < 4 && Align < 4) {
1355 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
Matt Arsenault90083d32018-06-07 09:54:49 +00001356 int64_t AlignDownOffset = alignDown(Offset, 4);
1357 int64_t OffsetDiff = Offset - AlignDownOffset;
1358
1359 EVT IntVT = MemVT.changeTypeToInteger();
1360
1361 // TODO: If we passed in the base kernel offset we could have a better
1362 // alignment than 4, but we don't really need it.
1363 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1364 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1365 MachineMemOperand::MODereferenceable |
1366 MachineMemOperand::MOInvariant);
1367
1368 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1369 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1370
1371 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1372 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1373 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1374
1375
1376 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1377 }
1378
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001379 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1380 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001381 MachineMemOperand::MODereferenceable |
1382 MachineMemOperand::MOInvariant);
1383
1384 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
Matt Arsenault6dca5422017-01-09 18:52:39 +00001385 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
Tom Stellard94593ee2013-06-03 17:40:18 +00001386}
1387
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001388SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1389 const SDLoc &SL, SDValue Chain,
1390 const ISD::InputArg &Arg) const {
1391 MachineFunction &MF = DAG.getMachineFunction();
1392 MachineFrameInfo &MFI = MF.getFrameInfo();
1393
1394 if (Arg.Flags.isByVal()) {
1395 unsigned Size = Arg.Flags.getByValSize();
1396 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1397 return DAG.getFrameIndex(FrameIdx, MVT::i32);
1398 }
1399
1400 unsigned ArgOffset = VA.getLocMemOffset();
1401 unsigned ArgSize = VA.getValVT().getStoreSize();
1402
1403 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1404
1405 // Create load nodes to retrieve arguments from the stack.
1406 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1407 SDValue ArgValue;
1408
1409 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1410 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
1411 MVT MemVT = VA.getValVT();
1412
1413 switch (VA.getLocInfo()) {
1414 default:
1415 break;
1416 case CCValAssign::BCvt:
1417 MemVT = VA.getLocVT();
1418 break;
1419 case CCValAssign::SExt:
1420 ExtType = ISD::SEXTLOAD;
1421 break;
1422 case CCValAssign::ZExt:
1423 ExtType = ISD::ZEXTLOAD;
1424 break;
1425 case CCValAssign::AExt:
1426 ExtType = ISD::EXTLOAD;
1427 break;
1428 }
1429
1430 ArgValue = DAG.getExtLoad(
1431 ExtType, SL, VA.getLocVT(), Chain, FIN,
1432 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1433 MemVT);
1434 return ArgValue;
1435}
1436
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001437SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1438 const SIMachineFunctionInfo &MFI,
1439 EVT VT,
1440 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1441 const ArgDescriptor *Reg;
1442 const TargetRegisterClass *RC;
1443
1444 std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1445 return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1446}
1447
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001448static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1449 CallingConv::ID CallConv,
1450 ArrayRef<ISD::InputArg> Ins,
1451 BitVector &Skipped,
1452 FunctionType *FType,
1453 SIMachineFunctionInfo *Info) {
1454 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001455 const ISD::InputArg *Arg = &Ins[I];
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001456
Matt Arsenault55ab9212018-08-01 19:57:34 +00001457 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1458 "vector type argument should have been split");
Matt Arsenault9ced1e02018-07-31 19:05:14 +00001459
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001460 // First check if it's a PS input addr.
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001461 if (CallConv == CallingConv::AMDGPU_PS &&
1462 !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001463
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001464 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1465
1466 // Inconveniently only the first part of the split is marked as isSplit,
1467 // so skip to the end. We only want to increment PSInputNum once for the
1468 // entire split argument.
1469 if (Arg->Flags.isSplit()) {
1470 while (!Arg->Flags.isSplitEnd()) {
1471 assert(!Arg->VT.isVector() &&
1472 "unexpected vector split in ps argument type");
1473 if (!SkipArg)
1474 Splits.push_back(*Arg);
1475 Arg = &Ins[++I];
1476 }
1477 }
1478
1479 if (SkipArg) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001480 // We can safely skip PS inputs.
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001481 Skipped.set(Arg->getOrigArgIndex());
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001482 ++PSInputNum;
1483 continue;
1484 }
1485
1486 Info->markPSInputAllocated(PSInputNum);
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001487 if (Arg->Used)
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001488 Info->markPSInputEnabled(PSInputNum);
1489
1490 ++PSInputNum;
1491 }
1492
Matt Arsenault9ced1e02018-07-31 19:05:14 +00001493 Splits.push_back(*Arg);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001494 }
1495}
1496
1497// Allocate special inputs passed in VGPRs.
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001498static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1499 MachineFunction &MF,
1500 const SIRegisterInfo &TRI,
1501 SIMachineFunctionInfo &Info) {
1502 if (Info.hasWorkItemIDX()) {
1503 unsigned Reg = AMDGPU::VGPR0;
1504 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001505
1506 CCInfo.AllocateReg(Reg);
1507 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
1508 }
1509
1510 if (Info.hasWorkItemIDY()) {
1511 unsigned Reg = AMDGPU::VGPR1;
1512 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1513
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001514 CCInfo.AllocateReg(Reg);
1515 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1516 }
1517
1518 if (Info.hasWorkItemIDZ()) {
1519 unsigned Reg = AMDGPU::VGPR2;
1520 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1521
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001522 CCInfo.AllocateReg(Reg);
1523 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1524 }
1525}
1526
1527// Try to allocate a VGPR at the end of the argument list, or if no argument
1528// VGPRs are left allocating a stack slot.
1529static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
1530 ArrayRef<MCPhysReg> ArgVGPRs
1531 = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1532 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1533 if (RegIdx == ArgVGPRs.size()) {
1534 // Spill to stack required.
1535 int64_t Offset = CCInfo.AllocateStack(4, 4);
1536
1537 return ArgDescriptor::createStack(Offset);
1538 }
1539
1540 unsigned Reg = ArgVGPRs[RegIdx];
1541 Reg = CCInfo.AllocateReg(Reg);
1542 assert(Reg != AMDGPU::NoRegister);
1543
1544 MachineFunction &MF = CCInfo.getMachineFunction();
1545 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1546 return ArgDescriptor::createRegister(Reg);
1547}
1548
1549static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1550 const TargetRegisterClass *RC,
1551 unsigned NumArgRegs) {
1552 ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1553 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1554 if (RegIdx == ArgSGPRs.size())
1555 report_fatal_error("ran out of SGPRs for arguments");
1556
1557 unsigned Reg = ArgSGPRs[RegIdx];
1558 Reg = CCInfo.AllocateReg(Reg);
1559 assert(Reg != AMDGPU::NoRegister);
1560
1561 MachineFunction &MF = CCInfo.getMachineFunction();
1562 MF.addLiveIn(Reg, RC);
1563 return ArgDescriptor::createRegister(Reg);
1564}
1565
1566static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
1567 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1568}
1569
1570static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
1571 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1572}
1573
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001574static void allocateSpecialInputVGPRs(CCState &CCInfo,
1575 MachineFunction &MF,
1576 const SIRegisterInfo &TRI,
1577 SIMachineFunctionInfo &Info) {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001578 if (Info.hasWorkItemIDX())
1579 Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001580
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001581 if (Info.hasWorkItemIDY())
1582 Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001583
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001584 if (Info.hasWorkItemIDZ())
1585 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1586}
1587
1588static void allocateSpecialInputSGPRs(CCState &CCInfo,
1589 MachineFunction &MF,
1590 const SIRegisterInfo &TRI,
1591 SIMachineFunctionInfo &Info) {
1592 auto &ArgInfo = Info.getArgInfo();
1593
1594 // TODO: Unify handling with private memory pointers.
1595
1596 if (Info.hasDispatchPtr())
1597 ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1598
1599 if (Info.hasQueuePtr())
1600 ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1601
1602 if (Info.hasKernargSegmentPtr())
1603 ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1604
1605 if (Info.hasDispatchID())
1606 ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1607
1608 // flat_scratch_init is not applicable for non-kernel functions.
1609
1610 if (Info.hasWorkGroupIDX())
1611 ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1612
1613 if (Info.hasWorkGroupIDY())
1614 ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1615
1616 if (Info.hasWorkGroupIDZ())
1617 ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
Matt Arsenault817c2532017-08-03 23:12:44 +00001618
1619 if (Info.hasImplicitArgPtr())
1620 ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001621}
1622
1623// Allocate special inputs passed in user SGPRs.
1624static void allocateHSAUserSGPRs(CCState &CCInfo,
1625 MachineFunction &MF,
1626 const SIRegisterInfo &TRI,
1627 SIMachineFunctionInfo &Info) {
Matt Arsenault10fc0622017-06-26 03:01:31 +00001628 if (Info.hasImplicitBufferPtr()) {
1629 unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1630 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1631 CCInfo.AllocateReg(ImplicitBufferPtrReg);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001632 }
1633
1634 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1635 if (Info.hasPrivateSegmentBuffer()) {
1636 unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1637 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1638 CCInfo.AllocateReg(PrivateSegmentBufferReg);
1639 }
1640
1641 if (Info.hasDispatchPtr()) {
1642 unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1643 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1644 CCInfo.AllocateReg(DispatchPtrReg);
1645 }
1646
1647 if (Info.hasQueuePtr()) {
1648 unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1649 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1650 CCInfo.AllocateReg(QueuePtrReg);
1651 }
1652
1653 if (Info.hasKernargSegmentPtr()) {
1654 unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1655 MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1656 CCInfo.AllocateReg(InputPtrReg);
1657 }
1658
1659 if (Info.hasDispatchID()) {
1660 unsigned DispatchIDReg = Info.addDispatchID(TRI);
1661 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1662 CCInfo.AllocateReg(DispatchIDReg);
1663 }
1664
1665 if (Info.hasFlatScratchInit()) {
1666 unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1667 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1668 CCInfo.AllocateReg(FlatScratchInitReg);
1669 }
1670
1671 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1672 // these from the dispatch pointer.
1673}
1674
1675// Allocate special input registers that are initialized per-wave.
1676static void allocateSystemSGPRs(CCState &CCInfo,
1677 MachineFunction &MF,
1678 SIMachineFunctionInfo &Info,
Marek Olsak584d2c02017-05-04 22:25:20 +00001679 CallingConv::ID CallConv,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001680 bool IsShader) {
1681 if (Info.hasWorkGroupIDX()) {
1682 unsigned Reg = Info.addWorkGroupIDX();
1683 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1684 CCInfo.AllocateReg(Reg);
1685 }
1686
1687 if (Info.hasWorkGroupIDY()) {
1688 unsigned Reg = Info.addWorkGroupIDY();
1689 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1690 CCInfo.AllocateReg(Reg);
1691 }
1692
1693 if (Info.hasWorkGroupIDZ()) {
1694 unsigned Reg = Info.addWorkGroupIDZ();
1695 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1696 CCInfo.AllocateReg(Reg);
1697 }
1698
1699 if (Info.hasWorkGroupInfo()) {
1700 unsigned Reg = Info.addWorkGroupInfo();
1701 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1702 CCInfo.AllocateReg(Reg);
1703 }
1704
1705 if (Info.hasPrivateSegmentWaveByteOffset()) {
1706 // Scratch wave offset passed in system SGPR.
1707 unsigned PrivateSegmentWaveByteOffsetReg;
1708
1709 if (IsShader) {
Marek Olsak584d2c02017-05-04 22:25:20 +00001710 PrivateSegmentWaveByteOffsetReg =
1711 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
1712
1713 // This is true if the scratch wave byte offset doesn't have a fixed
1714 // location.
1715 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1716 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1717 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1718 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001719 } else
1720 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1721
1722 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1723 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1724 }
1725}
1726
1727static void reservePrivateMemoryRegs(const TargetMachine &TM,
1728 MachineFunction &MF,
1729 const SIRegisterInfo &TRI,
Matt Arsenault1cc47f82017-07-18 16:44:56 +00001730 SIMachineFunctionInfo &Info) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001731 // Now that we've figured out where the scratch register inputs are, see if
1732 // should reserve the arguments and use them directly.
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001733 MachineFrameInfo &MFI = MF.getFrameInfo();
1734 bool HasStackObjects = MFI.hasStackObjects();
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001735
1736 // Record that we know we have non-spill stack objects so we don't need to
1737 // check all stack objects later.
1738 if (HasStackObjects)
1739 Info.setHasNonSpillStackObjects(true);
1740
1741 // Everything live out of a block is spilled with fast regalloc, so it's
1742 // almost certain that spilling will be required.
1743 if (TM.getOptLevel() == CodeGenOpt::None)
1744 HasStackObjects = true;
1745
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001746 // For now assume stack access is needed in any callee functions, so we need
1747 // the scratch registers to pass in.
1748 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1749
Tom Stellard5bfbae52018-07-11 20:59:01 +00001750 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Konstantin Zhuravlyovaa067cb2018-10-04 21:02:16 +00001751 if (ST.isAmdHsaOrMesa(MF.getFunction())) {
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001752 if (RequiresStackAccess) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001753 // If we have stack objects, we unquestionably need the private buffer
1754 // resource. For the Code Object V2 ABI, this will be the first 4 user
1755 // SGPR inputs. We can reserve those and use them directly.
1756
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001757 unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1758 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001759 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1760
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001761 if (MFI.hasCalls()) {
1762 // If we have calls, we need to keep the frame register in a register
1763 // that won't be clobbered by a call, so ensure it is copied somewhere.
1764
1765 // This is not a problem for the scratch wave offset, because the same
1766 // registers are reserved in all functions.
1767
1768 // FIXME: Nothing is really ensuring this is a call preserved register,
1769 // it's just selected from the end so it happens to be.
1770 unsigned ReservedOffsetReg
1771 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1772 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1773 } else {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001774 unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1775 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001776 Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1777 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001778 } else {
1779 unsigned ReservedBufferReg
1780 = TRI.reservedPrivateSegmentBufferReg(MF);
1781 unsigned ReservedOffsetReg
1782 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1783
1784 // We tentatively reserve the last registers (skipping the last two
1785 // which may contain VCC). After register allocation, we'll replace
1786 // these with the ones immediately after those which were really
1787 // allocated. In the prologue copies will be inserted from the argument
1788 // to these reserved registers.
1789 Info.setScratchRSrcReg(ReservedBufferReg);
1790 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1791 }
1792 } else {
1793 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1794
1795 // Without HSA, relocations are used for the scratch pointer and the
1796 // buffer resource setup is always inserted in the prologue. Scratch wave
1797 // offset is still in an input SGPR.
1798 Info.setScratchRSrcReg(ReservedBufferReg);
1799
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001800 if (HasStackObjects && !MFI.hasCalls()) {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001801 unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1802 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001803 Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1804 } else {
1805 unsigned ReservedOffsetReg
1806 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1807 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1808 }
1809 }
1810}
1811
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001812bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
1813 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1814 return !Info->isEntryFunction();
1815}
1816
1817void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
1818
1819}
1820
1821void SITargetLowering::insertCopiesSplitCSR(
1822 MachineBasicBlock *Entry,
1823 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1824 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1825
1826 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1827 if (!IStart)
1828 return;
1829
1830 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1831 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1832 MachineBasicBlock::iterator MBBI = Entry->begin();
1833 for (const MCPhysReg *I = IStart; *I; ++I) {
1834 const TargetRegisterClass *RC = nullptr;
1835 if (AMDGPU::SReg_64RegClass.contains(*I))
1836 RC = &AMDGPU::SGPR_64RegClass;
1837 else if (AMDGPU::SReg_32RegClass.contains(*I))
1838 RC = &AMDGPU::SGPR_32RegClass;
1839 else
1840 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1841
1842 unsigned NewVR = MRI->createVirtualRegister(RC);
1843 // Create copy from CSR to a virtual register.
1844 Entry->addLiveIn(*I);
1845 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1846 .addReg(*I);
1847
1848 // Insert the copy-back instructions right before the terminator.
1849 for (auto *Exit : Exits)
1850 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1851 TII->get(TargetOpcode::COPY), *I)
1852 .addReg(NewVR);
1853 }
1854}
1855
Christian Konig2c8f6d52013-03-07 09:03:52 +00001856SDValue SITargetLowering::LowerFormalArguments(
Eric Christopher7792e322015-01-30 23:24:40 +00001857 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
Benjamin Kramerbdc49562016-06-12 15:39:02 +00001858 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1859 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00001860 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
Christian Konig2c8f6d52013-03-07 09:03:52 +00001861
1862 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenaultceafc552018-05-29 17:42:50 +00001863 const Function &Fn = MF.getFunction();
Matthias Braunf1caa282017-12-15 22:22:58 +00001864 FunctionType *FType = MF.getFunction().getFunctionType();
Christian Konig99ee0f42013-03-07 09:04:14 +00001865 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Christian Konig2c8f6d52013-03-07 09:03:52 +00001866
Nicolai Haehnledf3a20c2016-04-06 19:40:20 +00001867 if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
Oliver Stannard7e7d9832016-02-02 13:52:43 +00001868 DiagnosticInfoUnsupported NoGraphicsHSA(
Matthias Braunf1caa282017-12-15 22:22:58 +00001869 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
Matt Arsenaultd48da142015-11-02 23:23:02 +00001870 DAG.getContext()->diagnose(NoGraphicsHSA);
Diana Picus81bc3172016-05-26 15:24:55 +00001871 return DAG.getEntryNode();
Matt Arsenaultd48da142015-11-02 23:23:02 +00001872 }
1873
Christian Konig2c8f6d52013-03-07 09:03:52 +00001874 SmallVector<ISD::InputArg, 16> Splits;
Christian Konig2c8f6d52013-03-07 09:03:52 +00001875 SmallVector<CCValAssign, 16> ArgLocs;
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001876 BitVector Skipped(Ins.size());
Eric Christopherb5217502014-08-06 18:45:26 +00001877 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1878 *DAG.getContext());
Christian Konig2c8f6d52013-03-07 09:03:52 +00001879
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001880 bool IsShader = AMDGPU::isShader(CallConv);
Matt Arsenaultefa9f4b2017-04-11 22:29:28 +00001881 bool IsKernel = AMDGPU::isKernel(CallConv);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001882 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
Christian Konig99ee0f42013-03-07 09:04:14 +00001883
Matt Arsenaultd1867c02017-08-02 00:59:51 +00001884 if (!IsEntryFunc) {
1885 // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1886 // this when allocating argument fixed offsets.
1887 CCInfo.AllocateStack(4, 4);
1888 }
1889
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001890 if (IsShader) {
1891 processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1892
1893 // At least one interpolation mode must be enabled or else the GPU will
1894 // hang.
1895 //
1896 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1897 // set PSInputAddr, the user wants to enable some bits after the compilation
1898 // based on run-time states. Since we can't know what the final PSInputEna
1899 // will look like, so we shouldn't do anything here and the user should take
1900 // responsibility for the correct programming.
1901 //
1902 // Otherwise, the following restrictions apply:
1903 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1904 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1905 // enabled too.
Tim Renoufc8ffffe2017-10-12 16:16:41 +00001906 if (CallConv == CallingConv::AMDGPU_PS) {
1907 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1908 ((Info->getPSInputAddr() & 0xF) == 0 &&
1909 Info->isPSInputAllocated(11))) {
1910 CCInfo.AllocateReg(AMDGPU::VGPR0);
1911 CCInfo.AllocateReg(AMDGPU::VGPR1);
1912 Info->markPSInputAllocated(0);
1913 Info->markPSInputEnabled(0);
1914 }
1915 if (Subtarget->isAmdPalOS()) {
1916 // For isAmdPalOS, the user does not enable some bits after compilation
1917 // based on run-time states; the register values being generated here are
1918 // the final ones set in hardware. Therefore we need to apply the
1919 // workaround to PSInputAddr and PSInputEnable together. (The case where
1920 // a bit is set in PSInputAddr but not PSInputEnable is where the
1921 // frontend set up an input arg for a particular interpolation mode, but
1922 // nothing uses that input arg. Really we should have an earlier pass
1923 // that removes such an arg.)
1924 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1925 if ((PsInputBits & 0x7F) == 0 ||
1926 ((PsInputBits & 0xF) == 0 &&
1927 (PsInputBits >> 11 & 1)))
1928 Info->markPSInputEnabled(
1929 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
1930 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001931 }
1932
Tom Stellard2f3f9852017-01-25 01:25:13 +00001933 assert(!Info->hasDispatchPtr() &&
Tom Stellardf110f8f2016-04-14 16:27:03 +00001934 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
1935 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
1936 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
1937 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
1938 !Info->hasWorkItemIDZ());
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001939 } else if (IsKernel) {
1940 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001941 } else {
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001942 Splits.append(Ins.begin(), Ins.end());
Tom Stellardaf775432013-10-23 00:44:32 +00001943 }
1944
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001945 if (IsEntryFunc) {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001946 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001947 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
Tom Stellard2f3f9852017-01-25 01:25:13 +00001948 }
1949
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001950 if (IsKernel) {
Tom Stellardbbeb45a2016-09-16 21:53:00 +00001951 analyzeFormalArgumentsCompute(CCInfo, Ins);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001952 } else {
1953 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1954 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1955 }
Christian Konig2c8f6d52013-03-07 09:03:52 +00001956
Matt Arsenaultcf13d182015-07-10 22:51:36 +00001957 SmallVector<SDValue, 16> Chains;
1958
Matt Arsenault7b4826e2018-05-30 16:17:51 +00001959 // FIXME: This is the minimum kernel argument alignment. We should improve
1960 // this to the maximum alignment of the arguments.
1961 //
1962 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
1963 // kern arg offset.
1964 const unsigned KernelArgBaseAlign = 16;
Matt Arsenault7b4826e2018-05-30 16:17:51 +00001965
1966 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
Christian Konigb7be72d2013-05-17 09:46:48 +00001967 const ISD::InputArg &Arg = Ins[i];
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001968 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
Christian Konigb7be72d2013-05-17 09:46:48 +00001969 InVals.push_back(DAG.getUNDEF(Arg.VT));
Christian Konig99ee0f42013-03-07 09:04:14 +00001970 continue;
1971 }
1972
Christian Konig2c8f6d52013-03-07 09:03:52 +00001973 CCValAssign &VA = ArgLocs[ArgIdx++];
Craig Topper7f416c82014-11-16 21:17:18 +00001974 MVT VT = VA.getLocVT();
Tom Stellarded882c22013-06-03 17:40:11 +00001975
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001976 if (IsEntryFunc && VA.isMemLoc()) {
Tom Stellardaf775432013-10-23 00:44:32 +00001977 VT = Ins[i].VT;
Tom Stellardbbeb45a2016-09-16 21:53:00 +00001978 EVT MemVT = VA.getLocVT();
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001979
Matt Arsenault4bec7d42018-07-20 09:05:08 +00001980 const uint64_t Offset = VA.getLocMemOffset();
Matt Arsenault7b4826e2018-05-30 16:17:51 +00001981 unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001982
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001983 SDValue Arg = lowerKernargMemParameter(
Matt Arsenault7b4826e2018-05-30 16:17:51 +00001984 DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
Matt Arsenaultcf13d182015-07-10 22:51:36 +00001985 Chains.push_back(Arg.getValue(1));
Tom Stellardca7ecf32014-08-22 18:49:31 +00001986
Craig Toppere3dcce92015-08-01 22:20:21 +00001987 auto *ParamTy =
Andrew Trick05938a52015-02-16 18:10:47 +00001988 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
Tom Stellard5bfbae52018-07-11 20:59:01 +00001989 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
Matt Arsenaultcdd191d2019-01-28 20:14:49 +00001990 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1991 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
Tom Stellardca7ecf32014-08-22 18:49:31 +00001992 // On SI local pointers are just offsets into LDS, so they are always
1993 // less than 16-bits. On CI and newer they could potentially be
1994 // real pointers, so we can't guarantee their size.
1995 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
1996 DAG.getValueType(MVT::i16));
1997 }
1998
Tom Stellarded882c22013-06-03 17:40:11 +00001999 InVals.push_back(Arg);
2000 continue;
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002001 } else if (!IsEntryFunc && VA.isMemLoc()) {
2002 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2003 InVals.push_back(Val);
2004 if (!Arg.Flags.isByVal())
2005 Chains.push_back(Val.getValue(1));
2006 continue;
Tom Stellarded882c22013-06-03 17:40:11 +00002007 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002008
Christian Konig2c8f6d52013-03-07 09:03:52 +00002009 assert(VA.isRegLoc() && "Parameter must be in a register!");
2010
2011 unsigned Reg = VA.getLocReg();
Christian Konig2c8f6d52013-03-07 09:03:52 +00002012 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
Matt Arsenaultb3463552017-07-15 05:52:59 +00002013 EVT ValVT = VA.getValVT();
Christian Konig2c8f6d52013-03-07 09:03:52 +00002014
2015 Reg = MF.addLiveIn(Reg, RC);
2016 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2017
Matt Arsenault45b98182017-11-15 00:45:43 +00002018 if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
2019 // The return object should be reasonably addressable.
2020
2021 // FIXME: This helps when the return is a real sret. If it is a
2022 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2023 // extra copy is inserted in SelectionDAGBuilder which obscures this.
2024 unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
2025 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2026 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2027 }
2028
Matt Arsenaultb3463552017-07-15 05:52:59 +00002029 // If this is an 8 or 16-bit value, it is really passed promoted
2030 // to 32 bits. Insert an assert[sz]ext to capture this, then
2031 // truncate to the right size.
2032 switch (VA.getLocInfo()) {
2033 case CCValAssign::Full:
2034 break;
2035 case CCValAssign::BCvt:
2036 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2037 break;
2038 case CCValAssign::SExt:
2039 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2040 DAG.getValueType(ValVT));
2041 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2042 break;
2043 case CCValAssign::ZExt:
2044 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2045 DAG.getValueType(ValVT));
2046 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2047 break;
2048 case CCValAssign::AExt:
2049 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2050 break;
2051 default:
2052 llvm_unreachable("Unknown loc info!");
2053 }
2054
Christian Konig2c8f6d52013-03-07 09:03:52 +00002055 InVals.push_back(Val);
2056 }
Tom Stellarde99fb652015-01-20 19:33:04 +00002057
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002058 if (!IsEntryFunc) {
2059 // Special inputs come after user arguments.
2060 allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2061 }
2062
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002063 // Start adding system SGPRs.
2064 if (IsEntryFunc) {
2065 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002066 } else {
2067 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2068 CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
2069 CCInfo.AllocateReg(Info->getFrameOffsetReg());
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002070 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002071 }
Matt Arsenaultcf13d182015-07-10 22:51:36 +00002072
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002073 auto &ArgUsageInfo =
2074 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
Matt Arsenaultceafc552018-05-29 17:42:50 +00002075 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002076
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002077 unsigned StackArgSize = CCInfo.getNextStackOffset();
2078 Info->setBytesInStackArgArea(StackArgSize);
2079
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002080 return Chains.empty() ? Chain :
2081 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
Christian Konig2c8f6d52013-03-07 09:03:52 +00002082}
2083
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002084// TODO: If return values can't fit in registers, we should return as many as
2085// possible in registers before passing on stack.
2086bool SITargetLowering::CanLowerReturn(
2087 CallingConv::ID CallConv,
2088 MachineFunction &MF, bool IsVarArg,
2089 const SmallVectorImpl<ISD::OutputArg> &Outs,
2090 LLVMContext &Context) const {
2091 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2092 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2093 // for shaders. Vector types should be explicitly handled by CC.
2094 if (AMDGPU::isEntryFunctionCC(CallConv))
2095 return true;
2096
2097 SmallVector<CCValAssign, 16> RVLocs;
2098 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2099 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2100}
2101
Benjamin Kramerbdc49562016-06-12 15:39:02 +00002102SDValue
2103SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2104 bool isVarArg,
2105 const SmallVectorImpl<ISD::OutputArg> &Outs,
2106 const SmallVectorImpl<SDValue> &OutVals,
2107 const SDLoc &DL, SelectionDAG &DAG) const {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002108 MachineFunction &MF = DAG.getMachineFunction();
2109 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2110
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002111 if (AMDGPU::isKernel(CallConv)) {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002112 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2113 OutVals, DL, DAG);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002114 }
2115
2116 bool IsShader = AMDGPU::isShader(CallConv);
Marek Olsak8a0f3352016-01-13 17:23:04 +00002117
Matt Arsenault55ab9212018-08-01 19:57:34 +00002118 Info->setIfReturnsVoid(Outs.empty());
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002119 bool IsWaveEnd = Info->returnsVoid() && IsShader;
Marek Olsak8e9cc632016-01-13 17:23:09 +00002120
Marek Olsak8a0f3352016-01-13 17:23:04 +00002121 // CCValAssign - represent the assignment of the return value to a location.
2122 SmallVector<CCValAssign, 48> RVLocs;
Matt Arsenault55ab9212018-08-01 19:57:34 +00002123 SmallVector<ISD::OutputArg, 48> Splits;
Marek Olsak8a0f3352016-01-13 17:23:04 +00002124
2125 // CCState - Info about the registers and stack slots.
2126 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2127 *DAG.getContext());
2128
2129 // Analyze outgoing return values.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002130 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
Marek Olsak8a0f3352016-01-13 17:23:04 +00002131
2132 SDValue Flag;
2133 SmallVector<SDValue, 48> RetOps;
2134 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2135
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002136 // Add return address for callable functions.
2137 if (!Info->isEntryFunction()) {
2138 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2139 SDValue ReturnAddrReg = CreateLiveInRegister(
2140 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2141
2142 // FIXME: Should be able to use a vreg here, but need a way to prevent it
2143 // from being allcoated to a CSR.
2144
2145 SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2146 MVT::i64);
2147
2148 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
2149 Flag = Chain.getValue(1);
2150
2151 RetOps.push_back(PhysReturnAddrReg);
2152 }
2153
Marek Olsak8a0f3352016-01-13 17:23:04 +00002154 // Copy the result values into the output registers.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002155 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2156 ++I, ++RealRVLocIdx) {
2157 CCValAssign &VA = RVLocs[I];
Marek Olsak8a0f3352016-01-13 17:23:04 +00002158 assert(VA.isRegLoc() && "Can only return in registers!");
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002159 // TODO: Partially return in registers if return values don't fit.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002160 SDValue Arg = OutVals[RealRVLocIdx];
Marek Olsak8a0f3352016-01-13 17:23:04 +00002161
2162 // Copied from other backends.
2163 switch (VA.getLocInfo()) {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002164 case CCValAssign::Full:
2165 break;
2166 case CCValAssign::BCvt:
2167 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2168 break;
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002169 case CCValAssign::SExt:
2170 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2171 break;
2172 case CCValAssign::ZExt:
2173 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2174 break;
2175 case CCValAssign::AExt:
2176 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2177 break;
2178 default:
2179 llvm_unreachable("Unknown loc info!");
Marek Olsak8a0f3352016-01-13 17:23:04 +00002180 }
2181
2182 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2183 Flag = Chain.getValue(1);
2184 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2185 }
2186
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002187 // FIXME: Does sret work properly?
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002188 if (!Info->isEntryFunction()) {
Tom Stellardc5a154d2018-06-28 23:47:12 +00002189 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002190 const MCPhysReg *I =
2191 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2192 if (I) {
2193 for (; *I; ++I) {
2194 if (AMDGPU::SReg_64RegClass.contains(*I))
2195 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2196 else if (AMDGPU::SReg_32RegClass.contains(*I))
2197 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2198 else
2199 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2200 }
2201 }
2202 }
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002203
Marek Olsak8a0f3352016-01-13 17:23:04 +00002204 // Update chain and glue.
2205 RetOps[0] = Chain;
2206 if (Flag.getNode())
2207 RetOps.push_back(Flag);
2208
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002209 unsigned Opc = AMDGPUISD::ENDPGM;
2210 if (!IsWaveEnd)
2211 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
Matt Arsenault9babdf42016-06-22 20:15:28 +00002212 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
Marek Olsak8a0f3352016-01-13 17:23:04 +00002213}
2214
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002215SDValue SITargetLowering::LowerCallResult(
2216 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2217 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2218 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2219 SDValue ThisVal) const {
2220 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2221
2222 // Assign locations to each value returned by this call.
2223 SmallVector<CCValAssign, 16> RVLocs;
2224 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2225 *DAG.getContext());
2226 CCInfo.AnalyzeCallResult(Ins, RetCC);
2227
2228 // Copy all of the result registers out of their specified physreg.
2229 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2230 CCValAssign VA = RVLocs[i];
2231 SDValue Val;
2232
2233 if (VA.isRegLoc()) {
2234 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2235 Chain = Val.getValue(1);
2236 InFlag = Val.getValue(2);
2237 } else if (VA.isMemLoc()) {
2238 report_fatal_error("TODO: return values in memory");
2239 } else
2240 llvm_unreachable("unknown argument location type");
2241
2242 switch (VA.getLocInfo()) {
2243 case CCValAssign::Full:
2244 break;
2245 case CCValAssign::BCvt:
2246 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2247 break;
2248 case CCValAssign::ZExt:
2249 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2250 DAG.getValueType(VA.getValVT()));
2251 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2252 break;
2253 case CCValAssign::SExt:
2254 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2255 DAG.getValueType(VA.getValVT()));
2256 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2257 break;
2258 case CCValAssign::AExt:
2259 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2260 break;
2261 default:
2262 llvm_unreachable("Unknown loc info!");
2263 }
2264
2265 InVals.push_back(Val);
2266 }
2267
2268 return Chain;
2269}
2270
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002271// Add code to pass special inputs required depending on used features separate
2272// from the explicit user arguments present in the IR.
2273void SITargetLowering::passSpecialInputs(
2274 CallLoweringInfo &CLI,
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002275 CCState &CCInfo,
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002276 const SIMachineFunctionInfo &Info,
2277 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2278 SmallVectorImpl<SDValue> &MemOpChains,
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002279 SDValue Chain) const {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002280 // If we don't have a call site, this was a call inserted by
2281 // legalization. These can never use special inputs.
2282 if (!CLI.CS)
2283 return;
2284
2285 const Function *CalleeFunc = CLI.CS.getCalledFunction();
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002286 assert(CalleeFunc);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002287
2288 SelectionDAG &DAG = CLI.DAG;
2289 const SDLoc &DL = CLI.DL;
2290
Tom Stellardc5a154d2018-06-28 23:47:12 +00002291 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002292
2293 auto &ArgUsageInfo =
2294 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2295 const AMDGPUFunctionArgInfo &CalleeArgInfo
2296 = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2297
2298 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2299
2300 // TODO: Unify with private memory register handling. This is complicated by
2301 // the fact that at least in kernels, the input argument is not necessarily
2302 // in the same location as the input.
2303 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
2304 AMDGPUFunctionArgInfo::DISPATCH_PTR,
2305 AMDGPUFunctionArgInfo::QUEUE_PTR,
2306 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
2307 AMDGPUFunctionArgInfo::DISPATCH_ID,
2308 AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
2309 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
2310 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
2311 AMDGPUFunctionArgInfo::WORKITEM_ID_X,
2312 AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
Matt Arsenault817c2532017-08-03 23:12:44 +00002313 AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
2314 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002315 };
2316
2317 for (auto InputID : InputRegs) {
2318 const ArgDescriptor *OutgoingArg;
2319 const TargetRegisterClass *ArgRC;
2320
2321 std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2322 if (!OutgoingArg)
2323 continue;
2324
2325 const ArgDescriptor *IncomingArg;
2326 const TargetRegisterClass *IncomingArgRC;
2327 std::tie(IncomingArg, IncomingArgRC)
2328 = CallerArgInfo.getPreloadedValue(InputID);
2329 assert(IncomingArgRC == ArgRC);
2330
2331 // All special arguments are ints for now.
2332 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
Matt Arsenault817c2532017-08-03 23:12:44 +00002333 SDValue InputReg;
2334
2335 if (IncomingArg) {
2336 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2337 } else {
2338 // The implicit arg ptr is special because it doesn't have a corresponding
2339 // input for kernels, and is computed from the kernarg segment pointer.
2340 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2341 InputReg = getImplicitArgPtr(DAG, DL);
2342 }
2343
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002344 if (OutgoingArg->isRegister()) {
2345 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2346 } else {
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002347 unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
2348 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2349 SpecialArgOffset);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002350 MemOpChains.push_back(ArgStore);
2351 }
2352 }
2353}
2354
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002355static bool canGuaranteeTCO(CallingConv::ID CC) {
2356 return CC == CallingConv::Fast;
2357}
2358
2359/// Return true if we might ever do TCO for calls with this calling convention.
2360static bool mayTailCallThisCC(CallingConv::ID CC) {
2361 switch (CC) {
2362 case CallingConv::C:
2363 return true;
2364 default:
2365 return canGuaranteeTCO(CC);
2366 }
2367}
2368
2369bool SITargetLowering::isEligibleForTailCallOptimization(
2370 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2371 const SmallVectorImpl<ISD::OutputArg> &Outs,
2372 const SmallVectorImpl<SDValue> &OutVals,
2373 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2374 if (!mayTailCallThisCC(CalleeCC))
2375 return false;
2376
2377 MachineFunction &MF = DAG.getMachineFunction();
Matthias Braunf1caa282017-12-15 22:22:58 +00002378 const Function &CallerF = MF.getFunction();
2379 CallingConv::ID CallerCC = CallerF.getCallingConv();
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002380 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2381 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2382
2383 // Kernels aren't callable, and don't have a live in return address so it
2384 // doesn't make sense to do a tail call with entry functions.
2385 if (!CallerPreserved)
2386 return false;
2387
2388 bool CCMatch = CallerCC == CalleeCC;
2389
2390 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
2391 if (canGuaranteeTCO(CalleeCC) && CCMatch)
2392 return true;
2393 return false;
2394 }
2395
2396 // TODO: Can we handle var args?
2397 if (IsVarArg)
2398 return false;
2399
Matthias Braunf1caa282017-12-15 22:22:58 +00002400 for (const Argument &Arg : CallerF.args()) {
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002401 if (Arg.hasByValAttr())
2402 return false;
2403 }
2404
2405 LLVMContext &Ctx = *DAG.getContext();
2406
2407 // Check that the call results are passed in the same way.
2408 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2409 CCAssignFnForCall(CalleeCC, IsVarArg),
2410 CCAssignFnForCall(CallerCC, IsVarArg)))
2411 return false;
2412
2413 // The callee has to preserve all registers the caller needs to preserve.
2414 if (!CCMatch) {
2415 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2416 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2417 return false;
2418 }
2419
2420 // Nothing more to check if the callee is taking no arguments.
2421 if (Outs.empty())
2422 return true;
2423
2424 SmallVector<CCValAssign, 16> ArgLocs;
2425 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2426
2427 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2428
2429 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2430 // If the stack arguments for this call do not fit into our own save area then
2431 // the call cannot be made tail.
2432 // TODO: Is this really necessary?
2433 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2434 return false;
2435
2436 const MachineRegisterInfo &MRI = MF.getRegInfo();
2437 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2438}
2439
2440bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2441 if (!CI->isTailCall())
2442 return false;
2443
2444 const Function *ParentFn = CI->getParent()->getParent();
2445 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2446 return false;
2447
2448 auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2449 return (Attr.getValueAsString() != "true");
2450}
2451
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002452// The wave scratch offset register is used as the global base pointer.
2453SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
2454 SmallVectorImpl<SDValue> &InVals) const {
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002455 SelectionDAG &DAG = CLI.DAG;
2456 const SDLoc &DL = CLI.DL;
2457 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2458 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2459 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2460 SDValue Chain = CLI.Chain;
2461 SDValue Callee = CLI.Callee;
2462 bool &IsTailCall = CLI.IsTailCall;
2463 CallingConv::ID CallConv = CLI.CallConv;
2464 bool IsVarArg = CLI.IsVarArg;
2465 bool IsSibCall = false;
2466 bool IsThisReturn = false;
2467 MachineFunction &MF = DAG.getMachineFunction();
2468
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002469 if (IsVarArg) {
2470 return lowerUnhandledCall(CLI, InVals,
2471 "unsupported call to variadic function ");
2472 }
2473
Matt Arsenault935f3b72018-08-08 16:58:39 +00002474 if (!CLI.CS.getInstruction())
2475 report_fatal_error("unsupported libcall legalization");
2476
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002477 if (!CLI.CS.getCalledFunction()) {
2478 return lowerUnhandledCall(CLI, InVals,
2479 "unsupported indirect call to function ");
2480 }
2481
2482 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2483 return lowerUnhandledCall(CLI, InVals,
2484 "unsupported required tail call to function ");
2485 }
2486
Matt Arsenault1fb90132018-06-28 10:18:36 +00002487 if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
2488 // Note the issue is with the CC of the calling function, not of the call
2489 // itself.
2490 return lowerUnhandledCall(CLI, InVals,
2491 "unsupported call from graphics shader of function ");
2492 }
2493
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002494 // The first 4 bytes are reserved for the callee's emergency stack slot.
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002495 if (IsTailCall) {
2496 IsTailCall = isEligibleForTailCallOptimization(
2497 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2498 if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2499 report_fatal_error("failed to perform tail call elimination on a call "
2500 "site marked musttail");
2501 }
2502
2503 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2504
2505 // A sibling call is one where we're under the usual C ABI and not planning
2506 // to change that but can still do a tail call:
2507 if (!TailCallOpt && IsTailCall)
2508 IsSibCall = true;
2509
2510 if (IsTailCall)
2511 ++NumTailCalls;
2512 }
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002513
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002514 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2515
2516 // Analyze operands of the call, assigning locations to each operand.
2517 SmallVector<CCValAssign, 16> ArgLocs;
2518 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2519 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002520
2521 // The first 4 bytes are reserved for the callee's emergency stack slot.
2522 CCInfo.AllocateStack(4, 4);
2523
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002524 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2525
2526 // Get a count of how many bytes are to be pushed on the stack.
2527 unsigned NumBytes = CCInfo.getNextStackOffset();
2528
2529 if (IsSibCall) {
2530 // Since we're not changing the ABI to make this a tail call, the memory
2531 // operands are already available in the caller's incoming argument space.
2532 NumBytes = 0;
2533 }
2534
2535 // FPDiff is the byte offset of the call's argument area from the callee's.
2536 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2537 // by this amount for a tail call. In a sibling call it must be 0 because the
2538 // caller will deallocate the entire stack and the callee still expects its
2539 // arguments to begin at SP+0. Completely unused for non-tail calls.
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002540 int32_t FPDiff = 0;
2541 MachineFrameInfo &MFI = MF.getFrameInfo();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002542 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2543
Matt Arsenault6efd0822017-09-14 17:14:57 +00002544 SDValue CallerSavedFP;
2545
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002546 // Adjust the stack pointer for the new arguments...
2547 // These operations are automatically eliminated by the prolog/epilog pass
2548 if (!IsSibCall) {
Matt Arsenaultdefe3712017-09-14 17:37:40 +00002549 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002550
2551 unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2552
2553 // In the HSA case, this should be an identity copy.
2554 SDValue ScratchRSrcReg
2555 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2556 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2557
2558 // TODO: Don't hardcode these registers and get from the callee function.
2559 SDValue ScratchWaveOffsetReg
2560 = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2561 RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
Matt Arsenault6efd0822017-09-14 17:14:57 +00002562
2563 if (!Info->isEntryFunction()) {
2564 // Avoid clobbering this function's FP value. In the current convention
2565 // callee will overwrite this, so do save/restore around the call site.
2566 CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2567 Info->getFrameOffsetReg(), MVT::i32);
2568 }
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002569 }
2570
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002571 SmallVector<SDValue, 8> MemOpChains;
2572 MVT PtrVT = MVT::i32;
2573
2574 // Walk the register/memloc assignments, inserting copies/loads.
2575 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2576 ++i, ++realArgIdx) {
2577 CCValAssign &VA = ArgLocs[i];
2578 SDValue Arg = OutVals[realArgIdx];
2579
2580 // Promote the value if needed.
2581 switch (VA.getLocInfo()) {
2582 case CCValAssign::Full:
2583 break;
2584 case CCValAssign::BCvt:
2585 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2586 break;
2587 case CCValAssign::ZExt:
2588 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2589 break;
2590 case CCValAssign::SExt:
2591 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2592 break;
2593 case CCValAssign::AExt:
2594 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2595 break;
2596 case CCValAssign::FPExt:
2597 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2598 break;
2599 default:
2600 llvm_unreachable("Unknown loc info!");
2601 }
2602
2603 if (VA.isRegLoc()) {
2604 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2605 } else {
2606 assert(VA.isMemLoc());
2607
2608 SDValue DstAddr;
2609 MachinePointerInfo DstInfo;
2610
2611 unsigned LocMemOffset = VA.getLocMemOffset();
2612 int32_t Offset = LocMemOffset;
Matt Arsenaultb655fa92017-11-29 01:25:12 +00002613
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002614 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002615 unsigned Align = 0;
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002616
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002617 if (IsTailCall) {
2618 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2619 unsigned OpSize = Flags.isByVal() ?
2620 Flags.getByValSize() : VA.getValVT().getStoreSize();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002621
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002622 // FIXME: We can have better than the minimum byval required alignment.
2623 Align = Flags.isByVal() ? Flags.getByValAlign() :
2624 MinAlign(Subtarget->getStackAlignment(), Offset);
2625
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002626 Offset = Offset + FPDiff;
2627 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2628
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002629 DstAddr = DAG.getFrameIndex(FI, PtrVT);
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002630 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2631
2632 // Make sure any stack arguments overlapping with where we're storing
2633 // are loaded before this eventual operation. Otherwise they'll be
2634 // clobbered.
2635
2636 // FIXME: Why is this really necessary? This seems to just result in a
2637 // lot of code to copy the stack and write them back to the same
2638 // locations, which are supposed to be immutable?
2639 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2640 } else {
2641 DstAddr = PtrOff;
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002642 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002643 Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002644 }
2645
2646 if (Outs[i].Flags.isByVal()) {
2647 SDValue SizeNode =
2648 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2649 SDValue Cpy = DAG.getMemcpy(
2650 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2651 /*isVol = */ false, /*AlwaysInline = */ true,
Yaxun Liuc5962262017-11-22 16:13:35 +00002652 /*isTailCall = */ false, DstInfo,
2653 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
Matt Arsenault0da63502018-08-31 05:49:54 +00002654 *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))));
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002655
2656 MemOpChains.push_back(Cpy);
2657 } else {
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002658 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002659 MemOpChains.push_back(Store);
2660 }
2661 }
2662 }
2663
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002664 // Copy special input registers after user input arguments.
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002665 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002666
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002667 if (!MemOpChains.empty())
2668 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2669
2670 // Build a sequence of copy-to-reg nodes chained together with token chain
2671 // and flag operands which copy the outgoing args into the appropriate regs.
2672 SDValue InFlag;
2673 for (auto &RegToPass : RegsToPass) {
2674 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2675 RegToPass.second, InFlag);
2676 InFlag = Chain.getValue(1);
2677 }
2678
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002679
2680 SDValue PhysReturnAddrReg;
2681 if (IsTailCall) {
2682 // Since the return is being combined with the call, we need to pass on the
2683 // return address.
2684
2685 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2686 SDValue ReturnAddrReg = CreateLiveInRegister(
2687 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2688
2689 PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2690 MVT::i64);
2691 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2692 InFlag = Chain.getValue(1);
2693 }
2694
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002695 // We don't usually want to end the call-sequence here because we would tidy
2696 // the frame up *after* the call, however in the ABI-changing tail-call case
2697 // we've carefully laid out the parameters so that when sp is reset they'll be
2698 // in the correct location.
2699 if (IsTailCall && !IsSibCall) {
2700 Chain = DAG.getCALLSEQ_END(Chain,
2701 DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2702 DAG.getTargetConstant(0, DL, MVT::i32),
2703 InFlag, DL);
2704 InFlag = Chain.getValue(1);
2705 }
2706
2707 std::vector<SDValue> Ops;
2708 Ops.push_back(Chain);
2709 Ops.push_back(Callee);
Scott Linderd19d1972019-02-04 20:00:07 +00002710 // Add a redundant copy of the callee global which will not be legalized, as
2711 // we need direct access to the callee later.
2712 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee);
2713 const GlobalValue *GV = GSD->getGlobal();
2714 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002715
2716 if (IsTailCall) {
2717 // Each tail call may have to adjust the stack by a different amount, so
2718 // this information must travel along with the operation for eventual
2719 // consumption by emitEpilogue.
2720 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002721
2722 Ops.push_back(PhysReturnAddrReg);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002723 }
2724
2725 // Add argument registers to the end of the list so that they are known live
2726 // into the call.
2727 for (auto &RegToPass : RegsToPass) {
2728 Ops.push_back(DAG.getRegister(RegToPass.first,
2729 RegToPass.second.getValueType()));
2730 }
2731
2732 // Add a register mask operand representing the call-preserved registers.
2733
Tom Stellardc5a154d2018-06-28 23:47:12 +00002734 auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002735 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2736 assert(Mask && "Missing call preserved mask for calling convention");
2737 Ops.push_back(DAG.getRegisterMask(Mask));
2738
2739 if (InFlag.getNode())
2740 Ops.push_back(InFlag);
2741
2742 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2743
2744 // If we're doing a tall call, use a TC_RETURN here rather than an
2745 // actual call instruction.
2746 if (IsTailCall) {
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002747 MFI.setHasTailCall();
2748 return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002749 }
2750
2751 // Returns a chain and a flag for retval copy to use.
2752 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2753 Chain = Call.getValue(0);
2754 InFlag = Call.getValue(1);
2755
Matt Arsenault6efd0822017-09-14 17:14:57 +00002756 if (CallerSavedFP) {
2757 SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2758 Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2759 InFlag = Chain.getValue(1);
2760 }
2761
Matt Arsenaultdefe3712017-09-14 17:37:40 +00002762 uint64_t CalleePopBytes = NumBytes;
2763 Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002764 DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2765 InFlag, DL);
2766 if (!Ins.empty())
2767 InFlag = Chain.getValue(1);
2768
2769 // Handle result values, copying them out of physregs into vregs that we
2770 // return.
2771 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2772 InVals, IsThisReturn,
2773 IsThisReturn ? OutVals[0] : SDValue());
2774}
2775
Matt Arsenault9a10cea2016-01-26 04:29:24 +00002776unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2777 SelectionDAG &DAG) const {
2778 unsigned Reg = StringSwitch<unsigned>(RegName)
2779 .Case("m0", AMDGPU::M0)
2780 .Case("exec", AMDGPU::EXEC)
2781 .Case("exec_lo", AMDGPU::EXEC_LO)
2782 .Case("exec_hi", AMDGPU::EXEC_HI)
2783 .Case("flat_scratch", AMDGPU::FLAT_SCR)
2784 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2785 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2786 .Default(AMDGPU::NoRegister);
2787
2788 if (Reg == AMDGPU::NoRegister) {
2789 report_fatal_error(Twine("invalid register name \""
2790 + StringRef(RegName) + "\"."));
2791
2792 }
2793
Tom Stellard5bfbae52018-07-11 20:59:01 +00002794 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
Matt Arsenault9a10cea2016-01-26 04:29:24 +00002795 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
2796 report_fatal_error(Twine("invalid register \""
2797 + StringRef(RegName) + "\" for subtarget."));
2798 }
2799
2800 switch (Reg) {
2801 case AMDGPU::M0:
2802 case AMDGPU::EXEC_LO:
2803 case AMDGPU::EXEC_HI:
2804 case AMDGPU::FLAT_SCR_LO:
2805 case AMDGPU::FLAT_SCR_HI:
2806 if (VT.getSizeInBits() == 32)
2807 return Reg;
2808 break;
2809 case AMDGPU::EXEC:
2810 case AMDGPU::FLAT_SCR:
2811 if (VT.getSizeInBits() == 64)
2812 return Reg;
2813 break;
2814 default:
2815 llvm_unreachable("missing register type checking");
2816 }
2817
2818 report_fatal_error(Twine("invalid type for register \""
2819 + StringRef(RegName) + "\"."));
2820}
2821
Matt Arsenault786724a2016-07-12 21:41:32 +00002822// If kill is not the last instruction, split the block so kill is always a
2823// proper terminator.
2824MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
2825 MachineBasicBlock *BB) const {
2826 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2827
2828 MachineBasicBlock::iterator SplitPoint(&MI);
2829 ++SplitPoint;
2830
2831 if (SplitPoint == BB->end()) {
2832 // Don't bother with a new block.
Marek Olsakce76ea02017-10-24 10:27:13 +00002833 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
Matt Arsenault786724a2016-07-12 21:41:32 +00002834 return BB;
2835 }
2836
2837 MachineFunction *MF = BB->getParent();
2838 MachineBasicBlock *SplitBB
2839 = MF->CreateMachineBasicBlock(BB->getBasicBlock());
2840
Matt Arsenault786724a2016-07-12 21:41:32 +00002841 MF->insert(++MachineFunction::iterator(BB), SplitBB);
2842 SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2843
Matt Arsenaultd40ded62016-07-22 17:01:15 +00002844 SplitBB->transferSuccessorsAndUpdatePHIs(BB);
Matt Arsenault786724a2016-07-12 21:41:32 +00002845 BB->addSuccessor(SplitBB);
2846
Marek Olsakce76ea02017-10-24 10:27:13 +00002847 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
Matt Arsenault786724a2016-07-12 21:41:32 +00002848 return SplitBB;
2849}
2850
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002851// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2852// wavefront. If the value is uniform and just happens to be in a VGPR, this
2853// will only do one iteration. In the worst case, this will loop 64 times.
2854//
2855// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002856static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
2857 const SIInstrInfo *TII,
2858 MachineRegisterInfo &MRI,
2859 MachineBasicBlock &OrigBB,
2860 MachineBasicBlock &LoopBB,
2861 const DebugLoc &DL,
2862 const MachineOperand &IdxReg,
2863 unsigned InitReg,
2864 unsigned ResultReg,
2865 unsigned PhiReg,
2866 unsigned InitSaveExecReg,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00002867 int Offset,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002868 bool UseGPRIdxMode,
2869 bool IsIndirectSrc) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002870 MachineBasicBlock::iterator I = LoopBB.begin();
2871
2872 unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2873 unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2874 unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2875 unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2876
2877 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2878 .addReg(InitReg)
2879 .addMBB(&OrigBB)
2880 .addReg(ResultReg)
2881 .addMBB(&LoopBB);
2882
2883 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2884 .addReg(InitSaveExecReg)
2885 .addMBB(&OrigBB)
2886 .addReg(NewExec)
2887 .addMBB(&LoopBB);
2888
2889 // Read the next variant <- also loop target.
2890 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2891 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2892
2893 // Compare the just read M0 value to all possible Idx values.
2894 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2895 .addReg(CurrentIdxReg)
Matt Arsenaultf0ba86a2016-07-21 09:40:57 +00002896 .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002897
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002898 // Update EXEC, save the original EXEC value to VCC.
2899 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2900 .addReg(CondReg, RegState::Kill);
2901
2902 MRI.setSimpleHint(NewExec, CondReg);
2903
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00002904 if (UseGPRIdxMode) {
2905 unsigned IdxReg;
2906 if (Offset == 0) {
2907 IdxReg = CurrentIdxReg;
2908 } else {
2909 IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2910 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2911 .addReg(CurrentIdxReg, RegState::Kill)
2912 .addImm(Offset);
2913 }
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002914 unsigned IdxMode = IsIndirectSrc ?
Dmitry Preobrazhenskyef920352019-02-27 13:12:12 +00002915 AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002916 MachineInstr *SetOn =
2917 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2918 .addReg(IdxReg, RegState::Kill)
2919 .addImm(IdxMode);
2920 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002921 } else {
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00002922 // Move index from VCC into M0
2923 if (Offset == 0) {
2924 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2925 .addReg(CurrentIdxReg, RegState::Kill);
2926 } else {
2927 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2928 .addReg(CurrentIdxReg, RegState::Kill)
2929 .addImm(Offset);
2930 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002931 }
2932
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002933 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002934 MachineInstr *InsertPt =
Scott Lindere2c58472019-02-05 19:50:32 +00002935 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002936 .addReg(AMDGPU::EXEC)
2937 .addReg(NewExec);
2938
2939 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2940 // s_cbranch_scc0?
2941
2942 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2943 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2944 .addMBB(&LoopBB);
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002945
2946 return InsertPt->getIterator();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002947}
2948
2949// This has slightly sub-optimal regalloc when the source vector is killed by
2950// the read. The register allocator does not understand that the kill is
2951// per-workitem, so is kept alive for the whole loop so we end up not re-using a
2952// subregister from it, using 1 more VGPR than necessary. This was saved when
2953// this was expanded after register allocation.
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002954static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
2955 MachineBasicBlock &MBB,
2956 MachineInstr &MI,
2957 unsigned InitResultReg,
2958 unsigned PhiReg,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00002959 int Offset,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002960 bool UseGPRIdxMode,
2961 bool IsIndirectSrc) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002962 MachineFunction *MF = MBB.getParent();
2963 MachineRegisterInfo &MRI = MF->getRegInfo();
2964 const DebugLoc &DL = MI.getDebugLoc();
2965 MachineBasicBlock::iterator I(&MI);
2966
2967 unsigned DstReg = MI.getOperand(0).getReg();
Matt Arsenault301162c2017-11-15 21:51:43 +00002968 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2969 unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002970
2971 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
2972
2973 // Save the EXEC mask
2974 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
2975 .addReg(AMDGPU::EXEC);
2976
2977 // To insert the loop we need to split the block. Move everything after this
2978 // point to a new block, and insert a new empty block between the two.
2979 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
2980 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
2981 MachineFunction::iterator MBBI(MBB);
2982 ++MBBI;
2983
2984 MF->insert(MBBI, LoopBB);
2985 MF->insert(MBBI, RemainderBB);
2986
2987 LoopBB->addSuccessor(LoopBB);
2988 LoopBB->addSuccessor(RemainderBB);
2989
2990 // Move the rest of the block into a new block.
Matt Arsenaultd40ded62016-07-22 17:01:15 +00002991 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002992 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
2993
2994 MBB.addSuccessor(LoopBB);
2995
2996 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2997
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002998 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
2999 InitResultReg, DstReg, PhiReg, TmpExec,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003000 Offset, UseGPRIdxMode, IsIndirectSrc);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003001
3002 MachineBasicBlock::iterator First = RemainderBB->begin();
3003 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
3004 .addReg(SaveExec);
3005
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003006 return InsPt;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003007}
3008
3009// Returns subreg index, offset
3010static std::pair<unsigned, int>
3011computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
3012 const TargetRegisterClass *SuperRC,
3013 unsigned VecReg,
3014 int Offset) {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003015 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003016
3017 // Skip out of bounds offsets, or else we would end up using an undefined
3018 // register.
3019 if (Offset >= NumElts || Offset < 0)
3020 return std::make_pair(AMDGPU::sub0, Offset);
3021
3022 return std::make_pair(AMDGPU::sub0 + Offset, 0);
3023}
3024
3025// Return true if the index is an SGPR and was set.
3026static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
3027 MachineRegisterInfo &MRI,
3028 MachineInstr &MI,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003029 int Offset,
3030 bool UseGPRIdxMode,
3031 bool IsIndirectSrc) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003032 MachineBasicBlock *MBB = MI.getParent();
3033 const DebugLoc &DL = MI.getDebugLoc();
3034 MachineBasicBlock::iterator I(&MI);
3035
3036 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3037 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3038
3039 assert(Idx->getReg() != AMDGPU::NoRegister);
3040
3041 if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
3042 return false;
3043
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003044 if (UseGPRIdxMode) {
3045 unsigned IdxMode = IsIndirectSrc ?
Dmitry Preobrazhenskyef920352019-02-27 13:12:12 +00003046 AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003047 if (Offset == 0) {
3048 MachineInstr *SetOn =
Diana Picus116bbab2017-01-13 09:58:52 +00003049 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3050 .add(*Idx)
3051 .addImm(IdxMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003052
Matt Arsenaultdac31db2016-10-13 12:45:16 +00003053 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003054 } else {
3055 unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3056 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
Diana Picus116bbab2017-01-13 09:58:52 +00003057 .add(*Idx)
3058 .addImm(Offset);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003059 MachineInstr *SetOn =
3060 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3061 .addReg(Tmp, RegState::Kill)
3062 .addImm(IdxMode);
3063
Matt Arsenaultdac31db2016-10-13 12:45:16 +00003064 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003065 }
3066
3067 return true;
3068 }
3069
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003070 if (Offset == 0) {
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00003071 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3072 .add(*Idx);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003073 } else {
3074 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00003075 .add(*Idx)
3076 .addImm(Offset);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003077 }
3078
3079 return true;
3080}
3081
3082// Control flow needs to be inserted if indexing with a VGPR.
3083static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
3084 MachineBasicBlock &MBB,
Tom Stellard5bfbae52018-07-11 20:59:01 +00003085 const GCNSubtarget &ST) {
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003086 const SIInstrInfo *TII = ST.getInstrInfo();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003087 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3088 MachineFunction *MF = MBB.getParent();
3089 MachineRegisterInfo &MRI = MF->getRegInfo();
3090
3091 unsigned Dst = MI.getOperand(0).getReg();
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003092 unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003093 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3094
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003095 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003096
3097 unsigned SubReg;
3098 std::tie(SubReg, Offset)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003099 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003100
Marek Olsake22fdb92017-03-21 17:00:32 +00003101 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003102
3103 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003104 MachineBasicBlock::iterator I(&MI);
3105 const DebugLoc &DL = MI.getDebugLoc();
3106
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003107 if (UseGPRIdxMode) {
3108 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3109 // to avoid interfering with other uses, so probably requires a new
3110 // optimization pass.
3111 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003112 .addReg(SrcReg, RegState::Undef, SubReg)
3113 .addReg(SrcReg, RegState::Implicit)
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003114 .addReg(AMDGPU::M0, RegState::Implicit);
3115 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3116 } else {
3117 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003118 .addReg(SrcReg, RegState::Undef, SubReg)
3119 .addReg(SrcReg, RegState::Implicit);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003120 }
3121
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003122 MI.eraseFromParent();
3123
3124 return &MBB;
3125 }
3126
3127 const DebugLoc &DL = MI.getDebugLoc();
3128 MachineBasicBlock::iterator I(&MI);
3129
3130 unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3131 unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3132
3133 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3134
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003135 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
3136 Offset, UseGPRIdxMode, true);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003137 MachineBasicBlock *LoopBB = InsPt->getParent();
3138
3139 if (UseGPRIdxMode) {
3140 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003141 .addReg(SrcReg, RegState::Undef, SubReg)
3142 .addReg(SrcReg, RegState::Implicit)
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003143 .addReg(AMDGPU::M0, RegState::Implicit);
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003144 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003145 } else {
3146 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003147 .addReg(SrcReg, RegState::Undef, SubReg)
3148 .addReg(SrcReg, RegState::Implicit);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003149 }
3150
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003151 MI.eraseFromParent();
3152
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003153 return LoopBB;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003154}
3155
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003156static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
3157 const TargetRegisterClass *VecRC) {
3158 switch (TRI.getRegSizeInBits(*VecRC)) {
3159 case 32: // 4 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003160 return AMDGPU::V_MOVRELD_B32_V1;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003161 case 64: // 8 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003162 return AMDGPU::V_MOVRELD_B32_V2;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003163 case 128: // 16 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003164 return AMDGPU::V_MOVRELD_B32_V4;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003165 case 256: // 32 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003166 return AMDGPU::V_MOVRELD_B32_V8;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003167 case 512: // 64 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003168 return AMDGPU::V_MOVRELD_B32_V16;
3169 default:
3170 llvm_unreachable("unsupported size for MOVRELD pseudos");
3171 }
3172}
3173
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003174static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
3175 MachineBasicBlock &MBB,
Tom Stellard5bfbae52018-07-11 20:59:01 +00003176 const GCNSubtarget &ST) {
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003177 const SIInstrInfo *TII = ST.getInstrInfo();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003178 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3179 MachineFunction *MF = MBB.getParent();
3180 MachineRegisterInfo &MRI = MF->getRegInfo();
3181
3182 unsigned Dst = MI.getOperand(0).getReg();
3183 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3184 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3185 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3186 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3187 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3188
3189 // This can be an immediate, but will be folded later.
3190 assert(Val->getReg());
3191
3192 unsigned SubReg;
3193 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3194 SrcVec->getReg(),
3195 Offset);
Marek Olsake22fdb92017-03-21 17:00:32 +00003196 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003197
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003198 if (Idx->getReg() == AMDGPU::NoRegister) {
3199 MachineBasicBlock::iterator I(&MI);
3200 const DebugLoc &DL = MI.getDebugLoc();
3201
3202 assert(Offset == 0);
3203
3204 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
Diana Picus116bbab2017-01-13 09:58:52 +00003205 .add(*SrcVec)
3206 .add(*Val)
3207 .addImm(SubReg);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003208
3209 MI.eraseFromParent();
3210 return &MBB;
3211 }
3212
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003213 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003214 MachineBasicBlock::iterator I(&MI);
3215 const DebugLoc &DL = MI.getDebugLoc();
3216
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003217 if (UseGPRIdxMode) {
3218 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
Diana Picus116bbab2017-01-13 09:58:52 +00003219 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
3220 .add(*Val)
3221 .addReg(Dst, RegState::ImplicitDefine)
3222 .addReg(SrcVec->getReg(), RegState::Implicit)
3223 .addReg(AMDGPU::M0, RegState::Implicit);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003224
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003225 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3226 } else {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003227 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003228
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003229 BuildMI(MBB, I, DL, MovRelDesc)
3230 .addReg(Dst, RegState::Define)
3231 .addReg(SrcVec->getReg())
Diana Picus116bbab2017-01-13 09:58:52 +00003232 .add(*Val)
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003233 .addImm(SubReg - AMDGPU::sub0);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003234 }
3235
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003236 MI.eraseFromParent();
3237 return &MBB;
3238 }
3239
3240 if (Val->isReg())
3241 MRI.clearKillFlags(Val->getReg());
3242
3243 const DebugLoc &DL = MI.getDebugLoc();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003244
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003245 unsigned PhiReg = MRI.createVirtualRegister(VecRC);
3246
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003247 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003248 Offset, UseGPRIdxMode, false);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003249 MachineBasicBlock *LoopBB = InsPt->getParent();
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003250
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003251 if (UseGPRIdxMode) {
3252 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
Diana Picus116bbab2017-01-13 09:58:52 +00003253 .addReg(PhiReg, RegState::Undef, SubReg) // vdst
3254 .add(*Val) // src0
3255 .addReg(Dst, RegState::ImplicitDefine)
3256 .addReg(PhiReg, RegState::Implicit)
3257 .addReg(AMDGPU::M0, RegState::Implicit);
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003258 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003259 } else {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003260 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003261
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003262 BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
3263 .addReg(Dst, RegState::Define)
3264 .addReg(PhiReg)
Diana Picus116bbab2017-01-13 09:58:52 +00003265 .add(*Val)
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003266 .addImm(SubReg - AMDGPU::sub0);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003267 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003268
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003269 MI.eraseFromParent();
3270
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003271 return LoopBB;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003272}
3273
Matt Arsenault786724a2016-07-12 21:41:32 +00003274MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
3275 MachineInstr &MI, MachineBasicBlock *BB) const {
Tom Stellard244891d2016-12-20 15:52:17 +00003276
3277 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3278 MachineFunction *MF = BB->getParent();
3279 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
3280
3281 if (TII->isMIMG(MI)) {
Matt Arsenault905f3512017-12-29 17:18:14 +00003282 if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
3283 report_fatal_error("missing mem operand from MIMG instruction");
3284 }
Tom Stellard244891d2016-12-20 15:52:17 +00003285 // Add a memoperand for mimg instructions so that they aren't assumed to
3286 // be ordered memory instuctions.
3287
Tom Stellard244891d2016-12-20 15:52:17 +00003288 return BB;
3289 }
3290
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003291 switch (MI.getOpcode()) {
Matt Arsenault301162c2017-11-15 21:51:43 +00003292 case AMDGPU::S_ADD_U64_PSEUDO:
3293 case AMDGPU::S_SUB_U64_PSEUDO: {
3294 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3295 const DebugLoc &DL = MI.getDebugLoc();
3296
3297 MachineOperand &Dest = MI.getOperand(0);
3298 MachineOperand &Src0 = MI.getOperand(1);
3299 MachineOperand &Src1 = MI.getOperand(2);
3300
3301 unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3302 unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3303
3304 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3305 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3306 &AMDGPU::SReg_32_XM0RegClass);
3307 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3308 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3309 &AMDGPU::SReg_32_XM0RegClass);
3310
3311 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3312 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3313 &AMDGPU::SReg_32_XM0RegClass);
3314 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3315 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3316 &AMDGPU::SReg_32_XM0RegClass);
3317
3318 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3319
3320 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3321 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3322 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3323 .add(Src0Sub0)
3324 .add(Src1Sub0);
3325 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3326 .add(Src0Sub1)
3327 .add(Src1Sub1);
3328 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3329 .addReg(DestSub0)
3330 .addImm(AMDGPU::sub0)
3331 .addReg(DestSub1)
3332 .addImm(AMDGPU::sub1);
3333 MI.eraseFromParent();
3334 return BB;
3335 }
3336 case AMDGPU::SI_INIT_M0: {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003337 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
Matt Arsenault4ac341c2016-04-14 21:58:15 +00003338 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
Diana Picus116bbab2017-01-13 09:58:52 +00003339 .add(MI.getOperand(0));
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003340 MI.eraseFromParent();
Matt Arsenault20711b72015-02-20 22:10:45 +00003341 return BB;
Matt Arsenault301162c2017-11-15 21:51:43 +00003342 }
Marek Olsak2d825902017-04-28 20:21:58 +00003343 case AMDGPU::SI_INIT_EXEC:
3344 // This should be before all vector instructions.
3345 BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3346 AMDGPU::EXEC)
3347 .addImm(MI.getOperand(0).getImm());
3348 MI.eraseFromParent();
3349 return BB;
3350
3351 case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3352 // Extract the thread count from an SGPR input and set EXEC accordingly.
3353 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3354 //
3355 // S_BFE_U32 count, input, {shift, 7}
3356 // S_BFM_B64 exec, count, 0
3357 // S_CMP_EQ_U32 count, 64
3358 // S_CMOV_B64 exec, -1
3359 MachineInstr *FirstMI = &*BB->begin();
3360 MachineRegisterInfo &MRI = MF->getRegInfo();
3361 unsigned InputReg = MI.getOperand(0).getReg();
3362 unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3363 bool Found = false;
3364
3365 // Move the COPY of the input reg to the beginning, so that we can use it.
3366 for (auto I = BB->begin(); I != &MI; I++) {
3367 if (I->getOpcode() != TargetOpcode::COPY ||
3368 I->getOperand(0).getReg() != InputReg)
3369 continue;
3370
3371 if (I == FirstMI) {
3372 FirstMI = &*++BB->begin();
3373 } else {
3374 I->removeFromParent();
3375 BB->insert(FirstMI, &*I);
3376 }
3377 Found = true;
3378 break;
3379 }
3380 assert(Found);
Davide Italiano0dcc0152017-05-11 19:58:52 +00003381 (void)Found;
Marek Olsak2d825902017-04-28 20:21:58 +00003382
3383 // This should be before all vector instructions.
3384 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3385 .addReg(InputReg)
3386 .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3387 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3388 AMDGPU::EXEC)
3389 .addReg(CountReg)
3390 .addImm(0);
3391 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3392 .addReg(CountReg, RegState::Kill)
3393 .addImm(64);
3394 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3395 AMDGPU::EXEC)
3396 .addImm(-1);
3397 MI.eraseFromParent();
3398 return BB;
3399 }
3400
Changpeng Fang01f60622016-03-15 17:28:44 +00003401 case AMDGPU::GET_GROUPSTATICSIZE: {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003402 DebugLoc DL = MI.getDebugLoc();
Matt Arsenault3c07c812016-07-22 17:01:33 +00003403 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
Diana Picus116bbab2017-01-13 09:58:52 +00003404 .add(MI.getOperand(0))
3405 .addImm(MFI->getLDSSize());
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003406 MI.eraseFromParent();
Changpeng Fang01f60622016-03-15 17:28:44 +00003407 return BB;
3408 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003409 case AMDGPU::SI_INDIRECT_SRC_V1:
3410 case AMDGPU::SI_INDIRECT_SRC_V2:
3411 case AMDGPU::SI_INDIRECT_SRC_V4:
3412 case AMDGPU::SI_INDIRECT_SRC_V8:
3413 case AMDGPU::SI_INDIRECT_SRC_V16:
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003414 return emitIndirectSrc(MI, *BB, *getSubtarget());
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003415 case AMDGPU::SI_INDIRECT_DST_V1:
3416 case AMDGPU::SI_INDIRECT_DST_V2:
3417 case AMDGPU::SI_INDIRECT_DST_V4:
3418 case AMDGPU::SI_INDIRECT_DST_V8:
3419 case AMDGPU::SI_INDIRECT_DST_V16:
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003420 return emitIndirectDst(MI, *BB, *getSubtarget());
Marek Olsakce76ea02017-10-24 10:27:13 +00003421 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3422 case AMDGPU::SI_KILL_I1_PSEUDO:
Matt Arsenault786724a2016-07-12 21:41:32 +00003423 return splitKillBlock(MI, BB);
Matt Arsenault22e41792016-08-27 01:00:37 +00003424 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3425 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
Matt Arsenault22e41792016-08-27 01:00:37 +00003426
3427 unsigned Dst = MI.getOperand(0).getReg();
3428 unsigned Src0 = MI.getOperand(1).getReg();
3429 unsigned Src1 = MI.getOperand(2).getReg();
3430 const DebugLoc &DL = MI.getDebugLoc();
3431 unsigned SrcCond = MI.getOperand(3).getReg();
3432
3433 unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3434 unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003435 unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
Matt Arsenault22e41792016-08-27 01:00:37 +00003436
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003437 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3438 .addReg(SrcCond);
Matt Arsenault22e41792016-08-27 01:00:37 +00003439 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3440 .addReg(Src0, 0, AMDGPU::sub0)
3441 .addReg(Src1, 0, AMDGPU::sub0)
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003442 .addReg(SrcCondCopy);
Matt Arsenault22e41792016-08-27 01:00:37 +00003443 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3444 .addReg(Src0, 0, AMDGPU::sub1)
3445 .addReg(Src1, 0, AMDGPU::sub1)
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003446 .addReg(SrcCondCopy);
Matt Arsenault22e41792016-08-27 01:00:37 +00003447
3448 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3449 .addReg(DstLo)
3450 .addImm(AMDGPU::sub0)
3451 .addReg(DstHi)
3452 .addImm(AMDGPU::sub1);
3453 MI.eraseFromParent();
3454 return BB;
3455 }
Matt Arsenault327188a2016-12-15 21:57:11 +00003456 case AMDGPU::SI_BR_UNDEF: {
3457 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3458 const DebugLoc &DL = MI.getDebugLoc();
3459 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
Diana Picus116bbab2017-01-13 09:58:52 +00003460 .add(MI.getOperand(0));
Matt Arsenault327188a2016-12-15 21:57:11 +00003461 Br->getOperand(1).setIsUndef(true); // read undef SCC
3462 MI.eraseFromParent();
3463 return BB;
3464 }
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003465 case AMDGPU::ADJCALLSTACKUP:
3466 case AMDGPU::ADJCALLSTACKDOWN: {
3467 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3468 MachineInstrBuilder MIB(*MF, &MI);
Matt Arsenaulte9f36792018-03-27 18:38:51 +00003469
3470 // Add an implicit use of the frame offset reg to prevent the restore copy
3471 // inserted after the call from being reorderd after stack operations in the
3472 // the caller's frame.
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003473 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
Matt Arsenaulte9f36792018-03-27 18:38:51 +00003474 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3475 .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003476 return BB;
3477 }
Scott Linderd19d1972019-02-04 20:00:07 +00003478 case AMDGPU::SI_CALL_ISEL: {
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003479 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3480 const DebugLoc &DL = MI.getDebugLoc();
Scott Linderd19d1972019-02-04 20:00:07 +00003481
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003482 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
Matt Arsenault6ed7b9b2017-08-02 01:31:28 +00003483
Matt Arsenault71bcbd42017-08-11 20:42:08 +00003484 MachineInstrBuilder MIB;
Scott Linderd19d1972019-02-04 20:00:07 +00003485 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
Matt Arsenault71bcbd42017-08-11 20:42:08 +00003486
Scott Linderd19d1972019-02-04 20:00:07 +00003487 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003488 MIB.add(MI.getOperand(I));
Matt Arsenault6ed7b9b2017-08-02 01:31:28 +00003489
Chandler Carruthc73c0302018-08-16 21:30:05 +00003490 MIB.cloneMemRefs(MI);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003491 MI.eraseFromParent();
3492 return BB;
3493 }
Changpeng Fang01f60622016-03-15 17:28:44 +00003494 default:
3495 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
Tom Stellard75aadc22012-12-11 21:25:42 +00003496 }
Tom Stellard75aadc22012-12-11 21:25:42 +00003497}
3498
Matt Arsenaulte11d8ac2017-10-13 21:10:22 +00003499bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
3500 return isTypeLegal(VT.getScalarType());
3501}
3502
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003503bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
3504 // This currently forces unfolding various combinations of fsub into fma with
3505 // free fneg'd operands. As long as we have fast FMA (controlled by
3506 // isFMAFasterThanFMulAndFAdd), we should perform these.
3507
3508 // When fma is quarter rate, for f64 where add / sub are at best half rate,
3509 // most of these combines appear to be cycle neutral but save on instruction
3510 // count / code size.
3511 return true;
3512}
3513
Mehdi Amini44ede332015-07-09 02:09:04 +00003514EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
3515 EVT VT) const {
Tom Stellard83747202013-07-18 21:43:53 +00003516 if (!VT.isVector()) {
3517 return MVT::i1;
3518 }
Matt Arsenault8596f712014-11-28 22:51:38 +00003519 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
Tom Stellard75aadc22012-12-11 21:25:42 +00003520}
3521
Matt Arsenault94163282016-12-22 16:36:25 +00003522MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
3523 // TODO: Should i16 be used always if legal? For now it would force VALU
3524 // shifts.
3525 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
Christian Konig082a14a2013-03-18 11:34:05 +00003526}
3527
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003528// Answering this is somewhat tricky and depends on the specific device which
3529// have different rates for fma or all f64 operations.
3530//
3531// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3532// regardless of which device (although the number of cycles differs between
3533// devices), so it is always profitable for f64.
3534//
3535// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3536// only on full rate devices. Normally, we should prefer selecting v_mad_f32
3537// which we can always do even without fused FP ops since it returns the same
3538// result as the separate operations and since it is always full
3539// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3540// however does not support denormals, so we do report fma as faster if we have
3541// a fast fma device and require denormals.
3542//
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003543bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3544 VT = VT.getScalarType();
3545
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003546 switch (VT.getSimpleVT().SimpleTy) {
Matt Arsenault0084adc2018-04-30 19:08:16 +00003547 case MVT::f32: {
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003548 // This is as fast on some subtargets. However, we always have full rate f32
3549 // mad available which returns the same result as the separate operations
Matt Arsenault8d630032015-02-20 22:10:41 +00003550 // which we should prefer over fma. We can't use this if we want to support
3551 // denormals, so only report this in these cases.
Matt Arsenault0084adc2018-04-30 19:08:16 +00003552 if (Subtarget->hasFP32Denormals())
3553 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
3554
3555 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3556 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
3557 }
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003558 case MVT::f64:
3559 return true;
Matt Arsenault9e22bc22016-12-22 03:21:48 +00003560 case MVT::f16:
3561 return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003562 default:
3563 break;
3564 }
3565
3566 return false;
3567}
3568
Tom Stellard75aadc22012-12-11 21:25:42 +00003569//===----------------------------------------------------------------------===//
3570// Custom DAG Lowering Operations
3571//===----------------------------------------------------------------------===//
3572
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003573// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3574// wider vector type is legal.
3575SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
3576 SelectionDAG &DAG) const {
3577 unsigned Opc = Op.getOpcode();
3578 EVT VT = Op.getValueType();
3579 assert(VT == MVT::v4f16);
3580
3581 SDValue Lo, Hi;
3582 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
3583
3584 SDLoc SL(Op);
3585 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
3586 Op->getFlags());
3587 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
3588 Op->getFlags());
3589
3590 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3591}
3592
3593// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3594// wider vector type is legal.
3595SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
3596 SelectionDAG &DAG) const {
3597 unsigned Opc = Op.getOpcode();
3598 EVT VT = Op.getValueType();
3599 assert(VT == MVT::v4i16 || VT == MVT::v4f16);
3600
3601 SDValue Lo0, Hi0;
3602 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
3603 SDValue Lo1, Hi1;
3604 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
3605
3606 SDLoc SL(Op);
3607
3608 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
3609 Op->getFlags());
3610 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
3611 Op->getFlags());
3612
3613 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3614}
3615
Tom Stellard75aadc22012-12-11 21:25:42 +00003616SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3617 switch (Op.getOpcode()) {
3618 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
Tom Stellardf8794352012-12-19 22:10:31 +00003619 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
Tom Stellard35bb18c2013-08-26 15:06:04 +00003620 case ISD::LOAD: {
Tom Stellarde812f2f2014-07-21 15:45:06 +00003621 SDValue Result = LowerLOAD(Op, DAG);
3622 assert((!Result.getNode() ||
3623 Result.getNode()->getNumValues() == 2) &&
3624 "Load should return a value and a chain");
3625 return Result;
Tom Stellard35bb18c2013-08-26 15:06:04 +00003626 }
Tom Stellardaf775432013-10-23 00:44:32 +00003627
Matt Arsenaultad14ce82014-07-19 18:44:39 +00003628 case ISD::FSIN:
3629 case ISD::FCOS:
3630 return LowerTrig(Op, DAG);
Tom Stellard0ec134f2014-02-04 17:18:40 +00003631 case ISD::SELECT: return LowerSELECT(Op, DAG);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00003632 case ISD::FDIV: return LowerFDIV(Op, DAG);
Tom Stellard354a43c2016-04-01 18:27:37 +00003633 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
Tom Stellard81d871d2013-11-13 23:36:50 +00003634 case ISD::STORE: return LowerSTORE(Op, DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003635 case ISD::GlobalAddress: {
3636 MachineFunction &MF = DAG.getMachineFunction();
3637 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3638 return LowerGlobalAddress(MFI, Op, DAG);
Tom Stellard94593ee2013-06-03 17:40:18 +00003639 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003640 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00003641 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003642 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
Matt Arsenault99c14522016-04-25 19:27:24 +00003643 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
Matt Arsenault3aef8092017-01-23 23:09:58 +00003644 case ISD::INSERT_VECTOR_ELT:
3645 return lowerINSERT_VECTOR_ELT(Op, DAG);
3646 case ISD::EXTRACT_VECTOR_ELT:
3647 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
Matt Arsenault67a98152018-05-16 11:47:30 +00003648 case ISD::BUILD_VECTOR:
3649 return lowerBUILD_VECTOR(Op, DAG);
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00003650 case ISD::FP_ROUND:
3651 return lowerFP_ROUND(Op, DAG);
Matt Arsenault3e025382017-04-24 17:49:13 +00003652 case ISD::TRAP:
Matt Arsenault3e025382017-04-24 17:49:13 +00003653 return lowerTRAP(Op, DAG);
Tony Tye43259df2018-05-16 16:19:34 +00003654 case ISD::DEBUGTRAP:
3655 return lowerDEBUGTRAP(Op, DAG);
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003656 case ISD::FABS:
3657 case ISD::FNEG:
Matt Arsenault36cdcfa2018-08-02 13:43:42 +00003658 case ISD::FCANONICALIZE:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003659 return splitUnaryVectorOp(Op, DAG);
Matt Arsenault687ec752018-10-22 16:27:27 +00003660 case ISD::FMINNUM:
3661 case ISD::FMAXNUM:
3662 return lowerFMINNUM_FMAXNUM(Op, DAG);
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003663 case ISD::SHL:
3664 case ISD::SRA:
3665 case ISD::SRL:
3666 case ISD::ADD:
3667 case ISD::SUB:
3668 case ISD::MUL:
3669 case ISD::SMIN:
3670 case ISD::SMAX:
3671 case ISD::UMIN:
3672 case ISD::UMAX:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003673 case ISD::FADD:
3674 case ISD::FMUL:
Matt Arsenault687ec752018-10-22 16:27:27 +00003675 case ISD::FMINNUM_IEEE:
3676 case ISD::FMAXNUM_IEEE:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003677 return splitBinaryVectorOp(Op, DAG);
Tom Stellard75aadc22012-12-11 21:25:42 +00003678 }
3679 return SDValue();
3680}
3681
Matt Arsenault1349a042018-05-22 06:32:10 +00003682static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
3683 const SDLoc &DL,
3684 SelectionDAG &DAG, bool Unpacked) {
3685 if (!LoadVT.isVector())
3686 return Result;
3687
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003688 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3689 // Truncate to v2i16/v4i16.
3690 EVT IntLoadVT = LoadVT.changeTypeToInteger();
Matt Arsenault1349a042018-05-22 06:32:10 +00003691
3692 // Workaround legalizer not scalarizing truncate after vector op
3693 // legalization byt not creating intermediate vector trunc.
3694 SmallVector<SDValue, 4> Elts;
3695 DAG.ExtractVectorElements(Result, Elts);
3696 for (SDValue &Elt : Elts)
3697 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3698
3699 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3700
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003701 // Bitcast to original type (v2f16/v4f16).
Matt Arsenault1349a042018-05-22 06:32:10 +00003702 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003703 }
Matt Arsenault1349a042018-05-22 06:32:10 +00003704
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003705 // Cast back to the original packed type.
3706 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3707}
3708
Matt Arsenault1349a042018-05-22 06:32:10 +00003709SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3710 MemSDNode *M,
3711 SelectionDAG &DAG,
Tim Renouf366a49d2018-08-02 23:33:01 +00003712 ArrayRef<SDValue> Ops,
Matt Arsenault1349a042018-05-22 06:32:10 +00003713 bool IsIntrinsic) const {
3714 SDLoc DL(M);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003715
3716 bool Unpacked = Subtarget->hasUnpackedD16VMem();
Matt Arsenault1349a042018-05-22 06:32:10 +00003717 EVT LoadVT = M->getValueType(0);
3718
Matt Arsenault1349a042018-05-22 06:32:10 +00003719 EVT EquivLoadVT = LoadVT;
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003720 if (Unpacked && LoadVT.isVector()) {
3721 EquivLoadVT = LoadVT.isVector() ?
3722 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3723 LoadVT.getVectorNumElements()) : LoadVT;
Matt Arsenault1349a042018-05-22 06:32:10 +00003724 }
3725
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003726 // Change from v4f16/v2f16 to EquivLoadVT.
3727 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3728
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003729 SDValue Load
3730 = DAG.getMemIntrinsicNode(
3731 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3732 VTList, Ops, M->getMemoryVT(),
3733 M->getMemOperand());
3734 if (!Unpacked) // Just adjusted the opcode.
3735 return Load;
Changpeng Fang4737e892018-01-18 22:08:53 +00003736
Matt Arsenault1349a042018-05-22 06:32:10 +00003737 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
Changpeng Fang4737e892018-01-18 22:08:53 +00003738
Matt Arsenault1349a042018-05-22 06:32:10 +00003739 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003740}
3741
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003742static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
3743 SDNode *N, SelectionDAG &DAG) {
3744 EVT VT = N->getValueType(0);
Matt Arsenaultcaf13162019-03-12 21:02:54 +00003745 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003746 int CondCode = CD->getSExtValue();
3747 if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
3748 CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
3749 return DAG.getUNDEF(VT);
3750
3751 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
3752
3753
3754 SDValue LHS = N->getOperand(1);
3755 SDValue RHS = N->getOperand(2);
3756
3757 SDLoc DL(N);
3758
3759 EVT CmpVT = LHS.getValueType();
3760 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
3761 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
3762 ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3763 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
3764 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
3765 }
3766
3767 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
3768
3769 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
3770 DAG.getCondCode(CCOpcode));
3771}
3772
3773static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
3774 SDNode *N, SelectionDAG &DAG) {
3775 EVT VT = N->getValueType(0);
Matt Arsenaultcaf13162019-03-12 21:02:54 +00003776 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003777
3778 int CondCode = CD->getSExtValue();
3779 if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
3780 CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
3781 return DAG.getUNDEF(VT);
3782 }
3783
3784 SDValue Src0 = N->getOperand(1);
3785 SDValue Src1 = N->getOperand(2);
3786 EVT CmpVT = Src0.getValueType();
3787 SDLoc SL(N);
3788
3789 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
3790 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
3791 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
3792 }
3793
3794 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
3795 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
3796 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
3797 Src1, DAG.getCondCode(CCOpcode));
3798}
3799
Matt Arsenault3aef8092017-01-23 23:09:58 +00003800void SITargetLowering::ReplaceNodeResults(SDNode *N,
3801 SmallVectorImpl<SDValue> &Results,
3802 SelectionDAG &DAG) const {
3803 switch (N->getOpcode()) {
3804 case ISD::INSERT_VECTOR_ELT: {
3805 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3806 Results.push_back(Res);
3807 return;
3808 }
3809 case ISD::EXTRACT_VECTOR_ELT: {
3810 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3811 Results.push_back(Res);
3812 return;
3813 }
Matt Arsenault1f17c662017-02-22 00:27:34 +00003814 case ISD::INTRINSIC_WO_CHAIN: {
3815 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
Marek Olsak13e47412018-01-31 20:18:04 +00003816 switch (IID) {
3817 case Intrinsic::amdgcn_cvt_pkrtz: {
Matt Arsenault1f17c662017-02-22 00:27:34 +00003818 SDValue Src0 = N->getOperand(1);
3819 SDValue Src1 = N->getOperand(2);
3820 SDLoc SL(N);
3821 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
3822 Src0, Src1);
Matt Arsenault1f17c662017-02-22 00:27:34 +00003823 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3824 return;
3825 }
Marek Olsak13e47412018-01-31 20:18:04 +00003826 case Intrinsic::amdgcn_cvt_pknorm_i16:
3827 case Intrinsic::amdgcn_cvt_pknorm_u16:
3828 case Intrinsic::amdgcn_cvt_pk_i16:
3829 case Intrinsic::amdgcn_cvt_pk_u16: {
3830 SDValue Src0 = N->getOperand(1);
3831 SDValue Src1 = N->getOperand(2);
3832 SDLoc SL(N);
3833 unsigned Opcode;
3834
3835 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
3836 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
3837 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3838 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
3839 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3840 Opcode = AMDGPUISD::CVT_PK_I16_I32;
3841 else
3842 Opcode = AMDGPUISD::CVT_PK_U16_U32;
3843
Matt Arsenault709374d2018-08-01 20:13:58 +00003844 EVT VT = N->getValueType(0);
3845 if (isTypeLegal(VT))
3846 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
3847 else {
3848 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3849 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3850 }
Marek Olsak13e47412018-01-31 20:18:04 +00003851 return;
3852 }
3853 }
Simon Pilgrimd362d272017-07-08 19:50:03 +00003854 break;
Matt Arsenault1f17c662017-02-22 00:27:34 +00003855 }
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003856 case ISD::INTRINSIC_W_CHAIN: {
Matt Arsenault1349a042018-05-22 06:32:10 +00003857 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003858 Results.push_back(Res);
Matt Arsenault1349a042018-05-22 06:32:10 +00003859 Results.push_back(Res.getValue(1));
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003860 return;
3861 }
Matt Arsenault1349a042018-05-22 06:32:10 +00003862
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003863 break;
3864 }
Matt Arsenault4a486232017-04-19 20:53:07 +00003865 case ISD::SELECT: {
3866 SDLoc SL(N);
3867 EVT VT = N->getValueType(0);
3868 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3869 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3870 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3871
3872 EVT SelectVT = NewVT;
3873 if (NewVT.bitsLT(MVT::i32)) {
3874 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3875 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3876 SelectVT = MVT::i32;
3877 }
3878
3879 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3880 N->getOperand(0), LHS, RHS);
3881
3882 if (NewVT != SelectVT)
3883 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3884 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3885 return;
3886 }
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003887 case ISD::FNEG: {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003888 if (N->getValueType(0) != MVT::v2f16)
3889 break;
3890
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003891 SDLoc SL(N);
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003892 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3893
3894 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
3895 BC,
3896 DAG.getConstant(0x80008000, SL, MVT::i32));
3897 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3898 return;
3899 }
3900 case ISD::FABS: {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003901 if (N->getValueType(0) != MVT::v2f16)
3902 break;
3903
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003904 SDLoc SL(N);
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003905 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3906
3907 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
3908 BC,
3909 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
3910 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3911 return;
3912 }
Matt Arsenault3aef8092017-01-23 23:09:58 +00003913 default:
3914 break;
3915 }
3916}
3917
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00003918/// Helper function for LowerBRCOND
Tom Stellardf8794352012-12-19 22:10:31 +00003919static SDNode *findUser(SDValue Value, unsigned Opcode) {
Tom Stellard75aadc22012-12-11 21:25:42 +00003920
Tom Stellardf8794352012-12-19 22:10:31 +00003921 SDNode *Parent = Value.getNode();
3922 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3923 I != E; ++I) {
3924
3925 if (I.getUse().get() != Value)
3926 continue;
3927
3928 if (I->getOpcode() == Opcode)
3929 return *I;
3930 }
Craig Topper062a2ba2014-04-25 05:30:21 +00003931 return nullptr;
Tom Stellardf8794352012-12-19 22:10:31 +00003932}
3933
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00003934unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
Matt Arsenault6408c912016-09-16 22:11:18 +00003935 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
3936 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00003937 case Intrinsic::amdgcn_if:
3938 return AMDGPUISD::IF;
3939 case Intrinsic::amdgcn_else:
3940 return AMDGPUISD::ELSE;
3941 case Intrinsic::amdgcn_loop:
3942 return AMDGPUISD::LOOP;
3943 case Intrinsic::amdgcn_end_cf:
3944 llvm_unreachable("should not occur");
Matt Arsenault6408c912016-09-16 22:11:18 +00003945 default:
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00003946 return 0;
Matt Arsenault6408c912016-09-16 22:11:18 +00003947 }
Tom Stellardbc4497b2016-02-12 23:45:29 +00003948 }
Matt Arsenault6408c912016-09-16 22:11:18 +00003949
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00003950 // break, if_break, else_break are all only used as inputs to loop, not
3951 // directly as branch conditions.
3952 return 0;
Tom Stellardbc4497b2016-02-12 23:45:29 +00003953}
3954
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00003955bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
3956 const Triple &TT = getTargetMachine().getTargetTriple();
Matt Arsenault0da63502018-08-31 05:49:54 +00003957 return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3958 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00003959 AMDGPU::shouldEmitConstantsToTextSection(TT);
3960}
3961
3962bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
Scott Linderd19d1972019-02-04 20:00:07 +00003963 // FIXME: Either avoid relying on address space here or change the default
3964 // address space for functions to avoid the explicit check.
3965 return (GV->getValueType()->isFunctionTy() ||
3966 GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
Matt Arsenault0da63502018-08-31 05:49:54 +00003967 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3968 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00003969 !shouldEmitFixup(GV) &&
3970 !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
3971}
3972
3973bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
3974 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
3975}
3976
Tom Stellardf8794352012-12-19 22:10:31 +00003977/// This transforms the control flow intrinsics to get the branch destination as
3978/// last parameter, also switches branch target with BR if the need arise
3979SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
3980 SelectionDAG &DAG) const {
Andrew Trickef9de2a2013-05-25 02:42:55 +00003981 SDLoc DL(BRCOND);
Tom Stellardf8794352012-12-19 22:10:31 +00003982
3983 SDNode *Intr = BRCOND.getOperand(1).getNode();
3984 SDValue Target = BRCOND.getOperand(2);
Craig Topper062a2ba2014-04-25 05:30:21 +00003985 SDNode *BR = nullptr;
Tom Stellardbc4497b2016-02-12 23:45:29 +00003986 SDNode *SetCC = nullptr;
Tom Stellardf8794352012-12-19 22:10:31 +00003987
3988 if (Intr->getOpcode() == ISD::SETCC) {
3989 // As long as we negate the condition everything is fine
Tom Stellardbc4497b2016-02-12 23:45:29 +00003990 SetCC = Intr;
Tom Stellardf8794352012-12-19 22:10:31 +00003991 Intr = SetCC->getOperand(0).getNode();
3992
3993 } else {
3994 // Get the target from BR if we don't negate the condition
3995 BR = findUser(BRCOND, ISD::BR);
3996 Target = BR->getOperand(1);
3997 }
3998
Matt Arsenault6408c912016-09-16 22:11:18 +00003999 // FIXME: This changes the types of the intrinsics instead of introducing new
4000 // nodes with the correct types.
4001 // e.g. llvm.amdgcn.loop
4002
4003 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
4004 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
4005
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004006 unsigned CFNode = isCFIntrinsic(Intr);
4007 if (CFNode == 0) {
Tom Stellardbc4497b2016-02-12 23:45:29 +00004008 // This is a uniform branch so we don't need to legalize.
4009 return BRCOND;
4010 }
4011
Matt Arsenault6408c912016-09-16 22:11:18 +00004012 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
4013 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
4014
Tom Stellardbc4497b2016-02-12 23:45:29 +00004015 assert(!SetCC ||
4016 (SetCC->getConstantOperandVal(1) == 1 &&
Tom Stellardbc4497b2016-02-12 23:45:29 +00004017 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
4018 ISD::SETNE));
Tom Stellardf8794352012-12-19 22:10:31 +00004019
Tom Stellardf8794352012-12-19 22:10:31 +00004020 // operands of the new intrinsic call
4021 SmallVector<SDValue, 4> Ops;
Matt Arsenault6408c912016-09-16 22:11:18 +00004022 if (HaveChain)
4023 Ops.push_back(BRCOND.getOperand(0));
4024
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004025 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
Tom Stellardf8794352012-12-19 22:10:31 +00004026 Ops.push_back(Target);
4027
Matt Arsenault6408c912016-09-16 22:11:18 +00004028 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
4029
Tom Stellardf8794352012-12-19 22:10:31 +00004030 // build the new intrinsic call
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004031 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
Tom Stellardf8794352012-12-19 22:10:31 +00004032
Matt Arsenault6408c912016-09-16 22:11:18 +00004033 if (!HaveChain) {
4034 SDValue Ops[] = {
4035 SDValue(Result, 0),
4036 BRCOND.getOperand(0)
4037 };
4038
4039 Result = DAG.getMergeValues(Ops, DL).getNode();
4040 }
4041
Tom Stellardf8794352012-12-19 22:10:31 +00004042 if (BR) {
4043 // Give the branch instruction our target
4044 SDValue Ops[] = {
4045 BR->getOperand(0),
4046 BRCOND.getOperand(2)
4047 };
Chandler Carruth356665a2014-08-01 22:09:43 +00004048 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
4049 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
4050 BR = NewBR.getNode();
Tom Stellardf8794352012-12-19 22:10:31 +00004051 }
4052
4053 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
4054
4055 // Copy the intrinsic results to registers
4056 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
4057 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
4058 if (!CopyToReg)
4059 continue;
4060
4061 Chain = DAG.getCopyToReg(
4062 Chain, DL,
4063 CopyToReg->getOperand(1),
4064 SDValue(Result, i - 1),
4065 SDValue());
4066
4067 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
4068 }
4069
4070 // Remove the old intrinsic from the chain
4071 DAG.ReplaceAllUsesOfValueWith(
4072 SDValue(Intr, Intr->getNumValues() - 1),
4073 Intr->getOperand(0));
4074
4075 return Chain;
Tom Stellard75aadc22012-12-11 21:25:42 +00004076}
4077
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00004078SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
4079 SDValue Op,
4080 const SDLoc &DL,
4081 EVT VT) const {
4082 return Op.getValueType().bitsLE(VT) ?
4083 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
4084 DAG.getNode(ISD::FTRUNC, DL, VT, Op);
4085}
4086
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004087SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004088 assert(Op.getValueType() == MVT::f16 &&
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004089 "Do not know how to custom lower FP_ROUND for non-f16 type");
4090
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004091 SDValue Src = Op.getOperand(0);
4092 EVT SrcVT = Src.getValueType();
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004093 if (SrcVT != MVT::f64)
4094 return Op;
4095
4096 SDLoc DL(Op);
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004097
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004098 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
4099 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
Mandeep Singh Grang5e1697e2017-06-06 05:08:36 +00004100 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004101}
4102
Matt Arsenault687ec752018-10-22 16:27:27 +00004103SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
4104 SelectionDAG &DAG) const {
4105 EVT VT = Op.getValueType();
4106 bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
4107
4108 // FIXME: Assert during eslection that this is only selected for
4109 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4110 // mode functions, but this happens to be OK since it's only done in cases
4111 // where there is known no sNaN.
4112 if (IsIEEEMode)
4113 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
4114
4115 if (VT == MVT::v4f16)
4116 return splitBinaryVectorOp(Op, DAG);
4117 return Op;
4118}
4119
Matt Arsenault3e025382017-04-24 17:49:13 +00004120SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
4121 SDLoc SL(Op);
Matt Arsenault3e025382017-04-24 17:49:13 +00004122 SDValue Chain = Op.getOperand(0);
4123
Tom Stellard5bfbae52018-07-11 20:59:01 +00004124 if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
Tony Tye43259df2018-05-16 16:19:34 +00004125 !Subtarget->isTrapHandlerEnabled())
Matt Arsenault3e025382017-04-24 17:49:13 +00004126 return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
Tony Tye43259df2018-05-16 16:19:34 +00004127
4128 MachineFunction &MF = DAG.getMachineFunction();
4129 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4130 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4131 assert(UserSGPR != AMDGPU::NoRegister);
4132 SDValue QueuePtr = CreateLiveInRegister(
4133 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4134 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
4135 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
4136 QueuePtr, SDValue());
4137 SDValue Ops[] = {
4138 ToReg,
Tom Stellard5bfbae52018-07-11 20:59:01 +00004139 DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
Tony Tye43259df2018-05-16 16:19:34 +00004140 SGPR01,
4141 ToReg.getValue(1)
4142 };
4143 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4144}
4145
4146SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
4147 SDLoc SL(Op);
4148 SDValue Chain = Op.getOperand(0);
4149 MachineFunction &MF = DAG.getMachineFunction();
4150
Tom Stellard5bfbae52018-07-11 20:59:01 +00004151 if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
Tony Tye43259df2018-05-16 16:19:34 +00004152 !Subtarget->isTrapHandlerEnabled()) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004153 DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
Matt Arsenault3e025382017-04-24 17:49:13 +00004154 "debugtrap handler not supported",
4155 Op.getDebugLoc(),
4156 DS_Warning);
Matthias Braunf1caa282017-12-15 22:22:58 +00004157 LLVMContext &Ctx = MF.getFunction().getContext();
Matt Arsenault3e025382017-04-24 17:49:13 +00004158 Ctx.diagnose(NoTrap);
4159 return Chain;
4160 }
Matt Arsenault3e025382017-04-24 17:49:13 +00004161
Tony Tye43259df2018-05-16 16:19:34 +00004162 SDValue Ops[] = {
4163 Chain,
Tom Stellard5bfbae52018-07-11 20:59:01 +00004164 DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
Tony Tye43259df2018-05-16 16:19:34 +00004165 };
4166 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
Matt Arsenault3e025382017-04-24 17:49:13 +00004167}
4168
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004169SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
Matt Arsenault99c14522016-04-25 19:27:24 +00004170 SelectionDAG &DAG) const {
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004171 // FIXME: Use inline constants (src_{shared, private}_base) instead.
4172 if (Subtarget->hasApertureRegs()) {
Matt Arsenault0da63502018-08-31 05:49:54 +00004173 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004174 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
4175 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
Matt Arsenault0da63502018-08-31 05:49:54 +00004176 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004177 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
4178 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
4179 unsigned Encoding =
4180 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
4181 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
4182 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
Matt Arsenaulte823d922017-02-18 18:29:53 +00004183
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004184 SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
4185 SDValue ApertureReg = SDValue(
4186 DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
4187 SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
4188 return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
Matt Arsenaulte823d922017-02-18 18:29:53 +00004189 }
4190
Matt Arsenault99c14522016-04-25 19:27:24 +00004191 MachineFunction &MF = DAG.getMachineFunction();
4192 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenault3b2e2a52016-06-06 20:03:31 +00004193 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4194 assert(UserSGPR != AMDGPU::NoRegister);
4195
Matt Arsenault99c14522016-04-25 19:27:24 +00004196 SDValue QueuePtr = CreateLiveInRegister(
Matt Arsenault3b2e2a52016-06-06 20:03:31 +00004197 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
Matt Arsenault99c14522016-04-25 19:27:24 +00004198
4199 // Offset into amd_queue_t for group_segment_aperture_base_hi /
4200 // private_segment_aperture_base_hi.
Matt Arsenault0da63502018-08-31 05:49:54 +00004201 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
Matt Arsenault99c14522016-04-25 19:27:24 +00004202
Matt Arsenaultb655fa92017-11-29 01:25:12 +00004203 SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
Matt Arsenault99c14522016-04-25 19:27:24 +00004204
4205 // TODO: Use custom target PseudoSourceValue.
4206 // TODO: We should use the value from the IR intrinsic call, but it might not
4207 // be available and how do we get it?
4208 Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
Matt Arsenault0da63502018-08-31 05:49:54 +00004209 AMDGPUAS::CONSTANT_ADDRESS));
Matt Arsenault99c14522016-04-25 19:27:24 +00004210
4211 MachinePointerInfo PtrInfo(V, StructOffset);
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004212 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
Justin Lebar9c375812016-07-15 18:27:10 +00004213 MinAlign(64, StructOffset),
Justin Lebaradbf09e2016-09-11 01:38:58 +00004214 MachineMemOperand::MODereferenceable |
4215 MachineMemOperand::MOInvariant);
Matt Arsenault99c14522016-04-25 19:27:24 +00004216}
4217
4218SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4219 SelectionDAG &DAG) const {
4220 SDLoc SL(Op);
4221 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4222
4223 SDValue Src = ASC->getOperand(0);
Matt Arsenault99c14522016-04-25 19:27:24 +00004224 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4225
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004226 const AMDGPUTargetMachine &TM =
4227 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4228
Matt Arsenault99c14522016-04-25 19:27:24 +00004229 // flat -> local/private
Matt Arsenault0da63502018-08-31 05:49:54 +00004230 if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenault971c85e2017-03-13 19:47:31 +00004231 unsigned DestAS = ASC->getDestAddressSpace();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00004232
Matt Arsenault0da63502018-08-31 05:49:54 +00004233 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
4234 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004235 unsigned NullVal = TM.getNullPointerValue(DestAS);
4236 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
Matt Arsenault99c14522016-04-25 19:27:24 +00004237 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4238 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4239
4240 return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4241 NonNull, Ptr, SegmentNullPtr);
4242 }
4243 }
4244
4245 // local/private -> flat
Matt Arsenault0da63502018-08-31 05:49:54 +00004246 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenault971c85e2017-03-13 19:47:31 +00004247 unsigned SrcAS = ASC->getSrcAddressSpace();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00004248
Matt Arsenault0da63502018-08-31 05:49:54 +00004249 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
4250 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004251 unsigned NullVal = TM.getNullPointerValue(SrcAS);
4252 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
Matt Arsenault971c85e2017-03-13 19:47:31 +00004253
Matt Arsenault99c14522016-04-25 19:27:24 +00004254 SDValue NonNull
4255 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4256
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004257 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
Matt Arsenault99c14522016-04-25 19:27:24 +00004258 SDValue CvtPtr
4259 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4260
4261 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4262 DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4263 FlatNullPtr);
4264 }
4265 }
4266
4267 // global <-> flat are no-ops and never emitted.
4268
4269 const MachineFunction &MF = DAG.getMachineFunction();
4270 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
Matthias Braunf1caa282017-12-15 22:22:58 +00004271 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
Matt Arsenault99c14522016-04-25 19:27:24 +00004272 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4273
4274 return DAG.getUNDEF(ASC->getValueType(0));
4275}
4276
Matt Arsenault3aef8092017-01-23 23:09:58 +00004277SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4278 SelectionDAG &DAG) const {
Matt Arsenault67a98152018-05-16 11:47:30 +00004279 SDValue Vec = Op.getOperand(0);
4280 SDValue InsVal = Op.getOperand(1);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004281 SDValue Idx = Op.getOperand(2);
Matt Arsenault67a98152018-05-16 11:47:30 +00004282 EVT VecVT = Vec.getValueType();
Matt Arsenault9224c002018-06-05 19:52:46 +00004283 EVT EltVT = VecVT.getVectorElementType();
4284 unsigned VecSize = VecVT.getSizeInBits();
4285 unsigned EltSize = EltVT.getSizeInBits();
Matt Arsenault67a98152018-05-16 11:47:30 +00004286
Matt Arsenault9224c002018-06-05 19:52:46 +00004287
4288 assert(VecSize <= 64);
Matt Arsenault67a98152018-05-16 11:47:30 +00004289
4290 unsigned NumElts = VecVT.getVectorNumElements();
4291 SDLoc SL(Op);
4292 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
4293
Matt Arsenault9224c002018-06-05 19:52:46 +00004294 if (NumElts == 4 && EltSize == 16 && KIdx) {
Matt Arsenault67a98152018-05-16 11:47:30 +00004295 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
4296
4297 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4298 DAG.getConstant(0, SL, MVT::i32));
4299 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4300 DAG.getConstant(1, SL, MVT::i32));
4301
4302 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
4303 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
4304
4305 unsigned Idx = KIdx->getZExtValue();
4306 bool InsertLo = Idx < 2;
4307 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
4308 InsertLo ? LoVec : HiVec,
4309 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
4310 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
4311
4312 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
4313
4314 SDValue Concat = InsertLo ?
4315 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
4316 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
4317
4318 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
4319 }
4320
Matt Arsenault3aef8092017-01-23 23:09:58 +00004321 if (isa<ConstantSDNode>(Idx))
4322 return SDValue();
4323
Matt Arsenault9224c002018-06-05 19:52:46 +00004324 MVT IntVT = MVT::getIntegerVT(VecSize);
Matt Arsenault67a98152018-05-16 11:47:30 +00004325
Matt Arsenault3aef8092017-01-23 23:09:58 +00004326 // Avoid stack access for dynamic indexing.
Matt Arsenault3aef8092017-01-23 23:09:58 +00004327 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
Tim Corringhamfa3e4e52019-02-01 16:51:09 +00004328
4329 // Create a congruent vector with the target value in each element so that
4330 // the required element can be masked and ORed into the target vector.
4331 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
4332 DAG.getSplatBuildVector(VecVT, SL, InsVal));
Matt Arsenault3aef8092017-01-23 23:09:58 +00004333
Matt Arsenault9224c002018-06-05 19:52:46 +00004334 assert(isPowerOf2_32(EltSize));
4335 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4336
Matt Arsenault3aef8092017-01-23 23:09:58 +00004337 // Convert vector index to bit-index.
Matt Arsenault9224c002018-06-05 19:52:46 +00004338 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004339
Matt Arsenault67a98152018-05-16 11:47:30 +00004340 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4341 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
4342 DAG.getConstant(0xffff, SL, IntVT),
Matt Arsenault3aef8092017-01-23 23:09:58 +00004343 ScaledIdx);
4344
Matt Arsenault67a98152018-05-16 11:47:30 +00004345 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
4346 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
4347 DAG.getNOT(SL, BFM, IntVT), BCVec);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004348
Matt Arsenault67a98152018-05-16 11:47:30 +00004349 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
4350 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004351}
4352
4353SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4354 SelectionDAG &DAG) const {
4355 SDLoc SL(Op);
4356
4357 EVT ResultVT = Op.getValueType();
4358 SDValue Vec = Op.getOperand(0);
4359 SDValue Idx = Op.getOperand(1);
Matt Arsenault67a98152018-05-16 11:47:30 +00004360 EVT VecVT = Vec.getValueType();
Matt Arsenault9224c002018-06-05 19:52:46 +00004361 unsigned VecSize = VecVT.getSizeInBits();
4362 EVT EltVT = VecVT.getVectorElementType();
4363 assert(VecSize <= 64);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004364
Matt Arsenault98f29462017-05-17 20:30:58 +00004365 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4366
Hiroshi Inoue372ffa12018-04-13 11:37:06 +00004367 // Make sure we do any optimizations that will make it easier to fold
Matt Arsenault98f29462017-05-17 20:30:58 +00004368 // source modifiers before obscuring it with bit operations.
4369
4370 // XXX - Why doesn't this get called when vector_shuffle is expanded?
4371 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4372 return Combined;
4373
Matt Arsenault9224c002018-06-05 19:52:46 +00004374 unsigned EltSize = EltVT.getSizeInBits();
4375 assert(isPowerOf2_32(EltSize));
Matt Arsenault3aef8092017-01-23 23:09:58 +00004376
Matt Arsenault9224c002018-06-05 19:52:46 +00004377 MVT IntVT = MVT::getIntegerVT(VecSize);
4378 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4379
4380 // Convert vector index to bit-index (* EltSize)
4381 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004382
Matt Arsenault67a98152018-05-16 11:47:30 +00004383 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4384 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004385
Matt Arsenault67a98152018-05-16 11:47:30 +00004386 if (ResultVT == MVT::f16) {
4387 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
4388 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4389 }
Matt Arsenault3aef8092017-01-23 23:09:58 +00004390
Matt Arsenault67a98152018-05-16 11:47:30 +00004391 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
4392}
4393
4394SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
4395 SelectionDAG &DAG) const {
4396 SDLoc SL(Op);
4397 EVT VT = Op.getValueType();
Matt Arsenault67a98152018-05-16 11:47:30 +00004398
Matt Arsenault02dc7e12018-06-15 15:15:46 +00004399 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4400 EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
4401
4402 // Turn into pair of packed build_vectors.
4403 // TODO: Special case for constants that can be materialized with s_mov_b64.
4404 SDValue Lo = DAG.getBuildVector(HalfVT, SL,
4405 { Op.getOperand(0), Op.getOperand(1) });
4406 SDValue Hi = DAG.getBuildVector(HalfVT, SL,
4407 { Op.getOperand(2), Op.getOperand(3) });
4408
4409 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
4410 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
4411
4412 SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
4413 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
4414 }
4415
Matt Arsenault1349a042018-05-22 06:32:10 +00004416 assert(VT == MVT::v2f16 || VT == MVT::v2i16);
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004417 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
Matt Arsenault67a98152018-05-16 11:47:30 +00004418
Matt Arsenault1349a042018-05-22 06:32:10 +00004419 SDValue Lo = Op.getOperand(0);
4420 SDValue Hi = Op.getOperand(1);
Matt Arsenault67a98152018-05-16 11:47:30 +00004421
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004422 // Avoid adding defined bits with the zero_extend.
4423 if (Hi.isUndef()) {
4424 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4425 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
4426 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
4427 }
Matt Arsenault67a98152018-05-16 11:47:30 +00004428
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004429 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
Matt Arsenault1349a042018-05-22 06:32:10 +00004430 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
4431
4432 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
4433 DAG.getConstant(16, SL, MVT::i32));
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004434 if (Lo.isUndef())
4435 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
4436
4437 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4438 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
Matt Arsenault1349a042018-05-22 06:32:10 +00004439
4440 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
Matt Arsenault1349a042018-05-22 06:32:10 +00004441 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004442}
4443
Tom Stellard418beb72016-07-13 14:23:33 +00004444bool
4445SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
4446 // We can fold offsets for anything that doesn't require a GOT relocation.
Matt Arsenault0da63502018-08-31 05:49:54 +00004447 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
4448 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4449 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004450 !shouldEmitGOTReloc(GA->getGlobal());
Tom Stellard418beb72016-07-13 14:23:33 +00004451}
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004452
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004453static SDValue
4454buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
4455 const SDLoc &DL, unsigned Offset, EVT PtrVT,
4456 unsigned GAFlags = SIInstrInfo::MO_NONE) {
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004457 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4458 // lowered to the following code sequence:
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004459 //
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004460 // For constant address space:
4461 // s_getpc_b64 s[0:1]
4462 // s_add_u32 s0, s0, $symbol
4463 // s_addc_u32 s1, s1, 0
4464 //
4465 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4466 // a fixup or relocation is emitted to replace $symbol with a literal
4467 // constant, which is a pc-relative offset from the encoding of the $symbol
4468 // operand to the global variable.
4469 //
4470 // For global address space:
4471 // s_getpc_b64 s[0:1]
4472 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4473 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4474 //
4475 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4476 // fixups or relocations are emitted to replace $symbol@*@lo and
4477 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4478 // which is a 64-bit pc-relative offset from the encoding of the $symbol
4479 // operand to the global variable.
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004480 //
4481 // What we want here is an offset from the value returned by s_getpc
4482 // (which is the address of the s_add_u32 instruction) to the global
4483 // variable, but since the encoding of $symbol starts 4 bytes after the start
4484 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4485 // small. This requires us to add 4 to the global variable offset in order to
4486 // compute the correct address.
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004487 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4488 GAFlags);
4489 SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4490 GAFlags == SIInstrInfo::MO_NONE ?
4491 GAFlags : GAFlags + 1);
4492 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004493}
4494
Tom Stellard418beb72016-07-13 14:23:33 +00004495SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
4496 SDValue Op,
4497 SelectionDAG &DAG) const {
4498 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00004499 const GlobalValue *GV = GSD->getGlobal();
Matt Arsenaultd1f45712018-09-10 12:16:11 +00004500 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
4501 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
4502 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
Tom Stellard418beb72016-07-13 14:23:33 +00004503 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
4504
4505 SDLoc DL(GSD);
Tom Stellard418beb72016-07-13 14:23:33 +00004506 EVT PtrVT = Op.getValueType();
4507
Matt Arsenaultd1f45712018-09-10 12:16:11 +00004508 // FIXME: Should not make address space based decisions here.
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004509 if (shouldEmitFixup(GV))
Tom Stellard418beb72016-07-13 14:23:33 +00004510 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004511 else if (shouldEmitPCReloc(GV))
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004512 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
4513 SIInstrInfo::MO_REL32);
Tom Stellard418beb72016-07-13 14:23:33 +00004514
4515 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004516 SIInstrInfo::MO_GOTPCREL32);
Tom Stellard418beb72016-07-13 14:23:33 +00004517
4518 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
Matt Arsenault0da63502018-08-31 05:49:54 +00004519 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
Tom Stellard418beb72016-07-13 14:23:33 +00004520 const DataLayout &DataLayout = DAG.getDataLayout();
4521 unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
Matt Arsenaultd77fcc22018-09-10 02:23:39 +00004522 MachinePointerInfo PtrInfo
4523 = MachinePointerInfo::getGOT(DAG.getMachineFunction());
Tom Stellard418beb72016-07-13 14:23:33 +00004524
Justin Lebar9c375812016-07-15 18:27:10 +00004525 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
Justin Lebaradbf09e2016-09-11 01:38:58 +00004526 MachineMemOperand::MODereferenceable |
4527 MachineMemOperand::MOInvariant);
Tom Stellard418beb72016-07-13 14:23:33 +00004528}
4529
Benjamin Kramerbdc49562016-06-12 15:39:02 +00004530SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
4531 const SDLoc &DL, SDValue V) const {
Matt Arsenault4ac341c2016-04-14 21:58:15 +00004532 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
4533 // the destination register.
4534 //
Tom Stellardfc92e772015-05-12 14:18:14 +00004535 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
4536 // so we will end up with redundant moves to m0.
4537 //
Matt Arsenault4ac341c2016-04-14 21:58:15 +00004538 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
4539
4540 // A Null SDValue creates a glue result.
4541 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
4542 V, Chain);
4543 return SDValue(M0, 0);
Tom Stellardfc92e772015-05-12 14:18:14 +00004544}
4545
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00004546SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
4547 SDValue Op,
4548 MVT VT,
4549 unsigned Offset) const {
4550 SDLoc SL(Op);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00004551 SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
Matt Arsenault7b4826e2018-05-30 16:17:51 +00004552 DAG.getEntryNode(), Offset, 4, false);
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00004553 // The local size values will have the hi 16-bits as zero.
4554 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
4555 DAG.getValueType(VT));
4556}
4557
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004558static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4559 EVT VT) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004560 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00004561 "non-hsa intrinsic with hsa target",
4562 DL.getDebugLoc());
4563 DAG.getContext()->diagnose(BadIntrin);
4564 return DAG.getUNDEF(VT);
4565}
4566
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004567static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4568 EVT VT) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004569 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00004570 "intrinsic not supported on subtarget",
4571 DL.getDebugLoc());
Matt Arsenaulte0132462016-01-30 05:19:45 +00004572 DAG.getContext()->diagnose(BadIntrin);
4573 return DAG.getUNDEF(VT);
4574}
4575
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004576static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
4577 ArrayRef<SDValue> Elts) {
4578 assert(!Elts.empty());
4579 MVT Type;
4580 unsigned NumElts;
4581
4582 if (Elts.size() == 1) {
4583 Type = MVT::f32;
4584 NumElts = 1;
4585 } else if (Elts.size() == 2) {
4586 Type = MVT::v2f32;
4587 NumElts = 2;
4588 } else if (Elts.size() <= 4) {
4589 Type = MVT::v4f32;
4590 NumElts = 4;
4591 } else if (Elts.size() <= 8) {
4592 Type = MVT::v8f32;
4593 NumElts = 8;
4594 } else {
4595 assert(Elts.size() <= 16);
4596 Type = MVT::v16f32;
4597 NumElts = 16;
4598 }
4599
4600 SmallVector<SDValue, 16> VecElts(NumElts);
4601 for (unsigned i = 0; i < Elts.size(); ++i) {
4602 SDValue Elt = Elts[i];
4603 if (Elt.getValueType() != MVT::f32)
4604 Elt = DAG.getBitcast(MVT::f32, Elt);
4605 VecElts[i] = Elt;
4606 }
4607 for (unsigned i = Elts.size(); i < NumElts; ++i)
4608 VecElts[i] = DAG.getUNDEF(MVT::f32);
4609
4610 if (NumElts == 1)
4611 return VecElts[0];
4612 return DAG.getBuildVector(Type, DL, VecElts);
4613}
4614
4615static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
4616 SDValue *GLC, SDValue *SLC) {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004617 auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004618
4619 uint64_t Value = CachePolicyConst->getZExtValue();
4620 SDLoc DL(CachePolicy);
4621 if (GLC) {
4622 *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4623 Value &= ~(uint64_t)0x1;
4624 }
4625 if (SLC) {
4626 *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4627 Value &= ~(uint64_t)0x2;
4628 }
4629
4630 return Value == 0;
4631}
4632
David Stuttardf77079f2019-01-14 11:55:24 +00004633// Re-construct the required return value for a image load intrinsic.
4634// This is more complicated due to the optional use TexFailCtrl which means the required
4635// return type is an aggregate
4636static SDValue constructRetValue(SelectionDAG &DAG,
4637 MachineSDNode *Result,
4638 ArrayRef<EVT> ResultTypes,
4639 bool IsTexFail, bool Unpacked, bool IsD16,
4640 int DMaskPop, int NumVDataDwords,
4641 const SDLoc &DL, LLVMContext &Context) {
4642 // Determine the required return type. This is the same regardless of IsTexFail flag
4643 EVT ReqRetVT = ResultTypes[0];
4644 EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
4645 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
4646 EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
4647 EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
4648 : AdjEltVT
4649 : ReqRetVT;
4650
4651 // Extract data part of the result
4652 // Bitcast the result to the same type as the required return type
4653 int NumElts;
4654 if (IsD16 && !Unpacked)
4655 NumElts = NumVDataDwords << 1;
4656 else
4657 NumElts = NumVDataDwords;
4658
4659 EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
4660 : AdjEltVT;
4661
4662 // Special case for v8f16. Rather than add support for this, use v4i32 to
4663 // extract the data elements
4664 bool V8F16Special = false;
4665 if (CastVT == MVT::v8f16) {
4666 CastVT = MVT::v4i32;
4667 DMaskPop >>= 1;
4668 ReqRetNumElts >>= 1;
4669 V8F16Special = true;
4670 AdjVT = MVT::v2i32;
4671 }
4672
4673 SDValue N = SDValue(Result, 0);
4674 SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
4675
4676 // Iterate over the result
4677 SmallVector<SDValue, 4> BVElts;
4678
4679 if (CastVT.isVector()) {
4680 DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
4681 } else {
4682 BVElts.push_back(CastRes);
4683 }
4684 int ExtraElts = ReqRetNumElts - DMaskPop;
4685 while(ExtraElts--)
4686 BVElts.push_back(DAG.getUNDEF(AdjEltVT));
4687
4688 SDValue PreTFCRes;
4689 if (ReqRetNumElts > 1) {
4690 SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
4691 if (IsD16 && Unpacked)
4692 PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
4693 else
4694 PreTFCRes = NewVec;
4695 } else {
4696 PreTFCRes = BVElts[0];
4697 }
4698
4699 if (V8F16Special)
4700 PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
4701
4702 if (!IsTexFail) {
4703 if (Result->getNumValues() > 1)
4704 return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
4705 else
4706 return PreTFCRes;
4707 }
4708
4709 // Extract the TexFail result and insert into aggregate return
4710 SmallVector<SDValue, 1> TFCElt;
4711 DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
4712 SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
4713 return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
4714}
4715
4716static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
4717 SDValue *LWE, bool &IsTexFail) {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004718 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
David Stuttardf77079f2019-01-14 11:55:24 +00004719
4720 uint64_t Value = TexFailCtrlConst->getZExtValue();
4721 if (Value) {
4722 IsTexFail = true;
4723 }
4724
4725 SDLoc DL(TexFailCtrlConst);
4726 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4727 Value &= ~(uint64_t)0x1;
4728 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4729 Value &= ~(uint64_t)0x2;
4730
4731 return Value == 0;
4732}
4733
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004734SDValue SITargetLowering::lowerImage(SDValue Op,
4735 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4736 SelectionDAG &DAG) const {
4737 SDLoc DL(Op);
Ryan Taylor1f334d02018-08-28 15:07:30 +00004738 MachineFunction &MF = DAG.getMachineFunction();
4739 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004740 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4741 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4742 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004743 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4744 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
4745 unsigned IntrOpcode = Intr->BaseOpcode;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004746
David Stuttardf77079f2019-01-14 11:55:24 +00004747 SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
4748 SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004749 bool IsD16 = false;
Ryan Taylor1f334d02018-08-28 15:07:30 +00004750 bool IsA16 = false;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004751 SDValue VData;
4752 int NumVDataDwords;
David Stuttardf77079f2019-01-14 11:55:24 +00004753 bool AdjustRetType = false;
4754
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004755 unsigned AddrIdx; // Index of first address argument
4756 unsigned DMask;
David Stuttardf77079f2019-01-14 11:55:24 +00004757 unsigned DMaskLanes = 0;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004758
4759 if (BaseOpcode->Atomic) {
4760 VData = Op.getOperand(2);
4761
4762 bool Is64Bit = VData.getValueType() == MVT::i64;
4763 if (BaseOpcode->AtomicX2) {
4764 SDValue VData2 = Op.getOperand(3);
4765 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
4766 {VData, VData2});
4767 if (Is64Bit)
4768 VData = DAG.getBitcast(MVT::v4i32, VData);
4769
4770 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
4771 DMask = Is64Bit ? 0xf : 0x3;
4772 NumVDataDwords = Is64Bit ? 4 : 2;
4773 AddrIdx = 4;
4774 } else {
4775 DMask = Is64Bit ? 0x3 : 0x1;
4776 NumVDataDwords = Is64Bit ? 2 : 1;
4777 AddrIdx = 3;
4778 }
4779 } else {
David Stuttardf77079f2019-01-14 11:55:24 +00004780 unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004781 auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
David Stuttardf77079f2019-01-14 11:55:24 +00004782 DMask = DMaskConst->getZExtValue();
4783 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004784
4785 if (BaseOpcode->Store) {
4786 VData = Op.getOperand(2);
4787
4788 MVT StoreVT = VData.getSimpleValueType();
4789 if (StoreVT.getScalarType() == MVT::f16) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00004790 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004791 !BaseOpcode->HasD16)
4792 return Op; // D16 is unsupported for this instruction
4793
4794 IsD16 = true;
4795 VData = handleD16VData(VData, DAG);
4796 }
4797
4798 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004799 } else {
David Stuttardf77079f2019-01-14 11:55:24 +00004800 // Work out the num dwords based on the dmask popcount and underlying type
4801 // and whether packing is supported.
4802 MVT LoadVT = ResultTypes[0].getSimpleVT();
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004803 if (LoadVT.getScalarType() == MVT::f16) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00004804 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004805 !BaseOpcode->HasD16)
4806 return Op; // D16 is unsupported for this instruction
4807
4808 IsD16 = true;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004809 }
4810
David Stuttardf77079f2019-01-14 11:55:24 +00004811 // Confirm that the return type is large enough for the dmask specified
4812 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
4813 (!LoadVT.isVector() && DMaskLanes > 1))
4814 return Op;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004815
David Stuttardf77079f2019-01-14 11:55:24 +00004816 if (IsD16 && !Subtarget->hasUnpackedD16VMem())
4817 NumVDataDwords = (DMaskLanes + 1) / 2;
4818 else
4819 NumVDataDwords = DMaskLanes;
4820
4821 AdjustRetType = true;
4822 }
David Stuttardc6603862018-11-29 20:14:17 +00004823
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004824 AddrIdx = DMaskIdx + 1;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004825 }
4826
Ryan Taylor1f334d02018-08-28 15:07:30 +00004827 unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
4828 unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
4829 unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
4830 unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
4831 NumCoords + NumLCM;
4832 unsigned NumMIVAddrs = NumVAddrs;
4833
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004834 SmallVector<SDValue, 4> VAddrs;
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004835
4836 // Optimize _L to _LZ when _L is zero
4837 if (LZMappingInfo) {
4838 if (auto ConstantLod =
Ryan Taylor1f334d02018-08-28 15:07:30 +00004839 dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004840 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4841 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
Ryan Taylor1f334d02018-08-28 15:07:30 +00004842 NumMIVAddrs--; // remove 'lod'
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004843 }
4844 }
4845 }
4846
Ryan Taylor1f334d02018-08-28 15:07:30 +00004847 // Check for 16 bit addresses and pack if true.
4848 unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4849 MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
Neil Henning63718b22018-10-31 10:34:48 +00004850 const MVT VAddrScalarVT = VAddrVT.getScalarType();
4851 if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
Ryan Taylor1f334d02018-08-28 15:07:30 +00004852 ST->hasFeature(AMDGPU::FeatureR128A16)) {
4853 IsA16 = true;
Neil Henning63718b22018-10-31 10:34:48 +00004854 const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
Ryan Taylor1f334d02018-08-28 15:07:30 +00004855 for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
4856 SDValue AddrLo, AddrHi;
4857 // Push back extra arguments.
4858 if (i < DimIdx) {
4859 AddrLo = Op.getOperand(i);
4860 } else {
4861 AddrLo = Op.getOperand(i);
4862 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
4863 // in 1D, derivatives dx/dh and dx/dv are packed with undef.
4864 if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
Matt Arsenault0da63502018-08-31 05:49:54 +00004865 ((NumGradients / 2) % 2 == 1 &&
4866 (i == DimIdx + (NumGradients / 2) - 1 ||
Ryan Taylor1f334d02018-08-28 15:07:30 +00004867 i == DimIdx + NumGradients - 1))) {
4868 AddrHi = DAG.getUNDEF(MVT::f16);
4869 } else {
4870 AddrHi = Op.getOperand(i + 1);
4871 i++;
4872 }
Neil Henning63718b22018-10-31 10:34:48 +00004873 AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
Ryan Taylor1f334d02018-08-28 15:07:30 +00004874 {AddrLo, AddrHi});
4875 AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
4876 }
4877 VAddrs.push_back(AddrLo);
4878 }
4879 } else {
4880 for (unsigned i = 0; i < NumMIVAddrs; ++i)
4881 VAddrs.push_back(Op.getOperand(AddrIdx + i));
4882 }
4883
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004884 SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
4885
4886 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
4887 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
4888 unsigned CtrlIdx; // Index of texfailctrl argument
4889 SDValue Unorm;
4890 if (!BaseOpcode->Sampler) {
4891 Unorm = True;
4892 CtrlIdx = AddrIdx + NumVAddrs + 1;
4893 } else {
4894 auto UnormConst =
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004895 cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004896
4897 Unorm = UnormConst->getZExtValue() ? True : False;
4898 CtrlIdx = AddrIdx + NumVAddrs + 3;
4899 }
4900
David Stuttardf77079f2019-01-14 11:55:24 +00004901 SDValue TFE;
4902 SDValue LWE;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004903 SDValue TexFail = Op.getOperand(CtrlIdx);
David Stuttardf77079f2019-01-14 11:55:24 +00004904 bool IsTexFail = false;
4905 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004906 return Op;
4907
David Stuttardf77079f2019-01-14 11:55:24 +00004908 if (IsTexFail) {
4909 if (!DMaskLanes) {
4910 // Expecting to get an error flag since TFC is on - and dmask is 0
4911 // Force dmask to be at least 1 otherwise the instruction will fail
4912 DMask = 0x1;
4913 DMaskLanes = 1;
4914 NumVDataDwords = 1;
4915 }
4916 NumVDataDwords += 1;
4917 AdjustRetType = true;
4918 }
4919
4920 // Has something earlier tagged that the return type needs adjusting
4921 // This happens if the instruction is a load or has set TexFailCtrl flags
4922 if (AdjustRetType) {
4923 // NumVDataDwords reflects the true number of dwords required in the return type
4924 if (DMaskLanes == 0 && !BaseOpcode->Store) {
4925 // This is a no-op load. This can be eliminated
4926 SDValue Undef = DAG.getUNDEF(Op.getValueType());
4927 if (isa<MemSDNode>(Op))
4928 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
4929 return Undef;
4930 }
4931
4932 // Have to use a power of 2 number of dwords
4933 NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords);
4934
4935 EVT NewVT = NumVDataDwords > 1 ?
4936 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
4937 : MVT::f32;
4938
4939 ResultTypes[0] = NewVT;
4940 if (ResultTypes.size() == 3) {
4941 // Original result was aggregate type used for TexFailCtrl results
4942 // The actual instruction returns as a vector type which has now been
4943 // created. Remove the aggregate result.
4944 ResultTypes.erase(&ResultTypes[1]);
4945 }
4946 }
4947
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004948 SDValue GLC;
4949 SDValue SLC;
4950 if (BaseOpcode->Atomic) {
4951 GLC = True; // TODO no-return optimization
4952 if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
4953 return Op;
4954 } else {
4955 if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
4956 return Op;
4957 }
4958
4959 SmallVector<SDValue, 14> Ops;
4960 if (BaseOpcode->Store || BaseOpcode->Atomic)
4961 Ops.push_back(VData); // vdata
4962 Ops.push_back(VAddr);
4963 Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
4964 if (BaseOpcode->Sampler)
4965 Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
4966 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
4967 Ops.push_back(Unorm);
4968 Ops.push_back(GLC);
4969 Ops.push_back(SLC);
Ryan Taylor1f334d02018-08-28 15:07:30 +00004970 Ops.push_back(IsA16 && // a16 or r128
4971 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
David Stuttardf77079f2019-01-14 11:55:24 +00004972 Ops.push_back(TFE); // tfe
4973 Ops.push_back(LWE); // lwe
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004974 Ops.push_back(DimInfo->DA ? True : False);
4975 if (BaseOpcode->HasD16)
4976 Ops.push_back(IsD16 ? True : False);
4977 if (isa<MemSDNode>(Op))
4978 Ops.push_back(Op.getOperand(0)); // chain
4979
4980 int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
4981 int Opcode = -1;
4982
Tom Stellard5bfbae52018-07-11 20:59:01 +00004983 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004984 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004985 NumVDataDwords, NumVAddrDwords);
4986 if (Opcode == -1)
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004987 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004988 NumVDataDwords, NumVAddrDwords);
4989 assert(Opcode != -1);
4990
4991 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
4992 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
Chandler Carruth66654b72018-08-14 23:30:32 +00004993 MachineMemOperand *MemRef = MemOp->getMemOperand();
4994 DAG.setNodeMemRefs(NewNode, {MemRef});
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004995 }
4996
4997 if (BaseOpcode->AtomicX2) {
4998 SmallVector<SDValue, 1> Elt;
4999 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
5000 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
David Stuttardf77079f2019-01-14 11:55:24 +00005001 } else if (!BaseOpcode->Store) {
5002 return constructRetValue(DAG, NewNode,
5003 OrigResultTypes, IsTexFail,
5004 Subtarget->hasUnpackedD16VMem(), IsD16,
5005 DMaskLanes, NumVDataDwords, DL,
5006 *DAG.getContext());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005007 }
5008
5009 return SDValue(NewNode, 0);
5010}
5011
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005012SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
5013 SDValue Offset, SDValue GLC,
5014 SelectionDAG &DAG) const {
5015 MachineFunction &MF = DAG.getMachineFunction();
5016 MachineMemOperand *MMO = MF.getMachineMemOperand(
5017 MachinePointerInfo(),
5018 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5019 MachineMemOperand::MOInvariant,
5020 VT.getStoreSize(), VT.getStoreSize());
5021
5022 if (!Offset->isDivergent()) {
5023 SDValue Ops[] = {
5024 Rsrc,
5025 Offset, // Offset
5026 GLC // glc
5027 };
5028 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
5029 DAG.getVTList(VT), Ops, VT, MMO);
5030 }
5031
5032 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
5033 // assume that the buffer is unswizzled.
5034 SmallVector<SDValue, 4> Loads;
5035 unsigned NumLoads = 1;
5036 MVT LoadVT = VT.getSimpleVT();
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005037 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
Simon Pilgrim44dfd812018-12-07 21:44:25 +00005038 assert((LoadVT.getScalarType() == MVT::i32 ||
5039 LoadVT.getScalarType() == MVT::f32) &&
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005040 isPowerOf2_32(NumElts));
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005041
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005042 if (NumElts == 8 || NumElts == 16) {
5043 NumLoads = NumElts == 16 ? 4 : 2;
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005044 LoadVT = MVT::v4i32;
5045 }
5046
5047 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
5048 unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
5049 SDValue Ops[] = {
5050 DAG.getEntryNode(), // Chain
5051 Rsrc, // rsrc
5052 DAG.getConstant(0, DL, MVT::i32), // vindex
5053 {}, // voffset
5054 {}, // soffset
5055 {}, // offset
5056 DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
5057 DAG.getConstant(0, DL, MVT::i1), // idxen
5058 };
5059
5060 // Use the alignment to ensure that the required offsets will fit into the
5061 // immediate offsets.
5062 setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
5063
5064 uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
5065 for (unsigned i = 0; i < NumLoads; ++i) {
5066 Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
5067 Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
5068 Ops, LoadVT, MMO));
5069 }
5070
5071 if (VT == MVT::v8i32 || VT == MVT::v16i32)
5072 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
5073
5074 return Loads[0];
5075}
5076
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005077SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5078 SelectionDAG &DAG) const {
5079 MachineFunction &MF = DAG.getMachineFunction();
Tom Stellarddcb9f092015-07-09 21:20:37 +00005080 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005081
5082 EVT VT = Op.getValueType();
5083 SDLoc DL(Op);
5084 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5085
Sanjay Patela2607012015-09-16 16:31:21 +00005086 // TODO: Should this propagate fast-math-flags?
5087
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005088 switch (IntrinsicID) {
Tom Stellard2f3f9852017-01-25 01:25:13 +00005089 case Intrinsic::amdgcn_implicit_buffer_ptr: {
Konstantin Zhuravlyovaa067cb2018-10-04 21:02:16 +00005090 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
Matt Arsenault10fc0622017-06-26 03:01:31 +00005091 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005092 return getPreloadedValue(DAG, *MFI, VT,
5093 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
Tom Stellard2f3f9852017-01-25 01:25:13 +00005094 }
Tom Stellard48f29f22015-11-26 00:43:29 +00005095 case Intrinsic::amdgcn_dispatch_ptr:
Matt Arsenault48ab5262016-04-25 19:27:18 +00005096 case Intrinsic::amdgcn_queue_ptr: {
Konstantin Zhuravlyovaa067cb2018-10-04 21:02:16 +00005097 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
Oliver Stannard7e7d9832016-02-02 13:52:43 +00005098 DiagnosticInfoUnsupported BadIntrin(
Matthias Braunf1caa282017-12-15 22:22:58 +00005099 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
Oliver Stannard7e7d9832016-02-02 13:52:43 +00005100 DL.getDebugLoc());
Matt Arsenault800fecf2016-01-11 21:18:33 +00005101 DAG.getContext()->diagnose(BadIntrin);
5102 return DAG.getUNDEF(VT);
5103 }
5104
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005105 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
5106 AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
5107 return getPreloadedValue(DAG, *MFI, VT, RegID);
Matt Arsenault48ab5262016-04-25 19:27:18 +00005108 }
Jan Veselyfea814d2016-06-21 20:46:20 +00005109 case Intrinsic::amdgcn_implicitarg_ptr: {
Matt Arsenault9166ce82017-07-28 15:52:08 +00005110 if (MFI->isEntryFunction())
5111 return getImplicitArgPtr(DAG, DL);
Matt Arsenault817c2532017-08-03 23:12:44 +00005112 return getPreloadedValue(DAG, *MFI, VT,
5113 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
Jan Veselyfea814d2016-06-21 20:46:20 +00005114 }
Matt Arsenaultdc4ebad2016-04-29 21:16:52 +00005115 case Intrinsic::amdgcn_kernarg_segment_ptr: {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005116 return getPreloadedValue(DAG, *MFI, VT,
5117 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
Matt Arsenaultdc4ebad2016-04-29 21:16:52 +00005118 }
Matt Arsenault8d718dc2016-07-22 17:01:30 +00005119 case Intrinsic::amdgcn_dispatch_id: {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005120 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
Matt Arsenault8d718dc2016-07-22 17:01:30 +00005121 }
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005122 case Intrinsic::amdgcn_rcp:
5123 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
5124 case Intrinsic::amdgcn_rsq:
5125 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
Eugene Zelenko66203762017-01-21 00:53:49 +00005126 case Intrinsic::amdgcn_rsq_legacy:
Tom Stellard5bfbae52018-07-11 20:59:01 +00005127 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005128 return emitRemovedIntrinsicError(DAG, DL, VT);
5129
5130 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
Eugene Zelenko66203762017-01-21 00:53:49 +00005131 case Intrinsic::amdgcn_rcp_legacy:
Tom Stellard5bfbae52018-07-11 20:59:01 +00005132 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenault32fc5272016-07-26 16:45:45 +00005133 return emitRemovedIntrinsicError(DAG, DL, VT);
5134 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
Matt Arsenault09b2c4a2016-07-15 21:26:52 +00005135 case Intrinsic::amdgcn_rsq_clamp: {
Tom Stellard5bfbae52018-07-11 20:59:01 +00005136 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenault79963e82016-02-13 01:03:00 +00005137 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
Tom Stellard48f29f22015-11-26 00:43:29 +00005138
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005139 Type *Type = VT.getTypeForEVT(*DAG.getContext());
5140 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
5141 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
5142
5143 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
5144 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
5145 DAG.getConstantFP(Max, DL, VT));
5146 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
5147 DAG.getConstantFP(Min, DL, VT));
5148 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005149 case Intrinsic::r600_read_ngroups_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005150 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005151 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005152
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005153 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005154 SI::KernelInputOffsets::NGROUPS_X, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005155 case Intrinsic::r600_read_ngroups_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005156 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005157 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005158
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005159 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005160 SI::KernelInputOffsets::NGROUPS_Y, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005161 case Intrinsic::r600_read_ngroups_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005162 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005163 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005164
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005165 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005166 SI::KernelInputOffsets::NGROUPS_Z, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005167 case Intrinsic::r600_read_global_size_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005168 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005169 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005170
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005171 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005172 SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005173 case Intrinsic::r600_read_global_size_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005174 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005175 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005176
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005177 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005178 SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005179 case Intrinsic::r600_read_global_size_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005180 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005181 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005182
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005183 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005184 SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005185 case Intrinsic::r600_read_local_size_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005186 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005187 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005188
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005189 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5190 SI::KernelInputOffsets::LOCAL_SIZE_X);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005191 case Intrinsic::r600_read_local_size_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005192 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005193 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005194
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005195 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5196 SI::KernelInputOffsets::LOCAL_SIZE_Y);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005197 case Intrinsic::r600_read_local_size_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005198 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005199 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005200
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005201 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5202 SI::KernelInputOffsets::LOCAL_SIZE_Z);
Matt Arsenault43976df2016-01-30 04:25:19 +00005203 case Intrinsic::amdgcn_workgroup_id_x:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005204 case Intrinsic::r600_read_tgid_x:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005205 return getPreloadedValue(DAG, *MFI, VT,
5206 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
Matt Arsenault43976df2016-01-30 04:25:19 +00005207 case Intrinsic::amdgcn_workgroup_id_y:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005208 case Intrinsic::r600_read_tgid_y:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005209 return getPreloadedValue(DAG, *MFI, VT,
5210 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
Matt Arsenault43976df2016-01-30 04:25:19 +00005211 case Intrinsic::amdgcn_workgroup_id_z:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005212 case Intrinsic::r600_read_tgid_z:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005213 return getPreloadedValue(DAG, *MFI, VT,
5214 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
Reid Kleckner4dc0b1a2018-11-01 19:54:45 +00005215 case Intrinsic::amdgcn_workitem_id_x:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005216 case Intrinsic::r600_read_tidig_x:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005217 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5218 SDLoc(DAG.getEntryNode()),
5219 MFI->getArgInfo().WorkItemIDX);
Matt Arsenault43976df2016-01-30 04:25:19 +00005220 case Intrinsic::amdgcn_workitem_id_y:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005221 case Intrinsic::r600_read_tidig_y:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005222 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5223 SDLoc(DAG.getEntryNode()),
5224 MFI->getArgInfo().WorkItemIDY);
Matt Arsenault43976df2016-01-30 04:25:19 +00005225 case Intrinsic::amdgcn_workitem_id_z:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005226 case Intrinsic::r600_read_tidig_z:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005227 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5228 SDLoc(DAG.getEntryNode()),
5229 MFI->getArgInfo().WorkItemIDZ);
Tim Renouf904343f2018-08-25 14:53:17 +00005230 case Intrinsic::amdgcn_s_buffer_load: {
5231 unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005232 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
5233 DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005234 }
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00005235 case Intrinsic::amdgcn_fdiv_fast:
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00005236 return lowerFDIV_FAST(Op, DAG);
Tom Stellard2187bb82016-12-06 23:52:13 +00005237 case Intrinsic::amdgcn_interp_mov: {
5238 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5239 SDValue Glue = M0.getValue(1);
5240 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
5241 Op.getOperand(2), Op.getOperand(3), Glue);
5242 }
Tom Stellardad7d03d2015-12-15 17:02:49 +00005243 case Intrinsic::amdgcn_interp_p1: {
5244 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5245 SDValue Glue = M0.getValue(1);
5246 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
5247 Op.getOperand(2), Op.getOperand(3), Glue);
5248 }
5249 case Intrinsic::amdgcn_interp_p2: {
5250 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5251 SDValue Glue = SDValue(M0.getNode(), 1);
5252 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
5253 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
5254 Glue);
5255 }
Tim Corringham824ca3f2019-01-28 13:48:59 +00005256 case Intrinsic::amdgcn_interp_p1_f16: {
5257 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5258 SDValue Glue = M0.getValue(1);
5259 if (getSubtarget()->getLDSBankCount() == 16) {
5260 // 16 bank LDS
5261 SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
5262 DAG.getConstant(2, DL, MVT::i32), // P0
5263 Op.getOperand(2), // Attrchan
5264 Op.getOperand(3), // Attr
5265 Glue);
5266 SDValue Ops[] = {
5267 Op.getOperand(1), // Src0
5268 Op.getOperand(2), // Attrchan
5269 Op.getOperand(3), // Attr
5270 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5271 S, // Src2 - holds two f16 values selected by high
5272 DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
5273 Op.getOperand(4), // high
5274 DAG.getConstant(0, DL, MVT::i1), // $clamp
5275 DAG.getConstant(0, DL, MVT::i32) // $omod
5276 };
5277 return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
5278 } else {
5279 // 32 bank LDS
5280 SDValue Ops[] = {
5281 Op.getOperand(1), // Src0
5282 Op.getOperand(2), // Attrchan
5283 Op.getOperand(3), // Attr
5284 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5285 Op.getOperand(4), // high
5286 DAG.getConstant(0, DL, MVT::i1), // $clamp
5287 DAG.getConstant(0, DL, MVT::i32), // $omod
5288 Glue
5289 };
5290 return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
5291 }
5292 }
5293 case Intrinsic::amdgcn_interp_p2_f16: {
5294 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6));
5295 SDValue Glue = SDValue(M0.getNode(), 1);
5296 SDValue Ops[] = {
5297 Op.getOperand(2), // Src0
5298 Op.getOperand(3), // Attrchan
5299 Op.getOperand(4), // Attr
5300 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5301 Op.getOperand(1), // Src2
5302 DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
5303 Op.getOperand(5), // high
5304 DAG.getConstant(0, DL, MVT::i1), // $clamp
5305 Glue
5306 };
5307 return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
5308 }
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005309 case Intrinsic::amdgcn_sin:
5310 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
5311
5312 case Intrinsic::amdgcn_cos:
5313 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
5314
5315 case Intrinsic::amdgcn_log_clamp: {
Tom Stellard5bfbae52018-07-11 20:59:01 +00005316 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005317 return SDValue();
5318
5319 DiagnosticInfoUnsupported BadIntrin(
Matthias Braunf1caa282017-12-15 22:22:58 +00005320 MF.getFunction(), "intrinsic not supported on subtarget",
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005321 DL.getDebugLoc());
5322 DAG.getContext()->diagnose(BadIntrin);
5323 return DAG.getUNDEF(VT);
5324 }
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005325 case Intrinsic::amdgcn_ldexp:
5326 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
5327 Op.getOperand(1), Op.getOperand(2));
Matt Arsenault74015162016-05-28 00:19:52 +00005328
5329 case Intrinsic::amdgcn_fract:
5330 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
5331
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005332 case Intrinsic::amdgcn_class:
5333 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
5334 Op.getOperand(1), Op.getOperand(2));
5335 case Intrinsic::amdgcn_div_fmas:
5336 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
5337 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5338 Op.getOperand(4));
5339
5340 case Intrinsic::amdgcn_div_fixup:
5341 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
5342 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5343
5344 case Intrinsic::amdgcn_trig_preop:
5345 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
5346 Op.getOperand(1), Op.getOperand(2));
5347 case Intrinsic::amdgcn_div_scale: {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00005348 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005349
5350 // Translate to the operands expected by the machine instruction. The
5351 // first parameter must be the same as the first instruction.
5352 SDValue Numerator = Op.getOperand(1);
5353 SDValue Denominator = Op.getOperand(2);
5354
5355 // Note this order is opposite of the machine instruction's operations,
5356 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5357 // intrinsic has the numerator as the first operand to match a normal
5358 // division operation.
5359
5360 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
5361
5362 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
5363 Denominator, Numerator);
5364 }
Wei Ding07e03712016-07-28 16:42:13 +00005365 case Intrinsic::amdgcn_icmp: {
Marek Olsak33eb4d92019-01-15 02:13:18 +00005366 // There is a Pat that handles this variant, so return it as-is.
5367 if (Op.getOperand(1).getValueType() == MVT::i1 &&
5368 Op.getConstantOperandVal(2) == 0 &&
5369 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
5370 return Op;
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00005371 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
Wei Ding07e03712016-07-28 16:42:13 +00005372 }
5373 case Intrinsic::amdgcn_fcmp: {
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00005374 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
Wei Ding07e03712016-07-28 16:42:13 +00005375 }
Matt Arsenaultf84e5d92017-01-31 03:07:46 +00005376 case Intrinsic::amdgcn_fmed3:
5377 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
5378 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
Farhana Aleenc370d7b2018-07-16 18:19:59 +00005379 case Intrinsic::amdgcn_fdot2:
5380 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
Konstantin Zhuravlyovbb30ef72018-08-01 01:31:30 +00005381 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5382 Op.getOperand(4));
Matt Arsenault32fc5272016-07-26 16:45:45 +00005383 case Intrinsic::amdgcn_fmul_legacy:
5384 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
5385 Op.getOperand(1), Op.getOperand(2));
Matt Arsenaultc96e1de2016-07-18 18:35:05 +00005386 case Intrinsic::amdgcn_sffbh:
Matt Arsenaultc96e1de2016-07-18 18:35:05 +00005387 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
Matt Arsenaultf5262252017-02-22 23:04:58 +00005388 case Intrinsic::amdgcn_sbfe:
5389 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
5390 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5391 case Intrinsic::amdgcn_ubfe:
5392 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
5393 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
Marek Olsak13e47412018-01-31 20:18:04 +00005394 case Intrinsic::amdgcn_cvt_pkrtz:
5395 case Intrinsic::amdgcn_cvt_pknorm_i16:
5396 case Intrinsic::amdgcn_cvt_pknorm_u16:
5397 case Intrinsic::amdgcn_cvt_pk_i16:
5398 case Intrinsic::amdgcn_cvt_pk_u16: {
5399 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
Matt Arsenault1f17c662017-02-22 00:27:34 +00005400 EVT VT = Op.getValueType();
Marek Olsak13e47412018-01-31 20:18:04 +00005401 unsigned Opcode;
5402
5403 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
5404 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
5405 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
5406 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
5407 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
5408 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
5409 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
5410 Opcode = AMDGPUISD::CVT_PK_I16_I32;
5411 else
5412 Opcode = AMDGPUISD::CVT_PK_U16_U32;
5413
Matt Arsenault709374d2018-08-01 20:13:58 +00005414 if (isTypeLegal(VT))
5415 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
5416
Marek Olsak13e47412018-01-31 20:18:04 +00005417 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
Matt Arsenault1f17c662017-02-22 00:27:34 +00005418 Op.getOperand(1), Op.getOperand(2));
5419 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
5420 }
Connor Abbott8c217d02017-08-04 18:36:49 +00005421 case Intrinsic::amdgcn_wqm: {
5422 SDValue Src = Op.getOperand(1);
5423 return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
5424 0);
5425 }
Connor Abbott92638ab2017-08-04 18:36:52 +00005426 case Intrinsic::amdgcn_wwm: {
5427 SDValue Src = Op.getOperand(1);
5428 return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
5429 0);
5430 }
Stanislav Mekhanoshindacda792018-06-26 20:04:19 +00005431 case Intrinsic::amdgcn_fmad_ftz:
5432 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
5433 Op.getOperand(2), Op.getOperand(3));
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005434 default:
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005435 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5436 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
5437 return lowerImage(Op, ImageDimIntr, DAG);
5438
Matt Arsenault754dd3e2017-04-03 18:08:08 +00005439 return Op;
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005440 }
5441}
5442
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005443SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5444 SelectionDAG &DAG) const {
5445 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
Tom Stellard6f9ef142016-12-20 17:19:44 +00005446 SDLoc DL(Op);
David Stuttard70e8bc12017-06-22 16:29:22 +00005447
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005448 switch (IntrID) {
Marek Olsakc5cec5e2019-01-16 15:43:53 +00005449 case Intrinsic::amdgcn_ds_ordered_add:
5450 case Intrinsic::amdgcn_ds_ordered_swap: {
5451 MemSDNode *M = cast<MemSDNode>(Op);
5452 SDValue Chain = M->getOperand(0);
5453 SDValue M0 = M->getOperand(2);
5454 SDValue Value = M->getOperand(3);
5455 unsigned OrderedCountIndex = M->getConstantOperandVal(7);
5456 unsigned WaveRelease = M->getConstantOperandVal(8);
5457 unsigned WaveDone = M->getConstantOperandVal(9);
5458 unsigned ShaderType;
5459 unsigned Instruction;
5460
5461 switch (IntrID) {
5462 case Intrinsic::amdgcn_ds_ordered_add:
5463 Instruction = 0;
5464 break;
5465 case Intrinsic::amdgcn_ds_ordered_swap:
5466 Instruction = 1;
5467 break;
5468 }
5469
5470 if (WaveDone && !WaveRelease)
5471 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
5472
5473 switch (DAG.getMachineFunction().getFunction().getCallingConv()) {
5474 case CallingConv::AMDGPU_CS:
5475 case CallingConv::AMDGPU_KERNEL:
5476 ShaderType = 0;
5477 break;
5478 case CallingConv::AMDGPU_PS:
5479 ShaderType = 1;
5480 break;
5481 case CallingConv::AMDGPU_VS:
5482 ShaderType = 2;
5483 break;
5484 case CallingConv::AMDGPU_GS:
5485 ShaderType = 3;
5486 break;
5487 default:
5488 report_fatal_error("ds_ordered_count unsupported for this calling conv");
5489 }
5490
5491 unsigned Offset0 = OrderedCountIndex << 2;
5492 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
5493 (Instruction << 4);
5494 unsigned Offset = Offset0 | (Offset1 << 8);
5495
5496 SDValue Ops[] = {
5497 Chain,
5498 Value,
5499 DAG.getTargetConstant(Offset, DL, MVT::i16),
5500 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
5501 };
5502 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
5503 M->getVTList(), Ops, M->getMemoryVT(),
5504 M->getMemOperand());
5505 }
Matt Arsenaulta5840c32019-01-22 18:36:06 +00005506 case Intrinsic::amdgcn_ds_fadd: {
5507 MemSDNode *M = cast<MemSDNode>(Op);
5508 unsigned Opc;
5509 switch (IntrID) {
5510 case Intrinsic::amdgcn_ds_fadd:
5511 Opc = ISD::ATOMIC_LOAD_FADD;
5512 break;
5513 }
5514
5515 return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
5516 M->getOperand(0), M->getOperand(2), M->getOperand(3),
5517 M->getMemOperand());
5518 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005519 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005520 case Intrinsic::amdgcn_atomic_dec:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005521 case Intrinsic::amdgcn_ds_fmin:
5522 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005523 MemSDNode *M = cast<MemSDNode>(Op);
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005524 unsigned Opc;
5525 switch (IntrID) {
5526 case Intrinsic::amdgcn_atomic_inc:
5527 Opc = AMDGPUISD::ATOMIC_INC;
5528 break;
5529 case Intrinsic::amdgcn_atomic_dec:
5530 Opc = AMDGPUISD::ATOMIC_DEC;
5531 break;
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005532 case Intrinsic::amdgcn_ds_fmin:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005533 Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
5534 break;
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005535 case Intrinsic::amdgcn_ds_fmax:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005536 Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
5537 break;
5538 default:
5539 llvm_unreachable("Unknown intrinsic!");
5540 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005541 SDValue Ops[] = {
5542 M->getOperand(0), // Chain
5543 M->getOperand(2), // Ptr
5544 M->getOperand(3) // Value
5545 };
5546
5547 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
5548 M->getMemoryVT(), M->getMemOperand());
5549 }
Tom Stellard6f9ef142016-12-20 17:19:44 +00005550 case Intrinsic::amdgcn_buffer_load:
5551 case Intrinsic::amdgcn_buffer_load_format: {
Tim Renouf4f703f52018-08-21 11:07:10 +00005552 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
5553 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5554 unsigned IdxEn = 1;
5555 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5556 IdxEn = Idx->getZExtValue() != 0;
Tom Stellard6f9ef142016-12-20 17:19:44 +00005557 SDValue Ops[] = {
5558 Op.getOperand(0), // Chain
5559 Op.getOperand(2), // rsrc
5560 Op.getOperand(3), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00005561 SDValue(), // voffset -- will be set by setBufferOffsets
5562 SDValue(), // soffset -- will be set by setBufferOffsets
5563 SDValue(), // offset -- will be set by setBufferOffsets
5564 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5565 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Tom Stellard6f9ef142016-12-20 17:19:44 +00005566 };
Tom Stellard6f9ef142016-12-20 17:19:44 +00005567
Tim Renouf4f703f52018-08-21 11:07:10 +00005568 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
Tom Stellard6f9ef142016-12-20 17:19:44 +00005569 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
5570 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
Tim Renouf4f703f52018-08-21 11:07:10 +00005571
5572 EVT VT = Op.getValueType();
5573 EVT IntVT = VT.changeTypeToInteger();
5574 auto *M = cast<MemSDNode>(Op);
5575 EVT LoadVT = Op.getValueType();
5576
5577 if (LoadVT.getScalarType() == MVT::f16)
5578 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5579 M, DAG, Ops);
5580 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5581 M->getMemOperand());
5582 }
5583 case Intrinsic::amdgcn_raw_buffer_load:
5584 case Intrinsic::amdgcn_raw_buffer_load_format: {
5585 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5586 SDValue Ops[] = {
5587 Op.getOperand(0), // Chain
5588 Op.getOperand(2), // rsrc
5589 DAG.getConstant(0, DL, MVT::i32), // vindex
5590 Offsets.first, // voffset
5591 Op.getOperand(4), // soffset
5592 Offsets.second, // offset
5593 Op.getOperand(5), // cachepolicy
5594 DAG.getConstant(0, DL, MVT::i1), // idxen
5595 };
5596
5597 unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
5598 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5599
5600 EVT VT = Op.getValueType();
5601 EVT IntVT = VT.changeTypeToInteger();
5602 auto *M = cast<MemSDNode>(Op);
5603 EVT LoadVT = Op.getValueType();
5604
5605 if (LoadVT.getScalarType() == MVT::f16)
5606 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5607 M, DAG, Ops);
5608 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5609 M->getMemOperand());
5610 }
5611 case Intrinsic::amdgcn_struct_buffer_load:
5612 case Intrinsic::amdgcn_struct_buffer_load_format: {
5613 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5614 SDValue Ops[] = {
5615 Op.getOperand(0), // Chain
5616 Op.getOperand(2), // rsrc
5617 Op.getOperand(3), // vindex
5618 Offsets.first, // voffset
5619 Op.getOperand(5), // soffset
5620 Offsets.second, // offset
5621 Op.getOperand(6), // cachepolicy
5622 DAG.getConstant(1, DL, MVT::i1), // idxen
5623 };
5624
5625 unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
5626 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5627
Tom Stellard6f9ef142016-12-20 17:19:44 +00005628 EVT VT = Op.getValueType();
5629 EVT IntVT = VT.changeTypeToInteger();
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005630 auto *M = cast<MemSDNode>(Op);
Matt Arsenault1349a042018-05-22 06:32:10 +00005631 EVT LoadVT = Op.getValueType();
Matt Arsenault1349a042018-05-22 06:32:10 +00005632
Tim Renouf366a49d2018-08-02 23:33:01 +00005633 if (LoadVT.getScalarType() == MVT::f16)
5634 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5635 M, DAG, Ops);
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005636 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5637 M->getMemOperand());
Tom Stellard6f9ef142016-12-20 17:19:44 +00005638 }
David Stuttard70e8bc12017-06-22 16:29:22 +00005639 case Intrinsic::amdgcn_tbuffer_load: {
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005640 MemSDNode *M = cast<MemSDNode>(Op);
Matt Arsenault1349a042018-05-22 06:32:10 +00005641 EVT LoadVT = Op.getValueType();
Matt Arsenault1349a042018-05-22 06:32:10 +00005642
Tim Renouf35484c92018-08-21 11:06:05 +00005643 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5644 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5645 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5646 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
5647 unsigned IdxEn = 1;
5648 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5649 IdxEn = Idx->getZExtValue() != 0;
David Stuttard70e8bc12017-06-22 16:29:22 +00005650 SDValue Ops[] = {
5651 Op.getOperand(0), // Chain
5652 Op.getOperand(2), // rsrc
5653 Op.getOperand(3), // vindex
5654 Op.getOperand(4), // voffset
5655 Op.getOperand(5), // soffset
5656 Op.getOperand(6), // offset
Tim Renouf35484c92018-08-21 11:06:05 +00005657 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5658 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5659 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5660 };
5661
5662 if (LoadVT.getScalarType() == MVT::f16)
5663 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5664 M, DAG, Ops);
5665 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5666 Op->getVTList(), Ops, LoadVT,
5667 M->getMemOperand());
5668 }
5669 case Intrinsic::amdgcn_raw_tbuffer_load: {
5670 MemSDNode *M = cast<MemSDNode>(Op);
5671 EVT LoadVT = Op.getValueType();
5672 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5673
5674 SDValue Ops[] = {
5675 Op.getOperand(0), // Chain
5676 Op.getOperand(2), // rsrc
5677 DAG.getConstant(0, DL, MVT::i32), // vindex
5678 Offsets.first, // voffset
5679 Op.getOperand(4), // soffset
5680 Offsets.second, // offset
5681 Op.getOperand(5), // format
5682 Op.getOperand(6), // cachepolicy
5683 DAG.getConstant(0, DL, MVT::i1), // idxen
5684 };
5685
5686 if (LoadVT.getScalarType() == MVT::f16)
5687 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5688 M, DAG, Ops);
5689 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5690 Op->getVTList(), Ops, LoadVT,
5691 M->getMemOperand());
5692 }
5693 case Intrinsic::amdgcn_struct_tbuffer_load: {
5694 MemSDNode *M = cast<MemSDNode>(Op);
5695 EVT LoadVT = Op.getValueType();
5696 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5697
5698 SDValue Ops[] = {
5699 Op.getOperand(0), // Chain
5700 Op.getOperand(2), // rsrc
5701 Op.getOperand(3), // vindex
5702 Offsets.first, // voffset
5703 Op.getOperand(5), // soffset
5704 Offsets.second, // offset
5705 Op.getOperand(6), // format
5706 Op.getOperand(7), // cachepolicy
5707 DAG.getConstant(1, DL, MVT::i1), // idxen
David Stuttard70e8bc12017-06-22 16:29:22 +00005708 };
5709
Tim Renouf366a49d2018-08-02 23:33:01 +00005710 if (LoadVT.getScalarType() == MVT::f16)
5711 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5712 M, DAG, Ops);
David Stuttard70e8bc12017-06-22 16:29:22 +00005713 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
Matt Arsenault1349a042018-05-22 06:32:10 +00005714 Op->getVTList(), Ops, LoadVT,
5715 M->getMemOperand());
David Stuttard70e8bc12017-06-22 16:29:22 +00005716 }
Marek Olsak5cec6412017-11-09 01:52:48 +00005717 case Intrinsic::amdgcn_buffer_atomic_swap:
5718 case Intrinsic::amdgcn_buffer_atomic_add:
5719 case Intrinsic::amdgcn_buffer_atomic_sub:
5720 case Intrinsic::amdgcn_buffer_atomic_smin:
5721 case Intrinsic::amdgcn_buffer_atomic_umin:
5722 case Intrinsic::amdgcn_buffer_atomic_smax:
5723 case Intrinsic::amdgcn_buffer_atomic_umax:
5724 case Intrinsic::amdgcn_buffer_atomic_and:
5725 case Intrinsic::amdgcn_buffer_atomic_or:
5726 case Intrinsic::amdgcn_buffer_atomic_xor: {
Tim Renouf4f703f52018-08-21 11:07:10 +00005727 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5728 unsigned IdxEn = 1;
5729 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5730 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00005731 SDValue Ops[] = {
5732 Op.getOperand(0), // Chain
5733 Op.getOperand(2), // vdata
5734 Op.getOperand(3), // rsrc
5735 Op.getOperand(4), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00005736 SDValue(), // voffset -- will be set by setBufferOffsets
5737 SDValue(), // soffset -- will be set by setBufferOffsets
5738 SDValue(), // offset -- will be set by setBufferOffsets
5739 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5740 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00005741 };
Tim Renouf4f703f52018-08-21 11:07:10 +00005742 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005743 EVT VT = Op.getValueType();
5744
5745 auto *M = cast<MemSDNode>(Op);
Marek Olsak5cec6412017-11-09 01:52:48 +00005746 unsigned Opcode = 0;
5747
5748 switch (IntrID) {
5749 case Intrinsic::amdgcn_buffer_atomic_swap:
5750 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5751 break;
5752 case Intrinsic::amdgcn_buffer_atomic_add:
5753 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5754 break;
5755 case Intrinsic::amdgcn_buffer_atomic_sub:
5756 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5757 break;
5758 case Intrinsic::amdgcn_buffer_atomic_smin:
5759 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5760 break;
5761 case Intrinsic::amdgcn_buffer_atomic_umin:
5762 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5763 break;
5764 case Intrinsic::amdgcn_buffer_atomic_smax:
5765 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5766 break;
5767 case Intrinsic::amdgcn_buffer_atomic_umax:
5768 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5769 break;
5770 case Intrinsic::amdgcn_buffer_atomic_and:
5771 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5772 break;
5773 case Intrinsic::amdgcn_buffer_atomic_or:
5774 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5775 break;
5776 case Intrinsic::amdgcn_buffer_atomic_xor:
5777 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5778 break;
5779 default:
5780 llvm_unreachable("unhandled atomic opcode");
5781 }
5782
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005783 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5784 M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00005785 }
Tim Renouf4f703f52018-08-21 11:07:10 +00005786 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5787 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5788 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5789 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5790 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5791 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5792 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5793 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5794 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5795 case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
5796 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5797 SDValue Ops[] = {
5798 Op.getOperand(0), // Chain
5799 Op.getOperand(2), // vdata
5800 Op.getOperand(3), // rsrc
5801 DAG.getConstant(0, DL, MVT::i32), // vindex
5802 Offsets.first, // voffset
5803 Op.getOperand(5), // soffset
5804 Offsets.second, // offset
5805 Op.getOperand(6), // cachepolicy
5806 DAG.getConstant(0, DL, MVT::i1), // idxen
5807 };
5808 EVT VT = Op.getValueType();
Marek Olsak5cec6412017-11-09 01:52:48 +00005809
Tim Renouf4f703f52018-08-21 11:07:10 +00005810 auto *M = cast<MemSDNode>(Op);
5811 unsigned Opcode = 0;
5812
5813 switch (IntrID) {
5814 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5815 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5816 break;
5817 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5818 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5819 break;
5820 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5821 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5822 break;
5823 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5824 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5825 break;
5826 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5827 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5828 break;
5829 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5830 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5831 break;
5832 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5833 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5834 break;
5835 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5836 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5837 break;
5838 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5839 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5840 break;
5841 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5842 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5843 break;
5844 default:
5845 llvm_unreachable("unhandled atomic opcode");
5846 }
5847
5848 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5849 M->getMemOperand());
5850 }
5851 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5852 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5853 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5854 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5855 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5856 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5857 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5858 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5859 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5860 case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
5861 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5862 SDValue Ops[] = {
5863 Op.getOperand(0), // Chain
5864 Op.getOperand(2), // vdata
5865 Op.getOperand(3), // rsrc
5866 Op.getOperand(4), // vindex
5867 Offsets.first, // voffset
5868 Op.getOperand(6), // soffset
5869 Offsets.second, // offset
5870 Op.getOperand(7), // cachepolicy
5871 DAG.getConstant(1, DL, MVT::i1), // idxen
5872 };
5873 EVT VT = Op.getValueType();
5874
5875 auto *M = cast<MemSDNode>(Op);
5876 unsigned Opcode = 0;
5877
5878 switch (IntrID) {
5879 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5880 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5881 break;
5882 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5883 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5884 break;
5885 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5886 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5887 break;
5888 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5889 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5890 break;
5891 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5892 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5893 break;
5894 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5895 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5896 break;
5897 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5898 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5899 break;
5900 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5901 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5902 break;
5903 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5904 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5905 break;
5906 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5907 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5908 break;
5909 default:
5910 llvm_unreachable("unhandled atomic opcode");
5911 }
5912
5913 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5914 M->getMemOperand());
5915 }
Marek Olsak5cec6412017-11-09 01:52:48 +00005916 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
Tim Renouf4f703f52018-08-21 11:07:10 +00005917 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5918 unsigned IdxEn = 1;
5919 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
5920 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00005921 SDValue Ops[] = {
5922 Op.getOperand(0), // Chain
5923 Op.getOperand(2), // src
5924 Op.getOperand(3), // cmp
5925 Op.getOperand(4), // rsrc
5926 Op.getOperand(5), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00005927 SDValue(), // voffset -- will be set by setBufferOffsets
5928 SDValue(), // soffset -- will be set by setBufferOffsets
5929 SDValue(), // offset -- will be set by setBufferOffsets
5930 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5931 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5932 };
5933 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
5934 EVT VT = Op.getValueType();
5935 auto *M = cast<MemSDNode>(Op);
5936
5937 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5938 Op->getVTList(), Ops, VT, M->getMemOperand());
5939 }
5940 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
5941 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5942 SDValue Ops[] = {
5943 Op.getOperand(0), // Chain
5944 Op.getOperand(2), // src
5945 Op.getOperand(3), // cmp
5946 Op.getOperand(4), // rsrc
5947 DAG.getConstant(0, DL, MVT::i32), // vindex
5948 Offsets.first, // voffset
5949 Op.getOperand(6), // soffset
5950 Offsets.second, // offset
5951 Op.getOperand(7), // cachepolicy
5952 DAG.getConstant(0, DL, MVT::i1), // idxen
5953 };
5954 EVT VT = Op.getValueType();
5955 auto *M = cast<MemSDNode>(Op);
5956
5957 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5958 Op->getVTList(), Ops, VT, M->getMemOperand());
5959 }
5960 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
5961 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
5962 SDValue Ops[] = {
5963 Op.getOperand(0), // Chain
5964 Op.getOperand(2), // src
5965 Op.getOperand(3), // cmp
5966 Op.getOperand(4), // rsrc
5967 Op.getOperand(5), // vindex
5968 Offsets.first, // voffset
5969 Op.getOperand(7), // soffset
5970 Offsets.second, // offset
5971 Op.getOperand(8), // cachepolicy
5972 DAG.getConstant(1, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00005973 };
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005974 EVT VT = Op.getValueType();
5975 auto *M = cast<MemSDNode>(Op);
Marek Olsak5cec6412017-11-09 01:52:48 +00005976
5977 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005978 Op->getVTList(), Ops, VT, M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00005979 }
5980
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005981 default:
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005982 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5983 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5984 return lowerImage(Op, ImageDimIntr, DAG);
Matt Arsenault1349a042018-05-22 06:32:10 +00005985
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005986 return SDValue();
5987 }
5988}
5989
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00005990SDValue SITargetLowering::handleD16VData(SDValue VData,
5991 SelectionDAG &DAG) const {
5992 EVT StoreVT = VData.getValueType();
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00005993
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00005994 // No change for f16 and legal vector D16 types.
Matt Arsenault1349a042018-05-22 06:32:10 +00005995 if (!StoreVT.isVector())
5996 return VData;
5997
5998 SDLoc DL(VData);
5999 assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
6000
6001 if (Subtarget->hasUnpackedD16VMem()) {
6002 // We need to unpack the packed data to store.
6003 EVT IntStoreVT = StoreVT.changeTypeToInteger();
6004 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
6005
6006 EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6007 StoreVT.getVectorNumElements());
6008 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
6009 return DAG.UnrollVectorOp(ZExt.getNode());
6010 }
6011
Matt Arsenault02dc7e12018-06-15 15:15:46 +00006012 assert(isTypeLegal(StoreVT));
6013 return VData;
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006014}
6015
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006016SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6017 SelectionDAG &DAG) const {
Tom Stellardfc92e772015-05-12 14:18:14 +00006018 SDLoc DL(Op);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006019 SDValue Chain = Op.getOperand(0);
6020 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
David Stuttard70e8bc12017-06-22 16:29:22 +00006021 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006022
6023 switch (IntrinsicID) {
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00006024 case Intrinsic::amdgcn_exp: {
Matt Arsenault4165efd2017-01-17 07:26:53 +00006025 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
6026 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
6027 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
6028 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
6029
6030 const SDValue Ops[] = {
6031 Chain,
6032 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
6033 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
6034 Op.getOperand(4), // src0
6035 Op.getOperand(5), // src1
6036 Op.getOperand(6), // src2
6037 Op.getOperand(7), // src3
6038 DAG.getTargetConstant(0, DL, MVT::i1), // compr
6039 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
6040 };
6041
6042 unsigned Opc = Done->isNullValue() ?
6043 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
6044 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
6045 }
6046 case Intrinsic::amdgcn_exp_compr: {
6047 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
6048 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
6049 SDValue Src0 = Op.getOperand(4);
6050 SDValue Src1 = Op.getOperand(5);
6051 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
6052 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
6053
6054 SDValue Undef = DAG.getUNDEF(MVT::f32);
6055 const SDValue Ops[] = {
6056 Chain,
6057 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
6058 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
6059 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
6060 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
6061 Undef, // src2
6062 Undef, // src3
6063 DAG.getTargetConstant(1, DL, MVT::i1), // compr
6064 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
6065 };
6066
6067 unsigned Opc = Done->isNullValue() ?
6068 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
6069 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
6070 }
6071 case Intrinsic::amdgcn_s_sendmsg:
Matt Arsenaultd3e5cb72017-02-16 02:01:17 +00006072 case Intrinsic::amdgcn_s_sendmsghalt: {
6073 unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
6074 AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
Tom Stellardfc92e772015-05-12 14:18:14 +00006075 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
6076 SDValue Glue = Chain.getValue(1);
Matt Arsenaulta78ca622017-02-15 22:17:09 +00006077 return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
Jan Veselyd48445d2017-01-04 18:06:55 +00006078 Op.getOperand(2), Glue);
6079 }
Marek Olsak2d825902017-04-28 20:21:58 +00006080 case Intrinsic::amdgcn_init_exec: {
6081 return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
6082 Op.getOperand(2));
6083 }
6084 case Intrinsic::amdgcn_init_exec_from_input: {
6085 return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
6086 Op.getOperand(2), Op.getOperand(3));
6087 }
Stanislav Mekhanoshinea57c382017-04-06 16:48:30 +00006088 case Intrinsic::amdgcn_s_barrier: {
6089 if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00006090 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Matthias Braunf1caa282017-12-15 22:22:58 +00006091 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
Stanislav Mekhanoshinea57c382017-04-06 16:48:30 +00006092 if (WGSize <= ST.getWavefrontSize())
6093 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
6094 Op.getOperand(0)), 0);
6095 }
6096 return SDValue();
6097 };
David Stuttard70e8bc12017-06-22 16:29:22 +00006098 case Intrinsic::amdgcn_tbuffer_store: {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006099 SDValue VData = Op.getOperand(2);
6100 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6101 if (IsD16)
6102 VData = handleD16VData(VData, DAG);
Tim Renouf35484c92018-08-21 11:06:05 +00006103 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
6104 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
6105 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
6106 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
6107 unsigned IdxEn = 1;
6108 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6109 IdxEn = Idx->getZExtValue() != 0;
David Stuttard70e8bc12017-06-22 16:29:22 +00006110 SDValue Ops[] = {
6111 Chain,
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006112 VData, // vdata
David Stuttard70e8bc12017-06-22 16:29:22 +00006113 Op.getOperand(3), // rsrc
6114 Op.getOperand(4), // vindex
6115 Op.getOperand(5), // voffset
6116 Op.getOperand(6), // soffset
6117 Op.getOperand(7), // offset
Tim Renouf35484c92018-08-21 11:06:05 +00006118 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
6119 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6120 DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
6121 };
6122 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6123 AMDGPUISD::TBUFFER_STORE_FORMAT;
6124 MemSDNode *M = cast<MemSDNode>(Op);
6125 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6126 M->getMemoryVT(), M->getMemOperand());
6127 }
6128
6129 case Intrinsic::amdgcn_struct_tbuffer_store: {
6130 SDValue VData = Op.getOperand(2);
6131 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6132 if (IsD16)
6133 VData = handleD16VData(VData, DAG);
6134 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6135 SDValue Ops[] = {
6136 Chain,
6137 VData, // vdata
6138 Op.getOperand(3), // rsrc
6139 Op.getOperand(4), // vindex
6140 Offsets.first, // voffset
6141 Op.getOperand(6), // soffset
6142 Offsets.second, // offset
6143 Op.getOperand(7), // format
6144 Op.getOperand(8), // cachepolicy
6145 DAG.getConstant(1, DL, MVT::i1), // idexen
6146 };
6147 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6148 AMDGPUISD::TBUFFER_STORE_FORMAT;
6149 MemSDNode *M = cast<MemSDNode>(Op);
6150 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6151 M->getMemoryVT(), M->getMemOperand());
6152 }
6153
6154 case Intrinsic::amdgcn_raw_tbuffer_store: {
6155 SDValue VData = Op.getOperand(2);
6156 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6157 if (IsD16)
6158 VData = handleD16VData(VData, DAG);
6159 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6160 SDValue Ops[] = {
6161 Chain,
6162 VData, // vdata
6163 Op.getOperand(3), // rsrc
6164 DAG.getConstant(0, DL, MVT::i32), // vindex
6165 Offsets.first, // voffset
6166 Op.getOperand(5), // soffset
6167 Offsets.second, // offset
6168 Op.getOperand(6), // format
6169 Op.getOperand(7), // cachepolicy
6170 DAG.getConstant(0, DL, MVT::i1), // idexen
David Stuttard70e8bc12017-06-22 16:29:22 +00006171 };
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006172 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6173 AMDGPUISD::TBUFFER_STORE_FORMAT;
6174 MemSDNode *M = cast<MemSDNode>(Op);
6175 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6176 M->getMemoryVT(), M->getMemOperand());
David Stuttard70e8bc12017-06-22 16:29:22 +00006177 }
6178
Marek Olsak5cec6412017-11-09 01:52:48 +00006179 case Intrinsic::amdgcn_buffer_store:
6180 case Intrinsic::amdgcn_buffer_store_format: {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006181 SDValue VData = Op.getOperand(2);
6182 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6183 if (IsD16)
6184 VData = handleD16VData(VData, DAG);
Tim Renouf4f703f52018-08-21 11:07:10 +00006185 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
6186 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
6187 unsigned IdxEn = 1;
6188 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6189 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00006190 SDValue Ops[] = {
6191 Chain,
Tim Renouf4f703f52018-08-21 11:07:10 +00006192 VData,
Marek Olsak5cec6412017-11-09 01:52:48 +00006193 Op.getOperand(3), // rsrc
6194 Op.getOperand(4), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00006195 SDValue(), // voffset -- will be set by setBufferOffsets
6196 SDValue(), // soffset -- will be set by setBufferOffsets
6197 SDValue(), // offset -- will be set by setBufferOffsets
6198 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6199 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00006200 };
Tim Renouf4f703f52018-08-21 11:07:10 +00006201 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006202 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
6203 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6204 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6205 MemSDNode *M = cast<MemSDNode>(Op);
6206 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6207 M->getMemoryVT(), M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00006208 }
Tim Renouf4f703f52018-08-21 11:07:10 +00006209
6210 case Intrinsic::amdgcn_raw_buffer_store:
6211 case Intrinsic::amdgcn_raw_buffer_store_format: {
6212 SDValue VData = Op.getOperand(2);
6213 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6214 if (IsD16)
6215 VData = handleD16VData(VData, DAG);
6216 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6217 SDValue Ops[] = {
6218 Chain,
6219 VData,
6220 Op.getOperand(3), // rsrc
6221 DAG.getConstant(0, DL, MVT::i32), // vindex
6222 Offsets.first, // voffset
6223 Op.getOperand(5), // soffset
6224 Offsets.second, // offset
6225 Op.getOperand(6), // cachepolicy
6226 DAG.getConstant(0, DL, MVT::i1), // idxen
6227 };
6228 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
6229 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6230 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6231 MemSDNode *M = cast<MemSDNode>(Op);
6232 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6233 M->getMemoryVT(), M->getMemOperand());
6234 }
6235
6236 case Intrinsic::amdgcn_struct_buffer_store:
6237 case Intrinsic::amdgcn_struct_buffer_store_format: {
6238 SDValue VData = Op.getOperand(2);
6239 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6240 if (IsD16)
6241 VData = handleD16VData(VData, DAG);
6242 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6243 SDValue Ops[] = {
6244 Chain,
6245 VData,
6246 Op.getOperand(3), // rsrc
6247 Op.getOperand(4), // vindex
6248 Offsets.first, // voffset
6249 Op.getOperand(6), // soffset
6250 Offsets.second, // offset
6251 Op.getOperand(7), // cachepolicy
6252 DAG.getConstant(1, DL, MVT::i1), // idxen
6253 };
6254 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
6255 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6256 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6257 MemSDNode *M = cast<MemSDNode>(Op);
6258 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6259 M->getMemoryVT(), M->getMemOperand());
6260 }
6261
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006262 default: {
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00006263 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6264 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
6265 return lowerImage(Op, ImageDimIntr, DAG);
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006266
Matt Arsenault754dd3e2017-04-03 18:08:08 +00006267 return Op;
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006268 }
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006269 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006270}
6271
Tim Renouf4f703f52018-08-21 11:07:10 +00006272// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6273// offset (the offset that is included in bounds checking and swizzling, to be
6274// split between the instruction's voffset and immoffset fields) and soffset
6275// (the offset that is excluded from bounds checking and swizzling, to go in
6276// the instruction's soffset field). This function takes the first kind of
6277// offset and figures out how to split it between voffset and immoffset.
Tim Renouf35484c92018-08-21 11:06:05 +00006278std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
6279 SDValue Offset, SelectionDAG &DAG) const {
6280 SDLoc DL(Offset);
6281 const unsigned MaxImm = 4095;
6282 SDValue N0 = Offset;
6283 ConstantSDNode *C1 = nullptr;
Piotr Sobczak378131b2019-01-02 09:47:41 +00006284
6285 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
Tim Renouf35484c92018-08-21 11:06:05 +00006286 N0 = SDValue();
Piotr Sobczak378131b2019-01-02 09:47:41 +00006287 else if (DAG.isBaseWithConstantOffset(N0)) {
6288 C1 = cast<ConstantSDNode>(N0.getOperand(1));
6289 N0 = N0.getOperand(0);
6290 }
Tim Renouf35484c92018-08-21 11:06:05 +00006291
6292 if (C1) {
6293 unsigned ImmOffset = C1->getZExtValue();
6294 // If the immediate value is too big for the immoffset field, put the value
Tim Renoufa37679d2018-10-03 10:29:43 +00006295 // and -4096 into the immoffset field so that the value that is copied/added
Tim Renouf35484c92018-08-21 11:06:05 +00006296 // for the voffset field is a multiple of 4096, and it stands more chance
6297 // of being CSEd with the copy/add for another similar load/store.
Tim Renoufa37679d2018-10-03 10:29:43 +00006298 // However, do not do that rounding down to a multiple of 4096 if that is a
6299 // negative number, as it appears to be illegal to have a negative offset
6300 // in the vgpr, even if adding the immediate offset makes it positive.
Tim Renouf35484c92018-08-21 11:06:05 +00006301 unsigned Overflow = ImmOffset & ~MaxImm;
6302 ImmOffset -= Overflow;
Tim Renoufa37679d2018-10-03 10:29:43 +00006303 if ((int32_t)Overflow < 0) {
6304 Overflow += ImmOffset;
6305 ImmOffset = 0;
6306 }
Tim Renouf35484c92018-08-21 11:06:05 +00006307 C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
6308 if (Overflow) {
6309 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
6310 if (!N0)
6311 N0 = OverflowVal;
6312 else {
6313 SDValue Ops[] = { N0, OverflowVal };
6314 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
6315 }
6316 }
6317 }
6318 if (!N0)
6319 N0 = DAG.getConstant(0, DL, MVT::i32);
6320 if (!C1)
6321 C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
6322 return {N0, SDValue(C1, 0)};
6323}
6324
Tim Renouf4f703f52018-08-21 11:07:10 +00006325// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
6326// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
6327// pointed to by Offsets.
6328void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006329 SelectionDAG &DAG, SDValue *Offsets,
6330 unsigned Align) const {
Tim Renouf4f703f52018-08-21 11:07:10 +00006331 SDLoc DL(CombinedOffset);
6332 if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
6333 uint32_t Imm = C->getZExtValue();
6334 uint32_t SOffset, ImmOffset;
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006335 if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
Tim Renouf4f703f52018-08-21 11:07:10 +00006336 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
6337 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6338 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6339 return;
6340 }
6341 }
6342 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
6343 SDValue N0 = CombinedOffset.getOperand(0);
6344 SDValue N1 = CombinedOffset.getOperand(1);
6345 uint32_t SOffset, ImmOffset;
6346 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006347 if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
6348 Subtarget, Align)) {
Tim Renouf4f703f52018-08-21 11:07:10 +00006349 Offsets[0] = N0;
6350 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6351 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6352 return;
6353 }
6354 }
6355 Offsets[0] = CombinedOffset;
6356 Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
6357 Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
6358}
6359
Matt Arsenault90083d32018-06-07 09:54:49 +00006360static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
6361 ISD::LoadExtType ExtType, SDValue Op,
6362 const SDLoc &SL, EVT VT) {
6363 if (VT.bitsLT(Op.getValueType()))
6364 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
6365
6366 switch (ExtType) {
6367 case ISD::SEXTLOAD:
6368 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
6369 case ISD::ZEXTLOAD:
6370 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
6371 case ISD::EXTLOAD:
6372 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
6373 case ISD::NON_EXTLOAD:
6374 return Op;
6375 }
6376
6377 llvm_unreachable("invalid ext type");
6378}
6379
6380SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
6381 SelectionDAG &DAG = DCI.DAG;
6382 if (Ld->getAlignment() < 4 || Ld->isDivergent())
6383 return SDValue();
6384
6385 // FIXME: Constant loads should all be marked invariant.
6386 unsigned AS = Ld->getAddressSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00006387 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
6388 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
Matt Arsenault90083d32018-06-07 09:54:49 +00006389 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
6390 return SDValue();
6391
6392 // Don't do this early, since it may interfere with adjacent load merging for
6393 // illegal types. We can avoid losing alignment information for exotic types
6394 // pre-legalize.
6395 EVT MemVT = Ld->getMemoryVT();
6396 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
6397 MemVT.getSizeInBits() >= 32)
6398 return SDValue();
6399
6400 SDLoc SL(Ld);
6401
6402 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
6403 "unexpected vector extload");
6404
6405 // TODO: Drop only high part of range.
6406 SDValue Ptr = Ld->getBasePtr();
6407 SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
6408 MVT::i32, SL, Ld->getChain(), Ptr,
6409 Ld->getOffset(),
6410 Ld->getPointerInfo(), MVT::i32,
6411 Ld->getAlignment(),
6412 Ld->getMemOperand()->getFlags(),
6413 Ld->getAAInfo(),
6414 nullptr); // Drop ranges
6415
6416 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
6417 if (MemVT.isFloatingPoint()) {
6418 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
6419 "unexpected fp extload");
6420 TruncVT = MemVT.changeTypeToInteger();
6421 }
6422
6423 SDValue Cvt = NewLoad;
6424 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
6425 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
6426 DAG.getValueType(TruncVT));
6427 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
6428 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
6429 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
6430 } else {
6431 assert(Ld->getExtensionType() == ISD::EXTLOAD);
6432 }
6433
6434 EVT VT = Ld->getValueType(0);
6435 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
6436
6437 DCI.AddToWorklist(Cvt.getNode());
6438
6439 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
6440 // the appropriate extension from the 32-bit load.
6441 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
6442 DCI.AddToWorklist(Cvt.getNode());
6443
6444 // Handle conversion back to floating point if necessary.
6445 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
6446
6447 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
6448}
6449
Tom Stellard81d871d2013-11-13 23:36:50 +00006450SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
6451 SDLoc DL(Op);
6452 LoadSDNode *Load = cast<LoadSDNode>(Op);
Matt Arsenault6dfda962016-02-10 18:21:39 +00006453 ISD::LoadExtType ExtType = Load->getExtensionType();
Matt Arsenaulta1436412016-02-10 18:21:45 +00006454 EVT MemVT = Load->getMemoryVT();
Matt Arsenault6dfda962016-02-10 18:21:39 +00006455
Matt Arsenaulta1436412016-02-10 18:21:45 +00006456 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
Matt Arsenault65ca292a2017-09-07 05:37:34 +00006457 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
6458 return SDValue();
6459
Matt Arsenault6dfda962016-02-10 18:21:39 +00006460 // FIXME: Copied from PPC
6461 // First, load into 32 bits, then truncate to 1 bit.
6462
6463 SDValue Chain = Load->getChain();
6464 SDValue BasePtr = Load->getBasePtr();
6465 MachineMemOperand *MMO = Load->getMemOperand();
6466
Tom Stellard115a6152016-11-10 16:02:37 +00006467 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
6468
Matt Arsenault6dfda962016-02-10 18:21:39 +00006469 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
Tom Stellard115a6152016-11-10 16:02:37 +00006470 BasePtr, RealMemVT, MMO);
Matt Arsenault6dfda962016-02-10 18:21:39 +00006471
6472 SDValue Ops[] = {
Matt Arsenaulta1436412016-02-10 18:21:45 +00006473 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
Matt Arsenault6dfda962016-02-10 18:21:39 +00006474 NewLD.getValue(1)
6475 };
6476
6477 return DAG.getMergeValues(Ops, DL);
6478 }
Tom Stellard81d871d2013-11-13 23:36:50 +00006479
Matt Arsenaulta1436412016-02-10 18:21:45 +00006480 if (!MemVT.isVector())
6481 return SDValue();
Matt Arsenault4d801cd2015-11-24 12:05:03 +00006482
Matt Arsenaulta1436412016-02-10 18:21:45 +00006483 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
6484 "Custom lowering for non-i32 vectors hasn't been implemented.");
Matt Arsenault4d801cd2015-11-24 12:05:03 +00006485
Farhana Aleen89196642018-03-07 17:09:18 +00006486 unsigned Alignment = Load->getAlignment();
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006487 unsigned AS = Load->getAddressSpace();
6488 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
Farhana Aleen89196642018-03-07 17:09:18 +00006489 AS, Alignment)) {
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006490 SDValue Ops[2];
6491 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
6492 return DAG.getMergeValues(Ops, DL);
6493 }
6494
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006495 MachineFunction &MF = DAG.getMachineFunction();
6496 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6497 // If there is a possibilty that flat instruction access scratch memory
6498 // then we need to use the same legalization rules we use for private.
Matt Arsenault0da63502018-08-31 05:49:54 +00006499 if (AS == AMDGPUAS::FLAT_ADDRESS)
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006500 AS = MFI->hasFlatScratchInit() ?
Matt Arsenault0da63502018-08-31 05:49:54 +00006501 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006502
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006503 unsigned NumElements = MemVT.getVectorNumElements();
Matt Arsenault6c041a32018-03-29 19:59:28 +00006504
Matt Arsenault0da63502018-08-31 05:49:54 +00006505 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6506 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +00006507 if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32)
Matt Arsenaulta1436412016-02-10 18:21:45 +00006508 return SDValue();
6509 // Non-uniform loads will be selected to MUBUF instructions, so they
Alexander Timofeev18009562016-12-08 17:28:47 +00006510 // have the same legalization requirements as global and private
Matt Arsenaulta1436412016-02-10 18:21:45 +00006511 // loads.
6512 //
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006513 }
Matt Arsenault6c041a32018-03-29 19:59:28 +00006514
Matt Arsenault0da63502018-08-31 05:49:54 +00006515 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6516 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6517 AS == AMDGPUAS::GLOBAL_ADDRESS) {
Alexander Timofeev2e5eece2018-03-05 15:12:21 +00006518 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
Farhana Aleen89196642018-03-07 17:09:18 +00006519 !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +00006520 Alignment >= 4 && NumElements < 32)
Alexander Timofeev18009562016-12-08 17:28:47 +00006521 return SDValue();
6522 // Non-uniform loads will be selected to MUBUF instructions, so they
6523 // have the same legalization requirements as global and private
6524 // loads.
6525 //
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006526 }
Matt Arsenault0da63502018-08-31 05:49:54 +00006527 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6528 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6529 AS == AMDGPUAS::GLOBAL_ADDRESS ||
6530 AS == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006531 if (NumElements > 4)
Matt Arsenaulta1436412016-02-10 18:21:45 +00006532 return SplitVectorLoad(Op, DAG);
6533 // v4 loads are supported for private and global memory.
6534 return SDValue();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006535 }
Matt Arsenault0da63502018-08-31 05:49:54 +00006536 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006537 // Depending on the setting of the private_element_size field in the
6538 // resource descriptor, we can only make private accesses up to a certain
6539 // size.
6540 switch (Subtarget->getMaxPrivateElementSize()) {
6541 case 4:
Matt Arsenault9c499c32016-04-14 23:31:26 +00006542 return scalarizeVectorLoad(Load, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006543 case 8:
6544 if (NumElements > 2)
6545 return SplitVectorLoad(Op, DAG);
6546 return SDValue();
6547 case 16:
6548 // Same as global/flat
6549 if (NumElements > 4)
6550 return SplitVectorLoad(Op, DAG);
6551 return SDValue();
6552 default:
6553 llvm_unreachable("unsupported private_element_size");
6554 }
Matt Arsenault0da63502018-08-31 05:49:54 +00006555 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Farhana Aleena7cb3112018-03-09 17:41:39 +00006556 // Use ds_read_b128 if possible.
Marek Olsaka9a58fa2018-04-10 22:48:23 +00006557 if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
Farhana Aleena7cb3112018-03-09 17:41:39 +00006558 MemVT.getStoreSize() == 16)
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006559 return SDValue();
6560
Farhana Aleena7cb3112018-03-09 17:41:39 +00006561 if (NumElements > 2)
6562 return SplitVectorLoad(Op, DAG);
Nicolai Haehnle48219372018-10-17 15:37:48 +00006563
6564 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
6565 // address is negative, then the instruction is incorrectly treated as
6566 // out-of-bounds even if base + offsets is in bounds. Split vectorized
6567 // loads here to avoid emitting ds_read2_b32. We may re-combine the
6568 // load later in the SILoadStoreOptimizer.
6569 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
6570 NumElements == 2 && MemVT.getStoreSize() == 8 &&
6571 Load->getAlignment() < 8) {
6572 return SplitVectorLoad(Op, DAG);
6573 }
Tom Stellarde9373602014-01-22 19:24:14 +00006574 }
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006575 return SDValue();
Tom Stellard81d871d2013-11-13 23:36:50 +00006576}
6577
Tom Stellard0ec134f2014-02-04 17:18:40 +00006578SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00006579 EVT VT = Op.getValueType();
6580 assert(VT.getSizeInBits() == 64);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006581
6582 SDLoc DL(Op);
6583 SDValue Cond = Op.getOperand(0);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006584
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00006585 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
6586 SDValue One = DAG.getConstant(1, DL, MVT::i32);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006587
Tom Stellard7ea3d6d2014-03-31 14:01:55 +00006588 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
6589 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
6590
6591 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
6592 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006593
6594 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
6595
Tom Stellard7ea3d6d2014-03-31 14:01:55 +00006596 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
6597 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006598
6599 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
6600
Ahmed Bougacha128f8732016-04-26 21:15:30 +00006601 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
Matt Arsenault02dc7e12018-06-15 15:15:46 +00006602 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006603}
6604
Matt Arsenault22ca3f82014-07-15 23:50:10 +00006605// Catch division cases where we can use shortcuts with rcp and rsq
6606// instructions.
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00006607SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
6608 SelectionDAG &DAG) const {
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006609 SDLoc SL(Op);
6610 SDValue LHS = Op.getOperand(0);
6611 SDValue RHS = Op.getOperand(1);
6612 EVT VT = Op.getValueType();
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00006613 const SDNodeFlags Flags = Op->getFlags();
Michael Berg7acc81b2018-05-04 18:48:20 +00006614 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006615
Konstantin Zhuravlyovc4b18e72017-04-21 19:25:33 +00006616 if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
6617 return SDValue();
6618
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006619 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
Konstantin Zhuravlyovc4b18e72017-04-21 19:25:33 +00006620 if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
Matt Arsenault979902b2016-08-02 22:25:04 +00006621 if (CLHS->isExactlyValue(1.0)) {
6622 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
6623 // the CI documentation has a worst case error of 1 ulp.
6624 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
6625 // use it as long as we aren't trying to use denormals.
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00006626 //
6627 // v_rcp_f16 and v_rsq_f16 DO support denormals.
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006628
Matt Arsenault979902b2016-08-02 22:25:04 +00006629 // 1.0 / sqrt(x) -> rsq(x)
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00006630
Matt Arsenault979902b2016-08-02 22:25:04 +00006631 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
6632 // error seems really high at 2^29 ULP.
6633 if (RHS.getOpcode() == ISD::FSQRT)
6634 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
6635
6636 // 1.0 / x -> rcp(x)
6637 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
6638 }
6639
6640 // Same as for 1.0, but expand the sign out of the constant.
6641 if (CLHS->isExactlyValue(-1.0)) {
6642 // -1.0 / x -> rcp (fneg x)
6643 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
6644 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
6645 }
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006646 }
6647 }
6648
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00006649 if (Unsafe) {
Matt Arsenault22ca3f82014-07-15 23:50:10 +00006650 // Turn into multiply by the reciprocal.
6651 // x / y -> x * (1.0 / y)
6652 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00006653 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
Matt Arsenault22ca3f82014-07-15 23:50:10 +00006654 }
6655
6656 return SDValue();
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006657}
6658
Tom Stellard8485fa02016-12-07 02:42:15 +00006659static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6660 EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
6661 if (GlueChain->getNumValues() <= 1) {
6662 return DAG.getNode(Opcode, SL, VT, A, B);
6663 }
6664
6665 assert(GlueChain->getNumValues() == 3);
6666
6667 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6668 switch (Opcode) {
6669 default: llvm_unreachable("no chain equivalent for opcode");
6670 case ISD::FMUL:
6671 Opcode = AMDGPUISD::FMUL_W_CHAIN;
6672 break;
6673 }
6674
6675 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
6676 GlueChain.getValue(2));
6677}
6678
6679static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6680 EVT VT, SDValue A, SDValue B, SDValue C,
6681 SDValue GlueChain) {
6682 if (GlueChain->getNumValues() <= 1) {
6683 return DAG.getNode(Opcode, SL, VT, A, B, C);
6684 }
6685
6686 assert(GlueChain->getNumValues() == 3);
6687
6688 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6689 switch (Opcode) {
6690 default: llvm_unreachable("no chain equivalent for opcode");
6691 case ISD::FMA:
6692 Opcode = AMDGPUISD::FMA_W_CHAIN;
6693 break;
6694 }
6695
6696 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
6697 GlueChain.getValue(2));
6698}
6699
Matt Arsenault4052a572016-12-22 03:05:41 +00006700SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00006701 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
6702 return FastLowered;
6703
Matt Arsenault4052a572016-12-22 03:05:41 +00006704 SDLoc SL(Op);
6705 SDValue Src0 = Op.getOperand(0);
6706 SDValue Src1 = Op.getOperand(1);
6707
6708 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6709 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6710
6711 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
6712 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
6713
6714 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
6715 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
6716
6717 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
6718}
6719
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00006720// Faster 2.5 ULP division that does not support denormals.
6721SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
6722 SDLoc SL(Op);
6723 SDValue LHS = Op.getOperand(1);
6724 SDValue RHS = Op.getOperand(2);
6725
6726 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
6727
6728 const APFloat K0Val(BitsToFloat(0x6f800000));
6729 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
6730
6731 const APFloat K1Val(BitsToFloat(0x2f800000));
6732 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
6733
6734 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
6735
6736 EVT SetCCVT =
6737 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
6738
6739 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
6740
6741 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
6742
6743 // TODO: Should this propagate fast-math-flags?
6744 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
6745
6746 // rcp does not support denormals.
6747 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
6748
6749 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
6750
6751 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
6752}
6753
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006754SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00006755 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
Eric Christopher538d09d02016-06-07 20:27:12 +00006756 return FastLowered;
Matt Arsenault22ca3f82014-07-15 23:50:10 +00006757
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006758 SDLoc SL(Op);
6759 SDValue LHS = Op.getOperand(0);
6760 SDValue RHS = Op.getOperand(1);
6761
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00006762 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
Matt Arsenault37fefd62016-06-10 02:18:02 +00006763
Wei Dinged0f97f2016-06-09 19:17:15 +00006764 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
Matt Arsenault37fefd62016-06-10 02:18:02 +00006765
Tom Stellard8485fa02016-12-07 02:42:15 +00006766 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
6767 RHS, RHS, LHS);
6768 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
6769 LHS, RHS, LHS);
Matt Arsenault37fefd62016-06-10 02:18:02 +00006770
Matt Arsenaultdfec5ce2016-07-09 07:48:11 +00006771 // Denominator is scaled to not be denormal, so using rcp is ok.
Tom Stellard8485fa02016-12-07 02:42:15 +00006772 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
6773 DenominatorScaled);
6774 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
6775 DenominatorScaled);
Matt Arsenault37fefd62016-06-10 02:18:02 +00006776
Tom Stellard8485fa02016-12-07 02:42:15 +00006777 const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
6778 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
6779 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
Matt Arsenault37fefd62016-06-10 02:18:02 +00006780
Tom Stellard8485fa02016-12-07 02:42:15 +00006781 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
Matt Arsenault37fefd62016-06-10 02:18:02 +00006782
Tom Stellard8485fa02016-12-07 02:42:15 +00006783 if (!Subtarget->hasFP32Denormals()) {
6784 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
6785 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
6786 SL, MVT::i32);
6787 SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
6788 DAG.getEntryNode(),
6789 EnableDenormValue, BitField);
6790 SDValue Ops[3] = {
6791 NegDivScale0,
6792 EnableDenorm.getValue(0),
6793 EnableDenorm.getValue(1)
6794 };
Matt Arsenault37fefd62016-06-10 02:18:02 +00006795
Tom Stellard8485fa02016-12-07 02:42:15 +00006796 NegDivScale0 = DAG.getMergeValues(Ops, SL);
6797 }
6798
6799 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
6800 ApproxRcp, One, NegDivScale0);
6801
6802 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
6803 ApproxRcp, Fma0);
6804
6805 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
6806 Fma1, Fma1);
6807
6808 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
6809 NumeratorScaled, Mul);
6810
6811 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
6812
6813 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
6814 NumeratorScaled, Fma3);
6815
6816 if (!Subtarget->hasFP32Denormals()) {
6817 const SDValue DisableDenormValue =
6818 DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
6819 SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
6820 Fma4.getValue(1),
6821 DisableDenormValue,
6822 BitField,
6823 Fma4.getValue(2));
6824
6825 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
6826 DisableDenorm, DAG.getRoot());
6827 DAG.setRoot(OutputChain);
6828 }
Matt Arsenault37fefd62016-06-10 02:18:02 +00006829
Wei Dinged0f97f2016-06-09 19:17:15 +00006830 SDValue Scale = NumeratorScaled.getValue(1);
Tom Stellard8485fa02016-12-07 02:42:15 +00006831 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
6832 Fma4, Fma1, Fma3, Scale);
Matt Arsenault37fefd62016-06-10 02:18:02 +00006833
Wei Dinged0f97f2016-06-09 19:17:15 +00006834 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006835}
6836
6837SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00006838 if (DAG.getTarget().Options.UnsafeFPMath)
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00006839 return lowerFastUnsafeFDIV(Op, DAG);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00006840
6841 SDLoc SL(Op);
6842 SDValue X = Op.getOperand(0);
6843 SDValue Y = Op.getOperand(1);
6844
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00006845 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00006846
6847 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
6848
6849 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
6850
6851 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
6852
6853 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
6854
6855 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
6856
6857 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
6858
6859 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
6860
6861 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
6862
6863 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
6864 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
6865
6866 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
6867 NegDivScale0, Mul, DivScale1);
6868
6869 SDValue Scale;
6870
Tom Stellard5bfbae52018-07-11 20:59:01 +00006871 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00006872 // Workaround a hardware bug on SI where the condition output from div_scale
6873 // is not usable.
6874
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00006875 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00006876
6877 // Figure out if the scale to use for div_fmas.
6878 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
6879 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
6880 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
6881 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
6882
6883 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
6884 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
6885
6886 SDValue Scale0Hi
6887 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
6888 SDValue Scale1Hi
6889 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
6890
6891 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
6892 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
6893 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
6894 } else {
6895 Scale = DivScale1.getValue(1);
6896 }
6897
6898 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
6899 Fma4, Fma3, Mul, Scale);
6900
6901 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006902}
6903
6904SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
6905 EVT VT = Op.getValueType();
6906
6907 if (VT == MVT::f32)
6908 return LowerFDIV32(Op, DAG);
6909
6910 if (VT == MVT::f64)
6911 return LowerFDIV64(Op, DAG);
6912
Matt Arsenault4052a572016-12-22 03:05:41 +00006913 if (VT == MVT::f16)
6914 return LowerFDIV16(Op, DAG);
6915
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006916 llvm_unreachable("Unexpected type for fdiv");
6917}
6918
Tom Stellard81d871d2013-11-13 23:36:50 +00006919SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
6920 SDLoc DL(Op);
6921 StoreSDNode *Store = cast<StoreSDNode>(Op);
6922 EVT VT = Store->getMemoryVT();
6923
Matt Arsenault95245662016-02-11 05:32:46 +00006924 if (VT == MVT::i1) {
6925 return DAG.getTruncStore(Store->getChain(), DL,
6926 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
6927 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
Tom Stellardb02094e2014-07-21 15:45:01 +00006928 }
6929
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006930 assert(VT.isVector() &&
6931 Store->getValue().getValueType().getScalarType() == MVT::i32);
6932
6933 unsigned AS = Store->getAddressSpace();
6934 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
6935 AS, Store->getAlignment())) {
6936 return expandUnalignedStore(Store, DAG);
6937 }
Tom Stellard81d871d2013-11-13 23:36:50 +00006938
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006939 MachineFunction &MF = DAG.getMachineFunction();
6940 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6941 // If there is a possibilty that flat instruction access scratch memory
6942 // then we need to use the same legalization rules we use for private.
Matt Arsenault0da63502018-08-31 05:49:54 +00006943 if (AS == AMDGPUAS::FLAT_ADDRESS)
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006944 AS = MFI->hasFlatScratchInit() ?
Matt Arsenault0da63502018-08-31 05:49:54 +00006945 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006946
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006947 unsigned NumElements = VT.getVectorNumElements();
Matt Arsenault0da63502018-08-31 05:49:54 +00006948 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
6949 AS == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006950 if (NumElements > 4)
6951 return SplitVectorStore(Op, DAG);
6952 return SDValue();
Matt Arsenault0da63502018-08-31 05:49:54 +00006953 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006954 switch (Subtarget->getMaxPrivateElementSize()) {
6955 case 4:
Matt Arsenault9c499c32016-04-14 23:31:26 +00006956 return scalarizeVectorStore(Store, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006957 case 8:
6958 if (NumElements > 2)
6959 return SplitVectorStore(Op, DAG);
6960 return SDValue();
6961 case 16:
6962 if (NumElements > 4)
6963 return SplitVectorStore(Op, DAG);
6964 return SDValue();
6965 default:
6966 llvm_unreachable("unsupported private_element_size");
6967 }
Matt Arsenault0da63502018-08-31 05:49:54 +00006968 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00006969 // Use ds_write_b128 if possible.
Marek Olsaka9a58fa2018-04-10 22:48:23 +00006970 if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00006971 VT.getStoreSize() == 16)
6972 return SDValue();
6973
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006974 if (NumElements > 2)
6975 return SplitVectorStore(Op, DAG);
Nicolai Haehnle48219372018-10-17 15:37:48 +00006976
6977 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
6978 // address is negative, then the instruction is incorrectly treated as
6979 // out-of-bounds even if base + offsets is in bounds. Split vectorized
6980 // stores here to avoid emitting ds_write2_b32. We may re-combine the
6981 // store later in the SILoadStoreOptimizer.
6982 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
6983 NumElements == 2 && VT.getStoreSize() == 8 &&
6984 Store->getAlignment() < 8) {
6985 return SplitVectorStore(Op, DAG);
6986 }
6987
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00006988 return SDValue();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006989 } else {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006990 llvm_unreachable("unhandled address space");
Matt Arsenault95245662016-02-11 05:32:46 +00006991 }
Tom Stellard81d871d2013-11-13 23:36:50 +00006992}
6993
Matt Arsenaultad14ce82014-07-19 18:44:39 +00006994SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00006995 SDLoc DL(Op);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00006996 EVT VT = Op.getValueType();
6997 SDValue Arg = Op.getOperand(0);
David Stuttard20de3e92018-09-14 10:27:19 +00006998 SDValue TrigVal;
6999
Sanjay Patela2607012015-09-16 16:31:21 +00007000 // TODO: Should this propagate fast-math-flags?
David Stuttard20de3e92018-09-14 10:27:19 +00007001
7002 SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT);
7003
7004 if (Subtarget->hasTrigReducedRange()) {
7005 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
7006 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
7007 } else {
7008 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
7009 }
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007010
7011 switch (Op.getOpcode()) {
7012 case ISD::FCOS:
David Stuttard20de3e92018-09-14 10:27:19 +00007013 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007014 case ISD::FSIN:
David Stuttard20de3e92018-09-14 10:27:19 +00007015 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007016 default:
7017 llvm_unreachable("Wrong trig opcode");
7018 }
7019}
7020
Tom Stellard354a43c2016-04-01 18:27:37 +00007021SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
7022 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
7023 assert(AtomicNode->isCompareAndSwap());
7024 unsigned AS = AtomicNode->getAddressSpace();
7025
7026 // No custom lowering required for local address space
Matt Arsenault0da63502018-08-31 05:49:54 +00007027 if (!isFlatGlobalAddrSpace(AS))
Tom Stellard354a43c2016-04-01 18:27:37 +00007028 return Op;
7029
7030 // Non-local address space requires custom lowering for atomic compare
7031 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
7032 SDLoc DL(Op);
7033 SDValue ChainIn = Op.getOperand(0);
7034 SDValue Addr = Op.getOperand(1);
7035 SDValue Old = Op.getOperand(2);
7036 SDValue New = Op.getOperand(3);
7037 EVT VT = Op.getValueType();
7038 MVT SimpleVT = VT.getSimpleVT();
7039 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
7040
Ahmed Bougacha128f8732016-04-26 21:15:30 +00007041 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
Tom Stellard354a43c2016-04-01 18:27:37 +00007042 SDValue Ops[] = { ChainIn, Addr, NewOld };
Matt Arsenault88701812016-06-09 23:42:48 +00007043
7044 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
7045 Ops, VT, AtomicNode->getMemOperand());
Tom Stellard354a43c2016-04-01 18:27:37 +00007046}
7047
Tom Stellard75aadc22012-12-11 21:25:42 +00007048//===----------------------------------------------------------------------===//
7049// Custom DAG optimizations
7050//===----------------------------------------------------------------------===//
7051
Matt Arsenault364a6742014-06-11 17:50:44 +00007052SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
Matt Arsenaulte6986632015-01-14 01:35:22 +00007053 DAGCombinerInfo &DCI) const {
Matt Arsenault364a6742014-06-11 17:50:44 +00007054 EVT VT = N->getValueType(0);
7055 EVT ScalarVT = VT.getScalarType();
7056 if (ScalarVT != MVT::f32)
7057 return SDValue();
7058
7059 SelectionDAG &DAG = DCI.DAG;
7060 SDLoc DL(N);
7061
7062 SDValue Src = N->getOperand(0);
7063 EVT SrcVT = Src.getValueType();
7064
7065 // TODO: We could try to match extracting the higher bytes, which would be
7066 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
7067 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
7068 // about in practice.
Craig Topper80d3bb32018-03-06 19:44:52 +00007069 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
Matt Arsenault364a6742014-06-11 17:50:44 +00007070 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
7071 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
7072 DCI.AddToWorklist(Cvt.getNode());
7073 return Cvt;
7074 }
7075 }
7076
Matt Arsenault364a6742014-06-11 17:50:44 +00007077 return SDValue();
7078}
7079
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007080// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
7081
7082// This is a variant of
7083// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
7084//
7085// The normal DAG combiner will do this, but only if the add has one use since
7086// that would increase the number of instructions.
7087//
7088// This prevents us from seeing a constant offset that can be folded into a
7089// memory instruction's addressing mode. If we know the resulting add offset of
7090// a pointer can be folded into an addressing offset, we can replace the pointer
7091// operand with the add of new constant offset. This eliminates one of the uses,
7092// and may allow the remaining use to also be simplified.
7093//
7094SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
7095 unsigned AddrSpace,
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007096 EVT MemVT,
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007097 DAGCombinerInfo &DCI) const {
7098 SDValue N0 = N->getOperand(0);
7099 SDValue N1 = N->getOperand(1);
7100
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007101 // We only do this to handle cases where it's profitable when there are
7102 // multiple uses of the add, so defer to the standard combine.
Matt Arsenaultc8903122017-11-14 23:46:42 +00007103 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
7104 N0->hasOneUse())
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007105 return SDValue();
7106
7107 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
7108 if (!CN1)
7109 return SDValue();
7110
7111 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
7112 if (!CAdd)
7113 return SDValue();
7114
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007115 // If the resulting offset is too large, we can't fold it into the addressing
7116 // mode offset.
7117 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007118 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
7119
7120 AddrMode AM;
7121 AM.HasBaseReg = true;
7122 AM.BaseOffs = Offset.getSExtValue();
7123 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007124 return SDValue();
7125
7126 SelectionDAG &DAG = DCI.DAG;
7127 SDLoc SL(N);
7128 EVT VT = N->getValueType(0);
7129
7130 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007131 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007132
Matt Arsenaulte5e0c742017-11-13 05:33:35 +00007133 SDNodeFlags Flags;
7134 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
7135 (N0.getOpcode() == ISD::OR ||
7136 N0->getFlags().hasNoUnsignedWrap()));
7137
7138 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007139}
7140
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00007141SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
7142 DAGCombinerInfo &DCI) const {
7143 SDValue Ptr = N->getBasePtr();
7144 SelectionDAG &DAG = DCI.DAG;
7145 SDLoc SL(N);
7146
7147 // TODO: We could also do this for multiplies.
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007148 if (Ptr.getOpcode() == ISD::SHL) {
7149 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
7150 N->getMemoryVT(), DCI);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00007151 if (NewPtr) {
7152 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
7153
7154 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
7155 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
7156 }
7157 }
7158
7159 return SDValue();
7160}
7161
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007162static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
7163 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
7164 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
7165 (Opc == ISD::XOR && Val == 0);
7166}
7167
7168// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
7169// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
7170// integer combine opportunities since most 64-bit operations are decomposed
7171// this way. TODO: We won't want this for SALU especially if it is an inline
7172// immediate.
7173SDValue SITargetLowering::splitBinaryBitConstantOp(
7174 DAGCombinerInfo &DCI,
7175 const SDLoc &SL,
7176 unsigned Opc, SDValue LHS,
7177 const ConstantSDNode *CRHS) const {
7178 uint64_t Val = CRHS->getZExtValue();
7179 uint32_t ValLo = Lo_32(Val);
7180 uint32_t ValHi = Hi_32(Val);
7181 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7182
7183 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
7184 bitOpWithConstantIsReducible(Opc, ValHi)) ||
7185 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
7186 // If we need to materialize a 64-bit immediate, it will be split up later
7187 // anyway. Avoid creating the harder to understand 64-bit immediate
7188 // materialization.
7189 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
7190 }
7191
7192 return SDValue();
7193}
7194
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00007195// Returns true if argument is a boolean value which is not serialized into
7196// memory or argument and does not require v_cmdmask_b32 to be deserialized.
7197static bool isBoolSGPR(SDValue V) {
7198 if (V.getValueType() != MVT::i1)
7199 return false;
7200 switch (V.getOpcode()) {
7201 default: break;
7202 case ISD::SETCC:
7203 case ISD::AND:
7204 case ISD::OR:
7205 case ISD::XOR:
7206 case AMDGPUISD::FP_CLASS:
7207 return true;
7208 }
7209 return false;
7210}
7211
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007212// If a constant has all zeroes or all ones within each byte return it.
7213// Otherwise return 0.
7214static uint32_t getConstantPermuteMask(uint32_t C) {
7215 // 0xff for any zero byte in the mask
7216 uint32_t ZeroByteMask = 0;
7217 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
7218 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
7219 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
7220 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
7221 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
7222 if ((NonZeroByteMask & C) != NonZeroByteMask)
7223 return 0; // Partial bytes selected.
7224 return C;
7225}
7226
7227// Check if a node selects whole bytes from its operand 0 starting at a byte
7228// boundary while masking the rest. Returns select mask as in the v_perm_b32
7229// or -1 if not succeeded.
7230// Note byte select encoding:
7231// value 0-3 selects corresponding source byte;
7232// value 0xc selects zero;
7233// value 0xff selects 0xff.
7234static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
7235 assert(V.getValueSizeInBits() == 32);
7236
7237 if (V.getNumOperands() != 2)
7238 return ~0;
7239
7240 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
7241 if (!N1)
7242 return ~0;
7243
7244 uint32_t C = N1->getZExtValue();
7245
7246 switch (V.getOpcode()) {
7247 default:
7248 break;
7249 case ISD::AND:
7250 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7251 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
7252 }
7253 break;
7254
7255 case ISD::OR:
7256 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7257 return (0x03020100 & ~ConstMask) | ConstMask;
7258 }
7259 break;
7260
7261 case ISD::SHL:
7262 if (C % 8)
7263 return ~0;
7264
7265 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
7266
7267 case ISD::SRL:
7268 if (C % 8)
7269 return ~0;
7270
7271 return uint32_t(0x0c0c0c0c03020100ull >> C);
7272 }
7273
7274 return ~0;
7275}
7276
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007277SDValue SITargetLowering::performAndCombine(SDNode *N,
7278 DAGCombinerInfo &DCI) const {
7279 if (DCI.isBeforeLegalize())
7280 return SDValue();
7281
7282 SelectionDAG &DAG = DCI.DAG;
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007283 EVT VT = N->getValueType(0);
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007284 SDValue LHS = N->getOperand(0);
7285 SDValue RHS = N->getOperand(1);
7286
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007287
Stanislav Mekhanoshin53a21292017-05-23 19:54:48 +00007288 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7289 if (VT == MVT::i64 && CRHS) {
7290 if (SDValue Split
7291 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
7292 return Split;
7293 }
7294
7295 if (CRHS && VT == MVT::i32) {
7296 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
7297 // nb = number of trailing zeroes in mask
7298 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
7299 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
7300 uint64_t Mask = CRHS->getZExtValue();
7301 unsigned Bits = countPopulation(Mask);
7302 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
7303 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
7304 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
7305 unsigned Shift = CShift->getZExtValue();
7306 unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
7307 unsigned Offset = NB + Shift;
7308 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
7309 SDLoc SL(N);
7310 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
7311 LHS->getOperand(0),
7312 DAG.getConstant(Offset, SL, MVT::i32),
7313 DAG.getConstant(Bits, SL, MVT::i32));
7314 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
7315 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
7316 DAG.getValueType(NarrowVT));
7317 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
7318 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
7319 return Shl;
7320 }
7321 }
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007322 }
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007323
7324 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7325 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
7326 isa<ConstantSDNode>(LHS.getOperand(2))) {
7327 uint32_t Sel = getConstantPermuteMask(Mask);
7328 if (!Sel)
7329 return SDValue();
7330
7331 // Select 0xc for all zero bytes
7332 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
7333 SDLoc DL(N);
7334 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7335 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7336 }
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007337 }
7338
7339 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
7340 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
7341 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007342 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7343 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
7344
7345 SDValue X = LHS.getOperand(0);
7346 SDValue Y = RHS.getOperand(0);
7347 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
7348 return SDValue();
7349
7350 if (LCC == ISD::SETO) {
7351 if (X != LHS.getOperand(1))
7352 return SDValue();
7353
7354 if (RCC == ISD::SETUNE) {
7355 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
7356 if (!C1 || !C1->isInfinity() || C1->isNegative())
7357 return SDValue();
7358
7359 const uint32_t Mask = SIInstrFlags::N_NORMAL |
7360 SIInstrFlags::N_SUBNORMAL |
7361 SIInstrFlags::N_ZERO |
7362 SIInstrFlags::P_ZERO |
7363 SIInstrFlags::P_SUBNORMAL |
7364 SIInstrFlags::P_NORMAL;
7365
7366 static_assert(((~(SIInstrFlags::S_NAN |
7367 SIInstrFlags::Q_NAN |
7368 SIInstrFlags::N_INFINITY |
7369 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
7370 "mask not equal");
7371
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007372 SDLoc DL(N);
7373 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7374 X, DAG.getConstant(Mask, DL, MVT::i32));
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007375 }
7376 }
7377 }
7378
Matt Arsenault3dcf4ce2018-08-10 18:58:56 +00007379 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
7380 std::swap(LHS, RHS);
7381
7382 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7383 RHS.hasOneUse()) {
7384 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7385 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
7386 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
7387 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7388 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
7389 (RHS.getOperand(0) == LHS.getOperand(0) &&
7390 LHS.getOperand(0) == LHS.getOperand(1))) {
7391 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
7392 unsigned NewMask = LCC == ISD::SETO ?
7393 Mask->getZExtValue() & ~OrdMask :
7394 Mask->getZExtValue() & OrdMask;
7395
7396 SDLoc DL(N);
7397 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
7398 DAG.getConstant(NewMask, DL, MVT::i32));
7399 }
7400 }
7401
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00007402 if (VT == MVT::i32 &&
7403 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
7404 // and x, (sext cc from i1) => select cc, x, 0
7405 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
7406 std::swap(LHS, RHS);
7407 if (isBoolSGPR(RHS.getOperand(0)))
7408 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
7409 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
7410 }
7411
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007412 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7413 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7414 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7415 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7416 uint32_t LHSMask = getPermuteMask(DAG, LHS);
7417 uint32_t RHSMask = getPermuteMask(DAG, RHS);
7418 if (LHSMask != ~0u && RHSMask != ~0u) {
7419 // Canonicalize the expression in an attempt to have fewer unique masks
7420 // and therefore fewer registers used to hold the masks.
7421 if (LHSMask > RHSMask) {
7422 std::swap(LHSMask, RHSMask);
7423 std::swap(LHS, RHS);
7424 }
7425
7426 // Select 0xc for each lane used from source operand. Zero has 0xc mask
7427 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7428 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7429 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7430
7431 // Check of we need to combine values from two sources within a byte.
7432 if (!(LHSUsedLanes & RHSUsedLanes) &&
7433 // If we select high and lower word keep it for SDWA.
7434 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7435 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7436 // Each byte in each mask is either selector mask 0-3, or has higher
7437 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
7438 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
7439 // mask which is not 0xff wins. By anding both masks we have a correct
7440 // result except that 0x0c shall be corrected to give 0x0c only.
7441 uint32_t Mask = LHSMask & RHSMask;
7442 for (unsigned I = 0; I < 32; I += 8) {
7443 uint32_t ByteSel = 0xff << I;
7444 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
7445 Mask &= (0x0c << I) & 0xffffffff;
7446 }
7447
7448 // Add 4 to each active LHS lane. It will not affect any existing 0xff
7449 // or 0x0c.
7450 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
7451 SDLoc DL(N);
7452
7453 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7454 LHS.getOperand(0), RHS.getOperand(0),
7455 DAG.getConstant(Sel, DL, MVT::i32));
7456 }
7457 }
7458 }
7459
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007460 return SDValue();
7461}
7462
Matt Arsenaultf2290332015-01-06 23:00:39 +00007463SDValue SITargetLowering::performOrCombine(SDNode *N,
7464 DAGCombinerInfo &DCI) const {
7465 SelectionDAG &DAG = DCI.DAG;
7466 SDValue LHS = N->getOperand(0);
7467 SDValue RHS = N->getOperand(1);
7468
Matt Arsenault3b082382016-04-12 18:24:38 +00007469 EVT VT = N->getValueType(0);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007470 if (VT == MVT::i1) {
7471 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
7472 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7473 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
7474 SDValue Src = LHS.getOperand(0);
7475 if (Src != RHS.getOperand(0))
7476 return SDValue();
Matt Arsenault3b082382016-04-12 18:24:38 +00007477
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007478 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
7479 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7480 if (!CLHS || !CRHS)
7481 return SDValue();
Matt Arsenault3b082382016-04-12 18:24:38 +00007482
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007483 // Only 10 bits are used.
7484 static const uint32_t MaxMask = 0x3ff;
Matt Arsenault3b082382016-04-12 18:24:38 +00007485
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007486 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
7487 SDLoc DL(N);
7488 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7489 Src, DAG.getConstant(NewMask, DL, MVT::i32));
7490 }
Matt Arsenault3b082382016-04-12 18:24:38 +00007491
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007492 return SDValue();
7493 }
7494
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007495 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7496 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
7497 LHS.getOpcode() == AMDGPUISD::PERM &&
7498 isa<ConstantSDNode>(LHS.getOperand(2))) {
7499 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
7500 if (!Sel)
7501 return SDValue();
7502
7503 Sel |= LHS.getConstantOperandVal(2);
7504 SDLoc DL(N);
7505 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7506 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7507 }
7508
7509 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7510 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7511 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7512 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7513 uint32_t LHSMask = getPermuteMask(DAG, LHS);
7514 uint32_t RHSMask = getPermuteMask(DAG, RHS);
7515 if (LHSMask != ~0u && RHSMask != ~0u) {
7516 // Canonicalize the expression in an attempt to have fewer unique masks
7517 // and therefore fewer registers used to hold the masks.
7518 if (LHSMask > RHSMask) {
7519 std::swap(LHSMask, RHSMask);
7520 std::swap(LHS, RHS);
7521 }
7522
7523 // Select 0xc for each lane used from source operand. Zero has 0xc mask
7524 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7525 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7526 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7527
7528 // Check of we need to combine values from two sources within a byte.
7529 if (!(LHSUsedLanes & RHSUsedLanes) &&
7530 // If we select high and lower word keep it for SDWA.
7531 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7532 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7533 // Kill zero bytes selected by other mask. Zero value is 0xc.
7534 LHSMask &= ~RHSUsedLanes;
7535 RHSMask &= ~LHSUsedLanes;
7536 // Add 4 to each active LHS lane
7537 LHSMask |= LHSUsedLanes & 0x04040404;
7538 // Combine masks
7539 uint32_t Sel = LHSMask | RHSMask;
7540 SDLoc DL(N);
7541
7542 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7543 LHS.getOperand(0), RHS.getOperand(0),
7544 DAG.getConstant(Sel, DL, MVT::i32));
7545 }
7546 }
7547 }
7548
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007549 if (VT != MVT::i64)
7550 return SDValue();
7551
7552 // TODO: This could be a generic combine with a predicate for extracting the
7553 // high half of an integer being free.
7554
7555 // (or i64:x, (zero_extend i32:y)) ->
7556 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
7557 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
7558 RHS.getOpcode() != ISD::ZERO_EXTEND)
7559 std::swap(LHS, RHS);
7560
7561 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
7562 SDValue ExtSrc = RHS.getOperand(0);
7563 EVT SrcVT = ExtSrc.getValueType();
7564 if (SrcVT == MVT::i32) {
7565 SDLoc SL(N);
7566 SDValue LowLHS, HiBits;
7567 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
7568 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
7569
7570 DCI.AddToWorklist(LowOr.getNode());
7571 DCI.AddToWorklist(HiBits.getNode());
7572
7573 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
7574 LowOr, HiBits);
7575 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
Matt Arsenault3b082382016-04-12 18:24:38 +00007576 }
7577 }
7578
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007579 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
7580 if (CRHS) {
7581 if (SDValue Split
7582 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
7583 return Split;
7584 }
Matt Arsenaultf2290332015-01-06 23:00:39 +00007585
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007586 return SDValue();
7587}
Matt Arsenaultf2290332015-01-06 23:00:39 +00007588
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007589SDValue SITargetLowering::performXorCombine(SDNode *N,
7590 DAGCombinerInfo &DCI) const {
7591 EVT VT = N->getValueType(0);
7592 if (VT != MVT::i64)
7593 return SDValue();
Matt Arsenaultf2290332015-01-06 23:00:39 +00007594
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007595 SDValue LHS = N->getOperand(0);
7596 SDValue RHS = N->getOperand(1);
7597
7598 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7599 if (CRHS) {
7600 if (SDValue Split
7601 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
7602 return Split;
Matt Arsenaultf2290332015-01-06 23:00:39 +00007603 }
7604
7605 return SDValue();
7606}
7607
Matt Arsenault5cf42712017-04-06 20:58:30 +00007608// Instructions that will be lowered with a final instruction that zeros the
7609// high result bits.
7610// XXX - probably only need to list legal operations.
Matt Arsenault8edfaee2017-03-31 19:53:03 +00007611static bool fp16SrcZerosHighBits(unsigned Opc) {
7612 switch (Opc) {
Matt Arsenault5cf42712017-04-06 20:58:30 +00007613 case ISD::FADD:
7614 case ISD::FSUB:
7615 case ISD::FMUL:
7616 case ISD::FDIV:
7617 case ISD::FREM:
7618 case ISD::FMA:
7619 case ISD::FMAD:
7620 case ISD::FCANONICALIZE:
7621 case ISD::FP_ROUND:
7622 case ISD::UINT_TO_FP:
7623 case ISD::SINT_TO_FP:
7624 case ISD::FABS:
7625 // Fabs is lowered to a bit operation, but it's an and which will clear the
7626 // high bits anyway.
7627 case ISD::FSQRT:
7628 case ISD::FSIN:
7629 case ISD::FCOS:
7630 case ISD::FPOWI:
7631 case ISD::FPOW:
7632 case ISD::FLOG:
7633 case ISD::FLOG2:
7634 case ISD::FLOG10:
7635 case ISD::FEXP:
7636 case ISD::FEXP2:
7637 case ISD::FCEIL:
7638 case ISD::FTRUNC:
7639 case ISD::FRINT:
7640 case ISD::FNEARBYINT:
7641 case ISD::FROUND:
7642 case ISD::FFLOOR:
7643 case ISD::FMINNUM:
7644 case ISD::FMAXNUM:
7645 case AMDGPUISD::FRACT:
7646 case AMDGPUISD::CLAMP:
7647 case AMDGPUISD::COS_HW:
7648 case AMDGPUISD::SIN_HW:
7649 case AMDGPUISD::FMIN3:
7650 case AMDGPUISD::FMAX3:
7651 case AMDGPUISD::FMED3:
7652 case AMDGPUISD::FMAD_FTZ:
7653 case AMDGPUISD::RCP:
7654 case AMDGPUISD::RSQ:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00007655 case AMDGPUISD::RCP_IFLAG:
Matt Arsenault5cf42712017-04-06 20:58:30 +00007656 case AMDGPUISD::LDEXP:
Matt Arsenault8edfaee2017-03-31 19:53:03 +00007657 return true;
Matt Arsenault5cf42712017-04-06 20:58:30 +00007658 default:
7659 // fcopysign, select and others may be lowered to 32-bit bit operations
7660 // which don't zero the high bits.
7661 return false;
Matt Arsenault8edfaee2017-03-31 19:53:03 +00007662 }
7663}
7664
7665SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
7666 DAGCombinerInfo &DCI) const {
7667 if (!Subtarget->has16BitInsts() ||
7668 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
7669 return SDValue();
7670
7671 EVT VT = N->getValueType(0);
7672 if (VT != MVT::i32)
7673 return SDValue();
7674
7675 SDValue Src = N->getOperand(0);
7676 if (Src.getValueType() != MVT::i16)
7677 return SDValue();
7678
7679 // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
7680 // FIXME: It is not universally true that the high bits are zeroed on gfx9.
7681 if (Src.getOpcode() == ISD::BITCAST) {
7682 SDValue BCSrc = Src.getOperand(0);
7683 if (BCSrc.getValueType() == MVT::f16 &&
7684 fp16SrcZerosHighBits(BCSrc.getOpcode()))
7685 return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
7686 }
7687
7688 return SDValue();
7689}
7690
Matt Arsenaultf2290332015-01-06 23:00:39 +00007691SDValue SITargetLowering::performClassCombine(SDNode *N,
7692 DAGCombinerInfo &DCI) const {
7693 SelectionDAG &DAG = DCI.DAG;
7694 SDValue Mask = N->getOperand(1);
7695
7696 // fp_class x, 0 -> false
7697 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
7698 if (CMask->isNullValue())
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007699 return DAG.getConstant(0, SDLoc(N), MVT::i1);
Matt Arsenaultf2290332015-01-06 23:00:39 +00007700 }
7701
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00007702 if (N->getOperand(0).isUndef())
7703 return DAG.getUNDEF(MVT::i1);
7704
Matt Arsenaultf2290332015-01-06 23:00:39 +00007705 return SDValue();
7706}
7707
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00007708SDValue SITargetLowering::performRcpCombine(SDNode *N,
7709 DAGCombinerInfo &DCI) const {
7710 EVT VT = N->getValueType(0);
7711 SDValue N0 = N->getOperand(0);
7712
7713 if (N0.isUndef())
7714 return N0;
7715
7716 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
7717 N0.getOpcode() == ISD::SINT_TO_FP)) {
7718 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
7719 N->getFlags());
7720 }
7721
7722 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
7723}
7724
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007725bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
7726 unsigned MaxDepth) const {
7727 unsigned Opcode = Op.getOpcode();
7728 if (Opcode == ISD::FCANONICALIZE)
7729 return true;
7730
7731 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
7732 auto F = CFP->getValueAPF();
7733 if (F.isNaN() && F.isSignaling())
7734 return false;
7735 return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
7736 }
7737
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007738 // If source is a result of another standard FP operation it is already in
7739 // canonical form.
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007740 if (MaxDepth == 0)
7741 return false;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007742
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007743 switch (Opcode) {
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007744 // These will flush denorms if required.
7745 case ISD::FADD:
7746 case ISD::FSUB:
7747 case ISD::FMUL:
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007748 case ISD::FCEIL:
7749 case ISD::FFLOOR:
7750 case ISD::FMA:
7751 case ISD::FMAD:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007752 case ISD::FSQRT:
7753 case ISD::FDIV:
7754 case ISD::FREM:
Matt Arsenaultce6d61f2018-08-06 21:51:52 +00007755 case ISD::FP_ROUND:
7756 case ISD::FP_EXTEND:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007757 case AMDGPUISD::FMUL_LEGACY:
7758 case AMDGPUISD::FMAD_FTZ:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00007759 case AMDGPUISD::RCP:
7760 case AMDGPUISD::RSQ:
7761 case AMDGPUISD::RSQ_CLAMP:
7762 case AMDGPUISD::RCP_LEGACY:
7763 case AMDGPUISD::RSQ_LEGACY:
7764 case AMDGPUISD::RCP_IFLAG:
7765 case AMDGPUISD::TRIG_PREOP:
7766 case AMDGPUISD::DIV_SCALE:
7767 case AMDGPUISD::DIV_FMAS:
7768 case AMDGPUISD::DIV_FIXUP:
7769 case AMDGPUISD::FRACT:
7770 case AMDGPUISD::LDEXP:
Matt Arsenault08f3fe42018-08-06 23:01:31 +00007771 case AMDGPUISD::CVT_PKRTZ_F16_F32:
Matt Arsenault940e6072018-08-10 19:20:17 +00007772 case AMDGPUISD::CVT_F32_UBYTE0:
7773 case AMDGPUISD::CVT_F32_UBYTE1:
7774 case AMDGPUISD::CVT_F32_UBYTE2:
7775 case AMDGPUISD::CVT_F32_UBYTE3:
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007776 return true;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007777
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007778 // It can/will be lowered or combined as a bit operation.
7779 // Need to check their input recursively to handle.
7780 case ISD::FNEG:
7781 case ISD::FABS:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007782 case ISD::FCOPYSIGN:
7783 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007784
7785 case ISD::FSIN:
7786 case ISD::FCOS:
7787 case ISD::FSINCOS:
7788 return Op.getValueType().getScalarType() != MVT::f16;
7789
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007790 case ISD::FMINNUM:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00007791 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00007792 case ISD::FMINNUM_IEEE:
7793 case ISD::FMAXNUM_IEEE:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00007794 case AMDGPUISD::CLAMP:
7795 case AMDGPUISD::FMED3:
7796 case AMDGPUISD::FMAX3:
7797 case AMDGPUISD::FMIN3: {
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007798 // FIXME: Shouldn't treat the generic operations different based these.
Matt Arsenault687ec752018-10-22 16:27:27 +00007799 // However, we aren't really required to flush the result from
7800 // minnum/maxnum..
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007801
Matt Arsenault687ec752018-10-22 16:27:27 +00007802 // snans will be quieted, so we only need to worry about denormals.
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007803 if (Subtarget->supportsMinMaxDenormModes() ||
Matt Arsenault687ec752018-10-22 16:27:27 +00007804 denormalsEnabledForType(Op.getValueType()))
7805 return true;
7806
7807 // Flushing may be required.
7808 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
7809 // targets need to check their input recursively.
7810
7811 // FIXME: Does this apply with clamp? It's implemented with max.
7812 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
7813 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
7814 return false;
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007815 }
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007816
Matt Arsenault687ec752018-10-22 16:27:27 +00007817 return true;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007818 }
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007819 case ISD::SELECT: {
7820 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
7821 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007822 }
Matt Arsenaulte94ee832018-08-06 22:45:51 +00007823 case ISD::BUILD_VECTOR: {
7824 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
7825 SDValue SrcOp = Op.getOperand(i);
7826 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
7827 return false;
7828 }
7829
7830 return true;
7831 }
7832 case ISD::EXTRACT_VECTOR_ELT:
7833 case ISD::EXTRACT_SUBVECTOR: {
7834 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
7835 }
7836 case ISD::INSERT_VECTOR_ELT: {
7837 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
7838 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
7839 }
7840 case ISD::UNDEF:
7841 // Could be anything.
7842 return false;
Matt Arsenault08f3fe42018-08-06 23:01:31 +00007843
Matt Arsenault687ec752018-10-22 16:27:27 +00007844 case ISD::BITCAST: {
7845 // Hack round the mess we make when legalizing extract_vector_elt
7846 SDValue Src = Op.getOperand(0);
7847 if (Src.getValueType() == MVT::i16 &&
7848 Src.getOpcode() == ISD::TRUNCATE) {
7849 SDValue TruncSrc = Src.getOperand(0);
7850 if (TruncSrc.getValueType() == MVT::i32 &&
7851 TruncSrc.getOpcode() == ISD::BITCAST &&
7852 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
7853 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
7854 }
7855 }
7856
7857 return false;
7858 }
Matt Arsenault08f3fe42018-08-06 23:01:31 +00007859 case ISD::INTRINSIC_WO_CHAIN: {
7860 unsigned IntrinsicID
7861 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7862 // TODO: Handle more intrinsics
7863 switch (IntrinsicID) {
7864 case Intrinsic::amdgcn_cvt_pkrtz:
Matt Arsenault940e6072018-08-10 19:20:17 +00007865 case Intrinsic::amdgcn_cubeid:
7866 case Intrinsic::amdgcn_frexp_mant:
7867 case Intrinsic::amdgcn_fdot2:
Matt Arsenault08f3fe42018-08-06 23:01:31 +00007868 return true;
7869 default:
7870 break;
7871 }
Matt Arsenault5bb9d792018-08-10 17:57:12 +00007872
7873 LLVM_FALLTHROUGH;
Matt Arsenault08f3fe42018-08-06 23:01:31 +00007874 }
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00007875 default:
7876 return denormalsEnabledForType(Op.getValueType()) &&
7877 DAG.isKnownNeverSNaN(Op);
7878 }
7879
7880 llvm_unreachable("invalid operation");
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007881}
7882
Matt Arsenault9cd90712016-04-14 01:42:16 +00007883// Constant fold canonicalize.
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00007884SDValue SITargetLowering::getCanonicalConstantFP(
7885 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
7886 // Flush denormals to 0 if not enabled.
7887 if (C.isDenormal() && !denormalsEnabledForType(VT))
7888 return DAG.getConstantFP(0.0, SL, VT);
7889
7890 if (C.isNaN()) {
7891 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
7892 if (C.isSignaling()) {
7893 // Quiet a signaling NaN.
7894 // FIXME: Is this supposed to preserve payload bits?
7895 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
7896 }
7897
7898 // Make sure it is the canonical NaN bitpattern.
7899 //
7900 // TODO: Can we use -1 as the canonical NaN value since it's an inline
7901 // immediate?
7902 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
7903 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
7904 }
7905
7906 // Already canonical.
7907 return DAG.getConstantFP(C, SL, VT);
7908}
7909
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007910static bool vectorEltWillFoldAway(SDValue Op) {
7911 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
7912}
7913
Matt Arsenault9cd90712016-04-14 01:42:16 +00007914SDValue SITargetLowering::performFCanonicalizeCombine(
7915 SDNode *N,
7916 DAGCombinerInfo &DCI) const {
Matt Arsenault9cd90712016-04-14 01:42:16 +00007917 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault4aec86d2018-07-31 13:34:31 +00007918 SDValue N0 = N->getOperand(0);
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007919 EVT VT = N->getValueType(0);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00007920
Matt Arsenault4aec86d2018-07-31 13:34:31 +00007921 // fcanonicalize undef -> qnan
7922 if (N0.isUndef()) {
Matt Arsenault4aec86d2018-07-31 13:34:31 +00007923 APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
7924 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
7925 }
7926
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00007927 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
Matt Arsenault9cd90712016-04-14 01:42:16 +00007928 EVT VT = N->getValueType(0);
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00007929 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
Matt Arsenault9cd90712016-04-14 01:42:16 +00007930 }
7931
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007932 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
7933 // (fcanonicalize k)
7934 //
7935 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
7936
7937 // TODO: This could be better with wider vectors that will be split to v2f16,
7938 // and to consider uses since there aren't that many packed operations.
Matt Arsenaultb5acec12018-08-12 08:42:54 +00007939 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
7940 isTypeLegal(MVT::v2f16)) {
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007941 SDLoc SL(N);
7942 SDValue NewElts[2];
7943 SDValue Lo = N0.getOperand(0);
7944 SDValue Hi = N0.getOperand(1);
Matt Arsenaultb5acec12018-08-12 08:42:54 +00007945 EVT EltVT = Lo.getValueType();
7946
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007947 if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
7948 for (unsigned I = 0; I != 2; ++I) {
7949 SDValue Op = N0.getOperand(I);
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007950 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
7951 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
7952 CFP->getValueAPF());
7953 } else if (Op.isUndef()) {
Matt Arsenaultb5acec12018-08-12 08:42:54 +00007954 // Handled below based on what the other operand is.
7955 NewElts[I] = Op;
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007956 } else {
7957 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
7958 }
7959 }
7960
Matt Arsenaultb5acec12018-08-12 08:42:54 +00007961 // If one half is undef, and one is constant, perfer a splat vector rather
7962 // than the normal qNaN. If it's a register, prefer 0.0 since that's
7963 // cheaper to use and may be free with a packed operation.
7964 if (NewElts[0].isUndef()) {
7965 if (isa<ConstantFPSDNode>(NewElts[1]))
7966 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
7967 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
7968 }
7969
7970 if (NewElts[1].isUndef()) {
7971 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
7972 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
7973 }
7974
Matt Arsenaulta29e7622018-08-06 22:30:44 +00007975 return DAG.getBuildVector(VT, SL, NewElts);
7976 }
7977 }
7978
Matt Arsenault687ec752018-10-22 16:27:27 +00007979 unsigned SrcOpc = N0.getOpcode();
7980
7981 // If it's free to do so, push canonicalizes further up the source, which may
7982 // find a canonical source.
7983 //
7984 // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
7985 // sNaNs.
7986 if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
7987 auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
7988 if (CRHS && N0.hasOneUse()) {
7989 SDLoc SL(N);
7990 SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
7991 N0.getOperand(0));
7992 SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
7993 DCI.AddToWorklist(Canon0.getNode());
7994
7995 return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
7996 }
7997 }
7998
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00007999 return isCanonicalized(DAG, N0) ? N0 : SDValue();
Matt Arsenault9cd90712016-04-14 01:42:16 +00008000}
8001
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008002static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
8003 switch (Opc) {
8004 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008005 case ISD::FMAXNUM_IEEE:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008006 return AMDGPUISD::FMAX3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008007 case ISD::SMAX:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008008 return AMDGPUISD::SMAX3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008009 case ISD::UMAX:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008010 return AMDGPUISD::UMAX3;
8011 case ISD::FMINNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008012 case ISD::FMINNUM_IEEE:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008013 return AMDGPUISD::FMIN3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008014 case ISD::SMIN:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008015 return AMDGPUISD::SMIN3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008016 case ISD::UMIN:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008017 return AMDGPUISD::UMIN3;
8018 default:
8019 llvm_unreachable("Not a min/max opcode");
8020 }
8021}
8022
Matt Arsenault10268f92017-02-27 22:40:39 +00008023SDValue SITargetLowering::performIntMed3ImmCombine(
8024 SelectionDAG &DAG, const SDLoc &SL,
8025 SDValue Op0, SDValue Op1, bool Signed) const {
Matt Arsenaultf639c322016-01-28 20:53:42 +00008026 ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
8027 if (!K1)
8028 return SDValue();
8029
8030 ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
8031 if (!K0)
8032 return SDValue();
8033
Matt Arsenaultf639c322016-01-28 20:53:42 +00008034 if (Signed) {
8035 if (K0->getAPIntValue().sge(K1->getAPIntValue()))
8036 return SDValue();
8037 } else {
8038 if (K0->getAPIntValue().uge(K1->getAPIntValue()))
8039 return SDValue();
8040 }
8041
8042 EVT VT = K0->getValueType(0);
Matt Arsenault10268f92017-02-27 22:40:39 +00008043 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
8044 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
8045 return DAG.getNode(Med3Opc, SL, VT,
8046 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
8047 }
Tom Stellard115a6152016-11-10 16:02:37 +00008048
Matt Arsenault10268f92017-02-27 22:40:39 +00008049 // If there isn't a 16-bit med3 operation, convert to 32-bit.
Tom Stellard115a6152016-11-10 16:02:37 +00008050 MVT NVT = MVT::i32;
8051 unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8052
Matt Arsenault10268f92017-02-27 22:40:39 +00008053 SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
8054 SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
8055 SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
Tom Stellard115a6152016-11-10 16:02:37 +00008056
Matt Arsenault10268f92017-02-27 22:40:39 +00008057 SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
8058 return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
Matt Arsenaultf639c322016-01-28 20:53:42 +00008059}
8060
Matt Arsenault6b114d22017-08-30 01:20:17 +00008061static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
8062 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
8063 return C;
8064
8065 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
8066 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
8067 return C;
8068 }
8069
8070 return nullptr;
8071}
8072
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008073SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
8074 const SDLoc &SL,
8075 SDValue Op0,
8076 SDValue Op1) const {
Matt Arsenault6b114d22017-08-30 01:20:17 +00008077 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
Matt Arsenaultf639c322016-01-28 20:53:42 +00008078 if (!K1)
8079 return SDValue();
8080
Matt Arsenault6b114d22017-08-30 01:20:17 +00008081 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
Matt Arsenaultf639c322016-01-28 20:53:42 +00008082 if (!K0)
8083 return SDValue();
8084
8085 // Ordered >= (although NaN inputs should have folded away by now).
8086 APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
8087 if (Cmp == APFloat::cmpGreaterThan)
8088 return SDValue();
8089
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008090 // TODO: Check IEEE bit enabled?
Matt Arsenault6b114d22017-08-30 01:20:17 +00008091 EVT VT = Op0.getValueType();
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008092 if (Subtarget->enableDX10Clamp()) {
8093 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
8094 // hardware fmed3 behavior converting to a min.
8095 // FIXME: Should this be allowing -0.0?
8096 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
8097 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
8098 }
8099
Matt Arsenault6b114d22017-08-30 01:20:17 +00008100 // med3 for f16 is only available on gfx9+, and not available for v2f16.
8101 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
8102 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
8103 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
8104 // then give the other result, which is different from med3 with a NaN
8105 // input.
8106 SDValue Var = Op0.getOperand(0);
Matt Arsenaultc3dc8e62018-08-03 18:27:52 +00008107 if (!DAG.isKnownNeverSNaN(Var))
Matt Arsenault6b114d22017-08-30 01:20:17 +00008108 return SDValue();
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008109
Matt Arsenaultebf46142018-09-18 02:34:54 +00008110 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
8111
8112 if ((!K0->hasOneUse() ||
8113 TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
8114 (!K1->hasOneUse() ||
8115 TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
8116 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
8117 Var, SDValue(K0, 0), SDValue(K1, 0));
8118 }
Matt Arsenault6b114d22017-08-30 01:20:17 +00008119 }
Matt Arsenaultf639c322016-01-28 20:53:42 +00008120
Matt Arsenault6b114d22017-08-30 01:20:17 +00008121 return SDValue();
Matt Arsenaultf639c322016-01-28 20:53:42 +00008122}
8123
8124SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
8125 DAGCombinerInfo &DCI) const {
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008126 SelectionDAG &DAG = DCI.DAG;
8127
Matt Arsenault79a45db2017-02-22 23:53:37 +00008128 EVT VT = N->getValueType(0);
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008129 unsigned Opc = N->getOpcode();
8130 SDValue Op0 = N->getOperand(0);
8131 SDValue Op1 = N->getOperand(1);
8132
8133 // Only do this if the inner op has one use since this will just increases
8134 // register pressure for no benefit.
8135
Matt Arsenault79a45db2017-02-22 23:53:37 +00008136
8137 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
Farhana Aleene80aeac2018-04-03 23:00:30 +00008138 !VT.isVector() && VT != MVT::f64 &&
Matt Arsenaultee324ff2017-05-17 19:25:06 +00008139 ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
Matt Arsenault5b39b342016-01-28 20:53:48 +00008140 // max(max(a, b), c) -> max3(a, b, c)
8141 // min(min(a, b), c) -> min3(a, b, c)
8142 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
8143 SDLoc DL(N);
8144 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
8145 DL,
8146 N->getValueType(0),
8147 Op0.getOperand(0),
8148 Op0.getOperand(1),
8149 Op1);
8150 }
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008151
Matt Arsenault5b39b342016-01-28 20:53:48 +00008152 // Try commuted.
8153 // max(a, max(b, c)) -> max3(a, b, c)
8154 // min(a, min(b, c)) -> min3(a, b, c)
8155 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
8156 SDLoc DL(N);
8157 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
8158 DL,
8159 N->getValueType(0),
8160 Op0,
8161 Op1.getOperand(0),
8162 Op1.getOperand(1));
8163 }
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008164 }
8165
Matt Arsenaultf639c322016-01-28 20:53:42 +00008166 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
8167 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
8168 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
8169 return Med3;
8170 }
8171
8172 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
8173 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
8174 return Med3;
8175 }
8176
8177 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
Matt Arsenault5b39b342016-01-28 20:53:48 +00008178 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
Matt Arsenault687ec752018-10-22 16:27:27 +00008179 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
Matt Arsenault5b39b342016-01-28 20:53:48 +00008180 (Opc == AMDGPUISD::FMIN_LEGACY &&
8181 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
Matt Arsenault79a45db2017-02-22 23:53:37 +00008182 (VT == MVT::f32 || VT == MVT::f64 ||
Matt Arsenault6b114d22017-08-30 01:20:17 +00008183 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
8184 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008185 Op0.hasOneUse()) {
Matt Arsenaultf639c322016-01-28 20:53:42 +00008186 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
8187 return Res;
8188 }
8189
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008190 return SDValue();
8191}
8192
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008193static bool isClampZeroToOne(SDValue A, SDValue B) {
8194 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
8195 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
8196 // FIXME: Should this be allowing -0.0?
8197 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
8198 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
8199 }
8200 }
8201
8202 return false;
8203}
8204
8205// FIXME: Should only worry about snans for version with chain.
8206SDValue SITargetLowering::performFMed3Combine(SDNode *N,
8207 DAGCombinerInfo &DCI) const {
8208 EVT VT = N->getValueType(0);
8209 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
8210 // NaNs. With a NaN input, the order of the operands may change the result.
8211
8212 SelectionDAG &DAG = DCI.DAG;
8213 SDLoc SL(N);
8214
8215 SDValue Src0 = N->getOperand(0);
8216 SDValue Src1 = N->getOperand(1);
8217 SDValue Src2 = N->getOperand(2);
8218
8219 if (isClampZeroToOne(Src0, Src1)) {
8220 // const_a, const_b, x -> clamp is safe in all cases including signaling
8221 // nans.
8222 // FIXME: Should this be allowing -0.0?
8223 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
8224 }
8225
8226 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
8227 // handling no dx10-clamp?
8228 if (Subtarget->enableDX10Clamp()) {
8229 // If NaNs is clamped to 0, we are free to reorder the inputs.
8230
8231 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8232 std::swap(Src0, Src1);
8233
8234 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
8235 std::swap(Src1, Src2);
8236
8237 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8238 std::swap(Src0, Src1);
8239
8240 if (isClampZeroToOne(Src1, Src2))
8241 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
8242 }
8243
8244 return SDValue();
8245}
8246
Matt Arsenault1f17c662017-02-22 00:27:34 +00008247SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
8248 DAGCombinerInfo &DCI) const {
8249 SDValue Src0 = N->getOperand(0);
8250 SDValue Src1 = N->getOperand(1);
8251 if (Src0.isUndef() && Src1.isUndef())
8252 return DCI.DAG.getUNDEF(N->getValueType(0));
8253 return SDValue();
8254}
8255
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008256SDValue SITargetLowering::performExtractVectorEltCombine(
8257 SDNode *N, DAGCombinerInfo &DCI) const {
8258 SDValue Vec = N->getOperand(0);
Matt Arsenault8cbb4882017-09-20 21:01:24 +00008259 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008260
8261 EVT VecVT = Vec.getValueType();
8262 EVT EltVT = VecVT.getVectorElementType();
8263
Matt Arsenaultfcc5ba42018-04-26 19:21:32 +00008264 if ((Vec.getOpcode() == ISD::FNEG ||
8265 Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008266 SDLoc SL(N);
8267 EVT EltVT = N->getValueType(0);
8268 SDValue Idx = N->getOperand(1);
8269 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8270 Vec.getOperand(0), Idx);
Matt Arsenaultfcc5ba42018-04-26 19:21:32 +00008271 return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008272 }
8273
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008274 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
8275 // =>
8276 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
8277 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
8278 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
Farhana Aleene24f3ff2018-05-09 21:18:34 +00008279 if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008280 SDLoc SL(N);
8281 EVT EltVT = N->getValueType(0);
8282 SDValue Idx = N->getOperand(1);
8283 unsigned Opc = Vec.getOpcode();
8284
8285 switch(Opc) {
8286 default:
Stanislav Mekhanoshinbcb34ac2018-11-13 21:18:21 +00008287 break;
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008288 // TODO: Support other binary operations.
8289 case ISD::FADD:
Matt Arsenaulta8160732018-08-15 21:34:06 +00008290 case ISD::FSUB:
8291 case ISD::FMUL:
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008292 case ISD::ADD:
Farhana Aleene24f3ff2018-05-09 21:18:34 +00008293 case ISD::UMIN:
8294 case ISD::UMAX:
8295 case ISD::SMIN:
8296 case ISD::SMAX:
8297 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008298 case ISD::FMINNUM:
8299 case ISD::FMAXNUM_IEEE:
8300 case ISD::FMINNUM_IEEE: {
Matt Arsenaulta8160732018-08-15 21:34:06 +00008301 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8302 Vec.getOperand(0), Idx);
8303 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8304 Vec.getOperand(1), Idx);
8305
8306 DCI.AddToWorklist(Elt0.getNode());
8307 DCI.AddToWorklist(Elt1.getNode());
8308 return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
8309 }
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008310 }
8311 }
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008312
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008313 unsigned VecSize = VecVT.getSizeInBits();
8314 unsigned EltSize = EltVT.getSizeInBits();
8315
Stanislav Mekhanoshinbcb34ac2018-11-13 21:18:21 +00008316 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
8317 // This elminates non-constant index and subsequent movrel or scratch access.
8318 // Sub-dword vectors of size 2 dword or less have better implementation.
8319 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8320 // instructions.
8321 if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) &&
8322 !isa<ConstantSDNode>(N->getOperand(1))) {
8323 SDLoc SL(N);
8324 SDValue Idx = N->getOperand(1);
8325 EVT IdxVT = Idx.getValueType();
8326 SDValue V;
8327 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8328 SDValue IC = DAG.getConstant(I, SL, IdxVT);
8329 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8330 if (I == 0)
8331 V = Elt;
8332 else
8333 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
8334 }
8335 return V;
8336 }
8337
8338 if (!DCI.isBeforeLegalize())
8339 return SDValue();
8340
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008341 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
8342 // elements. This exposes more load reduction opportunities by replacing
8343 // multiple small extract_vector_elements with a single 32-bit extract.
8344 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
Matt Arsenaultbf07a502018-08-31 15:39:52 +00008345 if (isa<MemSDNode>(Vec) &&
8346 EltSize <= 16 &&
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008347 EltVT.isByteSized() &&
8348 VecSize > 32 &&
8349 VecSize % 32 == 0 &&
8350 Idx) {
8351 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
8352
8353 unsigned BitIndex = Idx->getZExtValue() * EltSize;
8354 unsigned EltIdx = BitIndex / 32;
8355 unsigned LeftoverBitIdx = BitIndex % 32;
8356 SDLoc SL(N);
8357
8358 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
8359 DCI.AddToWorklist(Cast.getNode());
8360
8361 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
8362 DAG.getConstant(EltIdx, SL, MVT::i32));
8363 DCI.AddToWorklist(Elt.getNode());
8364 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
8365 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
8366 DCI.AddToWorklist(Srl.getNode());
8367
8368 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
8369 DCI.AddToWorklist(Trunc.getNode());
8370 return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
8371 }
8372
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008373 return SDValue();
8374}
8375
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00008376SDValue
8377SITargetLowering::performInsertVectorEltCombine(SDNode *N,
8378 DAGCombinerInfo &DCI) const {
8379 SDValue Vec = N->getOperand(0);
8380 SDValue Idx = N->getOperand(2);
8381 EVT VecVT = Vec.getValueType();
8382 EVT EltVT = VecVT.getVectorElementType();
8383 unsigned VecSize = VecVT.getSizeInBits();
8384 unsigned EltSize = EltVT.getSizeInBits();
8385
8386 // INSERT_VECTOR_ELT (<n x e>, var-idx)
8387 // => BUILD_VECTOR n x select (e, const-idx)
8388 // This elminates non-constant index and subsequent movrel or scratch access.
8389 // Sub-dword vectors of size 2 dword or less have better implementation.
8390 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8391 // instructions.
8392 if (isa<ConstantSDNode>(Idx) ||
8393 VecSize > 256 || (VecSize <= 64 && EltSize < 32))
8394 return SDValue();
8395
8396 SelectionDAG &DAG = DCI.DAG;
8397 SDLoc SL(N);
8398 SDValue Ins = N->getOperand(1);
8399 EVT IdxVT = Idx.getValueType();
8400
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00008401 SmallVector<SDValue, 16> Ops;
8402 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8403 SDValue IC = DAG.getConstant(I, SL, IdxVT);
8404 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8405 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
8406 Ops.push_back(V);
8407 }
8408
8409 return DAG.getBuildVector(VecVT, SL, Ops);
8410}
8411
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008412unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
8413 const SDNode *N0,
8414 const SDNode *N1) const {
8415 EVT VT = N0->getValueType(0);
8416
Matt Arsenault770ec862016-12-22 03:55:35 +00008417 // Only do this if we are not trying to support denormals. v_mad_f32 does not
8418 // support denormals ever.
8419 if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
8420 (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
8421 return ISD::FMAD;
8422
8423 const TargetOptions &Options = DAG.getTarget().Options;
Amara Emersond28f0cd42017-05-01 15:17:51 +00008424 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
Michael Berg7acc81b2018-05-04 18:48:20 +00008425 (N0->getFlags().hasAllowContract() &&
8426 N1->getFlags().hasAllowContract())) &&
Matt Arsenault770ec862016-12-22 03:55:35 +00008427 isFMAFasterThanFMulAndFAdd(VT)) {
8428 return ISD::FMA;
8429 }
8430
8431 return 0;
8432}
8433
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00008434// For a reassociatable opcode perform:
8435// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
8436SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
8437 SelectionDAG &DAG) const {
8438 EVT VT = N->getValueType(0);
8439 if (VT != MVT::i32 && VT != MVT::i64)
8440 return SDValue();
8441
8442 unsigned Opc = N->getOpcode();
8443 SDValue Op0 = N->getOperand(0);
8444 SDValue Op1 = N->getOperand(1);
8445
8446 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
8447 return SDValue();
8448
8449 if (Op0->isDivergent())
8450 std::swap(Op0, Op1);
8451
8452 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
8453 return SDValue();
8454
8455 SDValue Op2 = Op1.getOperand(1);
8456 Op1 = Op1.getOperand(0);
8457 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
8458 return SDValue();
8459
8460 if (Op1->isDivergent())
8461 std::swap(Op1, Op2);
8462
8463 // If either operand is constant this will conflict with
8464 // DAGCombiner::ReassociateOps().
Stanislav Mekhanoshinda1628e2019-02-26 20:56:25 +00008465 if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
8466 DAG.isConstantIntBuildVectorOrConstantInt(Op1))
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00008467 return SDValue();
8468
8469 SDLoc SL(N);
8470 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
8471 return DAG.getNode(Opc, SL, VT, Add1, Op2);
8472}
8473
Matt Arsenault4f6318f2017-11-06 17:04:37 +00008474static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
8475 EVT VT,
8476 SDValue N0, SDValue N1, SDValue N2,
8477 bool Signed) {
8478 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
8479 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
8480 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
8481 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
8482}
8483
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008484SDValue SITargetLowering::performAddCombine(SDNode *N,
8485 DAGCombinerInfo &DCI) const {
8486 SelectionDAG &DAG = DCI.DAG;
8487 EVT VT = N->getValueType(0);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008488 SDLoc SL(N);
8489 SDValue LHS = N->getOperand(0);
8490 SDValue RHS = N->getOperand(1);
8491
Matt Arsenault4f6318f2017-11-06 17:04:37 +00008492 if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
8493 && Subtarget->hasMad64_32() &&
8494 !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
8495 VT.getScalarSizeInBits() <= 64) {
8496 if (LHS.getOpcode() != ISD::MUL)
8497 std::swap(LHS, RHS);
8498
8499 SDValue MulLHS = LHS.getOperand(0);
8500 SDValue MulRHS = LHS.getOperand(1);
8501 SDValue AddRHS = RHS;
8502
8503 // TODO: Maybe restrict if SGPR inputs.
8504 if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
8505 numBitsUnsigned(MulRHS, DAG) <= 32) {
8506 MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
8507 MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
8508 AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
8509 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
8510 }
8511
8512 if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
8513 MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
8514 MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
8515 AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
8516 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
8517 }
8518
8519 return SDValue();
8520 }
8521
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00008522 if (SDValue V = reassociateScalarOps(N, DAG)) {
8523 return V;
8524 }
8525
Farhana Aleen07e61232018-05-02 18:16:39 +00008526 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
Matt Arsenault4f6318f2017-11-06 17:04:37 +00008527 return SDValue();
8528
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008529 // add x, zext (setcc) => addcarry x, 0, setcc
8530 // add x, sext (setcc) => subcarry x, 0, setcc
8531 unsigned Opc = LHS.getOpcode();
8532 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008533 Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008534 std::swap(RHS, LHS);
8535
8536 Opc = RHS.getOpcode();
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008537 switch (Opc) {
8538 default: break;
8539 case ISD::ZERO_EXTEND:
8540 case ISD::SIGN_EXTEND:
8541 case ISD::ANY_EXTEND: {
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008542 auto Cond = RHS.getOperand(0);
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00008543 if (!isBoolSGPR(Cond))
Stanislav Mekhanoshin3ed38c62017-06-21 23:46:22 +00008544 break;
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008545 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
8546 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
8547 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
8548 return DAG.getNode(Opc, SL, VTList, Args);
8549 }
8550 case ISD::ADDCARRY: {
8551 // add x, (addcarry y, 0, cc) => addcarry x, y, cc
8552 auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
8553 if (!C || C->getZExtValue() != 0) break;
8554 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
8555 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
8556 }
8557 }
8558 return SDValue();
8559}
8560
8561SDValue SITargetLowering::performSubCombine(SDNode *N,
8562 DAGCombinerInfo &DCI) const {
8563 SelectionDAG &DAG = DCI.DAG;
8564 EVT VT = N->getValueType(0);
8565
8566 if (VT != MVT::i32)
8567 return SDValue();
8568
8569 SDLoc SL(N);
8570 SDValue LHS = N->getOperand(0);
8571 SDValue RHS = N->getOperand(1);
8572
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008573 if (LHS.getOpcode() == ISD::SUBCARRY) {
8574 // sub (subcarry x, 0, cc), y => subcarry x, y, cc
8575 auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
Stanislav Mekhanoshin42e229e2019-02-21 02:58:00 +00008576 if (!C || !C->isNullValue())
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008577 return SDValue();
8578 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
8579 return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
8580 }
8581 return SDValue();
8582}
8583
8584SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
8585 DAGCombinerInfo &DCI) const {
8586
8587 if (N->getValueType(0) != MVT::i32)
8588 return SDValue();
8589
8590 auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8591 if (!C || C->getZExtValue() != 0)
8592 return SDValue();
8593
8594 SelectionDAG &DAG = DCI.DAG;
8595 SDValue LHS = N->getOperand(0);
8596
8597 // addcarry (add x, y), 0, cc => addcarry x, y, cc
8598 // subcarry (sub x, y), 0, cc => subcarry x, y, cc
8599 unsigned LHSOpc = LHS.getOpcode();
8600 unsigned Opc = N->getOpcode();
8601 if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
8602 (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
8603 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
8604 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008605 }
8606 return SDValue();
8607}
8608
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008609SDValue SITargetLowering::performFAddCombine(SDNode *N,
8610 DAGCombinerInfo &DCI) const {
8611 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8612 return SDValue();
8613
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008614 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault770ec862016-12-22 03:55:35 +00008615 EVT VT = N->getValueType(0);
Matt Arsenault770ec862016-12-22 03:55:35 +00008616
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008617 SDLoc SL(N);
8618 SDValue LHS = N->getOperand(0);
8619 SDValue RHS = N->getOperand(1);
8620
8621 // These should really be instruction patterns, but writing patterns with
8622 // source modiifiers is a pain.
8623
8624 // fadd (fadd (a, a), b) -> mad 2.0, a, b
8625 if (LHS.getOpcode() == ISD::FADD) {
8626 SDValue A = LHS.getOperand(0);
8627 if (A == LHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008628 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00008629 if (FusedOp != 0) {
8630 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00008631 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
Matt Arsenault770ec862016-12-22 03:55:35 +00008632 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008633 }
8634 }
8635
8636 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
8637 if (RHS.getOpcode() == ISD::FADD) {
8638 SDValue A = RHS.getOperand(0);
8639 if (A == RHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008640 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00008641 if (FusedOp != 0) {
8642 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00008643 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
Matt Arsenault770ec862016-12-22 03:55:35 +00008644 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008645 }
8646 }
8647
8648 return SDValue();
8649}
8650
8651SDValue SITargetLowering::performFSubCombine(SDNode *N,
8652 DAGCombinerInfo &DCI) const {
8653 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8654 return SDValue();
8655
8656 SelectionDAG &DAG = DCI.DAG;
8657 SDLoc SL(N);
8658 EVT VT = N->getValueType(0);
8659 assert(!VT.isVector());
8660
8661 // Try to get the fneg to fold into the source modifier. This undoes generic
8662 // DAG combines and folds them into the mad.
8663 //
8664 // Only do this if we are not trying to support denormals. v_mad_f32 does
8665 // not support denormals ever.
Matt Arsenault770ec862016-12-22 03:55:35 +00008666 SDValue LHS = N->getOperand(0);
8667 SDValue RHS = N->getOperand(1);
8668 if (LHS.getOpcode() == ISD::FADD) {
8669 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
8670 SDValue A = LHS.getOperand(0);
8671 if (A == LHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008672 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00008673 if (FusedOp != 0){
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008674 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8675 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
8676
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00008677 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008678 }
8679 }
Matt Arsenault770ec862016-12-22 03:55:35 +00008680 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008681
Matt Arsenault770ec862016-12-22 03:55:35 +00008682 if (RHS.getOpcode() == ISD::FADD) {
8683 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008684
Matt Arsenault770ec862016-12-22 03:55:35 +00008685 SDValue A = RHS.getOperand(0);
8686 if (A == RHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008687 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00008688 if (FusedOp != 0){
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008689 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00008690 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008691 }
8692 }
8693 }
8694
8695 return SDValue();
8696}
8697
Farhana Aleenc370d7b2018-07-16 18:19:59 +00008698SDValue SITargetLowering::performFMACombine(SDNode *N,
8699 DAGCombinerInfo &DCI) const {
8700 SelectionDAG &DAG = DCI.DAG;
8701 EVT VT = N->getValueType(0);
8702 SDLoc SL(N);
8703
Stanislav Mekhanoshin0e858b02019-02-09 00:34:21 +00008704 if (!Subtarget->hasDot2Insts() || VT != MVT::f32)
Farhana Aleenc370d7b2018-07-16 18:19:59 +00008705 return SDValue();
8706
8707 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
8708 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
8709 SDValue Op1 = N->getOperand(0);
8710 SDValue Op2 = N->getOperand(1);
8711 SDValue FMA = N->getOperand(2);
8712
8713 if (FMA.getOpcode() != ISD::FMA ||
8714 Op1.getOpcode() != ISD::FP_EXTEND ||
8715 Op2.getOpcode() != ISD::FP_EXTEND)
8716 return SDValue();
8717
8718 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
8719 // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
8720 // is sufficient to allow generaing fdot2.
8721 const TargetOptions &Options = DAG.getTarget().Options;
8722 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
8723 (N->getFlags().hasAllowContract() &&
8724 FMA->getFlags().hasAllowContract())) {
8725 Op1 = Op1.getOperand(0);
8726 Op2 = Op2.getOperand(0);
8727 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8728 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8729 return SDValue();
8730
8731 SDValue Vec1 = Op1.getOperand(0);
8732 SDValue Idx1 = Op1.getOperand(1);
8733 SDValue Vec2 = Op2.getOperand(0);
8734
8735 SDValue FMAOp1 = FMA.getOperand(0);
8736 SDValue FMAOp2 = FMA.getOperand(1);
8737 SDValue FMAAcc = FMA.getOperand(2);
8738
8739 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
8740 FMAOp2.getOpcode() != ISD::FP_EXTEND)
8741 return SDValue();
8742
8743 FMAOp1 = FMAOp1.getOperand(0);
8744 FMAOp2 = FMAOp2.getOperand(0);
8745 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8746 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8747 return SDValue();
8748
8749 SDValue Vec3 = FMAOp1.getOperand(0);
8750 SDValue Vec4 = FMAOp2.getOperand(0);
8751 SDValue Idx2 = FMAOp1.getOperand(1);
8752
8753 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
8754 // Idx1 and Idx2 cannot be the same.
8755 Idx1 == Idx2)
8756 return SDValue();
8757
8758 if (Vec1 == Vec2 || Vec3 == Vec4)
8759 return SDValue();
8760
8761 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
8762 return SDValue();
8763
8764 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
Konstantin Zhuravlyovbb30ef72018-08-01 01:31:30 +00008765 (Vec1 == Vec4 && Vec2 == Vec3)) {
8766 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
8767 DAG.getTargetConstant(0, SL, MVT::i1));
8768 }
Farhana Aleenc370d7b2018-07-16 18:19:59 +00008769 }
8770 return SDValue();
8771}
8772
Matt Arsenault6f6233d2015-01-06 23:00:41 +00008773SDValue SITargetLowering::performSetCCCombine(SDNode *N,
8774 DAGCombinerInfo &DCI) const {
8775 SelectionDAG &DAG = DCI.DAG;
8776 SDLoc SL(N);
8777
8778 SDValue LHS = N->getOperand(0);
8779 SDValue RHS = N->getOperand(1);
8780 EVT VT = LHS.getValueType();
Stanislav Mekhanoshinc9bd53a2017-06-27 18:53:03 +00008781 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
8782
8783 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
8784 if (!CRHS) {
8785 CRHS = dyn_cast<ConstantSDNode>(LHS);
8786 if (CRHS) {
8787 std::swap(LHS, RHS);
8788 CC = getSetCCSwappedOperands(CC);
8789 }
8790 }
8791
Stanislav Mekhanoshin3b117942018-06-16 03:46:59 +00008792 if (CRHS) {
8793 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
8794 isBoolSGPR(LHS.getOperand(0))) {
8795 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
8796 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
8797 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
8798 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
8799 if ((CRHS->isAllOnesValue() &&
8800 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
8801 (CRHS->isNullValue() &&
8802 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
8803 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
8804 DAG.getConstant(-1, SL, MVT::i1));
8805 if ((CRHS->isAllOnesValue() &&
8806 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
8807 (CRHS->isNullValue() &&
8808 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
8809 return LHS.getOperand(0);
8810 }
8811
8812 uint64_t CRHSVal = CRHS->getZExtValue();
8813 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
8814 LHS.getOpcode() == ISD::SELECT &&
8815 isa<ConstantSDNode>(LHS.getOperand(1)) &&
8816 isa<ConstantSDNode>(LHS.getOperand(2)) &&
8817 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
8818 isBoolSGPR(LHS.getOperand(0))) {
8819 // Given CT != FT:
8820 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
8821 // setcc (select cc, CT, CF), CF, ne => cc
8822 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
8823 // setcc (select cc, CT, CF), CT, eq => cc
8824 uint64_t CT = LHS.getConstantOperandVal(1);
8825 uint64_t CF = LHS.getConstantOperandVal(2);
8826
8827 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
8828 (CT == CRHSVal && CC == ISD::SETNE))
8829 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
8830 DAG.getConstant(-1, SL, MVT::i1));
8831 if ((CF == CRHSVal && CC == ISD::SETNE) ||
8832 (CT == CRHSVal && CC == ISD::SETEQ))
8833 return LHS.getOperand(0);
8834 }
Stanislav Mekhanoshinc9bd53a2017-06-27 18:53:03 +00008835 }
Matt Arsenault6f6233d2015-01-06 23:00:41 +00008836
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00008837 if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
8838 VT != MVT::f16))
Matt Arsenault6f6233d2015-01-06 23:00:41 +00008839 return SDValue();
8840
Matt Arsenault8ad00d32018-08-10 18:58:41 +00008841 // Match isinf/isfinite pattern
Matt Arsenault6f6233d2015-01-06 23:00:41 +00008842 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
Matt Arsenault8ad00d32018-08-10 18:58:41 +00008843 // (fcmp one (fabs x), inf) -> (fp_class x,
8844 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
8845 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
Matt Arsenault6f6233d2015-01-06 23:00:41 +00008846 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
8847 if (!CRHS)
8848 return SDValue();
8849
8850 const APFloat &APF = CRHS->getValueAPF();
8851 if (APF.isInfinity() && !APF.isNegative()) {
Matt Arsenault8ad00d32018-08-10 18:58:41 +00008852 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
8853 SIInstrFlags::N_INFINITY;
8854 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
8855 SIInstrFlags::P_ZERO |
8856 SIInstrFlags::N_NORMAL |
8857 SIInstrFlags::P_NORMAL |
8858 SIInstrFlags::N_SUBNORMAL |
8859 SIInstrFlags::P_SUBNORMAL;
8860 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00008861 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
8862 DAG.getConstant(Mask, SL, MVT::i32));
Matt Arsenault6f6233d2015-01-06 23:00:41 +00008863 }
8864 }
8865
8866 return SDValue();
8867}
8868
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008869SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
8870 DAGCombinerInfo &DCI) const {
8871 SelectionDAG &DAG = DCI.DAG;
8872 SDLoc SL(N);
8873 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
8874
8875 SDValue Src = N->getOperand(0);
8876 SDValue Srl = N->getOperand(0);
8877 if (Srl.getOpcode() == ISD::ZERO_EXTEND)
8878 Srl = Srl.getOperand(0);
8879
8880 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
8881 if (Srl.getOpcode() == ISD::SRL) {
8882 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
8883 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
8884 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
8885
8886 if (const ConstantSDNode *C =
8887 dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
8888 Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
8889 EVT(MVT::i32));
8890
8891 unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
8892 if (SrcOffset < 32 && SrcOffset % 8 == 0) {
8893 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
8894 MVT::f32, Srl);
8895 }
8896 }
8897 }
8898
8899 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
8900
Craig Topperd0af7e82017-04-28 05:31:46 +00008901 KnownBits Known;
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008902 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
8903 !DCI.isBeforeLegalizeOps());
8904 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
Stanislav Mekhanoshined0d6c62019-01-09 02:24:22 +00008905 if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008906 DCI.CommitTargetLoweringOpt(TLO);
8907 }
8908
8909 return SDValue();
8910}
8911
Tom Stellard1b95fed2018-05-24 05:28:34 +00008912SDValue SITargetLowering::performClampCombine(SDNode *N,
8913 DAGCombinerInfo &DCI) const {
8914 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
8915 if (!CSrc)
8916 return SDValue();
8917
8918 const APFloat &F = CSrc->getValueAPF();
8919 APFloat Zero = APFloat::getZero(F.getSemantics());
8920 APFloat::cmpResult Cmp0 = F.compare(Zero);
8921 if (Cmp0 == APFloat::cmpLessThan ||
8922 (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
8923 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
8924 }
8925
8926 APFloat One(F.getSemantics(), "1.0");
8927 APFloat::cmpResult Cmp1 = F.compare(One);
8928 if (Cmp1 == APFloat::cmpGreaterThan)
8929 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
8930
8931 return SDValue(CSrc, 0);
8932}
8933
8934
Tom Stellard75aadc22012-12-11 21:25:42 +00008935SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
8936 DAGCombinerInfo &DCI) const {
Stanislav Mekhanoshin443a7f92018-11-27 15:13:37 +00008937 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
8938 return SDValue();
8939
Tom Stellard75aadc22012-12-11 21:25:42 +00008940 switch (N->getOpcode()) {
Matt Arsenault22b4c252014-12-21 16:48:42 +00008941 default:
8942 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008943 case ISD::ADD:
8944 return performAddCombine(N, DCI);
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008945 case ISD::SUB:
8946 return performSubCombine(N, DCI);
8947 case ISD::ADDCARRY:
8948 case ISD::SUBCARRY:
8949 return performAddCarrySubCarryCombine(N, DCI);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008950 case ISD::FADD:
8951 return performFAddCombine(N, DCI);
8952 case ISD::FSUB:
8953 return performFSubCombine(N, DCI);
Matt Arsenault6f6233d2015-01-06 23:00:41 +00008954 case ISD::SETCC:
8955 return performSetCCCombine(N, DCI);
Matt Arsenault5b39b342016-01-28 20:53:48 +00008956 case ISD::FMAXNUM:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008957 case ISD::FMINNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008958 case ISD::FMAXNUM_IEEE:
8959 case ISD::FMINNUM_IEEE:
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008960 case ISD::SMAX:
8961 case ISD::SMIN:
8962 case ISD::UMAX:
Matt Arsenault5b39b342016-01-28 20:53:48 +00008963 case ISD::UMIN:
8964 case AMDGPUISD::FMIN_LEGACY:
Stanislav Mekhanoshin443a7f92018-11-27 15:13:37 +00008965 case AMDGPUISD::FMAX_LEGACY:
8966 return performMinMaxCombine(N, DCI);
Farhana Aleenc370d7b2018-07-16 18:19:59 +00008967 case ISD::FMA:
8968 return performFMACombine(N, DCI);
Matt Arsenault90083d32018-06-07 09:54:49 +00008969 case ISD::LOAD: {
8970 if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
8971 return Widended;
8972 LLVM_FALLTHROUGH;
8973 }
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00008974 case ISD::STORE:
8975 case ISD::ATOMIC_LOAD:
8976 case ISD::ATOMIC_STORE:
8977 case ISD::ATOMIC_CMP_SWAP:
8978 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
8979 case ISD::ATOMIC_SWAP:
8980 case ISD::ATOMIC_LOAD_ADD:
8981 case ISD::ATOMIC_LOAD_SUB:
8982 case ISD::ATOMIC_LOAD_AND:
8983 case ISD::ATOMIC_LOAD_OR:
8984 case ISD::ATOMIC_LOAD_XOR:
8985 case ISD::ATOMIC_LOAD_NAND:
8986 case ISD::ATOMIC_LOAD_MIN:
8987 case ISD::ATOMIC_LOAD_MAX:
8988 case ISD::ATOMIC_LOAD_UMIN:
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00008989 case ISD::ATOMIC_LOAD_UMAX:
Matt Arsenaulta5840c32019-01-22 18:36:06 +00008990 case ISD::ATOMIC_LOAD_FADD:
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00008991 case AMDGPUISD::ATOMIC_INC:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00008992 case AMDGPUISD::ATOMIC_DEC:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00008993 case AMDGPUISD::ATOMIC_LOAD_FMIN:
Matt Arsenaulta5840c32019-01-22 18:36:06 +00008994 case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00008995 if (DCI.isBeforeLegalize())
8996 break;
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008997 return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
Matt Arsenaultd0101a22015-01-06 23:00:46 +00008998 case ISD::AND:
8999 return performAndCombine(N, DCI);
Matt Arsenaultf2290332015-01-06 23:00:39 +00009000 case ISD::OR:
9001 return performOrCombine(N, DCI);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00009002 case ISD::XOR:
9003 return performXorCombine(N, DCI);
Matt Arsenault8edfaee2017-03-31 19:53:03 +00009004 case ISD::ZERO_EXTEND:
9005 return performZeroExtendCombine(N, DCI);
Matt Arsenaultf2290332015-01-06 23:00:39 +00009006 case AMDGPUISD::FP_CLASS:
9007 return performClassCombine(N, DCI);
Matt Arsenault9cd90712016-04-14 01:42:16 +00009008 case ISD::FCANONICALIZE:
9009 return performFCanonicalizeCombine(N, DCI);
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009010 case AMDGPUISD::RCP:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00009011 return performRcpCombine(N, DCI);
9012 case AMDGPUISD::FRACT:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009013 case AMDGPUISD::RSQ:
Matt Arsenault32fc5272016-07-26 16:45:45 +00009014 case AMDGPUISD::RCP_LEGACY:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009015 case AMDGPUISD::RSQ_LEGACY:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00009016 case AMDGPUISD::RCP_IFLAG:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009017 case AMDGPUISD::RSQ_CLAMP:
9018 case AMDGPUISD::LDEXP: {
9019 SDValue Src = N->getOperand(0);
9020 if (Src.isUndef())
9021 return Src;
9022 break;
9023 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009024 case ISD::SINT_TO_FP:
9025 case ISD::UINT_TO_FP:
9026 return performUCharToFloatCombine(N, DCI);
9027 case AMDGPUISD::CVT_F32_UBYTE0:
9028 case AMDGPUISD::CVT_F32_UBYTE1:
9029 case AMDGPUISD::CVT_F32_UBYTE2:
9030 case AMDGPUISD::CVT_F32_UBYTE3:
9031 return performCvtF32UByteNCombine(N, DCI);
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00009032 case AMDGPUISD::FMED3:
9033 return performFMed3Combine(N, DCI);
Matt Arsenault1f17c662017-02-22 00:27:34 +00009034 case AMDGPUISD::CVT_PKRTZ_F16_F32:
9035 return performCvtPkRTZCombine(N, DCI);
Tom Stellard1b95fed2018-05-24 05:28:34 +00009036 case AMDGPUISD::CLAMP:
9037 return performClampCombine(N, DCI);
Matt Arsenaulteb522e62017-02-27 22:15:25 +00009038 case ISD::SCALAR_TO_VECTOR: {
9039 SelectionDAG &DAG = DCI.DAG;
9040 EVT VT = N->getValueType(0);
9041
9042 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
9043 if (VT == MVT::v2i16 || VT == MVT::v2f16) {
9044 SDLoc SL(N);
9045 SDValue Src = N->getOperand(0);
9046 EVT EltVT = Src.getValueType();
9047 if (EltVT == MVT::f16)
9048 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
9049
9050 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
9051 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
9052 }
9053
9054 break;
9055 }
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00009056 case ISD::EXTRACT_VECTOR_ELT:
9057 return performExtractVectorEltCombine(N, DCI);
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00009058 case ISD::INSERT_VECTOR_ELT:
9059 return performInsertVectorEltCombine(N, DCI);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00009060 }
Matt Arsenault5565f65e2014-05-22 18:09:07 +00009061 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
Tom Stellard75aadc22012-12-11 21:25:42 +00009062}
Christian Konigd910b7d2013-02-26 17:52:16 +00009063
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009064/// Helper function for adjustWritemask
Benjamin Kramer635e3682013-05-23 15:43:05 +00009065static unsigned SubIdx2Lane(unsigned Idx) {
Christian Konig8e06e2a2013-04-10 08:39:08 +00009066 switch (Idx) {
9067 default: return 0;
9068 case AMDGPU::sub0: return 0;
9069 case AMDGPU::sub1: return 1;
9070 case AMDGPU::sub2: return 2;
9071 case AMDGPU::sub3: return 3;
David Stuttardf77079f2019-01-14 11:55:24 +00009072 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
Christian Konig8e06e2a2013-04-10 08:39:08 +00009073 }
9074}
9075
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009076/// Adjust the writemask of MIMG instructions
Matt Arsenault68f05052017-12-04 22:18:27 +00009077SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
9078 SelectionDAG &DAG) const {
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009079 unsigned Opcode = Node->getMachineOpcode();
9080
9081 // Subtract 1 because the vdata output is not a MachineSDNode operand.
9082 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
9083 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
9084 return Node; // not implemented for D16
9085
David Stuttardf77079f2019-01-14 11:55:24 +00009086 SDNode *Users[5] = { nullptr };
Tom Stellard54774e52013-10-23 02:53:47 +00009087 unsigned Lane = 0;
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009088 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009089 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
Tom Stellard54774e52013-10-23 02:53:47 +00009090 unsigned NewDmask = 0;
David Stuttardf77079f2019-01-14 11:55:24 +00009091 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
9092 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
9093 bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
9094 Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
9095 unsigned TFCLane = 0;
Matt Arsenault856777d2017-12-08 20:00:57 +00009096 bool HasChain = Node->getNumValues() > 1;
9097
9098 if (OldDmask == 0) {
9099 // These are folded out, but on the chance it happens don't assert.
9100 return Node;
9101 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009102
David Stuttardf77079f2019-01-14 11:55:24 +00009103 unsigned OldBitsSet = countPopulation(OldDmask);
9104 // Work out which is the TFE/LWE lane if that is enabled.
9105 if (UsesTFC) {
9106 TFCLane = OldBitsSet;
9107 }
9108
Christian Konig8e06e2a2013-04-10 08:39:08 +00009109 // Try to figure out the used register components
9110 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
9111 I != E; ++I) {
9112
Matt Arsenault93e65ea2017-02-22 21:16:41 +00009113 // Don't look at users of the chain.
9114 if (I.getUse().getResNo() != 0)
9115 continue;
9116
Christian Konig8e06e2a2013-04-10 08:39:08 +00009117 // Abort if we can't understand the usage
9118 if (!I->isMachineOpcode() ||
9119 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
Matt Arsenault68f05052017-12-04 22:18:27 +00009120 return Node;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009121
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +00009122 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
Tom Stellard54774e52013-10-23 02:53:47 +00009123 // Note that subregs are packed, i.e. Lane==0 is the first bit set
9124 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
9125 // set, etc.
Christian Konig8b1ed282013-04-10 08:39:16 +00009126 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
Christian Konig8e06e2a2013-04-10 08:39:08 +00009127
David Stuttardf77079f2019-01-14 11:55:24 +00009128 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
9129 if (UsesTFC && Lane == TFCLane) {
9130 Users[Lane] = *I;
9131 } else {
9132 // Set which texture component corresponds to the lane.
9133 unsigned Comp;
9134 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
9135 Comp = countTrailingZeros(Dmask);
9136 Dmask &= ~(1 << Comp);
9137 }
9138
9139 // Abort if we have more than one user per component.
9140 if (Users[Lane])
9141 return Node;
9142
9143 Users[Lane] = *I;
9144 NewDmask |= 1 << Comp;
Tom Stellard54774e52013-10-23 02:53:47 +00009145 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009146 }
9147
David Stuttardf77079f2019-01-14 11:55:24 +00009148 // Don't allow 0 dmask, as hardware assumes one channel enabled.
9149 bool NoChannels = !NewDmask;
9150 if (NoChannels) {
9151 // If the original dmask has one channel - then nothing to do
9152 if (OldBitsSet == 1)
9153 return Node;
9154 // Use an arbitrary dmask - required for the instruction to work
9155 NewDmask = 1;
9156 }
Tom Stellard54774e52013-10-23 02:53:47 +00009157 // Abort if there's no change
9158 if (NewDmask == OldDmask)
Matt Arsenault68f05052017-12-04 22:18:27 +00009159 return Node;
9160
9161 unsigned BitsSet = countPopulation(NewDmask);
9162
David Stuttardf77079f2019-01-14 11:55:24 +00009163 // Check for TFE or LWE - increase the number of channels by one to account
9164 // for the extra return value
9165 // This will need adjustment for D16 if this is also included in
9166 // adjustWriteMask (this function) but at present D16 are excluded.
9167 unsigned NewChannels = BitsSet + UsesTFC;
9168
9169 int NewOpcode =
9170 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
Matt Arsenault68f05052017-12-04 22:18:27 +00009171 assert(NewOpcode != -1 &&
9172 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
9173 "failed to find equivalent MIMG op");
Christian Konig8e06e2a2013-04-10 08:39:08 +00009174
9175 // Adjust the writemask in the node
Matt Arsenault68f05052017-12-04 22:18:27 +00009176 SmallVector<SDValue, 12> Ops;
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009177 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009178 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009179 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
Christian Konig8e06e2a2013-04-10 08:39:08 +00009180
Matt Arsenault68f05052017-12-04 22:18:27 +00009181 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
9182
David Stuttardf77079f2019-01-14 11:55:24 +00009183 MVT ResultVT = NewChannels == 1 ?
9184 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
9185 NewChannels == 5 ? 8 : NewChannels);
Matt Arsenault856777d2017-12-08 20:00:57 +00009186 SDVTList NewVTList = HasChain ?
9187 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
9188
Matt Arsenault68f05052017-12-04 22:18:27 +00009189
9190 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
9191 NewVTList, Ops);
Matt Arsenaultecad0d532017-12-08 20:00:45 +00009192
Matt Arsenault856777d2017-12-08 20:00:57 +00009193 if (HasChain) {
9194 // Update chain.
Chandler Carruth66654b72018-08-14 23:30:32 +00009195 DAG.setNodeMemRefs(NewNode, Node->memoperands());
Matt Arsenault856777d2017-12-08 20:00:57 +00009196 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
9197 }
Matt Arsenault68f05052017-12-04 22:18:27 +00009198
David Stuttardf77079f2019-01-14 11:55:24 +00009199 if (NewChannels == 1) {
Matt Arsenault68f05052017-12-04 22:18:27 +00009200 assert(Node->hasNUsesOfValue(1, 0));
9201 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
9202 SDLoc(Node), Users[Lane]->getValueType(0),
9203 SDValue(NewNode, 0));
Christian Konig8b1ed282013-04-10 08:39:16 +00009204 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
Matt Arsenault68f05052017-12-04 22:18:27 +00009205 return nullptr;
Christian Konig8b1ed282013-04-10 08:39:16 +00009206 }
9207
Christian Konig8e06e2a2013-04-10 08:39:08 +00009208 // Update the users of the node with the new indices
David Stuttardf77079f2019-01-14 11:55:24 +00009209 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
Christian Konig8e06e2a2013-04-10 08:39:08 +00009210 SDNode *User = Users[i];
David Stuttardf77079f2019-01-14 11:55:24 +00009211 if (!User) {
9212 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
9213 // Users[0] is still nullptr because channel 0 doesn't really have a use.
9214 if (i || !NoChannels)
9215 continue;
9216 } else {
9217 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
9218 DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
9219 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009220
9221 switch (Idx) {
9222 default: break;
9223 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
9224 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
9225 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
David Stuttardf77079f2019-01-14 11:55:24 +00009226 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009227 }
9228 }
Matt Arsenault68f05052017-12-04 22:18:27 +00009229
9230 DAG.RemoveDeadNode(Node);
9231 return nullptr;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009232}
9233
Tom Stellardc98ee202015-07-16 19:40:07 +00009234static bool isFrameIndexOp(SDValue Op) {
9235 if (Op.getOpcode() == ISD::AssertZext)
9236 Op = Op.getOperand(0);
9237
9238 return isa<FrameIndexSDNode>(Op);
9239}
9240
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009241/// Legalize target independent instructions (e.g. INSERT_SUBREG)
Tom Stellard3457a842014-10-09 19:06:00 +00009242/// with frame index operands.
9243/// LLVM assumes that inputs are to these instructions are registers.
Matt Arsenault0d0d6c22017-04-12 21:58:23 +00009244SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
9245 SelectionDAG &DAG) const {
9246 if (Node->getOpcode() == ISD::CopyToReg) {
9247 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
9248 SDValue SrcVal = Node->getOperand(2);
9249
9250 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
9251 // to try understanding copies to physical registers.
9252 if (SrcVal.getValueType() == MVT::i1 &&
9253 TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
9254 SDLoc SL(Node);
9255 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
9256 SDValue VReg = DAG.getRegister(
9257 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
9258
9259 SDNode *Glued = Node->getGluedNode();
9260 SDValue ToVReg
9261 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
9262 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
9263 SDValue ToResultReg
9264 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
9265 VReg, ToVReg.getValue(1));
9266 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
9267 DAG.RemoveDeadNode(Node);
9268 return ToResultReg.getNode();
9269 }
9270 }
Tom Stellard8dd392e2014-10-09 18:09:15 +00009271
9272 SmallVector<SDValue, 8> Ops;
Tom Stellard3457a842014-10-09 19:06:00 +00009273 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
Tom Stellardc98ee202015-07-16 19:40:07 +00009274 if (!isFrameIndexOp(Node->getOperand(i))) {
Tom Stellard3457a842014-10-09 19:06:00 +00009275 Ops.push_back(Node->getOperand(i));
Tom Stellard8dd392e2014-10-09 18:09:15 +00009276 continue;
9277 }
9278
Tom Stellard3457a842014-10-09 19:06:00 +00009279 SDLoc DL(Node);
Tom Stellard8dd392e2014-10-09 18:09:15 +00009280 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
Tom Stellard3457a842014-10-09 19:06:00 +00009281 Node->getOperand(i).getValueType(),
9282 Node->getOperand(i)), 0));
Tom Stellard8dd392e2014-10-09 18:09:15 +00009283 }
9284
Mark Searles4e3d6162017-10-16 23:38:53 +00009285 return DAG.UpdateNodeOperands(Node, Ops);
Tom Stellard8dd392e2014-10-09 18:09:15 +00009286}
9287
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009288/// Fold the instructions after selecting them.
Matt Arsenault68f05052017-12-04 22:18:27 +00009289/// Returns null if users were already updated.
Christian Konig8e06e2a2013-04-10 08:39:08 +00009290SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
9291 SelectionDAG &DAG) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009292 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Nicolai Haehnlef2c64db2016-02-18 16:44:18 +00009293 unsigned Opcode = Node->getMachineOpcode();
Christian Konig8e06e2a2013-04-10 08:39:08 +00009294
Nicolai Haehnlec06bfa12016-07-11 21:59:43 +00009295 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009296 !TII->isGather4(Opcode)) {
Matt Arsenault68f05052017-12-04 22:18:27 +00009297 return adjustWritemask(Node, DAG);
9298 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009299
Nicolai Haehnlef2c64db2016-02-18 16:44:18 +00009300 if (Opcode == AMDGPU::INSERT_SUBREG ||
9301 Opcode == AMDGPU::REG_SEQUENCE) {
Tom Stellard8dd392e2014-10-09 18:09:15 +00009302 legalizeTargetIndependentNode(Node, DAG);
9303 return Node;
9304 }
Matt Arsenault206f8262017-08-01 20:49:41 +00009305
9306 switch (Opcode) {
9307 case AMDGPU::V_DIV_SCALE_F32:
9308 case AMDGPU::V_DIV_SCALE_F64: {
9309 // Satisfy the operand register constraint when one of the inputs is
9310 // undefined. Ordinarily each undef value will have its own implicit_def of
9311 // a vreg, so force these to use a single register.
9312 SDValue Src0 = Node->getOperand(0);
9313 SDValue Src1 = Node->getOperand(1);
9314 SDValue Src2 = Node->getOperand(2);
9315
9316 if ((Src0.isMachineOpcode() &&
9317 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
9318 (Src0 == Src1 || Src0 == Src2))
9319 break;
9320
9321 MVT VT = Src0.getValueType().getSimpleVT();
9322 const TargetRegisterClass *RC = getRegClassFor(VT);
9323
9324 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
9325 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
9326
9327 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
9328 UndefReg, Src0, SDValue());
9329
9330 // src0 must be the same register as src1 or src2, even if the value is
9331 // undefined, so make sure we don't violate this constraint.
9332 if (Src0.isMachineOpcode() &&
9333 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
9334 if (Src1.isMachineOpcode() &&
9335 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9336 Src0 = Src1;
9337 else if (Src2.isMachineOpcode() &&
9338 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9339 Src0 = Src2;
9340 else {
9341 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
9342 Src0 = UndefReg;
9343 Src1 = UndefReg;
9344 }
9345 } else
9346 break;
9347
9348 SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
9349 for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
9350 Ops.push_back(Node->getOperand(I));
9351
9352 Ops.push_back(ImpDef.getValue(1));
9353 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
9354 }
9355 default:
9356 break;
9357 }
9358
Tom Stellard654d6692015-01-08 15:08:17 +00009359 return Node;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009360}
Christian Konig8b1ed282013-04-10 08:39:16 +00009361
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009362/// Assign the register class depending on the number of
Christian Konig8b1ed282013-04-10 08:39:16 +00009363/// bits set in the writemask
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009364void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
Christian Konig8b1ed282013-04-10 08:39:16 +00009365 SDNode *Node) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009366 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009367
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009368 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009369
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009370 if (TII->isVOP3(MI.getOpcode())) {
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009371 // Make sure constant bus requirements are respected.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009372 TII->legalizeOperandsVOP3(MRI, MI);
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009373 return;
9374 }
Matt Arsenaultcb0ac3d2014-09-26 17:54:59 +00009375
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009376 // Replace unused atomics with the no return version.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009377 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009378 if (NoRetAtomicOp != -1) {
9379 if (!Node->hasAnyUseOfValue(0)) {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009380 MI.setDesc(TII->get(NoRetAtomicOp));
9381 MI.RemoveOperand(0);
Tom Stellard354a43c2016-04-01 18:27:37 +00009382 return;
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009383 }
9384
Tom Stellard354a43c2016-04-01 18:27:37 +00009385 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
9386 // instruction, because the return type of these instructions is a vec2 of
9387 // the memory type, so it can be tied to the input operand.
9388 // This means these instructions always have a use, so we need to add a
9389 // special case to check if the atomic has only one extract_subreg use,
9390 // which itself has no uses.
9391 if ((Node->hasNUsesOfValue(1, 0) &&
Nicolai Haehnle750082d2016-04-15 14:42:36 +00009392 Node->use_begin()->isMachineOpcode() &&
Tom Stellard354a43c2016-04-01 18:27:37 +00009393 Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
9394 !Node->use_begin()->hasAnyUseOfValue(0))) {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009395 unsigned Def = MI.getOperand(0).getReg();
Tom Stellard354a43c2016-04-01 18:27:37 +00009396
9397 // Change this into a noret atomic.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009398 MI.setDesc(TII->get(NoRetAtomicOp));
9399 MI.RemoveOperand(0);
Tom Stellard354a43c2016-04-01 18:27:37 +00009400
9401 // If we only remove the def operand from the atomic instruction, the
9402 // extract_subreg will be left with a use of a vreg without a def.
9403 // So we need to insert an implicit_def to avoid machine verifier
9404 // errors.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009405 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
Tom Stellard354a43c2016-04-01 18:27:37 +00009406 TII->get(AMDGPU::IMPLICIT_DEF), Def);
9407 }
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009408 return;
9409 }
Christian Konig8b1ed282013-04-10 08:39:16 +00009410}
Tom Stellard0518ff82013-06-03 17:39:58 +00009411
Benjamin Kramerbdc49562016-06-12 15:39:02 +00009412static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
9413 uint64_t Val) {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009414 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
Matt Arsenault485defe2014-11-05 19:01:17 +00009415 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
9416}
9417
9418MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
Benjamin Kramerbdc49562016-06-12 15:39:02 +00009419 const SDLoc &DL,
Matt Arsenault485defe2014-11-05 19:01:17 +00009420 SDValue Ptr) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009421 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Matt Arsenault485defe2014-11-05 19:01:17 +00009422
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009423 // Build the half of the subregister with the constants before building the
9424 // full 128-bit register. If we are building multiple resource descriptors,
9425 // this will allow CSEing of the 2-component register.
9426 const SDValue Ops0[] = {
9427 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
9428 buildSMovImm32(DAG, DL, 0),
9429 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
9430 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
9431 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
9432 };
Matt Arsenault485defe2014-11-05 19:01:17 +00009433
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009434 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
9435 MVT::v2i32, Ops0), 0);
Matt Arsenault485defe2014-11-05 19:01:17 +00009436
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009437 // Combine the constants and the pointer.
9438 const SDValue Ops1[] = {
9439 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
9440 Ptr,
9441 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
9442 SubRegHi,
9443 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
9444 };
Matt Arsenault485defe2014-11-05 19:01:17 +00009445
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009446 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
Matt Arsenault485defe2014-11-05 19:01:17 +00009447}
9448
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009449/// Return a resource descriptor with the 'Add TID' bit enabled
Benjamin Kramerdf005cb2015-08-08 18:27:36 +00009450/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
9451/// of the resource descriptor) to create an offset, which is added to
9452/// the resource pointer.
Benjamin Kramerbdc49562016-06-12 15:39:02 +00009453MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
9454 SDValue Ptr, uint32_t RsrcDword1,
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009455 uint64_t RsrcDword2And3) const {
9456 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
9457 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
9458 if (RsrcDword1) {
9459 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009460 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
9461 0);
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009462 }
9463
9464 SDValue DataLo = buildSMovImm32(DAG, DL,
9465 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
9466 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
9467
9468 const SDValue Ops[] = {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009469 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009470 PtrLo,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009471 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009472 PtrHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009473 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009474 DataLo,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009475 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009476 DataHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009477 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009478 };
9479
9480 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
9481}
9482
Tom Stellardd7e6f132015-04-08 01:09:26 +00009483//===----------------------------------------------------------------------===//
9484// SI Inline Assembly Support
9485//===----------------------------------------------------------------------===//
9486
9487std::pair<unsigned, const TargetRegisterClass *>
9488SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
Benjamin Kramer9bfb6272015-07-05 19:29:18 +00009489 StringRef Constraint,
Tom Stellardd7e6f132015-04-08 01:09:26 +00009490 MVT VT) const {
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009491 const TargetRegisterClass *RC = nullptr;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009492 if (Constraint.size() == 1) {
9493 switch (Constraint[0]) {
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009494 default:
9495 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009496 case 's':
9497 case 'r':
9498 switch (VT.getSizeInBits()) {
9499 default:
9500 return std::make_pair(0U, nullptr);
9501 case 32:
Matt Arsenault9e910142016-12-20 19:06:12 +00009502 case 16:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009503 RC = &AMDGPU::SReg_32_XM0RegClass;
9504 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009505 case 64:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009506 RC = &AMDGPU::SGPR_64RegClass;
9507 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009508 case 128:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009509 RC = &AMDGPU::SReg_128RegClass;
9510 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009511 case 256:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009512 RC = &AMDGPU::SReg_256RegClass;
9513 break;
Matt Arsenaulte0bf7d02017-02-21 19:12:08 +00009514 case 512:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009515 RC = &AMDGPU::SReg_512RegClass;
9516 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009517 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009518 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009519 case 'v':
9520 switch (VT.getSizeInBits()) {
9521 default:
9522 return std::make_pair(0U, nullptr);
9523 case 32:
Matt Arsenault9e910142016-12-20 19:06:12 +00009524 case 16:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009525 RC = &AMDGPU::VGPR_32RegClass;
9526 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009527 case 64:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009528 RC = &AMDGPU::VReg_64RegClass;
9529 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009530 case 96:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009531 RC = &AMDGPU::VReg_96RegClass;
9532 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009533 case 128:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009534 RC = &AMDGPU::VReg_128RegClass;
9535 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009536 case 256:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009537 RC = &AMDGPU::VReg_256RegClass;
9538 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009539 case 512:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009540 RC = &AMDGPU::VReg_512RegClass;
9541 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009542 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009543 break;
Tom Stellardd7e6f132015-04-08 01:09:26 +00009544 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009545 // We actually support i128, i16 and f16 as inline parameters
9546 // even if they are not reported as legal
9547 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
9548 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
9549 return std::make_pair(0U, RC);
Tom Stellardd7e6f132015-04-08 01:09:26 +00009550 }
9551
9552 if (Constraint.size() > 1) {
Tom Stellardd7e6f132015-04-08 01:09:26 +00009553 if (Constraint[1] == 'v') {
9554 RC = &AMDGPU::VGPR_32RegClass;
9555 } else if (Constraint[1] == 's') {
9556 RC = &AMDGPU::SGPR_32RegClass;
9557 }
9558
9559 if (RC) {
Matt Arsenault0b554ed2015-06-23 02:05:55 +00009560 uint32_t Idx;
9561 bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
9562 if (!Failed && Idx < RC->getNumRegs())
Tom Stellardd7e6f132015-04-08 01:09:26 +00009563 return std::make_pair(RC->getRegister(Idx), RC);
9564 }
9565 }
9566 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9567}
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009568
9569SITargetLowering::ConstraintType
9570SITargetLowering::getConstraintType(StringRef Constraint) const {
9571 if (Constraint.size() == 1) {
9572 switch (Constraint[0]) {
9573 default: break;
9574 case 's':
9575 case 'v':
9576 return C_RegisterClass;
9577 }
9578 }
9579 return TargetLowering::getConstraintType(Constraint);
9580}
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009581
9582// Figure out which registers should be reserved for stack access. Only after
9583// the function is legalized do we know all of the non-spill stack objects or if
9584// calls are present.
9585void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
9586 MachineRegisterInfo &MRI = MF.getRegInfo();
9587 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
9588 const MachineFrameInfo &MFI = MF.getFrameInfo();
Tom Stellardc5a154d2018-06-28 23:47:12 +00009589 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009590
9591 if (Info->isEntryFunction()) {
9592 // Callable functions have fixed registers used for stack access.
9593 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
9594 }
9595
9596 // We have to assume the SP is needed in case there are calls in the function
9597 // during lowering. Calls are only detected after the function is
9598 // lowered. We're about to reserve registers, so don't bother using it if we
9599 // aren't really going to use it.
9600 bool NeedSP = !Info->isEntryFunction() ||
9601 MFI.hasVarSizedObjects() ||
9602 MFI.hasCalls();
9603
9604 if (NeedSP) {
9605 unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
9606 Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
9607
9608 assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
9609 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
9610 Info->getStackPtrOffsetReg()));
Matt Arsenaultbc6d07c2019-03-14 22:54:43 +00009611 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
9612 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009613 }
9614
Matt Arsenaultbc6d07c2019-03-14 22:54:43 +00009615 // We need to worry about replacing the default register with itself in case
9616 // of MIR testcases missing the MFI.
9617 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
9618 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
9619
9620 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
9621 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
9622
9623 if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) {
9624 MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
9625 Info->getScratchWaveOffsetReg());
9626 }
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009627
Stanislav Mekhanoshind4b500c2018-05-31 05:36:04 +00009628 Info->limitOccupancy(MF);
9629
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009630 TargetLoweringBase::finalizeLowering(MF);
9631}
Matt Arsenault45b98182017-11-15 00:45:43 +00009632
9633void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
9634 KnownBits &Known,
9635 const APInt &DemandedElts,
9636 const SelectionDAG &DAG,
9637 unsigned Depth) const {
9638 TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
9639 DAG, Depth);
9640
9641 if (getSubtarget()->enableHugePrivateBuffer())
9642 return;
9643
9644 // Technically it may be possible to have a dispatch with a single workitem
9645 // that uses the full private memory size, but that's not really useful. We
9646 // can't use vaddr in MUBUF instructions if we don't know the address
9647 // calculation won't overflow, so assume the sign bit is never set.
9648 Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
9649}
Tom Stellard264c1712018-06-13 15:06:37 +00009650
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +00009651LLVM_ATTRIBUTE_UNUSED
9652static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
9653 assert(N->getOpcode() == ISD::CopyFromReg);
9654 do {
9655 // Follow the chain until we find an INLINEASM node.
9656 N = N->getOperand(0).getNode();
Craig Topper784929d2019-02-08 20:48:56 +00009657 if (N->getOpcode() == ISD::INLINEASM ||
9658 N->getOpcode() == ISD::INLINEASM_BR)
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +00009659 return true;
9660 } while (N->getOpcode() == ISD::CopyFromReg);
9661 return false;
9662}
9663
Tom Stellard264c1712018-06-13 15:06:37 +00009664bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
Nicolai Haehnle35617ed2018-08-30 14:21:36 +00009665 FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
Tom Stellard264c1712018-06-13 15:06:37 +00009666{
9667 switch (N->getOpcode()) {
Tom Stellard264c1712018-06-13 15:06:37 +00009668 case ISD::CopyFromReg:
9669 {
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +00009670 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
9671 const MachineFunction * MF = FLI->MF;
9672 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
9673 const MachineRegisterInfo &MRI = MF->getRegInfo();
9674 const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
9675 unsigned Reg = R->getReg();
9676 if (TRI.isPhysicalRegister(Reg))
9677 return !TRI.isSGPRReg(MRI, Reg);
Tom Stellard264c1712018-06-13 15:06:37 +00009678
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +00009679 if (MRI.isLiveIn(Reg)) {
9680 // workitem.id.x workitem.id.y workitem.id.z
9681 // Any VGPR formal argument is also considered divergent
9682 if (!TRI.isSGPRReg(MRI, Reg))
9683 return true;
9684 // Formal arguments of non-entry functions
9685 // are conservatively considered divergent
9686 else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
9687 return true;
9688 return false;
Tom Stellard264c1712018-06-13 15:06:37 +00009689 }
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +00009690 const Value *V = FLI->getValueFromVirtualReg(Reg);
9691 if (V)
9692 return KDA->isDivergent(V);
9693 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
9694 return !TRI.isSGPRReg(MRI, Reg);
Tom Stellard264c1712018-06-13 15:06:37 +00009695 }
9696 break;
9697 case ISD::LOAD: {
Matt Arsenault813613c2018-09-04 18:58:19 +00009698 const LoadSDNode *L = cast<LoadSDNode>(N);
9699 unsigned AS = L->getAddressSpace();
9700 // A flat load may access private memory.
9701 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
Tom Stellard264c1712018-06-13 15:06:37 +00009702 } break;
9703 case ISD::CALLSEQ_END:
9704 return true;
9705 break;
9706 case ISD::INTRINSIC_WO_CHAIN:
9707 {
9708
9709 }
9710 return AMDGPU::isIntrinsicSourceOfDivergence(
9711 cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
9712 case ISD::INTRINSIC_W_CHAIN:
9713 return AMDGPU::isIntrinsicSourceOfDivergence(
9714 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
9715 // In some cases intrinsics that are a source of divergence have been
9716 // lowered to AMDGPUISD so we also need to check those too.
9717 case AMDGPUISD::INTERP_MOV:
9718 case AMDGPUISD::INTERP_P1:
9719 case AMDGPUISD::INTERP_P2:
9720 return true;
9721 }
9722 return false;
9723}
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00009724
9725bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
9726 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
9727 case MVT::f32:
9728 return Subtarget->hasFP32Denormals();
9729 case MVT::f64:
9730 return Subtarget->hasFP64Denormals();
9731 case MVT::f16:
9732 return Subtarget->hasFP16Denormals();
9733 default:
9734 return false;
9735 }
9736}
Matt Arsenault687ec752018-10-22 16:27:27 +00009737
9738bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
9739 const SelectionDAG &DAG,
9740 bool SNaN,
9741 unsigned Depth) const {
9742 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
9743 if (Subtarget->enableDX10Clamp())
9744 return true; // Clamped to 0.
9745 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
9746 }
9747
9748 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
9749 SNaN, Depth);
9750}
Matt Arsenaulta5840c32019-01-22 18:36:06 +00009751
9752TargetLowering::AtomicExpansionKind
9753SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
9754 switch (RMW->getOperation()) {
9755 case AtomicRMWInst::FAdd: {
9756 Type *Ty = RMW->getType();
9757
9758 // We don't have a way to support 16-bit atomics now, so just leave them
9759 // as-is.
9760 if (Ty->isHalfTy())
9761 return AtomicExpansionKind::None;
9762
9763 if (!Ty->isFloatTy())
9764 return AtomicExpansionKind::CmpXChg;
9765
9766 // TODO: Do have these for flat. Older targets also had them for buffers.
9767 unsigned AS = RMW->getPointerAddressSpace();
9768 return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
9769 AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
9770 }
9771 default:
9772 break;
9773 }
9774
9775 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
9776}