blob: 6f60f5346666d266c7e1d03b9f9391af7cd8b21d [file] [log] [blame]
Tom Stellard75aadc22012-12-11 21:25:42 +00001//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Tom Stellard75aadc22012-12-11 21:25:42 +00006//
7//===----------------------------------------------------------------------===//
8//
9/// \file
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000010/// Custom DAG lowering for SI
Tom Stellard75aadc22012-12-11 21:25:42 +000011//
12//===----------------------------------------------------------------------===//
13
Sylvestre Ledrudf92dab2018-11-02 17:25:40 +000014#if defined(_MSC_VER) || defined(__MINGW32__)
NAKAMURA Takumi45e0a832014-07-20 11:15:07 +000015// Provide M_PI.
16#define _USE_MATH_DEFINES
NAKAMURA Takumi45e0a832014-07-20 11:15:07 +000017#endif
18
Chandler Carruth6bda14b2017-06-06 11:49:48 +000019#include "SIISelLowering.h"
Christian Konig99ee0f42013-03-07 09:04:14 +000020#include "AMDGPU.h"
Matt Arsenault41e2f2b2014-02-24 21:01:28 +000021#include "AMDGPUSubtarget.h"
Chandler Carruth6bda14b2017-06-06 11:49:48 +000022#include "AMDGPUTargetMachine.h"
Tom Stellard8485fa02016-12-07 02:42:15 +000023#include "SIDefines.h"
Tom Stellard75aadc22012-12-11 21:25:42 +000024#include "SIInstrInfo.h"
25#include "SIMachineFunctionInfo.h"
26#include "SIRegisterInfo.h"
Tom Stellard44b30b42018-05-22 02:03:23 +000027#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000028#include "Utils/AMDGPUBaseInfo.h"
29#include "llvm/ADT/APFloat.h"
30#include "llvm/ADT/APInt.h"
31#include "llvm/ADT/ArrayRef.h"
Alexey Samsonova253bf92014-08-27 19:36:53 +000032#include "llvm/ADT/BitVector.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000033#include "llvm/ADT/SmallVector.h"
Matt Arsenault71bcbd42017-08-11 20:42:08 +000034#include "llvm/ADT/Statistic.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000035#include "llvm/ADT/StringRef.h"
Matt Arsenault9a10cea2016-01-26 04:29:24 +000036#include "llvm/ADT/StringSwitch.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000037#include "llvm/ADT/Twine.h"
Wei Ding07e03712016-07-28 16:42:13 +000038#include "llvm/CodeGen/Analysis.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000039#include "llvm/CodeGen/CallingConvLower.h"
40#include "llvm/CodeGen/DAGCombine.h"
41#include "llvm/CodeGen/ISDOpcodes.h"
42#include "llvm/CodeGen/MachineBasicBlock.h"
43#include "llvm/CodeGen/MachineFrameInfo.h"
44#include "llvm/CodeGen/MachineFunction.h"
45#include "llvm/CodeGen/MachineInstr.h"
46#include "llvm/CodeGen/MachineInstrBuilder.h"
47#include "llvm/CodeGen/MachineMemOperand.h"
Matt Arsenault8623e8d2017-08-03 23:00:29 +000048#include "llvm/CodeGen/MachineModuleInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000049#include "llvm/CodeGen/MachineOperand.h"
50#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000051#include "llvm/CodeGen/SelectionDAG.h"
52#include "llvm/CodeGen/SelectionDAGNodes.h"
David Blaikieb3bde2e2017-11-17 01:07:10 +000053#include "llvm/CodeGen/TargetCallingConv.h"
54#include "llvm/CodeGen/TargetRegisterInfo.h"
Craig Topper2fa14362018-03-29 17:21:10 +000055#include "llvm/CodeGen/ValueTypes.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000056#include "llvm/IR/Constants.h"
57#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/DebugLoc.h"
59#include "llvm/IR/DerivedTypes.h"
Oliver Stannard7e7d9832016-02-02 13:52:43 +000060#include "llvm/IR/DiagnosticInfo.h"
Benjamin Kramerd78bb462013-05-23 17:10:37 +000061#include "llvm/IR/Function.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000062#include "llvm/IR/GlobalValue.h"
63#include "llvm/IR/InstrTypes.h"
64#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Instructions.h"
Matt Arsenault7dc01c92017-03-15 23:15:12 +000066#include "llvm/IR/IntrinsicInst.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000067#include "llvm/IR/Type.h"
68#include "llvm/Support/Casting.h"
69#include "llvm/Support/CodeGen.h"
70#include "llvm/Support/CommandLine.h"
71#include "llvm/Support/Compiler.h"
72#include "llvm/Support/ErrorHandling.h"
Craig Topperd0af7e82017-04-28 05:31:46 +000073#include "llvm/Support/KnownBits.h"
David Blaikie13e77db2018-03-23 23:58:25 +000074#include "llvm/Support/MachineValueType.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000075#include "llvm/Support/MathExtras.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000076#include "llvm/Target/TargetOptions.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000077#include <cassert>
78#include <cmath>
79#include <cstdint>
80#include <iterator>
81#include <tuple>
82#include <utility>
83#include <vector>
Tom Stellard75aadc22012-12-11 21:25:42 +000084
85using namespace llvm;
86
Matt Arsenault71bcbd42017-08-11 20:42:08 +000087#define DEBUG_TYPE "si-lower"
88
89STATISTIC(NumTailCalls, "Number of tail calls");
90
Matt Arsenaultd486d3f2016-10-12 18:49:05 +000091static cl::opt<bool> EnableVGPRIndexMode(
92 "amdgpu-vgpr-index-mode",
93 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
94 cl::init(false));
95
Matt Arsenault45b98182017-11-15 00:45:43 +000096static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
97 "amdgpu-frame-index-zero-bits",
98 cl::desc("High bits of frame index assumed to be zero"),
99 cl::init(5),
100 cl::ReallyHidden);
101
Stanislav Mekhanoshin93f15c92019-05-03 21:17:29 +0000102static cl::opt<bool> DisableLoopAlignment(
103 "amdgpu-disable-loop-alignment",
104 cl::desc("Do not align and prefetch loops"),
105 cl::init(false));
106
Tom Stellardf110f8f2016-04-14 16:27:03 +0000107static unsigned findFirstFreeSGPR(CCState &CCInfo) {
108 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
109 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
110 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
111 return AMDGPU::SGPR0 + Reg;
112 }
113 }
114 llvm_unreachable("Cannot allocate sgpr");
115}
116
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000117SITargetLowering::SITargetLowering(const TargetMachine &TM,
Tom Stellard5bfbae52018-07-11 20:59:01 +0000118 const GCNSubtarget &STI)
Tom Stellardc5a154d2018-06-28 23:47:12 +0000119 : AMDGPUTargetLowering(TM, STI),
120 Subtarget(&STI) {
Tom Stellard1bd80722014-04-30 15:31:33 +0000121 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
Tom Stellard436780b2014-05-15 14:41:57 +0000122 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000123
Marek Olsak79c05872016-11-25 17:37:09 +0000124 addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
Tom Stellard45c0b3a2015-01-07 20:59:25 +0000125 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
Tom Stellard75aadc22012-12-11 21:25:42 +0000126
Tom Stellard436780b2014-05-15 14:41:57 +0000127 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
128 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
129 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000130
Tim Renouf361b5b22019-03-21 12:01:21 +0000131 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
132 addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
133
Matt Arsenault61001bb2015-11-25 19:58:34 +0000134 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
135 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
136
Tom Stellard436780b2014-05-15 14:41:57 +0000137 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
138 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000139
Tim Renouf033f99a2019-03-22 10:11:21 +0000140 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
141 addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
142
Tom Stellardf0a21072014-11-18 20:39:39 +0000143 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000144 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
145
Tom Stellardf0a21072014-11-18 20:39:39 +0000146 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000147 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
Tom Stellard75aadc22012-12-11 21:25:42 +0000148
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000149 if (Subtarget->has16BitInsts()) {
Marek Olsak79c05872016-11-25 17:37:09 +0000150 addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
151 addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
Tom Stellard115a6152016-11-10 16:02:37 +0000152
Matt Arsenault1349a042018-05-22 06:32:10 +0000153 // Unless there are also VOP3P operations, not operations are really legal.
Matt Arsenault7596f132017-02-27 20:52:10 +0000154 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
155 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000156 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
157 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
Matt Arsenault7596f132017-02-27 20:52:10 +0000158 }
159
Tom Stellardc5a154d2018-06-28 23:47:12 +0000160 computeRegisterProperties(Subtarget->getRegisterInfo());
Tom Stellard75aadc22012-12-11 21:25:42 +0000161
Tom Stellard35bb18c2013-08-26 15:06:04 +0000162 // We need to custom lower vector stores from local memory
Matt Arsenault71e66762016-05-21 02:27:49 +0000163 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
Tim Renouf361b5b22019-03-21 12:01:21 +0000164 setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
Tom Stellard35bb18c2013-08-26 15:06:04 +0000165 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
Tim Renouf033f99a2019-03-22 10:11:21 +0000166 setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
Tom Stellardaf775432013-10-23 00:44:32 +0000167 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
168 setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000169 setOperationAction(ISD::LOAD, MVT::i1, Custom);
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000170 setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
Matt Arsenault2b957b52016-05-02 20:07:26 +0000171
Matt Arsenaultbcdfee72016-05-02 20:13:51 +0000172 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
Tim Renouf361b5b22019-03-21 12:01:21 +0000173 setOperationAction(ISD::STORE, MVT::v3i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000174 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
Tim Renouf033f99a2019-03-22 10:11:21 +0000175 setOperationAction(ISD::STORE, MVT::v5i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000176 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
177 setOperationAction(ISD::STORE, MVT::v16i32, Custom);
178 setOperationAction(ISD::STORE, MVT::i1, Custom);
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000179 setOperationAction(ISD::STORE, MVT::v32i32, Custom);
Matt Arsenaultbcdfee72016-05-02 20:13:51 +0000180
Jan Vesely06200bd2017-01-06 21:00:46 +0000181 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
182 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
183 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
184 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
185 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
186 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
187 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
188 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
189 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
190 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
191
Matt Arsenault71e66762016-05-21 02:27:49 +0000192 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
193 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000194
195 setOperationAction(ISD::SELECT, MVT::i1, Promote);
Tom Stellard0ec134f2014-02-04 17:18:40 +0000196 setOperationAction(ISD::SELECT, MVT::i64, Custom);
Tom Stellardda99c6e2014-03-24 16:07:30 +0000197 setOperationAction(ISD::SELECT, MVT::f64, Promote);
198 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
Tom Stellard81d871d2013-11-13 23:36:50 +0000199
Tom Stellard3ca1bfc2014-06-10 16:01:22 +0000200 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
201 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
202 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
203 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
Matt Arsenault71e66762016-05-21 02:27:49 +0000204 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
Tom Stellard754f80f2013-04-05 23:31:51 +0000205
Tom Stellardd1efda82016-01-20 21:48:24 +0000206 setOperationAction(ISD::SETCC, MVT::i1, Promote);
Tom Stellard83747202013-07-18 21:43:53 +0000207 setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
208 setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
Matt Arsenault18f56be2016-12-22 16:27:11 +0000209 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
Tom Stellard83747202013-07-18 21:43:53 +0000210
Matt Arsenault71e66762016-05-21 02:27:49 +0000211 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
212 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
Matt Arsenaulte306a322014-10-21 16:25:08 +0000213
Matt Arsenault4e466652014-04-16 01:41:30 +0000214 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
215 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000216 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
217 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000218 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
219 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000220 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
221
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000222 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
Tom Stellard9fa17912013-08-14 23:24:45 +0000223 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
Tom Stellard9fa17912013-08-14 23:24:45 +0000224 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
Matt Arsenaultb3a80e52018-08-15 21:25:20 +0000225 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
226 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
Marek Olsak13e47412018-01-31 20:18:04 +0000227 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000228 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
229
Changpeng Fang44dfa1d2018-01-12 21:12:19 +0000230 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
231 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
David Stuttardf77079f2019-01-14 11:55:24 +0000232 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000233 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
Ryan Taylor00e063a2019-03-19 16:07:00 +0000234 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
235 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000236
237 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
Matt Arsenault4165efd2017-01-17 07:26:53 +0000238 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
239 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +0000240 setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
Ryan Taylor00e063a2019-03-19 16:07:00 +0000241 setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
242 setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000243
Matt Arsenaulte54e1c32014-06-23 18:00:44 +0000244 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000245 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
Tom Stellardbc4497b2016-02-12 23:45:29 +0000246 setOperationAction(ISD::BR_CC, MVT::i32, Expand);
247 setOperationAction(ISD::BR_CC, MVT::i64, Expand);
248 setOperationAction(ISD::BR_CC, MVT::f32, Expand);
249 setOperationAction(ISD::BR_CC, MVT::f64, Expand);
Tom Stellardafcf12f2013-09-12 02:55:14 +0000250
Matt Arsenaultee3f0ac2017-01-30 18:11:38 +0000251 setOperationAction(ISD::UADDO, MVT::i32, Legal);
252 setOperationAction(ISD::USUBO, MVT::i32, Legal);
253
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +0000254 setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
255 setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
256
Matt Arsenaulte7191392018-08-08 16:58:33 +0000257 setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
258 setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
259 setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
260
Matt Arsenault84445dd2017-11-30 22:51:26 +0000261#if 0
262 setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
263 setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
264#endif
265
Benjamin Kramer867bfc52015-03-07 17:41:00 +0000266 // We only support LOAD/STORE and vector manipulation ops for vectors
267 // with > 4 elements.
Matt Arsenault7596f132017-02-27 20:52:10 +0000268 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000269 MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
Tom Stellard967bf582014-02-13 23:34:15 +0000270 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
Matt Arsenault71e66762016-05-21 02:27:49 +0000271 switch (Op) {
Tom Stellard967bf582014-02-13 23:34:15 +0000272 case ISD::LOAD:
273 case ISD::STORE:
274 case ISD::BUILD_VECTOR:
275 case ISD::BITCAST:
276 case ISD::EXTRACT_VECTOR_ELT:
277 case ISD::INSERT_VECTOR_ELT:
Tom Stellard967bf582014-02-13 23:34:15 +0000278 case ISD::INSERT_SUBVECTOR:
279 case ISD::EXTRACT_SUBVECTOR:
Matt Arsenault61001bb2015-11-25 19:58:34 +0000280 case ISD::SCALAR_TO_VECTOR:
Tom Stellard967bf582014-02-13 23:34:15 +0000281 break;
Tom Stellardc0503db2014-08-09 01:06:56 +0000282 case ISD::CONCAT_VECTORS:
283 setOperationAction(Op, VT, Custom);
284 break;
Tom Stellard967bf582014-02-13 23:34:15 +0000285 default:
Matt Arsenaultd504a742014-05-15 21:44:05 +0000286 setOperationAction(Op, VT, Expand);
Tom Stellard967bf582014-02-13 23:34:15 +0000287 break;
288 }
289 }
290 }
291
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000292 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
293
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000294 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
295 // is expanded to avoid having two separate loops in case the index is a VGPR.
296
Matt Arsenault61001bb2015-11-25 19:58:34 +0000297 // Most operations are naturally 32-bit vector operations. We only support
298 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
299 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
300 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
301 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
302
303 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
304 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
305
306 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
307 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
308
309 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
310 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
311 }
312
Matt Arsenault71e66762016-05-21 02:27:49 +0000313 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
314 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
315 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
316 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +0000317
Matt Arsenault67a98152018-05-16 11:47:30 +0000318 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
319 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
320
Matt Arsenault3aef8092017-01-23 23:09:58 +0000321 // Avoid stack access for these.
322 // TODO: Generalize to more vector types.
323 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
324 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault67a98152018-05-16 11:47:30 +0000325 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
326 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
327
Matt Arsenault3aef8092017-01-23 23:09:58 +0000328 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
329 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault9224c002018-06-05 19:52:46 +0000330 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
331 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
332 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
333
334 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
335 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
336 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
Matt Arsenault3aef8092017-01-23 23:09:58 +0000337
Matt Arsenault67a98152018-05-16 11:47:30 +0000338 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
339 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
340 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
341 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
342
Tim Renouf361b5b22019-03-21 12:01:21 +0000343 // Deal with vec3 vector operations when widened to vec4.
344 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Expand);
345 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Expand);
346 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Expand);
347 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Expand);
348
Tim Renouf033f99a2019-03-22 10:11:21 +0000349 // Deal with vec5 vector operations when widened to vec8.
350 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Expand);
351 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Expand);
352 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Expand);
353 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Expand);
354
Tom Stellard354a43c2016-04-01 18:27:37 +0000355 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
356 // and output demarshalling
357 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
358 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
359
360 // We can't return success/failure, only the old value,
361 // let LLVM add the comparison
362 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
363 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
364
Tom Stellardc5a154d2018-06-28 23:47:12 +0000365 if (Subtarget->hasFlatAddressSpace()) {
Matt Arsenault99c14522016-04-25 19:27:24 +0000366 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
367 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
368 }
369
Matt Arsenault71e66762016-05-21 02:27:49 +0000370 setOperationAction(ISD::BSWAP, MVT::i32, Legal);
371 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
372
373 // On SI this is s_memtime and s_memrealtime on VI.
374 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
Matt Arsenault3e025382017-04-24 17:49:13 +0000375 setOperationAction(ISD::TRAP, MVT::Other, Custom);
376 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000377
Tom Stellardc5a154d2018-06-28 23:47:12 +0000378 if (Subtarget->has16BitInsts()) {
379 setOperationAction(ISD::FLOG, MVT::f16, Custom);
Matt Arsenault7121bed2018-08-16 17:07:52 +0000380 setOperationAction(ISD::FEXP, MVT::f16, Custom);
Tom Stellardc5a154d2018-06-28 23:47:12 +0000381 setOperationAction(ISD::FLOG10, MVT::f16, Custom);
382 }
383
384 // v_mad_f32 does not support denormals according to some sources.
385 if (!Subtarget->hasFP32Denormals())
386 setOperationAction(ISD::FMAD, MVT::f32, Legal);
387
388 if (!Subtarget->hasBFI()) {
389 // fcopysign can be done in a single instruction with BFI.
390 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
391 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
392 }
393
394 if (!Subtarget->hasBCNT(32))
395 setOperationAction(ISD::CTPOP, MVT::i32, Expand);
396
397 if (!Subtarget->hasBCNT(64))
398 setOperationAction(ISD::CTPOP, MVT::i64, Expand);
399
400 if (Subtarget->hasFFBH())
401 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
402
403 if (Subtarget->hasFFBL())
404 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
405
406 // We only really have 32-bit BFE instructions (and 16-bit on VI).
407 //
408 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
409 // effort to match them now. We want this to be false for i64 cases when the
410 // extraction isn't restricted to the upper or lower half. Ideally we would
411 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
412 // span the midpoint are probably relatively rare, so don't worry about them
413 // for now.
414 if (Subtarget->hasBFE())
415 setHasExtractBitsInsn(true);
416
Matt Arsenault687ec752018-10-22 16:27:27 +0000417 setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
418 setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
419 setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
420 setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
421
422
423 // These are really only legal for ieee_mode functions. We should be avoiding
424 // them for functions that don't have ieee_mode enabled, so just say they are
425 // legal.
426 setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
427 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
428 setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
429 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
430
Matt Arsenault71e66762016-05-21 02:27:49 +0000431
Tom Stellard5bfbae52018-07-11 20:59:01 +0000432 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
Matt Arsenault71e66762016-05-21 02:27:49 +0000433 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
434 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
435 setOperationAction(ISD::FRINT, MVT::f64, Legal);
Tom Stellardc5a154d2018-06-28 23:47:12 +0000436 } else {
437 setOperationAction(ISD::FCEIL, MVT::f64, Custom);
438 setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
439 setOperationAction(ISD::FRINT, MVT::f64, Custom);
440 setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000441 }
442
443 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
444
445 setOperationAction(ISD::FSIN, MVT::f32, Custom);
446 setOperationAction(ISD::FCOS, MVT::f32, Custom);
447 setOperationAction(ISD::FDIV, MVT::f32, Custom);
448 setOperationAction(ISD::FDIV, MVT::f64, Custom);
449
Tom Stellard115a6152016-11-10 16:02:37 +0000450 if (Subtarget->has16BitInsts()) {
451 setOperationAction(ISD::Constant, MVT::i16, Legal);
452
453 setOperationAction(ISD::SMIN, MVT::i16, Legal);
454 setOperationAction(ISD::SMAX, MVT::i16, Legal);
455
456 setOperationAction(ISD::UMIN, MVT::i16, Legal);
457 setOperationAction(ISD::UMAX, MVT::i16, Legal);
458
Tom Stellard115a6152016-11-10 16:02:37 +0000459 setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
460 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
461
462 setOperationAction(ISD::ROTR, MVT::i16, Promote);
463 setOperationAction(ISD::ROTL, MVT::i16, Promote);
464
465 setOperationAction(ISD::SDIV, MVT::i16, Promote);
466 setOperationAction(ISD::UDIV, MVT::i16, Promote);
467 setOperationAction(ISD::SREM, MVT::i16, Promote);
468 setOperationAction(ISD::UREM, MVT::i16, Promote);
469
470 setOperationAction(ISD::BSWAP, MVT::i16, Promote);
471 setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
472
473 setOperationAction(ISD::CTTZ, MVT::i16, Promote);
474 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
475 setOperationAction(ISD::CTLZ, MVT::i16, Promote);
476 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
Jan Veselyb283ea02018-03-02 02:50:22 +0000477 setOperationAction(ISD::CTPOP, MVT::i16, Promote);
Tom Stellard115a6152016-11-10 16:02:37 +0000478
479 setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
480
481 setOperationAction(ISD::BR_CC, MVT::i16, Expand);
482
483 setOperationAction(ISD::LOAD, MVT::i16, Custom);
484
485 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
486
Tom Stellard115a6152016-11-10 16:02:37 +0000487 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
488 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
489 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
490 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
Tom Stellardb4c8e8e2016-11-12 00:19:11 +0000491
Konstantin Zhuravlyov3f0cdc72016-11-17 04:00:46 +0000492 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
493 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
494 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
495 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
Tom Stellardb4c8e8e2016-11-12 00:19:11 +0000496
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000497 // F16 - Constant Actions.
Matt Arsenaulte96d0372016-12-08 20:14:46 +0000498 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000499
500 // F16 - Load/Store Actions.
501 setOperationAction(ISD::LOAD, MVT::f16, Promote);
502 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
503 setOperationAction(ISD::STORE, MVT::f16, Promote);
504 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
505
506 // F16 - VOP1 Actions.
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +0000507 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000508 setOperationAction(ISD::FCOS, MVT::f16, Promote);
509 setOperationAction(ISD::FSIN, MVT::f16, Promote);
Konstantin Zhuravlyov3f0cdc72016-11-17 04:00:46 +0000510 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
511 setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
512 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
513 setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
Matt Arsenaultb5d23272017-03-24 20:04:18 +0000514 setOperationAction(ISD::FROUND, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000515
516 // F16 - VOP2 Actions.
Konstantin Zhuravlyov662e01d2016-11-17 03:49:01 +0000517 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
Konstantin Zhuravlyov2a87a422016-11-16 03:16:26 +0000518 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
Matt Arsenault687ec752018-10-22 16:27:27 +0000519
Matt Arsenault4052a572016-12-22 03:05:41 +0000520 setOperationAction(ISD::FDIV, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000521
522 // F16 - VOP3 Actions.
523 setOperationAction(ISD::FMA, MVT::f16, Legal);
Stanislav Mekhanoshin28a19362019-05-04 04:20:37 +0000524 if (!Subtarget->hasFP16Denormals() && STI.hasMadF16())
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000525 setOperationAction(ISD::FMAD, MVT::f16, Legal);
Tom Stellard115a6152016-11-10 16:02:37 +0000526
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000527 for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
Matt Arsenault7596f132017-02-27 20:52:10 +0000528 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
529 switch (Op) {
530 case ISD::LOAD:
531 case ISD::STORE:
532 case ISD::BUILD_VECTOR:
533 case ISD::BITCAST:
534 case ISD::EXTRACT_VECTOR_ELT:
535 case ISD::INSERT_VECTOR_ELT:
536 case ISD::INSERT_SUBVECTOR:
537 case ISD::EXTRACT_SUBVECTOR:
538 case ISD::SCALAR_TO_VECTOR:
539 break;
540 case ISD::CONCAT_VECTORS:
541 setOperationAction(Op, VT, Custom);
542 break;
543 default:
544 setOperationAction(Op, VT, Expand);
545 break;
546 }
547 }
548 }
549
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000550 // XXX - Do these do anything? Vector constants turn into build_vector.
551 setOperationAction(ISD::Constant, MVT::v2i16, Legal);
552 setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
553
Matt Arsenaultdfb88df2018-05-13 10:04:38 +0000554 setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
555 setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
556
Matt Arsenault7596f132017-02-27 20:52:10 +0000557 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
558 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
559 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
560 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
561
562 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
563 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
564 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
565 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000566
567 setOperationAction(ISD::AND, MVT::v2i16, Promote);
568 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
569 setOperationAction(ISD::OR, MVT::v2i16, Promote);
570 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
571 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
572 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000573
Matt Arsenault1349a042018-05-22 06:32:10 +0000574 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
575 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
576 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
577 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
578
579 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
580 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
581 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
582 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
583
584 setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
585 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
586 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
587 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
588
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000589 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
590 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
591 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
592
Matt Arsenault1349a042018-05-22 06:32:10 +0000593 if (!Subtarget->hasVOP3PInsts()) {
594 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
595 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
596 }
597
598 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
599 // This isn't really legal, but this avoids the legalizer unrolling it (and
600 // allows matching fneg (fabs x) patterns)
601 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
Matt Arsenault687ec752018-10-22 16:27:27 +0000602
603 setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
604 setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
605 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
606 setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
607
608 setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
609 setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
610
611 setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
612 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
Matt Arsenault1349a042018-05-22 06:32:10 +0000613 }
614
615 if (Subtarget->hasVOP3PInsts()) {
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000616 setOperationAction(ISD::ADD, MVT::v2i16, Legal);
617 setOperationAction(ISD::SUB, MVT::v2i16, Legal);
618 setOperationAction(ISD::MUL, MVT::v2i16, Legal);
619 setOperationAction(ISD::SHL, MVT::v2i16, Legal);
620 setOperationAction(ISD::SRL, MVT::v2i16, Legal);
621 setOperationAction(ISD::SRA, MVT::v2i16, Legal);
622 setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
623 setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
624 setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
625 setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
626
627 setOperationAction(ISD::FADD, MVT::v2f16, Legal);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000628 setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
629 setOperationAction(ISD::FMA, MVT::v2f16, Legal);
Matt Arsenault687ec752018-10-22 16:27:27 +0000630
631 setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
632 setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
633
Matt Arsenault540512c2018-04-26 19:21:37 +0000634 setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000635
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000636 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
637 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000638
639 setOperationAction(ISD::SHL, MVT::v4i16, Custom);
640 setOperationAction(ISD::SRA, MVT::v4i16, Custom);
641 setOperationAction(ISD::SRL, MVT::v4i16, Custom);
642 setOperationAction(ISD::ADD, MVT::v4i16, Custom);
643 setOperationAction(ISD::SUB, MVT::v4i16, Custom);
644 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
645
646 setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
647 setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
648 setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
649 setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
650
651 setOperationAction(ISD::FADD, MVT::v4f16, Custom);
652 setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
Matt Arsenault687ec752018-10-22 16:27:27 +0000653
654 setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
655 setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
656
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000657 setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
658 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
Matt Arsenault36cdcfa2018-08-02 13:43:42 +0000659 setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000660
Matt Arsenault7121bed2018-08-16 17:07:52 +0000661 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000662 setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
663 setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
Matt Arsenault1349a042018-05-22 06:32:10 +0000664 }
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000665
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000666 setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
667 setOperationAction(ISD::FABS, MVT::v4f16, Custom);
668
Matt Arsenault1349a042018-05-22 06:32:10 +0000669 if (Subtarget->has16BitInsts()) {
670 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
671 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
672 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
673 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
Matt Arsenault4a486232017-04-19 20:53:07 +0000674 } else {
Matt Arsenault1349a042018-05-22 06:32:10 +0000675 // Legalization hack.
Matt Arsenault4a486232017-04-19 20:53:07 +0000676 setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
677 setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
Matt Arsenaulte9524f12018-06-06 21:28:11 +0000678
679 setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
680 setOperationAction(ISD::FABS, MVT::v2f16, Custom);
Matt Arsenault4a486232017-04-19 20:53:07 +0000681 }
682
683 for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
684 setOperationAction(ISD::SELECT, VT, Custom);
Matt Arsenault7596f132017-02-27 20:52:10 +0000685 }
686
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +0000687 setTargetDAGCombine(ISD::ADD);
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +0000688 setTargetDAGCombine(ISD::ADDCARRY);
689 setTargetDAGCombine(ISD::SUB);
690 setTargetDAGCombine(ISD::SUBCARRY);
Matt Arsenault02cb0ff2014-09-29 14:59:34 +0000691 setTargetDAGCombine(ISD::FADD);
Matt Arsenault8675db12014-08-29 16:01:14 +0000692 setTargetDAGCombine(ISD::FSUB);
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +0000693 setTargetDAGCombine(ISD::FMINNUM);
694 setTargetDAGCombine(ISD::FMAXNUM);
Matt Arsenault687ec752018-10-22 16:27:27 +0000695 setTargetDAGCombine(ISD::FMINNUM_IEEE);
696 setTargetDAGCombine(ISD::FMAXNUM_IEEE);
Farhana Aleenc370d7b2018-07-16 18:19:59 +0000697 setTargetDAGCombine(ISD::FMA);
Matt Arsenault5881f4e2015-06-09 00:52:37 +0000698 setTargetDAGCombine(ISD::SMIN);
699 setTargetDAGCombine(ISD::SMAX);
700 setTargetDAGCombine(ISD::UMIN);
701 setTargetDAGCombine(ISD::UMAX);
Tom Stellard75aadc22012-12-11 21:25:42 +0000702 setTargetDAGCombine(ISD::SETCC);
Matt Arsenaultd0101a22015-01-06 23:00:46 +0000703 setTargetDAGCombine(ISD::AND);
Matt Arsenaultf2290332015-01-06 23:00:39 +0000704 setTargetDAGCombine(ISD::OR);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +0000705 setTargetDAGCombine(ISD::XOR);
Konstantin Zhuravlyovfda33ea2016-10-21 22:10:03 +0000706 setTargetDAGCombine(ISD::SINT_TO_FP);
Matt Arsenault364a6742014-06-11 17:50:44 +0000707 setTargetDAGCombine(ISD::UINT_TO_FP);
Matt Arsenault9cd90712016-04-14 01:42:16 +0000708 setTargetDAGCombine(ISD::FCANONICALIZE);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000709 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
Matt Arsenault8edfaee2017-03-31 19:53:03 +0000710 setTargetDAGCombine(ISD::ZERO_EXTEND);
Ryan Taylor00e063a2019-03-19 16:07:00 +0000711 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
Matt Arsenaultbf5482e2017-05-11 17:26:25 +0000712 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +0000713 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
Matt Arsenault364a6742014-06-11 17:50:44 +0000714
Matt Arsenaultb2baffa2014-08-15 17:49:05 +0000715 // All memory operations. Some folding on the pointer operand is done to help
716 // matching the constant offsets in the addressing modes.
717 setTargetDAGCombine(ISD::LOAD);
718 setTargetDAGCombine(ISD::STORE);
719 setTargetDAGCombine(ISD::ATOMIC_LOAD);
720 setTargetDAGCombine(ISD::ATOMIC_STORE);
721 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
722 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
723 setTargetDAGCombine(ISD::ATOMIC_SWAP);
724 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
725 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
726 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
727 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
728 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
729 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
730 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
731 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
732 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
733 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
Matt Arsenaulta5840c32019-01-22 18:36:06 +0000734 setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +0000735
Christian Konigeecebd02013-03-26 14:04:02 +0000736 setSchedulingPreference(Sched::RegPressure);
Tom Stellard75aadc22012-12-11 21:25:42 +0000737}
738
Tom Stellard5bfbae52018-07-11 20:59:01 +0000739const GCNSubtarget *SITargetLowering::getSubtarget() const {
Tom Stellardc5a154d2018-06-28 23:47:12 +0000740 return Subtarget;
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000741}
742
Tom Stellard0125f2a2013-06-25 02:39:35 +0000743//===----------------------------------------------------------------------===//
744// TargetLowering queries
745//===----------------------------------------------------------------------===//
746
Tom Stellardb12f4de2018-05-22 19:37:55 +0000747// v_mad_mix* support a conversion from f16 to f32.
748//
749// There is only one special case when denormals are enabled we don't currently,
750// where this is OK to use.
751bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
752 EVT DestVT, EVT SrcVT) const {
753 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
754 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
755 DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
756 SrcVT.getScalarType() == MVT::f16;
757}
758
Zvi Rackover1b736822017-07-26 08:06:58 +0000759bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
Matt Arsenault7dc01c92017-03-15 23:15:12 +0000760 // SI has some legal vector types, but no legal vector operations. Say no
761 // shuffles are legal in order to prefer scalarizing some vector operations.
762 return false;
763}
764
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000765MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
766 CallingConv::ID CC,
767 EVT VT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000768 // TODO: Consider splitting all arguments into 32-bit pieces.
769 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000770 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000771 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000772 if (Size == 32)
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000773 return ScalarVT.getSimpleVT();
Matt Arsenault0395da72018-07-31 19:17:47 +0000774
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000775 if (Size == 64)
776 return MVT::i32;
777
Matt Arsenault57b59662018-09-10 11:49:23 +0000778 if (Size == 16 && Subtarget->has16BitInsts())
Matt Arsenault0395da72018-07-31 19:17:47 +0000779 return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000780 }
781
782 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
783}
784
785unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
786 CallingConv::ID CC,
787 EVT VT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000788 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000789 unsigned NumElts = VT.getVectorNumElements();
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000790 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000791 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenault0395da72018-07-31 19:17:47 +0000792
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000793 if (Size == 32)
Matt Arsenault0395da72018-07-31 19:17:47 +0000794 return NumElts;
795
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000796 if (Size == 64)
797 return 2 * NumElts;
798
Matt Arsenault57b59662018-09-10 11:49:23 +0000799 if (Size == 16 && Subtarget->has16BitInsts())
800 return (VT.getVectorNumElements() + 1) / 2;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000801 }
802
803 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
804}
805
806unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
807 LLVMContext &Context, CallingConv::ID CC,
808 EVT VT, EVT &IntermediateVT,
809 unsigned &NumIntermediates, MVT &RegisterVT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000810 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000811 unsigned NumElts = VT.getVectorNumElements();
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000812 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000813 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000814 if (Size == 32) {
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000815 RegisterVT = ScalarVT.getSimpleVT();
816 IntermediateVT = RegisterVT;
Matt Arsenault0395da72018-07-31 19:17:47 +0000817 NumIntermediates = NumElts;
818 return NumIntermediates;
819 }
820
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000821 if (Size == 64) {
822 RegisterVT = MVT::i32;
823 IntermediateVT = RegisterVT;
824 NumIntermediates = 2 * NumElts;
825 return NumIntermediates;
826 }
827
Matt Arsenault0395da72018-07-31 19:17:47 +0000828 // FIXME: We should fix the ABI to be the same on targets without 16-bit
829 // support, but unless we can properly handle 3-vectors, it will be still be
830 // inconsistent.
Matt Arsenault57b59662018-09-10 11:49:23 +0000831 if (Size == 16 && Subtarget->has16BitInsts()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000832 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
833 IntermediateVT = RegisterVT;
Matt Arsenault57b59662018-09-10 11:49:23 +0000834 NumIntermediates = (NumElts + 1) / 2;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000835 return NumIntermediates;
836 }
837 }
838
839 return TargetLowering::getVectorTypeBreakdownForCallingConv(
840 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
841}
842
David Stuttardf77079f2019-01-14 11:55:24 +0000843static MVT memVTFromAggregate(Type *Ty) {
844 // Only limited forms of aggregate type currently expected.
845 assert(Ty->isStructTy() && "Expected struct type");
846
847
848 Type *ElementType = nullptr;
849 unsigned NumElts;
850 if (Ty->getContainedType(0)->isVectorTy()) {
851 VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
852 ElementType = VecComponent->getElementType();
853 NumElts = VecComponent->getNumElements();
854 } else {
855 ElementType = Ty->getContainedType(0);
856 NumElts = 1;
857 }
858
859 assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
860
861 // Calculate the size of the memVT type from the aggregate
862 unsigned Pow2Elts = 0;
863 unsigned ElementSize;
864 switch (ElementType->getTypeID()) {
865 default:
866 llvm_unreachable("Unknown type!");
867 case Type::IntegerTyID:
868 ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
869 break;
870 case Type::HalfTyID:
871 ElementSize = 16;
872 break;
873 case Type::FloatTyID:
874 ElementSize = 32;
875 break;
876 }
877 unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
878 Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
879
880 return MVT::getVectorVT(MVT::getVT(ElementType, false),
881 Pow2Elts);
882}
883
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000884bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
885 const CallInst &CI,
Matt Arsenault7d7adf42017-12-14 22:34:10 +0000886 MachineFunction &MF,
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000887 unsigned IntrID) const {
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000888 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
Nicolai Haehnlee741d7e2018-06-21 13:36:33 +0000889 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000890 AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
891 (Intrinsic::ID)IntrID);
892 if (Attr.hasFnAttribute(Attribute::ReadNone))
893 return false;
894
895 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
896
897 if (RsrcIntr->IsImage) {
898 Info.ptrVal = MFI->getImagePSV(
Tom Stellard5bfbae52018-07-11 20:59:01 +0000899 *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000900 CI.getArgOperand(RsrcIntr->RsrcArg));
901 Info.align = 0;
902 } else {
903 Info.ptrVal = MFI->getBufferPSV(
Tom Stellard5bfbae52018-07-11 20:59:01 +0000904 *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000905 CI.getArgOperand(RsrcIntr->RsrcArg));
906 }
907
908 Info.flags = MachineMemOperand::MODereferenceable;
909 if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
910 Info.opc = ISD::INTRINSIC_W_CHAIN;
David Stuttardf77079f2019-01-14 11:55:24 +0000911 Info.memVT = MVT::getVT(CI.getType(), true);
912 if (Info.memVT == MVT::Other) {
913 // Some intrinsics return an aggregate type - special case to work out
914 // the correct memVT
915 Info.memVT = memVTFromAggregate(CI.getType());
916 }
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000917 Info.flags |= MachineMemOperand::MOLoad;
918 } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
919 Info.opc = ISD::INTRINSIC_VOID;
920 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
921 Info.flags |= MachineMemOperand::MOStore;
922 } else {
923 // Atomic
924 Info.opc = ISD::INTRINSIC_W_CHAIN;
925 Info.memVT = MVT::getVT(CI.getType());
926 Info.flags = MachineMemOperand::MOLoad |
927 MachineMemOperand::MOStore |
928 MachineMemOperand::MODereferenceable;
929
930 // XXX - Should this be volatile without known ordering?
931 Info.flags |= MachineMemOperand::MOVolatile;
932 }
933 return true;
934 }
935
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000936 switch (IntrID) {
937 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +0000938 case Intrinsic::amdgcn_atomic_dec:
Marek Olsakc5cec5e2019-01-16 15:43:53 +0000939 case Intrinsic::amdgcn_ds_ordered_add:
940 case Intrinsic::amdgcn_ds_ordered_swap:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +0000941 case Intrinsic::amdgcn_ds_fadd:
942 case Intrinsic::amdgcn_ds_fmin:
943 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000944 Info.opc = ISD::INTRINSIC_W_CHAIN;
945 Info.memVT = MVT::getVT(CI.getType());
946 Info.ptrVal = CI.getOperand(0);
947 Info.align = 0;
Matt Arsenault11171332017-12-14 21:39:51 +0000948 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
Matt Arsenault79f837c2017-03-30 22:21:40 +0000949
Matt Arsenaultcaf13162019-03-12 21:02:54 +0000950 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
951 if (!Vol->isZero())
Matt Arsenault11171332017-12-14 21:39:51 +0000952 Info.flags |= MachineMemOperand::MOVolatile;
953
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000954 return true;
Matt Arsenault79f837c2017-03-30 22:21:40 +0000955 }
Matt Arsenaultcdd191d2019-01-28 20:14:49 +0000956 case Intrinsic::amdgcn_ds_append:
957 case Intrinsic::amdgcn_ds_consume: {
958 Info.opc = ISD::INTRINSIC_W_CHAIN;
959 Info.memVT = MVT::getVT(CI.getType());
960 Info.ptrVal = CI.getOperand(0);
961 Info.align = 0;
962 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
Matt Arsenault905f3512017-12-29 17:18:14 +0000963
Matt Arsenaultcaf13162019-03-12 21:02:54 +0000964 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
965 if (!Vol->isZero())
Matt Arsenaultcdd191d2019-01-28 20:14:49 +0000966 Info.flags |= MachineMemOperand::MOVolatile;
967
968 return true;
969 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000970 default:
971 return false;
972 }
973}
974
Matt Arsenault7dc01c92017-03-15 23:15:12 +0000975bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
976 SmallVectorImpl<Value*> &Ops,
977 Type *&AccessTy) const {
978 switch (II->getIntrinsicID()) {
979 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +0000980 case Intrinsic::amdgcn_atomic_dec:
Marek Olsakc5cec5e2019-01-16 15:43:53 +0000981 case Intrinsic::amdgcn_ds_ordered_add:
982 case Intrinsic::amdgcn_ds_ordered_swap:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +0000983 case Intrinsic::amdgcn_ds_fadd:
984 case Intrinsic::amdgcn_ds_fmin:
985 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenault7dc01c92017-03-15 23:15:12 +0000986 Value *Ptr = II->getArgOperand(0);
987 AccessTy = II->getType();
988 Ops.push_back(Ptr);
989 return true;
990 }
991 default:
992 return false;
993 }
Matt Arsenaulte306a322014-10-21 16:25:08 +0000994}
995
Tom Stellard70580f82015-07-20 14:28:41 +0000996bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
Matt Arsenaultd9b77842017-06-12 17:06:35 +0000997 if (!Subtarget->hasFlatInstOffsets()) {
998 // Flat instructions do not have offsets, and only have the register
999 // address.
1000 return AM.BaseOffs == 0 && AM.Scale == 0;
1001 }
1002
1003 // GFX9 added a 13-bit signed offset. When using regular flat instructions,
1004 // the sign bit is ignored and is treated as a 12-bit unsigned offset.
1005
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00001006 // GFX10 shrinked signed offset to 12 bits. When using regular flat
1007 // instructions, the sign bit is also ignored and is treated as 11-bit
1008 // unsigned offset.
1009
1010 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
1011 return isUInt<11>(AM.BaseOffs) && AM.Scale == 0;
1012
Matt Arsenaultd9b77842017-06-12 17:06:35 +00001013 // Just r + i
1014 return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
Tom Stellard70580f82015-07-20 14:28:41 +00001015}
1016
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +00001017bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1018 if (Subtarget->hasFlatGlobalInsts())
1019 return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
1020
1021 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1022 // Assume the we will use FLAT for all global memory accesses
1023 // on VI.
1024 // FIXME: This assumption is currently wrong. On VI we still use
1025 // MUBUF instructions for the r + i addressing mode. As currently
1026 // implemented, the MUBUF instructions only work on buffer < 4GB.
1027 // It may be possible to support > 4GB buffers with MUBUF instructions,
1028 // by setting the stride value in the resource descriptor which would
1029 // increase the size limit to (stride * 4GB). However, this is risky,
1030 // because it has never been validated.
1031 return isLegalFlatAddressingMode(AM);
1032 }
1033
1034 return isLegalMUBUFAddressingMode(AM);
1035}
1036
Matt Arsenault711b3902015-08-07 20:18:34 +00001037bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1038 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1039 // additionally can do r + r + i with addr64. 32-bit has more addressing
1040 // mode options. Depending on the resource constant, it can also do
1041 // (i64 r0) + (i32 r1) * (i14 i).
1042 //
1043 // Private arrays end up using a scratch buffer most of the time, so also
1044 // assume those use MUBUF instructions. Scratch loads / stores are currently
1045 // implemented as mubuf instructions with offen bit set, so slightly
1046 // different than the normal addr64.
1047 if (!isUInt<12>(AM.BaseOffs))
1048 return false;
1049
1050 // FIXME: Since we can split immediate into soffset and immediate offset,
1051 // would it make sense to allow any immediate?
1052
1053 switch (AM.Scale) {
1054 case 0: // r + i or just i, depending on HasBaseReg.
1055 return true;
1056 case 1:
1057 return true; // We have r + r or r + i.
1058 case 2:
1059 if (AM.HasBaseReg) {
1060 // Reject 2 * r + r.
1061 return false;
1062 }
1063
1064 // Allow 2 * r as r + r
1065 // Or 2 * r + i is allowed as r + r + i.
1066 return true;
1067 default: // Don't allow n * r
1068 return false;
1069 }
1070}
1071
Mehdi Amini0cdec1e2015-07-09 02:09:40 +00001072bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1073 const AddrMode &AM, Type *Ty,
Jonas Paulsson024e3192017-07-21 11:59:37 +00001074 unsigned AS, Instruction *I) const {
Matt Arsenault5015a892014-08-15 17:17:07 +00001075 // No global is ever allowed as a base.
1076 if (AM.BaseGV)
1077 return false;
1078
Matt Arsenault0da63502018-08-31 05:49:54 +00001079 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +00001080 return isLegalGlobalAddressingMode(AM);
Matt Arsenault5015a892014-08-15 17:17:07 +00001081
Matt Arsenault0da63502018-08-31 05:49:54 +00001082 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
Neil Henning523dab02019-03-18 14:44:28 +00001083 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1084 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001085 // If the offset isn't a multiple of 4, it probably isn't going to be
1086 // correctly aligned.
Matt Arsenault3cc1e002016-08-13 01:43:51 +00001087 // FIXME: Can we get the real alignment here?
Matt Arsenault711b3902015-08-07 20:18:34 +00001088 if (AM.BaseOffs % 4 != 0)
1089 return isLegalMUBUFAddressingMode(AM);
1090
1091 // There are no SMRD extloads, so if we have to do a small type access we
1092 // will use a MUBUF load.
1093 // FIXME?: We also need to do this if unaligned, but we don't know the
1094 // alignment here.
Stanislav Mekhanoshin57d341c2018-05-15 22:07:51 +00001095 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +00001096 return isLegalGlobalAddressingMode(AM);
Matt Arsenault711b3902015-08-07 20:18:34 +00001097
Tom Stellard5bfbae52018-07-11 20:59:01 +00001098 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001099 // SMRD instructions have an 8-bit, dword offset on SI.
1100 if (!isUInt<8>(AM.BaseOffs / 4))
1101 return false;
Tom Stellard5bfbae52018-07-11 20:59:01 +00001102 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001103 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1104 // in 8-bits, it can use a smaller encoding.
1105 if (!isUInt<32>(AM.BaseOffs / 4))
1106 return false;
Tom Stellard5bfbae52018-07-11 20:59:01 +00001107 } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001108 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1109 if (!isUInt<20>(AM.BaseOffs))
1110 return false;
1111 } else
1112 llvm_unreachable("unhandled generation");
1113
1114 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1115 return true;
1116
1117 if (AM.Scale == 1 && AM.HasBaseReg)
1118 return true;
1119
1120 return false;
Matt Arsenault711b3902015-08-07 20:18:34 +00001121
Matt Arsenault0da63502018-08-31 05:49:54 +00001122 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001123 return isLegalMUBUFAddressingMode(AM);
Matt Arsenault0da63502018-08-31 05:49:54 +00001124 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1125 AS == AMDGPUAS::REGION_ADDRESS) {
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001126 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1127 // field.
1128 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1129 // an 8-bit dword offset but we don't know the alignment here.
1130 if (!isUInt<16>(AM.BaseOffs))
Matt Arsenault5015a892014-08-15 17:17:07 +00001131 return false;
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001132
1133 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1134 return true;
1135
1136 if (AM.Scale == 1 && AM.HasBaseReg)
1137 return true;
1138
Matt Arsenault5015a892014-08-15 17:17:07 +00001139 return false;
Matt Arsenault0da63502018-08-31 05:49:54 +00001140 } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1141 AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
Matt Arsenault7d1b6c82016-04-29 06:25:10 +00001142 // For an unknown address space, this usually means that this is for some
1143 // reason being used for pure arithmetic, and not based on some addressing
1144 // computation. We don't have instructions that compute pointers with any
1145 // addressing modes, so treat them as having no offset like flat
1146 // instructions.
Tom Stellard70580f82015-07-20 14:28:41 +00001147 return isLegalFlatAddressingMode(AM);
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00001148 } else {
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001149 llvm_unreachable("unhandled address space");
1150 }
Matt Arsenault5015a892014-08-15 17:17:07 +00001151}
1152
Nirav Dave4dcad5d2017-07-10 20:25:54 +00001153bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1154 const SelectionDAG &DAG) const {
Matt Arsenault0da63502018-08-31 05:49:54 +00001155 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001156 return (MemVT.getSizeInBits() <= 4 * 32);
Matt Arsenault0da63502018-08-31 05:49:54 +00001157 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001158 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1159 return (MemVT.getSizeInBits() <= MaxPrivateBits);
Matt Arsenault0da63502018-08-31 05:49:54 +00001160 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001161 return (MemVT.getSizeInBits() <= 2 * 32);
1162 }
1163 return true;
1164}
1165
Matt Arsenaulte6986632015-01-14 01:35:22 +00001166bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
Matt Arsenault6f2a5262014-07-27 17:46:40 +00001167 unsigned AddrSpace,
1168 unsigned Align,
1169 bool *IsFast) const {
Matt Arsenault1018c892014-04-24 17:08:26 +00001170 if (IsFast)
1171 *IsFast = false;
1172
Matt Arsenault1018c892014-04-24 17:08:26 +00001173 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1174 // which isn't a simple VT.
Alina Sbirlea6f937b12016-08-04 16:38:44 +00001175 // Until MVT is extended to handle this, simply check for the size and
1176 // rely on the condition below: allow accesses if the size is a multiple of 4.
1177 if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1178 VT.getStoreSize() > 16)) {
Tom Stellard81d871d2013-11-13 23:36:50 +00001179 return false;
Alina Sbirlea6f937b12016-08-04 16:38:44 +00001180 }
Matt Arsenault1018c892014-04-24 17:08:26 +00001181
Matt Arsenault0da63502018-08-31 05:49:54 +00001182 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1183 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
Matt Arsenault6f2a5262014-07-27 17:46:40 +00001184 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1185 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1186 // with adjacent offsets.
Sanjay Patelce74db92015-09-03 15:03:19 +00001187 bool AlignedBy4 = (Align % 4 == 0);
1188 if (IsFast)
1189 *IsFast = AlignedBy4;
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001190
Sanjay Patelce74db92015-09-03 15:03:19 +00001191 return AlignedBy4;
Matt Arsenault6f2a5262014-07-27 17:46:40 +00001192 }
Matt Arsenault1018c892014-04-24 17:08:26 +00001193
Tom Stellard64a9d082016-10-14 18:10:39 +00001194 // FIXME: We have to be conservative here and assume that flat operations
1195 // will access scratch. If we had access to the IR function, then we
1196 // could determine if any private memory was used in the function.
1197 if (!Subtarget->hasUnalignedScratchAccess() &&
Matt Arsenault0da63502018-08-31 05:49:54 +00001198 (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1199 AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
Matt Arsenaultf4320112018-09-24 13:18:15 +00001200 bool AlignedBy4 = Align >= 4;
1201 if (IsFast)
1202 *IsFast = AlignedBy4;
1203
1204 return AlignedBy4;
Tom Stellard64a9d082016-10-14 18:10:39 +00001205 }
1206
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001207 if (Subtarget->hasUnalignedBufferAccess()) {
1208 // If we have an uniform constant load, it still requires using a slow
1209 // buffer instruction if unaligned.
1210 if (IsFast) {
Matt Arsenault0da63502018-08-31 05:49:54 +00001211 *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1212 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001213 (Align % 4 == 0) : true;
1214 }
1215
1216 return true;
1217 }
1218
Tom Stellard33e64c62015-02-04 20:49:52 +00001219 // Smaller than dword value must be aligned.
Tom Stellard33e64c62015-02-04 20:49:52 +00001220 if (VT.bitsLT(MVT::i32))
1221 return false;
1222
Matt Arsenault1018c892014-04-24 17:08:26 +00001223 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1224 // byte-address are ignored, thus forcing Dword alignment.
Tom Stellarde812f2f2014-07-21 15:45:06 +00001225 // This applies to private, global, and constant memory.
Matt Arsenault1018c892014-04-24 17:08:26 +00001226 if (IsFast)
1227 *IsFast = true;
Tom Stellardc6b299c2015-02-02 18:02:28 +00001228
1229 return VT.bitsGT(MVT::i32) && Align % 4 == 0;
Tom Stellard0125f2a2013-06-25 02:39:35 +00001230}
1231
Sjoerd Meijer180f1ae2019-04-30 08:38:12 +00001232EVT SITargetLowering::getOptimalMemOpType(
1233 uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
1234 bool ZeroMemset, bool MemcpyStrSrc,
1235 const AttributeList &FuncAttributes) const {
Matt Arsenault46645fa2014-07-28 17:49:26 +00001236 // FIXME: Should account for address space here.
1237
1238 // The default fallback uses the private pointer size as a guess for a type to
1239 // use. Make sure we switch these to 64-bit accesses.
1240
1241 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
1242 return MVT::v4i32;
1243
1244 if (Size >= 8 && DstAlign >= 4)
1245 return MVT::v2i32;
1246
1247 // Use the default.
1248 return MVT::Other;
1249}
1250
Matt Arsenault0da63502018-08-31 05:49:54 +00001251static bool isFlatGlobalAddrSpace(unsigned AS) {
1252 return AS == AMDGPUAS::GLOBAL_ADDRESS ||
1253 AS == AMDGPUAS::FLAT_ADDRESS ||
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001254 AS == AMDGPUAS::CONSTANT_ADDRESS ||
1255 AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
Matt Arsenaultf9bfeaf2015-12-01 23:04:00 +00001256}
1257
1258bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1259 unsigned DestAS) const {
Matt Arsenault0da63502018-08-31 05:49:54 +00001260 return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
Matt Arsenaultf9bfeaf2015-12-01 23:04:00 +00001261}
1262
Alexander Timofeev18009562016-12-08 17:28:47 +00001263bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1264 const MemSDNode *MemNode = cast<MemSDNode>(N);
1265 const Value *Ptr = MemNode->getMemOperand()->getValue();
Matt Arsenault0a0c8712018-03-27 18:39:45 +00001266 const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
Alexander Timofeev18009562016-12-08 17:28:47 +00001267 return I && I->getMetadata("amdgpu.noclobber");
1268}
1269
Matt Arsenaultd4da0ed2016-12-02 18:12:53 +00001270bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
1271 unsigned DestAS) const {
1272 // Flat -> private/local is a simple truncate.
1273 // Flat -> global is no-op
Matt Arsenault0da63502018-08-31 05:49:54 +00001274 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
Matt Arsenaultd4da0ed2016-12-02 18:12:53 +00001275 return true;
1276
1277 return isNoopAddrSpaceCast(SrcAS, DestAS);
1278}
1279
Tom Stellarda6f24c62015-12-15 20:55:55 +00001280bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1281 const MemSDNode *MemNode = cast<MemSDNode>(N);
Tom Stellarda6f24c62015-12-15 20:55:55 +00001282
Matt Arsenaultbcf7bec2018-02-09 16:57:48 +00001283 return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
Tom Stellarda6f24c62015-12-15 20:55:55 +00001284}
1285
Chandler Carruth9d010ff2014-07-03 00:23:43 +00001286TargetLoweringBase::LegalizeTypeAction
Craig Topper0b5f8162018-11-05 23:26:13 +00001287SITargetLowering::getPreferredVectorAction(MVT VT) const {
Chandler Carruth9d010ff2014-07-03 00:23:43 +00001288 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
1289 return TypeSplitVector;
1290
1291 return TargetLoweringBase::getPreferredVectorAction(VT);
Tom Stellardd86003e2013-08-14 23:25:00 +00001292}
Tom Stellard0125f2a2013-06-25 02:39:35 +00001293
Matt Arsenaultd7bdcc42014-03-31 19:54:27 +00001294bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1295 Type *Ty) const {
Matt Arsenault749035b2016-07-30 01:40:36 +00001296 // FIXME: Could be smarter if called for vector constants.
1297 return true;
Matt Arsenaultd7bdcc42014-03-31 19:54:27 +00001298}
1299
Tom Stellard2e045bb2016-01-20 00:13:22 +00001300bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
Matt Arsenault7b00cf42016-12-09 17:57:43 +00001301 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1302 switch (Op) {
1303 case ISD::LOAD:
1304 case ISD::STORE:
Tom Stellard2e045bb2016-01-20 00:13:22 +00001305
Matt Arsenault7b00cf42016-12-09 17:57:43 +00001306 // These operations are done with 32-bit instructions anyway.
1307 case ISD::AND:
1308 case ISD::OR:
1309 case ISD::XOR:
1310 case ISD::SELECT:
1311 // TODO: Extensions?
1312 return true;
1313 default:
1314 return false;
1315 }
1316 }
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +00001317
Tom Stellard2e045bb2016-01-20 00:13:22 +00001318 // SimplifySetCC uses this function to determine whether or not it should
1319 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1320 if (VT == MVT::i1 && Op == ISD::SETCC)
1321 return false;
1322
1323 return TargetLowering::isTypeDesirableForOp(Op, VT);
1324}
1325
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001326SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1327 const SDLoc &SL,
1328 SDValue Chain,
1329 uint64_t Offset) const {
Mehdi Aminia749f2a2015-07-09 02:09:52 +00001330 const DataLayout &DL = DAG.getDataLayout();
Tom Stellardec2e43c2014-09-22 15:35:29 +00001331 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001332 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1333
1334 const ArgDescriptor *InputPtrReg;
1335 const TargetRegisterClass *RC;
1336
1337 std::tie(InputPtrReg, RC)
1338 = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
Tom Stellard94593ee2013-06-03 17:40:18 +00001339
Matt Arsenault86033ca2014-07-28 17:31:39 +00001340 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
Matt Arsenault0da63502018-08-31 05:49:54 +00001341 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
Matt Arsenaulta0269b62015-06-01 21:58:24 +00001342 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001343 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1344
Matt Arsenault2fb9ccf2018-05-29 17:42:38 +00001345 return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
Jan Veselyfea814d2016-06-21 20:46:20 +00001346}
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00001347
Matt Arsenault9166ce82017-07-28 15:52:08 +00001348SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1349 const SDLoc &SL) const {
Matt Arsenault75e71922018-06-28 10:18:55 +00001350 uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1351 FIRST_IMPLICIT);
Matt Arsenault9166ce82017-07-28 15:52:08 +00001352 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1353}
1354
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001355SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1356 const SDLoc &SL, SDValue Val,
1357 bool Signed,
Matt Arsenault6dca5422017-01-09 18:52:39 +00001358 const ISD::InputArg *Arg) const {
Tim Renouf361b5b22019-03-21 12:01:21 +00001359 // First, if it is a widened vector, narrow it.
1360 if (VT.isVector() &&
1361 VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1362 EVT NarrowedVT =
1363 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
1364 VT.getVectorNumElements());
1365 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1366 DAG.getConstant(0, SL, MVT::i32));
1367 }
1368
1369 // Then convert the vector elements or scalar value.
Matt Arsenault6dca5422017-01-09 18:52:39 +00001370 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1371 VT.bitsLT(MemVT)) {
1372 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1373 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1374 }
1375
Tom Stellardbc6c5232016-10-17 16:21:45 +00001376 if (MemVT.isFloatingPoint())
Matt Arsenault6dca5422017-01-09 18:52:39 +00001377 Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001378 else if (Signed)
Matt Arsenault6dca5422017-01-09 18:52:39 +00001379 Val = DAG.getSExtOrTrunc(Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001380 else
Matt Arsenault6dca5422017-01-09 18:52:39 +00001381 Val = DAG.getZExtOrTrunc(Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001382
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001383 return Val;
1384}
1385
1386SDValue SITargetLowering::lowerKernargMemParameter(
1387 SelectionDAG &DAG, EVT VT, EVT MemVT,
1388 const SDLoc &SL, SDValue Chain,
Matt Arsenault7b4826e2018-05-30 16:17:51 +00001389 uint64_t Offset, unsigned Align, bool Signed,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001390 const ISD::InputArg *Arg) const {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001391 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
Matt Arsenault0da63502018-08-31 05:49:54 +00001392 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001393 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1394
Matt Arsenault90083d32018-06-07 09:54:49 +00001395 // Try to avoid using an extload by loading earlier than the argument address,
1396 // and extracting the relevant bits. The load should hopefully be merged with
1397 // the previous argument.
Matt Arsenault4bec7d42018-07-20 09:05:08 +00001398 if (MemVT.getStoreSize() < 4 && Align < 4) {
1399 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
Matt Arsenault90083d32018-06-07 09:54:49 +00001400 int64_t AlignDownOffset = alignDown(Offset, 4);
1401 int64_t OffsetDiff = Offset - AlignDownOffset;
1402
1403 EVT IntVT = MemVT.changeTypeToInteger();
1404
1405 // TODO: If we passed in the base kernel offset we could have a better
1406 // alignment than 4, but we don't really need it.
1407 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1408 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1409 MachineMemOperand::MODereferenceable |
1410 MachineMemOperand::MOInvariant);
1411
1412 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1413 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1414
1415 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1416 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1417 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1418
1419
1420 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1421 }
1422
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001423 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1424 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001425 MachineMemOperand::MODereferenceable |
1426 MachineMemOperand::MOInvariant);
1427
1428 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
Matt Arsenault6dca5422017-01-09 18:52:39 +00001429 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
Tom Stellard94593ee2013-06-03 17:40:18 +00001430}
1431
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001432SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1433 const SDLoc &SL, SDValue Chain,
1434 const ISD::InputArg &Arg) const {
1435 MachineFunction &MF = DAG.getMachineFunction();
1436 MachineFrameInfo &MFI = MF.getFrameInfo();
1437
1438 if (Arg.Flags.isByVal()) {
1439 unsigned Size = Arg.Flags.getByValSize();
1440 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1441 return DAG.getFrameIndex(FrameIdx, MVT::i32);
1442 }
1443
1444 unsigned ArgOffset = VA.getLocMemOffset();
1445 unsigned ArgSize = VA.getValVT().getStoreSize();
1446
1447 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1448
1449 // Create load nodes to retrieve arguments from the stack.
1450 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1451 SDValue ArgValue;
1452
1453 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1454 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
1455 MVT MemVT = VA.getValVT();
1456
1457 switch (VA.getLocInfo()) {
1458 default:
1459 break;
1460 case CCValAssign::BCvt:
1461 MemVT = VA.getLocVT();
1462 break;
1463 case CCValAssign::SExt:
1464 ExtType = ISD::SEXTLOAD;
1465 break;
1466 case CCValAssign::ZExt:
1467 ExtType = ISD::ZEXTLOAD;
1468 break;
1469 case CCValAssign::AExt:
1470 ExtType = ISD::EXTLOAD;
1471 break;
1472 }
1473
1474 ArgValue = DAG.getExtLoad(
1475 ExtType, SL, VA.getLocVT(), Chain, FIN,
1476 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1477 MemVT);
1478 return ArgValue;
1479}
1480
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001481SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1482 const SIMachineFunctionInfo &MFI,
1483 EVT VT,
1484 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1485 const ArgDescriptor *Reg;
1486 const TargetRegisterClass *RC;
1487
1488 std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1489 return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1490}
1491
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001492static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1493 CallingConv::ID CallConv,
1494 ArrayRef<ISD::InputArg> Ins,
1495 BitVector &Skipped,
1496 FunctionType *FType,
1497 SIMachineFunctionInfo *Info) {
1498 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001499 const ISD::InputArg *Arg = &Ins[I];
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001500
Matt Arsenault55ab9212018-08-01 19:57:34 +00001501 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1502 "vector type argument should have been split");
Matt Arsenault9ced1e02018-07-31 19:05:14 +00001503
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001504 // First check if it's a PS input addr.
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001505 if (CallConv == CallingConv::AMDGPU_PS &&
1506 !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001507
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001508 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1509
1510 // Inconveniently only the first part of the split is marked as isSplit,
1511 // so skip to the end. We only want to increment PSInputNum once for the
1512 // entire split argument.
1513 if (Arg->Flags.isSplit()) {
1514 while (!Arg->Flags.isSplitEnd()) {
1515 assert(!Arg->VT.isVector() &&
1516 "unexpected vector split in ps argument type");
1517 if (!SkipArg)
1518 Splits.push_back(*Arg);
1519 Arg = &Ins[++I];
1520 }
1521 }
1522
1523 if (SkipArg) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001524 // We can safely skip PS inputs.
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001525 Skipped.set(Arg->getOrigArgIndex());
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001526 ++PSInputNum;
1527 continue;
1528 }
1529
1530 Info->markPSInputAllocated(PSInputNum);
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001531 if (Arg->Used)
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001532 Info->markPSInputEnabled(PSInputNum);
1533
1534 ++PSInputNum;
1535 }
1536
Matt Arsenault9ced1e02018-07-31 19:05:14 +00001537 Splits.push_back(*Arg);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001538 }
1539}
1540
1541// Allocate special inputs passed in VGPRs.
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001542static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1543 MachineFunction &MF,
1544 const SIRegisterInfo &TRI,
1545 SIMachineFunctionInfo &Info) {
1546 if (Info.hasWorkItemIDX()) {
1547 unsigned Reg = AMDGPU::VGPR0;
1548 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001549
1550 CCInfo.AllocateReg(Reg);
1551 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
1552 }
1553
1554 if (Info.hasWorkItemIDY()) {
1555 unsigned Reg = AMDGPU::VGPR1;
1556 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1557
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001558 CCInfo.AllocateReg(Reg);
1559 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1560 }
1561
1562 if (Info.hasWorkItemIDZ()) {
1563 unsigned Reg = AMDGPU::VGPR2;
1564 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1565
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001566 CCInfo.AllocateReg(Reg);
1567 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1568 }
1569}
1570
1571// Try to allocate a VGPR at the end of the argument list, or if no argument
1572// VGPRs are left allocating a stack slot.
1573static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
1574 ArrayRef<MCPhysReg> ArgVGPRs
1575 = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1576 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1577 if (RegIdx == ArgVGPRs.size()) {
1578 // Spill to stack required.
1579 int64_t Offset = CCInfo.AllocateStack(4, 4);
1580
1581 return ArgDescriptor::createStack(Offset);
1582 }
1583
1584 unsigned Reg = ArgVGPRs[RegIdx];
1585 Reg = CCInfo.AllocateReg(Reg);
1586 assert(Reg != AMDGPU::NoRegister);
1587
1588 MachineFunction &MF = CCInfo.getMachineFunction();
1589 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1590 return ArgDescriptor::createRegister(Reg);
1591}
1592
1593static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1594 const TargetRegisterClass *RC,
1595 unsigned NumArgRegs) {
1596 ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1597 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1598 if (RegIdx == ArgSGPRs.size())
1599 report_fatal_error("ran out of SGPRs for arguments");
1600
1601 unsigned Reg = ArgSGPRs[RegIdx];
1602 Reg = CCInfo.AllocateReg(Reg);
1603 assert(Reg != AMDGPU::NoRegister);
1604
1605 MachineFunction &MF = CCInfo.getMachineFunction();
1606 MF.addLiveIn(Reg, RC);
1607 return ArgDescriptor::createRegister(Reg);
1608}
1609
1610static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
1611 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1612}
1613
1614static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
1615 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1616}
1617
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001618static void allocateSpecialInputVGPRs(CCState &CCInfo,
1619 MachineFunction &MF,
1620 const SIRegisterInfo &TRI,
1621 SIMachineFunctionInfo &Info) {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001622 if (Info.hasWorkItemIDX())
1623 Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001624
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001625 if (Info.hasWorkItemIDY())
1626 Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001627
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001628 if (Info.hasWorkItemIDZ())
1629 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1630}
1631
1632static void allocateSpecialInputSGPRs(CCState &CCInfo,
1633 MachineFunction &MF,
1634 const SIRegisterInfo &TRI,
1635 SIMachineFunctionInfo &Info) {
1636 auto &ArgInfo = Info.getArgInfo();
1637
1638 // TODO: Unify handling with private memory pointers.
1639
1640 if (Info.hasDispatchPtr())
1641 ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1642
1643 if (Info.hasQueuePtr())
1644 ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1645
1646 if (Info.hasKernargSegmentPtr())
1647 ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1648
1649 if (Info.hasDispatchID())
1650 ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1651
1652 // flat_scratch_init is not applicable for non-kernel functions.
1653
1654 if (Info.hasWorkGroupIDX())
1655 ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1656
1657 if (Info.hasWorkGroupIDY())
1658 ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1659
1660 if (Info.hasWorkGroupIDZ())
1661 ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
Matt Arsenault817c2532017-08-03 23:12:44 +00001662
1663 if (Info.hasImplicitArgPtr())
1664 ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001665}
1666
1667// Allocate special inputs passed in user SGPRs.
1668static void allocateHSAUserSGPRs(CCState &CCInfo,
1669 MachineFunction &MF,
1670 const SIRegisterInfo &TRI,
1671 SIMachineFunctionInfo &Info) {
Matt Arsenault10fc0622017-06-26 03:01:31 +00001672 if (Info.hasImplicitBufferPtr()) {
1673 unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1674 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1675 CCInfo.AllocateReg(ImplicitBufferPtrReg);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001676 }
1677
1678 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1679 if (Info.hasPrivateSegmentBuffer()) {
1680 unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1681 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1682 CCInfo.AllocateReg(PrivateSegmentBufferReg);
1683 }
1684
1685 if (Info.hasDispatchPtr()) {
1686 unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1687 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1688 CCInfo.AllocateReg(DispatchPtrReg);
1689 }
1690
1691 if (Info.hasQueuePtr()) {
1692 unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1693 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1694 CCInfo.AllocateReg(QueuePtrReg);
1695 }
1696
1697 if (Info.hasKernargSegmentPtr()) {
1698 unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1699 MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1700 CCInfo.AllocateReg(InputPtrReg);
1701 }
1702
1703 if (Info.hasDispatchID()) {
1704 unsigned DispatchIDReg = Info.addDispatchID(TRI);
1705 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1706 CCInfo.AllocateReg(DispatchIDReg);
1707 }
1708
1709 if (Info.hasFlatScratchInit()) {
1710 unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1711 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1712 CCInfo.AllocateReg(FlatScratchInitReg);
1713 }
1714
1715 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1716 // these from the dispatch pointer.
1717}
1718
1719// Allocate special input registers that are initialized per-wave.
1720static void allocateSystemSGPRs(CCState &CCInfo,
1721 MachineFunction &MF,
1722 SIMachineFunctionInfo &Info,
Marek Olsak584d2c02017-05-04 22:25:20 +00001723 CallingConv::ID CallConv,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001724 bool IsShader) {
1725 if (Info.hasWorkGroupIDX()) {
1726 unsigned Reg = Info.addWorkGroupIDX();
1727 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1728 CCInfo.AllocateReg(Reg);
1729 }
1730
1731 if (Info.hasWorkGroupIDY()) {
1732 unsigned Reg = Info.addWorkGroupIDY();
1733 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1734 CCInfo.AllocateReg(Reg);
1735 }
1736
1737 if (Info.hasWorkGroupIDZ()) {
1738 unsigned Reg = Info.addWorkGroupIDZ();
1739 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1740 CCInfo.AllocateReg(Reg);
1741 }
1742
1743 if (Info.hasWorkGroupInfo()) {
1744 unsigned Reg = Info.addWorkGroupInfo();
1745 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1746 CCInfo.AllocateReg(Reg);
1747 }
1748
1749 if (Info.hasPrivateSegmentWaveByteOffset()) {
1750 // Scratch wave offset passed in system SGPR.
1751 unsigned PrivateSegmentWaveByteOffsetReg;
1752
1753 if (IsShader) {
Marek Olsak584d2c02017-05-04 22:25:20 +00001754 PrivateSegmentWaveByteOffsetReg =
1755 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
1756
1757 // This is true if the scratch wave byte offset doesn't have a fixed
1758 // location.
1759 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1760 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1761 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1762 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001763 } else
1764 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1765
1766 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1767 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1768 }
1769}
1770
1771static void reservePrivateMemoryRegs(const TargetMachine &TM,
1772 MachineFunction &MF,
1773 const SIRegisterInfo &TRI,
Matt Arsenault1cc47f82017-07-18 16:44:56 +00001774 SIMachineFunctionInfo &Info) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001775 // Now that we've figured out where the scratch register inputs are, see if
1776 // should reserve the arguments and use them directly.
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001777 MachineFrameInfo &MFI = MF.getFrameInfo();
1778 bool HasStackObjects = MFI.hasStackObjects();
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001779
1780 // Record that we know we have non-spill stack objects so we don't need to
1781 // check all stack objects later.
1782 if (HasStackObjects)
1783 Info.setHasNonSpillStackObjects(true);
1784
1785 // Everything live out of a block is spilled with fast regalloc, so it's
1786 // almost certain that spilling will be required.
1787 if (TM.getOptLevel() == CodeGenOpt::None)
1788 HasStackObjects = true;
1789
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001790 // For now assume stack access is needed in any callee functions, so we need
1791 // the scratch registers to pass in.
1792 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1793
Tom Stellard5bfbae52018-07-11 20:59:01 +00001794 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Konstantin Zhuravlyovaa067cb2018-10-04 21:02:16 +00001795 if (ST.isAmdHsaOrMesa(MF.getFunction())) {
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001796 if (RequiresStackAccess) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001797 // If we have stack objects, we unquestionably need the private buffer
1798 // resource. For the Code Object V2 ABI, this will be the first 4 user
1799 // SGPR inputs. We can reserve those and use them directly.
1800
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001801 unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1802 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001803 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1804
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001805 if (MFI.hasCalls()) {
1806 // If we have calls, we need to keep the frame register in a register
1807 // that won't be clobbered by a call, so ensure it is copied somewhere.
1808
1809 // This is not a problem for the scratch wave offset, because the same
1810 // registers are reserved in all functions.
1811
1812 // FIXME: Nothing is really ensuring this is a call preserved register,
1813 // it's just selected from the end so it happens to be.
1814 unsigned ReservedOffsetReg
1815 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1816 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1817 } else {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001818 unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1819 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001820 Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1821 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001822 } else {
1823 unsigned ReservedBufferReg
1824 = TRI.reservedPrivateSegmentBufferReg(MF);
1825 unsigned ReservedOffsetReg
1826 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1827
1828 // We tentatively reserve the last registers (skipping the last two
1829 // which may contain VCC). After register allocation, we'll replace
1830 // these with the ones immediately after those which were really
1831 // allocated. In the prologue copies will be inserted from the argument
1832 // to these reserved registers.
1833 Info.setScratchRSrcReg(ReservedBufferReg);
1834 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1835 }
1836 } else {
1837 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1838
1839 // Without HSA, relocations are used for the scratch pointer and the
1840 // buffer resource setup is always inserted in the prologue. Scratch wave
1841 // offset is still in an input SGPR.
1842 Info.setScratchRSrcReg(ReservedBufferReg);
1843
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001844 if (HasStackObjects && !MFI.hasCalls()) {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001845 unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1846 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001847 Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1848 } else {
1849 unsigned ReservedOffsetReg
1850 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1851 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1852 }
1853 }
1854}
1855
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001856bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
1857 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1858 return !Info->isEntryFunction();
1859}
1860
1861void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
1862
1863}
1864
1865void SITargetLowering::insertCopiesSplitCSR(
1866 MachineBasicBlock *Entry,
1867 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1868 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1869
1870 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1871 if (!IStart)
1872 return;
1873
1874 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1875 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1876 MachineBasicBlock::iterator MBBI = Entry->begin();
1877 for (const MCPhysReg *I = IStart; *I; ++I) {
1878 const TargetRegisterClass *RC = nullptr;
1879 if (AMDGPU::SReg_64RegClass.contains(*I))
1880 RC = &AMDGPU::SGPR_64RegClass;
1881 else if (AMDGPU::SReg_32RegClass.contains(*I))
1882 RC = &AMDGPU::SGPR_32RegClass;
1883 else
1884 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1885
1886 unsigned NewVR = MRI->createVirtualRegister(RC);
1887 // Create copy from CSR to a virtual register.
1888 Entry->addLiveIn(*I);
1889 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1890 .addReg(*I);
1891
1892 // Insert the copy-back instructions right before the terminator.
1893 for (auto *Exit : Exits)
1894 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1895 TII->get(TargetOpcode::COPY), *I)
1896 .addReg(NewVR);
1897 }
1898}
1899
Christian Konig2c8f6d52013-03-07 09:03:52 +00001900SDValue SITargetLowering::LowerFormalArguments(
Eric Christopher7792e322015-01-30 23:24:40 +00001901 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
Benjamin Kramerbdc49562016-06-12 15:39:02 +00001902 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1903 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00001904 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
Christian Konig2c8f6d52013-03-07 09:03:52 +00001905
1906 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenaultceafc552018-05-29 17:42:50 +00001907 const Function &Fn = MF.getFunction();
Matthias Braunf1caa282017-12-15 22:22:58 +00001908 FunctionType *FType = MF.getFunction().getFunctionType();
Christian Konig99ee0f42013-03-07 09:04:14 +00001909 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Christian Konig2c8f6d52013-03-07 09:03:52 +00001910
Nicolai Haehnledf3a20c2016-04-06 19:40:20 +00001911 if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
Oliver Stannard7e7d9832016-02-02 13:52:43 +00001912 DiagnosticInfoUnsupported NoGraphicsHSA(
Matthias Braunf1caa282017-12-15 22:22:58 +00001913 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
Matt Arsenaultd48da142015-11-02 23:23:02 +00001914 DAG.getContext()->diagnose(NoGraphicsHSA);
Diana Picus81bc3172016-05-26 15:24:55 +00001915 return DAG.getEntryNode();
Matt Arsenaultd48da142015-11-02 23:23:02 +00001916 }
1917
Christian Konig2c8f6d52013-03-07 09:03:52 +00001918 SmallVector<ISD::InputArg, 16> Splits;
Christian Konig2c8f6d52013-03-07 09:03:52 +00001919 SmallVector<CCValAssign, 16> ArgLocs;
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001920 BitVector Skipped(Ins.size());
Eric Christopherb5217502014-08-06 18:45:26 +00001921 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1922 *DAG.getContext());
Christian Konig2c8f6d52013-03-07 09:03:52 +00001923
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001924 bool IsShader = AMDGPU::isShader(CallConv);
Matt Arsenaultefa9f4b2017-04-11 22:29:28 +00001925 bool IsKernel = AMDGPU::isKernel(CallConv);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001926 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
Christian Konig99ee0f42013-03-07 09:04:14 +00001927
Matt Arsenaultd1867c02017-08-02 00:59:51 +00001928 if (!IsEntryFunc) {
1929 // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1930 // this when allocating argument fixed offsets.
1931 CCInfo.AllocateStack(4, 4);
1932 }
1933
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001934 if (IsShader) {
1935 processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1936
1937 // At least one interpolation mode must be enabled or else the GPU will
1938 // hang.
1939 //
1940 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1941 // set PSInputAddr, the user wants to enable some bits after the compilation
1942 // based on run-time states. Since we can't know what the final PSInputEna
1943 // will look like, so we shouldn't do anything here and the user should take
1944 // responsibility for the correct programming.
1945 //
1946 // Otherwise, the following restrictions apply:
1947 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1948 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1949 // enabled too.
Tim Renoufc8ffffe2017-10-12 16:16:41 +00001950 if (CallConv == CallingConv::AMDGPU_PS) {
1951 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1952 ((Info->getPSInputAddr() & 0xF) == 0 &&
1953 Info->isPSInputAllocated(11))) {
1954 CCInfo.AllocateReg(AMDGPU::VGPR0);
1955 CCInfo.AllocateReg(AMDGPU::VGPR1);
1956 Info->markPSInputAllocated(0);
1957 Info->markPSInputEnabled(0);
1958 }
1959 if (Subtarget->isAmdPalOS()) {
1960 // For isAmdPalOS, the user does not enable some bits after compilation
1961 // based on run-time states; the register values being generated here are
1962 // the final ones set in hardware. Therefore we need to apply the
1963 // workaround to PSInputAddr and PSInputEnable together. (The case where
1964 // a bit is set in PSInputAddr but not PSInputEnable is where the
1965 // frontend set up an input arg for a particular interpolation mode, but
1966 // nothing uses that input arg. Really we should have an earlier pass
1967 // that removes such an arg.)
1968 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1969 if ((PsInputBits & 0x7F) == 0 ||
1970 ((PsInputBits & 0xF) == 0 &&
1971 (PsInputBits >> 11 & 1)))
1972 Info->markPSInputEnabled(
1973 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
1974 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001975 }
1976
Tom Stellard2f3f9852017-01-25 01:25:13 +00001977 assert(!Info->hasDispatchPtr() &&
Tom Stellardf110f8f2016-04-14 16:27:03 +00001978 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
1979 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
1980 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
1981 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
1982 !Info->hasWorkItemIDZ());
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001983 } else if (IsKernel) {
1984 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001985 } else {
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001986 Splits.append(Ins.begin(), Ins.end());
Tom Stellardaf775432013-10-23 00:44:32 +00001987 }
1988
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001989 if (IsEntryFunc) {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001990 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001991 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
Tom Stellard2f3f9852017-01-25 01:25:13 +00001992 }
1993
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001994 if (IsKernel) {
Tom Stellardbbeb45a2016-09-16 21:53:00 +00001995 analyzeFormalArgumentsCompute(CCInfo, Ins);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001996 } else {
1997 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1998 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1999 }
Christian Konig2c8f6d52013-03-07 09:03:52 +00002000
Matt Arsenaultcf13d182015-07-10 22:51:36 +00002001 SmallVector<SDValue, 16> Chains;
2002
Matt Arsenault7b4826e2018-05-30 16:17:51 +00002003 // FIXME: This is the minimum kernel argument alignment. We should improve
2004 // this to the maximum alignment of the arguments.
2005 //
2006 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2007 // kern arg offset.
2008 const unsigned KernelArgBaseAlign = 16;
Matt Arsenault7b4826e2018-05-30 16:17:51 +00002009
2010 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
Christian Konigb7be72d2013-05-17 09:46:48 +00002011 const ISD::InputArg &Arg = Ins[i];
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00002012 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
Christian Konigb7be72d2013-05-17 09:46:48 +00002013 InVals.push_back(DAG.getUNDEF(Arg.VT));
Christian Konig99ee0f42013-03-07 09:04:14 +00002014 continue;
2015 }
2016
Christian Konig2c8f6d52013-03-07 09:03:52 +00002017 CCValAssign &VA = ArgLocs[ArgIdx++];
Craig Topper7f416c82014-11-16 21:17:18 +00002018 MVT VT = VA.getLocVT();
Tom Stellarded882c22013-06-03 17:40:11 +00002019
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002020 if (IsEntryFunc && VA.isMemLoc()) {
Tom Stellardaf775432013-10-23 00:44:32 +00002021 VT = Ins[i].VT;
Tom Stellardbbeb45a2016-09-16 21:53:00 +00002022 EVT MemVT = VA.getLocVT();
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002023
Matt Arsenault4bec7d42018-07-20 09:05:08 +00002024 const uint64_t Offset = VA.getLocMemOffset();
Matt Arsenault7b4826e2018-05-30 16:17:51 +00002025 unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002026
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002027 SDValue Arg = lowerKernargMemParameter(
Matt Arsenault7b4826e2018-05-30 16:17:51 +00002028 DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
Matt Arsenaultcf13d182015-07-10 22:51:36 +00002029 Chains.push_back(Arg.getValue(1));
Tom Stellardca7ecf32014-08-22 18:49:31 +00002030
Craig Toppere3dcce92015-08-01 22:20:21 +00002031 auto *ParamTy =
Andrew Trick05938a52015-02-16 18:10:47 +00002032 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
Tom Stellard5bfbae52018-07-11 20:59:01 +00002033 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
Matt Arsenaultcdd191d2019-01-28 20:14:49 +00002034 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2035 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
Tom Stellardca7ecf32014-08-22 18:49:31 +00002036 // On SI local pointers are just offsets into LDS, so they are always
2037 // less than 16-bits. On CI and newer they could potentially be
2038 // real pointers, so we can't guarantee their size.
2039 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2040 DAG.getValueType(MVT::i16));
2041 }
2042
Tom Stellarded882c22013-06-03 17:40:11 +00002043 InVals.push_back(Arg);
2044 continue;
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002045 } else if (!IsEntryFunc && VA.isMemLoc()) {
2046 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2047 InVals.push_back(Val);
2048 if (!Arg.Flags.isByVal())
2049 Chains.push_back(Val.getValue(1));
2050 continue;
Tom Stellarded882c22013-06-03 17:40:11 +00002051 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002052
Christian Konig2c8f6d52013-03-07 09:03:52 +00002053 assert(VA.isRegLoc() && "Parameter must be in a register!");
2054
2055 unsigned Reg = VA.getLocReg();
Christian Konig2c8f6d52013-03-07 09:03:52 +00002056 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
Matt Arsenaultb3463552017-07-15 05:52:59 +00002057 EVT ValVT = VA.getValVT();
Christian Konig2c8f6d52013-03-07 09:03:52 +00002058
2059 Reg = MF.addLiveIn(Reg, RC);
2060 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2061
Matt Arsenault45b98182017-11-15 00:45:43 +00002062 if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
2063 // The return object should be reasonably addressable.
2064
2065 // FIXME: This helps when the return is a real sret. If it is a
2066 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2067 // extra copy is inserted in SelectionDAGBuilder which obscures this.
2068 unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
2069 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2070 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2071 }
2072
Matt Arsenaultb3463552017-07-15 05:52:59 +00002073 // If this is an 8 or 16-bit value, it is really passed promoted
2074 // to 32 bits. Insert an assert[sz]ext to capture this, then
2075 // truncate to the right size.
2076 switch (VA.getLocInfo()) {
2077 case CCValAssign::Full:
2078 break;
2079 case CCValAssign::BCvt:
2080 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2081 break;
2082 case CCValAssign::SExt:
2083 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2084 DAG.getValueType(ValVT));
2085 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2086 break;
2087 case CCValAssign::ZExt:
2088 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2089 DAG.getValueType(ValVT));
2090 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2091 break;
2092 case CCValAssign::AExt:
2093 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2094 break;
2095 default:
2096 llvm_unreachable("Unknown loc info!");
2097 }
2098
Christian Konig2c8f6d52013-03-07 09:03:52 +00002099 InVals.push_back(Val);
2100 }
Tom Stellarde99fb652015-01-20 19:33:04 +00002101
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002102 if (!IsEntryFunc) {
2103 // Special inputs come after user arguments.
2104 allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2105 }
2106
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002107 // Start adding system SGPRs.
2108 if (IsEntryFunc) {
2109 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002110 } else {
2111 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2112 CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
2113 CCInfo.AllocateReg(Info->getFrameOffsetReg());
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002114 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002115 }
Matt Arsenaultcf13d182015-07-10 22:51:36 +00002116
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002117 auto &ArgUsageInfo =
2118 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
Matt Arsenaultceafc552018-05-29 17:42:50 +00002119 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002120
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002121 unsigned StackArgSize = CCInfo.getNextStackOffset();
2122 Info->setBytesInStackArgArea(StackArgSize);
2123
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002124 return Chains.empty() ? Chain :
2125 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
Christian Konig2c8f6d52013-03-07 09:03:52 +00002126}
2127
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002128// TODO: If return values can't fit in registers, we should return as many as
2129// possible in registers before passing on stack.
2130bool SITargetLowering::CanLowerReturn(
2131 CallingConv::ID CallConv,
2132 MachineFunction &MF, bool IsVarArg,
2133 const SmallVectorImpl<ISD::OutputArg> &Outs,
2134 LLVMContext &Context) const {
2135 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2136 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2137 // for shaders. Vector types should be explicitly handled by CC.
2138 if (AMDGPU::isEntryFunctionCC(CallConv))
2139 return true;
2140
2141 SmallVector<CCValAssign, 16> RVLocs;
2142 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2143 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2144}
2145
Benjamin Kramerbdc49562016-06-12 15:39:02 +00002146SDValue
2147SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2148 bool isVarArg,
2149 const SmallVectorImpl<ISD::OutputArg> &Outs,
2150 const SmallVectorImpl<SDValue> &OutVals,
2151 const SDLoc &DL, SelectionDAG &DAG) const {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002152 MachineFunction &MF = DAG.getMachineFunction();
2153 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2154
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002155 if (AMDGPU::isKernel(CallConv)) {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002156 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2157 OutVals, DL, DAG);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002158 }
2159
2160 bool IsShader = AMDGPU::isShader(CallConv);
Marek Olsak8a0f3352016-01-13 17:23:04 +00002161
Matt Arsenault55ab9212018-08-01 19:57:34 +00002162 Info->setIfReturnsVoid(Outs.empty());
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002163 bool IsWaveEnd = Info->returnsVoid() && IsShader;
Marek Olsak8e9cc632016-01-13 17:23:09 +00002164
Marek Olsak8a0f3352016-01-13 17:23:04 +00002165 // CCValAssign - represent the assignment of the return value to a location.
2166 SmallVector<CCValAssign, 48> RVLocs;
Matt Arsenault55ab9212018-08-01 19:57:34 +00002167 SmallVector<ISD::OutputArg, 48> Splits;
Marek Olsak8a0f3352016-01-13 17:23:04 +00002168
2169 // CCState - Info about the registers and stack slots.
2170 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2171 *DAG.getContext());
2172
2173 // Analyze outgoing return values.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002174 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
Marek Olsak8a0f3352016-01-13 17:23:04 +00002175
2176 SDValue Flag;
2177 SmallVector<SDValue, 48> RetOps;
2178 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2179
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002180 // Add return address for callable functions.
2181 if (!Info->isEntryFunction()) {
2182 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2183 SDValue ReturnAddrReg = CreateLiveInRegister(
2184 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2185
2186 // FIXME: Should be able to use a vreg here, but need a way to prevent it
2187 // from being allcoated to a CSR.
2188
2189 SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2190 MVT::i64);
2191
2192 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
2193 Flag = Chain.getValue(1);
2194
2195 RetOps.push_back(PhysReturnAddrReg);
2196 }
2197
Marek Olsak8a0f3352016-01-13 17:23:04 +00002198 // Copy the result values into the output registers.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002199 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2200 ++I, ++RealRVLocIdx) {
2201 CCValAssign &VA = RVLocs[I];
Marek Olsak8a0f3352016-01-13 17:23:04 +00002202 assert(VA.isRegLoc() && "Can only return in registers!");
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002203 // TODO: Partially return in registers if return values don't fit.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002204 SDValue Arg = OutVals[RealRVLocIdx];
Marek Olsak8a0f3352016-01-13 17:23:04 +00002205
2206 // Copied from other backends.
2207 switch (VA.getLocInfo()) {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002208 case CCValAssign::Full:
2209 break;
2210 case CCValAssign::BCvt:
2211 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2212 break;
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002213 case CCValAssign::SExt:
2214 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2215 break;
2216 case CCValAssign::ZExt:
2217 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2218 break;
2219 case CCValAssign::AExt:
2220 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2221 break;
2222 default:
2223 llvm_unreachable("Unknown loc info!");
Marek Olsak8a0f3352016-01-13 17:23:04 +00002224 }
2225
2226 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2227 Flag = Chain.getValue(1);
2228 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2229 }
2230
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002231 // FIXME: Does sret work properly?
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002232 if (!Info->isEntryFunction()) {
Tom Stellardc5a154d2018-06-28 23:47:12 +00002233 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002234 const MCPhysReg *I =
2235 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2236 if (I) {
2237 for (; *I; ++I) {
2238 if (AMDGPU::SReg_64RegClass.contains(*I))
2239 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2240 else if (AMDGPU::SReg_32RegClass.contains(*I))
2241 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2242 else
2243 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2244 }
2245 }
2246 }
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002247
Marek Olsak8a0f3352016-01-13 17:23:04 +00002248 // Update chain and glue.
2249 RetOps[0] = Chain;
2250 if (Flag.getNode())
2251 RetOps.push_back(Flag);
2252
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002253 unsigned Opc = AMDGPUISD::ENDPGM;
2254 if (!IsWaveEnd)
2255 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
Matt Arsenault9babdf42016-06-22 20:15:28 +00002256 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
Marek Olsak8a0f3352016-01-13 17:23:04 +00002257}
2258
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002259SDValue SITargetLowering::LowerCallResult(
2260 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2261 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2262 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2263 SDValue ThisVal) const {
2264 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2265
2266 // Assign locations to each value returned by this call.
2267 SmallVector<CCValAssign, 16> RVLocs;
2268 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2269 *DAG.getContext());
2270 CCInfo.AnalyzeCallResult(Ins, RetCC);
2271
2272 // Copy all of the result registers out of their specified physreg.
2273 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2274 CCValAssign VA = RVLocs[i];
2275 SDValue Val;
2276
2277 if (VA.isRegLoc()) {
2278 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2279 Chain = Val.getValue(1);
2280 InFlag = Val.getValue(2);
2281 } else if (VA.isMemLoc()) {
2282 report_fatal_error("TODO: return values in memory");
2283 } else
2284 llvm_unreachable("unknown argument location type");
2285
2286 switch (VA.getLocInfo()) {
2287 case CCValAssign::Full:
2288 break;
2289 case CCValAssign::BCvt:
2290 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2291 break;
2292 case CCValAssign::ZExt:
2293 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2294 DAG.getValueType(VA.getValVT()));
2295 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2296 break;
2297 case CCValAssign::SExt:
2298 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2299 DAG.getValueType(VA.getValVT()));
2300 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2301 break;
2302 case CCValAssign::AExt:
2303 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2304 break;
2305 default:
2306 llvm_unreachable("Unknown loc info!");
2307 }
2308
2309 InVals.push_back(Val);
2310 }
2311
2312 return Chain;
2313}
2314
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002315// Add code to pass special inputs required depending on used features separate
2316// from the explicit user arguments present in the IR.
2317void SITargetLowering::passSpecialInputs(
2318 CallLoweringInfo &CLI,
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002319 CCState &CCInfo,
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002320 const SIMachineFunctionInfo &Info,
2321 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2322 SmallVectorImpl<SDValue> &MemOpChains,
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002323 SDValue Chain) const {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002324 // If we don't have a call site, this was a call inserted by
2325 // legalization. These can never use special inputs.
2326 if (!CLI.CS)
2327 return;
2328
2329 const Function *CalleeFunc = CLI.CS.getCalledFunction();
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002330 assert(CalleeFunc);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002331
2332 SelectionDAG &DAG = CLI.DAG;
2333 const SDLoc &DL = CLI.DL;
2334
Tom Stellardc5a154d2018-06-28 23:47:12 +00002335 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002336
2337 auto &ArgUsageInfo =
2338 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2339 const AMDGPUFunctionArgInfo &CalleeArgInfo
2340 = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2341
2342 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2343
2344 // TODO: Unify with private memory register handling. This is complicated by
2345 // the fact that at least in kernels, the input argument is not necessarily
2346 // in the same location as the input.
2347 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
2348 AMDGPUFunctionArgInfo::DISPATCH_PTR,
2349 AMDGPUFunctionArgInfo::QUEUE_PTR,
2350 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
2351 AMDGPUFunctionArgInfo::DISPATCH_ID,
2352 AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
2353 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
2354 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
2355 AMDGPUFunctionArgInfo::WORKITEM_ID_X,
2356 AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
Matt Arsenault817c2532017-08-03 23:12:44 +00002357 AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
2358 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002359 };
2360
2361 for (auto InputID : InputRegs) {
2362 const ArgDescriptor *OutgoingArg;
2363 const TargetRegisterClass *ArgRC;
2364
2365 std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2366 if (!OutgoingArg)
2367 continue;
2368
2369 const ArgDescriptor *IncomingArg;
2370 const TargetRegisterClass *IncomingArgRC;
2371 std::tie(IncomingArg, IncomingArgRC)
2372 = CallerArgInfo.getPreloadedValue(InputID);
2373 assert(IncomingArgRC == ArgRC);
2374
2375 // All special arguments are ints for now.
2376 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
Matt Arsenault817c2532017-08-03 23:12:44 +00002377 SDValue InputReg;
2378
2379 if (IncomingArg) {
2380 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2381 } else {
2382 // The implicit arg ptr is special because it doesn't have a corresponding
2383 // input for kernels, and is computed from the kernarg segment pointer.
2384 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2385 InputReg = getImplicitArgPtr(DAG, DL);
2386 }
2387
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002388 if (OutgoingArg->isRegister()) {
2389 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2390 } else {
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002391 unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
2392 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2393 SpecialArgOffset);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002394 MemOpChains.push_back(ArgStore);
2395 }
2396 }
2397}
2398
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002399static bool canGuaranteeTCO(CallingConv::ID CC) {
2400 return CC == CallingConv::Fast;
2401}
2402
2403/// Return true if we might ever do TCO for calls with this calling convention.
2404static bool mayTailCallThisCC(CallingConv::ID CC) {
2405 switch (CC) {
2406 case CallingConv::C:
2407 return true;
2408 default:
2409 return canGuaranteeTCO(CC);
2410 }
2411}
2412
2413bool SITargetLowering::isEligibleForTailCallOptimization(
2414 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2415 const SmallVectorImpl<ISD::OutputArg> &Outs,
2416 const SmallVectorImpl<SDValue> &OutVals,
2417 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2418 if (!mayTailCallThisCC(CalleeCC))
2419 return false;
2420
2421 MachineFunction &MF = DAG.getMachineFunction();
Matthias Braunf1caa282017-12-15 22:22:58 +00002422 const Function &CallerF = MF.getFunction();
2423 CallingConv::ID CallerCC = CallerF.getCallingConv();
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002424 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2425 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2426
2427 // Kernels aren't callable, and don't have a live in return address so it
2428 // doesn't make sense to do a tail call with entry functions.
2429 if (!CallerPreserved)
2430 return false;
2431
2432 bool CCMatch = CallerCC == CalleeCC;
2433
2434 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
2435 if (canGuaranteeTCO(CalleeCC) && CCMatch)
2436 return true;
2437 return false;
2438 }
2439
2440 // TODO: Can we handle var args?
2441 if (IsVarArg)
2442 return false;
2443
Matthias Braunf1caa282017-12-15 22:22:58 +00002444 for (const Argument &Arg : CallerF.args()) {
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002445 if (Arg.hasByValAttr())
2446 return false;
2447 }
2448
2449 LLVMContext &Ctx = *DAG.getContext();
2450
2451 // Check that the call results are passed in the same way.
2452 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2453 CCAssignFnForCall(CalleeCC, IsVarArg),
2454 CCAssignFnForCall(CallerCC, IsVarArg)))
2455 return false;
2456
2457 // The callee has to preserve all registers the caller needs to preserve.
2458 if (!CCMatch) {
2459 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2460 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2461 return false;
2462 }
2463
2464 // Nothing more to check if the callee is taking no arguments.
2465 if (Outs.empty())
2466 return true;
2467
2468 SmallVector<CCValAssign, 16> ArgLocs;
2469 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2470
2471 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2472
2473 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2474 // If the stack arguments for this call do not fit into our own save area then
2475 // the call cannot be made tail.
2476 // TODO: Is this really necessary?
2477 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2478 return false;
2479
2480 const MachineRegisterInfo &MRI = MF.getRegInfo();
2481 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2482}
2483
2484bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2485 if (!CI->isTailCall())
2486 return false;
2487
2488 const Function *ParentFn = CI->getParent()->getParent();
2489 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2490 return false;
2491
2492 auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2493 return (Attr.getValueAsString() != "true");
2494}
2495
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002496// The wave scratch offset register is used as the global base pointer.
2497SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
2498 SmallVectorImpl<SDValue> &InVals) const {
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002499 SelectionDAG &DAG = CLI.DAG;
2500 const SDLoc &DL = CLI.DL;
2501 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2502 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2503 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2504 SDValue Chain = CLI.Chain;
2505 SDValue Callee = CLI.Callee;
2506 bool &IsTailCall = CLI.IsTailCall;
2507 CallingConv::ID CallConv = CLI.CallConv;
2508 bool IsVarArg = CLI.IsVarArg;
2509 bool IsSibCall = false;
2510 bool IsThisReturn = false;
2511 MachineFunction &MF = DAG.getMachineFunction();
2512
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002513 if (IsVarArg) {
2514 return lowerUnhandledCall(CLI, InVals,
2515 "unsupported call to variadic function ");
2516 }
2517
Matt Arsenault935f3b72018-08-08 16:58:39 +00002518 if (!CLI.CS.getInstruction())
2519 report_fatal_error("unsupported libcall legalization");
2520
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002521 if (!CLI.CS.getCalledFunction()) {
2522 return lowerUnhandledCall(CLI, InVals,
2523 "unsupported indirect call to function ");
2524 }
2525
2526 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2527 return lowerUnhandledCall(CLI, InVals,
2528 "unsupported required tail call to function ");
2529 }
2530
Matt Arsenault1fb90132018-06-28 10:18:36 +00002531 if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
2532 // Note the issue is with the CC of the calling function, not of the call
2533 // itself.
2534 return lowerUnhandledCall(CLI, InVals,
2535 "unsupported call from graphics shader of function ");
2536 }
2537
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002538 // The first 4 bytes are reserved for the callee's emergency stack slot.
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002539 if (IsTailCall) {
2540 IsTailCall = isEligibleForTailCallOptimization(
2541 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2542 if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2543 report_fatal_error("failed to perform tail call elimination on a call "
2544 "site marked musttail");
2545 }
2546
2547 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2548
2549 // A sibling call is one where we're under the usual C ABI and not planning
2550 // to change that but can still do a tail call:
2551 if (!TailCallOpt && IsTailCall)
2552 IsSibCall = true;
2553
2554 if (IsTailCall)
2555 ++NumTailCalls;
2556 }
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002557
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002558 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2559
2560 // Analyze operands of the call, assigning locations to each operand.
2561 SmallVector<CCValAssign, 16> ArgLocs;
2562 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2563 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002564
2565 // The first 4 bytes are reserved for the callee's emergency stack slot.
2566 CCInfo.AllocateStack(4, 4);
2567
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002568 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2569
2570 // Get a count of how many bytes are to be pushed on the stack.
2571 unsigned NumBytes = CCInfo.getNextStackOffset();
2572
2573 if (IsSibCall) {
2574 // Since we're not changing the ABI to make this a tail call, the memory
2575 // operands are already available in the caller's incoming argument space.
2576 NumBytes = 0;
2577 }
2578
2579 // FPDiff is the byte offset of the call's argument area from the callee's.
2580 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2581 // by this amount for a tail call. In a sibling call it must be 0 because the
2582 // caller will deallocate the entire stack and the callee still expects its
2583 // arguments to begin at SP+0. Completely unused for non-tail calls.
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002584 int32_t FPDiff = 0;
2585 MachineFrameInfo &MFI = MF.getFrameInfo();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002586 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2587
Matt Arsenault6efd0822017-09-14 17:14:57 +00002588 SDValue CallerSavedFP;
2589
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002590 // Adjust the stack pointer for the new arguments...
2591 // These operations are automatically eliminated by the prolog/epilog pass
2592 if (!IsSibCall) {
Matt Arsenaultdefe3712017-09-14 17:37:40 +00002593 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002594
2595 unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2596
2597 // In the HSA case, this should be an identity copy.
2598 SDValue ScratchRSrcReg
2599 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2600 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2601
2602 // TODO: Don't hardcode these registers and get from the callee function.
2603 SDValue ScratchWaveOffsetReg
2604 = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2605 RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
Matt Arsenault6efd0822017-09-14 17:14:57 +00002606
2607 if (!Info->isEntryFunction()) {
2608 // Avoid clobbering this function's FP value. In the current convention
2609 // callee will overwrite this, so do save/restore around the call site.
2610 CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2611 Info->getFrameOffsetReg(), MVT::i32);
2612 }
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002613 }
2614
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002615 SmallVector<SDValue, 8> MemOpChains;
2616 MVT PtrVT = MVT::i32;
2617
2618 // Walk the register/memloc assignments, inserting copies/loads.
2619 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2620 ++i, ++realArgIdx) {
2621 CCValAssign &VA = ArgLocs[i];
2622 SDValue Arg = OutVals[realArgIdx];
2623
2624 // Promote the value if needed.
2625 switch (VA.getLocInfo()) {
2626 case CCValAssign::Full:
2627 break;
2628 case CCValAssign::BCvt:
2629 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2630 break;
2631 case CCValAssign::ZExt:
2632 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2633 break;
2634 case CCValAssign::SExt:
2635 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2636 break;
2637 case CCValAssign::AExt:
2638 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2639 break;
2640 case CCValAssign::FPExt:
2641 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2642 break;
2643 default:
2644 llvm_unreachable("Unknown loc info!");
2645 }
2646
2647 if (VA.isRegLoc()) {
2648 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2649 } else {
2650 assert(VA.isMemLoc());
2651
2652 SDValue DstAddr;
2653 MachinePointerInfo DstInfo;
2654
2655 unsigned LocMemOffset = VA.getLocMemOffset();
2656 int32_t Offset = LocMemOffset;
Matt Arsenaultb655fa92017-11-29 01:25:12 +00002657
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002658 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002659 unsigned Align = 0;
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002660
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002661 if (IsTailCall) {
2662 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2663 unsigned OpSize = Flags.isByVal() ?
2664 Flags.getByValSize() : VA.getValVT().getStoreSize();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002665
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002666 // FIXME: We can have better than the minimum byval required alignment.
2667 Align = Flags.isByVal() ? Flags.getByValAlign() :
2668 MinAlign(Subtarget->getStackAlignment(), Offset);
2669
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002670 Offset = Offset + FPDiff;
2671 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2672
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002673 DstAddr = DAG.getFrameIndex(FI, PtrVT);
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002674 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2675
2676 // Make sure any stack arguments overlapping with where we're storing
2677 // are loaded before this eventual operation. Otherwise they'll be
2678 // clobbered.
2679
2680 // FIXME: Why is this really necessary? This seems to just result in a
2681 // lot of code to copy the stack and write them back to the same
2682 // locations, which are supposed to be immutable?
2683 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2684 } else {
2685 DstAddr = PtrOff;
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002686 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002687 Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002688 }
2689
2690 if (Outs[i].Flags.isByVal()) {
2691 SDValue SizeNode =
2692 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2693 SDValue Cpy = DAG.getMemcpy(
2694 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2695 /*isVol = */ false, /*AlwaysInline = */ true,
Yaxun Liuc5962262017-11-22 16:13:35 +00002696 /*isTailCall = */ false, DstInfo,
2697 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
Matt Arsenault0da63502018-08-31 05:49:54 +00002698 *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))));
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002699
2700 MemOpChains.push_back(Cpy);
2701 } else {
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002702 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002703 MemOpChains.push_back(Store);
2704 }
2705 }
2706 }
2707
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002708 // Copy special input registers after user input arguments.
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002709 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002710
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002711 if (!MemOpChains.empty())
2712 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2713
2714 // Build a sequence of copy-to-reg nodes chained together with token chain
2715 // and flag operands which copy the outgoing args into the appropriate regs.
2716 SDValue InFlag;
2717 for (auto &RegToPass : RegsToPass) {
2718 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2719 RegToPass.second, InFlag);
2720 InFlag = Chain.getValue(1);
2721 }
2722
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002723
2724 SDValue PhysReturnAddrReg;
2725 if (IsTailCall) {
2726 // Since the return is being combined with the call, we need to pass on the
2727 // return address.
2728
2729 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2730 SDValue ReturnAddrReg = CreateLiveInRegister(
2731 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2732
2733 PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2734 MVT::i64);
2735 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2736 InFlag = Chain.getValue(1);
2737 }
2738
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002739 // We don't usually want to end the call-sequence here because we would tidy
2740 // the frame up *after* the call, however in the ABI-changing tail-call case
2741 // we've carefully laid out the parameters so that when sp is reset they'll be
2742 // in the correct location.
2743 if (IsTailCall && !IsSibCall) {
2744 Chain = DAG.getCALLSEQ_END(Chain,
2745 DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2746 DAG.getTargetConstant(0, DL, MVT::i32),
2747 InFlag, DL);
2748 InFlag = Chain.getValue(1);
2749 }
2750
2751 std::vector<SDValue> Ops;
2752 Ops.push_back(Chain);
2753 Ops.push_back(Callee);
Scott Linderd19d1972019-02-04 20:00:07 +00002754 // Add a redundant copy of the callee global which will not be legalized, as
2755 // we need direct access to the callee later.
2756 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee);
2757 const GlobalValue *GV = GSD->getGlobal();
2758 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002759
2760 if (IsTailCall) {
2761 // Each tail call may have to adjust the stack by a different amount, so
2762 // this information must travel along with the operation for eventual
2763 // consumption by emitEpilogue.
2764 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002765
2766 Ops.push_back(PhysReturnAddrReg);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002767 }
2768
2769 // Add argument registers to the end of the list so that they are known live
2770 // into the call.
2771 for (auto &RegToPass : RegsToPass) {
2772 Ops.push_back(DAG.getRegister(RegToPass.first,
2773 RegToPass.second.getValueType()));
2774 }
2775
2776 // Add a register mask operand representing the call-preserved registers.
2777
Tom Stellardc5a154d2018-06-28 23:47:12 +00002778 auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002779 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2780 assert(Mask && "Missing call preserved mask for calling convention");
2781 Ops.push_back(DAG.getRegisterMask(Mask));
2782
2783 if (InFlag.getNode())
2784 Ops.push_back(InFlag);
2785
2786 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2787
2788 // If we're doing a tall call, use a TC_RETURN here rather than an
2789 // actual call instruction.
2790 if (IsTailCall) {
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002791 MFI.setHasTailCall();
2792 return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002793 }
2794
2795 // Returns a chain and a flag for retval copy to use.
2796 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2797 Chain = Call.getValue(0);
2798 InFlag = Call.getValue(1);
2799
Matt Arsenault6efd0822017-09-14 17:14:57 +00002800 if (CallerSavedFP) {
2801 SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2802 Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2803 InFlag = Chain.getValue(1);
2804 }
2805
Matt Arsenaultdefe3712017-09-14 17:37:40 +00002806 uint64_t CalleePopBytes = NumBytes;
2807 Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002808 DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2809 InFlag, DL);
2810 if (!Ins.empty())
2811 InFlag = Chain.getValue(1);
2812
2813 // Handle result values, copying them out of physregs into vregs that we
2814 // return.
2815 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2816 InVals, IsThisReturn,
2817 IsThisReturn ? OutVals[0] : SDValue());
2818}
2819
Matt Arsenault9a10cea2016-01-26 04:29:24 +00002820unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2821 SelectionDAG &DAG) const {
2822 unsigned Reg = StringSwitch<unsigned>(RegName)
2823 .Case("m0", AMDGPU::M0)
2824 .Case("exec", AMDGPU::EXEC)
2825 .Case("exec_lo", AMDGPU::EXEC_LO)
2826 .Case("exec_hi", AMDGPU::EXEC_HI)
2827 .Case("flat_scratch", AMDGPU::FLAT_SCR)
2828 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2829 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2830 .Default(AMDGPU::NoRegister);
2831
2832 if (Reg == AMDGPU::NoRegister) {
2833 report_fatal_error(Twine("invalid register name \""
2834 + StringRef(RegName) + "\"."));
2835
2836 }
2837
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00002838 if ((Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||
2839 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) &&
2840 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
Matt Arsenault9a10cea2016-01-26 04:29:24 +00002841 report_fatal_error(Twine("invalid register \""
2842 + StringRef(RegName) + "\" for subtarget."));
2843 }
2844
2845 switch (Reg) {
2846 case AMDGPU::M0:
2847 case AMDGPU::EXEC_LO:
2848 case AMDGPU::EXEC_HI:
2849 case AMDGPU::FLAT_SCR_LO:
2850 case AMDGPU::FLAT_SCR_HI:
2851 if (VT.getSizeInBits() == 32)
2852 return Reg;
2853 break;
2854 case AMDGPU::EXEC:
2855 case AMDGPU::FLAT_SCR:
2856 if (VT.getSizeInBits() == 64)
2857 return Reg;
2858 break;
2859 default:
2860 llvm_unreachable("missing register type checking");
2861 }
2862
2863 report_fatal_error(Twine("invalid type for register \""
2864 + StringRef(RegName) + "\"."));
2865}
2866
Matt Arsenault786724a2016-07-12 21:41:32 +00002867// If kill is not the last instruction, split the block so kill is always a
2868// proper terminator.
2869MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
2870 MachineBasicBlock *BB) const {
2871 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2872
2873 MachineBasicBlock::iterator SplitPoint(&MI);
2874 ++SplitPoint;
2875
2876 if (SplitPoint == BB->end()) {
2877 // Don't bother with a new block.
Marek Olsakce76ea02017-10-24 10:27:13 +00002878 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
Matt Arsenault786724a2016-07-12 21:41:32 +00002879 return BB;
2880 }
2881
2882 MachineFunction *MF = BB->getParent();
2883 MachineBasicBlock *SplitBB
2884 = MF->CreateMachineBasicBlock(BB->getBasicBlock());
2885
Matt Arsenault786724a2016-07-12 21:41:32 +00002886 MF->insert(++MachineFunction::iterator(BB), SplitBB);
2887 SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2888
Matt Arsenaultd40ded62016-07-22 17:01:15 +00002889 SplitBB->transferSuccessorsAndUpdatePHIs(BB);
Matt Arsenault786724a2016-07-12 21:41:32 +00002890 BB->addSuccessor(SplitBB);
2891
Marek Olsakce76ea02017-10-24 10:27:13 +00002892 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
Matt Arsenault786724a2016-07-12 21:41:32 +00002893 return SplitBB;
2894}
2895
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002896// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2897// wavefront. If the value is uniform and just happens to be in a VGPR, this
2898// will only do one iteration. In the worst case, this will loop 64 times.
2899//
2900// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002901static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
2902 const SIInstrInfo *TII,
2903 MachineRegisterInfo &MRI,
2904 MachineBasicBlock &OrigBB,
2905 MachineBasicBlock &LoopBB,
2906 const DebugLoc &DL,
2907 const MachineOperand &IdxReg,
2908 unsigned InitReg,
2909 unsigned ResultReg,
2910 unsigned PhiReg,
2911 unsigned InitSaveExecReg,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00002912 int Offset,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002913 bool UseGPRIdxMode,
2914 bool IsIndirectSrc) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002915 MachineBasicBlock::iterator I = LoopBB.begin();
2916
2917 unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2918 unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2919 unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2920 unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2921
2922 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2923 .addReg(InitReg)
2924 .addMBB(&OrigBB)
2925 .addReg(ResultReg)
2926 .addMBB(&LoopBB);
2927
2928 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2929 .addReg(InitSaveExecReg)
2930 .addMBB(&OrigBB)
2931 .addReg(NewExec)
2932 .addMBB(&LoopBB);
2933
2934 // Read the next variant <- also loop target.
2935 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2936 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2937
2938 // Compare the just read M0 value to all possible Idx values.
2939 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2940 .addReg(CurrentIdxReg)
Matt Arsenaultf0ba86a2016-07-21 09:40:57 +00002941 .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002942
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002943 // Update EXEC, save the original EXEC value to VCC.
2944 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2945 .addReg(CondReg, RegState::Kill);
2946
2947 MRI.setSimpleHint(NewExec, CondReg);
2948
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00002949 if (UseGPRIdxMode) {
2950 unsigned IdxReg;
2951 if (Offset == 0) {
2952 IdxReg = CurrentIdxReg;
2953 } else {
2954 IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2955 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2956 .addReg(CurrentIdxReg, RegState::Kill)
2957 .addImm(Offset);
2958 }
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002959 unsigned IdxMode = IsIndirectSrc ?
Dmitry Preobrazhenskyef920352019-02-27 13:12:12 +00002960 AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002961 MachineInstr *SetOn =
2962 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2963 .addReg(IdxReg, RegState::Kill)
2964 .addImm(IdxMode);
2965 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002966 } else {
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00002967 // Move index from VCC into M0
2968 if (Offset == 0) {
2969 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2970 .addReg(CurrentIdxReg, RegState::Kill);
2971 } else {
2972 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2973 .addReg(CurrentIdxReg, RegState::Kill)
2974 .addImm(Offset);
2975 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002976 }
2977
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002978 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002979 MachineInstr *InsertPt =
Scott Lindere2c58472019-02-05 19:50:32 +00002980 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002981 .addReg(AMDGPU::EXEC)
2982 .addReg(NewExec);
2983
2984 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2985 // s_cbranch_scc0?
2986
2987 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2988 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2989 .addMBB(&LoopBB);
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002990
2991 return InsertPt->getIterator();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002992}
2993
2994// This has slightly sub-optimal regalloc when the source vector is killed by
2995// the read. The register allocator does not understand that the kill is
2996// per-workitem, so is kept alive for the whole loop so we end up not re-using a
2997// subregister from it, using 1 more VGPR than necessary. This was saved when
2998// this was expanded after register allocation.
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002999static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
3000 MachineBasicBlock &MBB,
3001 MachineInstr &MI,
3002 unsigned InitResultReg,
3003 unsigned PhiReg,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003004 int Offset,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003005 bool UseGPRIdxMode,
3006 bool IsIndirectSrc) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003007 MachineFunction *MF = MBB.getParent();
3008 MachineRegisterInfo &MRI = MF->getRegInfo();
3009 const DebugLoc &DL = MI.getDebugLoc();
3010 MachineBasicBlock::iterator I(&MI);
3011
3012 unsigned DstReg = MI.getOperand(0).getReg();
Matt Arsenault301162c2017-11-15 21:51:43 +00003013 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3014 unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003015
3016 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3017
3018 // Save the EXEC mask
3019 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
3020 .addReg(AMDGPU::EXEC);
3021
3022 // To insert the loop we need to split the block. Move everything after this
3023 // point to a new block, and insert a new empty block between the two.
3024 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
3025 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
3026 MachineFunction::iterator MBBI(MBB);
3027 ++MBBI;
3028
3029 MF->insert(MBBI, LoopBB);
3030 MF->insert(MBBI, RemainderBB);
3031
3032 LoopBB->addSuccessor(LoopBB);
3033 LoopBB->addSuccessor(RemainderBB);
3034
3035 // Move the rest of the block into a new block.
Matt Arsenaultd40ded62016-07-22 17:01:15 +00003036 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003037 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3038
3039 MBB.addSuccessor(LoopBB);
3040
3041 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3042
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003043 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3044 InitResultReg, DstReg, PhiReg, TmpExec,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003045 Offset, UseGPRIdxMode, IsIndirectSrc);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003046
3047 MachineBasicBlock::iterator First = RemainderBB->begin();
3048 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
3049 .addReg(SaveExec);
3050
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003051 return InsPt;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003052}
3053
3054// Returns subreg index, offset
3055static std::pair<unsigned, int>
3056computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
3057 const TargetRegisterClass *SuperRC,
3058 unsigned VecReg,
3059 int Offset) {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003060 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003061
3062 // Skip out of bounds offsets, or else we would end up using an undefined
3063 // register.
3064 if (Offset >= NumElts || Offset < 0)
3065 return std::make_pair(AMDGPU::sub0, Offset);
3066
3067 return std::make_pair(AMDGPU::sub0 + Offset, 0);
3068}
3069
3070// Return true if the index is an SGPR and was set.
3071static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
3072 MachineRegisterInfo &MRI,
3073 MachineInstr &MI,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003074 int Offset,
3075 bool UseGPRIdxMode,
3076 bool IsIndirectSrc) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003077 MachineBasicBlock *MBB = MI.getParent();
3078 const DebugLoc &DL = MI.getDebugLoc();
3079 MachineBasicBlock::iterator I(&MI);
3080
3081 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3082 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3083
3084 assert(Idx->getReg() != AMDGPU::NoRegister);
3085
3086 if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
3087 return false;
3088
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003089 if (UseGPRIdxMode) {
3090 unsigned IdxMode = IsIndirectSrc ?
Dmitry Preobrazhenskyef920352019-02-27 13:12:12 +00003091 AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003092 if (Offset == 0) {
3093 MachineInstr *SetOn =
Diana Picus116bbab2017-01-13 09:58:52 +00003094 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3095 .add(*Idx)
3096 .addImm(IdxMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003097
Matt Arsenaultdac31db2016-10-13 12:45:16 +00003098 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003099 } else {
3100 unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3101 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
Diana Picus116bbab2017-01-13 09:58:52 +00003102 .add(*Idx)
3103 .addImm(Offset);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003104 MachineInstr *SetOn =
3105 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3106 .addReg(Tmp, RegState::Kill)
3107 .addImm(IdxMode);
3108
Matt Arsenaultdac31db2016-10-13 12:45:16 +00003109 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003110 }
3111
3112 return true;
3113 }
3114
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003115 if (Offset == 0) {
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00003116 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3117 .add(*Idx);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003118 } else {
3119 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00003120 .add(*Idx)
3121 .addImm(Offset);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003122 }
3123
3124 return true;
3125}
3126
3127// Control flow needs to be inserted if indexing with a VGPR.
3128static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
3129 MachineBasicBlock &MBB,
Tom Stellard5bfbae52018-07-11 20:59:01 +00003130 const GCNSubtarget &ST) {
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003131 const SIInstrInfo *TII = ST.getInstrInfo();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003132 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3133 MachineFunction *MF = MBB.getParent();
3134 MachineRegisterInfo &MRI = MF->getRegInfo();
3135
3136 unsigned Dst = MI.getOperand(0).getReg();
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003137 unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003138 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3139
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003140 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003141
3142 unsigned SubReg;
3143 std::tie(SubReg, Offset)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003144 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003145
Marek Olsake22fdb92017-03-21 17:00:32 +00003146 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003147
3148 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003149 MachineBasicBlock::iterator I(&MI);
3150 const DebugLoc &DL = MI.getDebugLoc();
3151
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003152 if (UseGPRIdxMode) {
3153 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3154 // to avoid interfering with other uses, so probably requires a new
3155 // optimization pass.
3156 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003157 .addReg(SrcReg, RegState::Undef, SubReg)
3158 .addReg(SrcReg, RegState::Implicit)
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003159 .addReg(AMDGPU::M0, RegState::Implicit);
3160 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3161 } else {
3162 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003163 .addReg(SrcReg, RegState::Undef, SubReg)
3164 .addReg(SrcReg, RegState::Implicit);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003165 }
3166
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003167 MI.eraseFromParent();
3168
3169 return &MBB;
3170 }
3171
3172 const DebugLoc &DL = MI.getDebugLoc();
3173 MachineBasicBlock::iterator I(&MI);
3174
3175 unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3176 unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3177
3178 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3179
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003180 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
3181 Offset, UseGPRIdxMode, true);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003182 MachineBasicBlock *LoopBB = InsPt->getParent();
3183
3184 if (UseGPRIdxMode) {
3185 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003186 .addReg(SrcReg, RegState::Undef, SubReg)
3187 .addReg(SrcReg, RegState::Implicit)
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003188 .addReg(AMDGPU::M0, RegState::Implicit);
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003189 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003190 } else {
3191 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003192 .addReg(SrcReg, RegState::Undef, SubReg)
3193 .addReg(SrcReg, RegState::Implicit);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003194 }
3195
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003196 MI.eraseFromParent();
3197
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003198 return LoopBB;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003199}
3200
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003201static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
3202 const TargetRegisterClass *VecRC) {
3203 switch (TRI.getRegSizeInBits(*VecRC)) {
3204 case 32: // 4 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003205 return AMDGPU::V_MOVRELD_B32_V1;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003206 case 64: // 8 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003207 return AMDGPU::V_MOVRELD_B32_V2;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003208 case 128: // 16 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003209 return AMDGPU::V_MOVRELD_B32_V4;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003210 case 256: // 32 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003211 return AMDGPU::V_MOVRELD_B32_V8;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003212 case 512: // 64 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003213 return AMDGPU::V_MOVRELD_B32_V16;
3214 default:
3215 llvm_unreachable("unsupported size for MOVRELD pseudos");
3216 }
3217}
3218
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003219static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
3220 MachineBasicBlock &MBB,
Tom Stellard5bfbae52018-07-11 20:59:01 +00003221 const GCNSubtarget &ST) {
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003222 const SIInstrInfo *TII = ST.getInstrInfo();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003223 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3224 MachineFunction *MF = MBB.getParent();
3225 MachineRegisterInfo &MRI = MF->getRegInfo();
3226
3227 unsigned Dst = MI.getOperand(0).getReg();
3228 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3229 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3230 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3231 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3232 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3233
3234 // This can be an immediate, but will be folded later.
3235 assert(Val->getReg());
3236
3237 unsigned SubReg;
3238 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3239 SrcVec->getReg(),
3240 Offset);
Marek Olsake22fdb92017-03-21 17:00:32 +00003241 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003242
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003243 if (Idx->getReg() == AMDGPU::NoRegister) {
3244 MachineBasicBlock::iterator I(&MI);
3245 const DebugLoc &DL = MI.getDebugLoc();
3246
3247 assert(Offset == 0);
3248
3249 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
Diana Picus116bbab2017-01-13 09:58:52 +00003250 .add(*SrcVec)
3251 .add(*Val)
3252 .addImm(SubReg);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003253
3254 MI.eraseFromParent();
3255 return &MBB;
3256 }
3257
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003258 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003259 MachineBasicBlock::iterator I(&MI);
3260 const DebugLoc &DL = MI.getDebugLoc();
3261
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003262 if (UseGPRIdxMode) {
3263 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
Diana Picus116bbab2017-01-13 09:58:52 +00003264 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
3265 .add(*Val)
3266 .addReg(Dst, RegState::ImplicitDefine)
3267 .addReg(SrcVec->getReg(), RegState::Implicit)
3268 .addReg(AMDGPU::M0, RegState::Implicit);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003269
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003270 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3271 } else {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003272 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003273
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003274 BuildMI(MBB, I, DL, MovRelDesc)
3275 .addReg(Dst, RegState::Define)
3276 .addReg(SrcVec->getReg())
Diana Picus116bbab2017-01-13 09:58:52 +00003277 .add(*Val)
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003278 .addImm(SubReg - AMDGPU::sub0);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003279 }
3280
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003281 MI.eraseFromParent();
3282 return &MBB;
3283 }
3284
3285 if (Val->isReg())
3286 MRI.clearKillFlags(Val->getReg());
3287
3288 const DebugLoc &DL = MI.getDebugLoc();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003289
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003290 unsigned PhiReg = MRI.createVirtualRegister(VecRC);
3291
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003292 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003293 Offset, UseGPRIdxMode, false);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003294 MachineBasicBlock *LoopBB = InsPt->getParent();
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003295
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003296 if (UseGPRIdxMode) {
3297 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
Diana Picus116bbab2017-01-13 09:58:52 +00003298 .addReg(PhiReg, RegState::Undef, SubReg) // vdst
3299 .add(*Val) // src0
3300 .addReg(Dst, RegState::ImplicitDefine)
3301 .addReg(PhiReg, RegState::Implicit)
3302 .addReg(AMDGPU::M0, RegState::Implicit);
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003303 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003304 } else {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003305 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003306
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003307 BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
3308 .addReg(Dst, RegState::Define)
3309 .addReg(PhiReg)
Diana Picus116bbab2017-01-13 09:58:52 +00003310 .add(*Val)
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003311 .addImm(SubReg - AMDGPU::sub0);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003312 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003313
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003314 MI.eraseFromParent();
3315
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003316 return LoopBB;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003317}
3318
Matt Arsenault786724a2016-07-12 21:41:32 +00003319MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
3320 MachineInstr &MI, MachineBasicBlock *BB) const {
Tom Stellard244891d2016-12-20 15:52:17 +00003321
3322 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3323 MachineFunction *MF = BB->getParent();
3324 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
3325
3326 if (TII->isMIMG(MI)) {
Matt Arsenault905f3512017-12-29 17:18:14 +00003327 if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
3328 report_fatal_error("missing mem operand from MIMG instruction");
3329 }
Tom Stellard244891d2016-12-20 15:52:17 +00003330 // Add a memoperand for mimg instructions so that they aren't assumed to
3331 // be ordered memory instuctions.
3332
Tom Stellard244891d2016-12-20 15:52:17 +00003333 return BB;
3334 }
3335
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003336 switch (MI.getOpcode()) {
Matt Arsenault301162c2017-11-15 21:51:43 +00003337 case AMDGPU::S_ADD_U64_PSEUDO:
3338 case AMDGPU::S_SUB_U64_PSEUDO: {
3339 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3340 const DebugLoc &DL = MI.getDebugLoc();
3341
3342 MachineOperand &Dest = MI.getOperand(0);
3343 MachineOperand &Src0 = MI.getOperand(1);
3344 MachineOperand &Src1 = MI.getOperand(2);
3345
3346 unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3347 unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3348
3349 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3350 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3351 &AMDGPU::SReg_32_XM0RegClass);
3352 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3353 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3354 &AMDGPU::SReg_32_XM0RegClass);
3355
3356 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3357 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3358 &AMDGPU::SReg_32_XM0RegClass);
3359 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3360 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3361 &AMDGPU::SReg_32_XM0RegClass);
3362
3363 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3364
3365 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3366 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3367 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3368 .add(Src0Sub0)
3369 .add(Src1Sub0);
3370 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3371 .add(Src0Sub1)
3372 .add(Src1Sub1);
3373 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3374 .addReg(DestSub0)
3375 .addImm(AMDGPU::sub0)
3376 .addReg(DestSub1)
3377 .addImm(AMDGPU::sub1);
3378 MI.eraseFromParent();
3379 return BB;
3380 }
3381 case AMDGPU::SI_INIT_M0: {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003382 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
Matt Arsenault4ac341c2016-04-14 21:58:15 +00003383 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
Diana Picus116bbab2017-01-13 09:58:52 +00003384 .add(MI.getOperand(0));
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003385 MI.eraseFromParent();
Matt Arsenault20711b72015-02-20 22:10:45 +00003386 return BB;
Matt Arsenault301162c2017-11-15 21:51:43 +00003387 }
Marek Olsak2d825902017-04-28 20:21:58 +00003388 case AMDGPU::SI_INIT_EXEC:
3389 // This should be before all vector instructions.
3390 BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3391 AMDGPU::EXEC)
3392 .addImm(MI.getOperand(0).getImm());
3393 MI.eraseFromParent();
3394 return BB;
3395
3396 case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3397 // Extract the thread count from an SGPR input and set EXEC accordingly.
3398 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3399 //
3400 // S_BFE_U32 count, input, {shift, 7}
3401 // S_BFM_B64 exec, count, 0
3402 // S_CMP_EQ_U32 count, 64
3403 // S_CMOV_B64 exec, -1
3404 MachineInstr *FirstMI = &*BB->begin();
3405 MachineRegisterInfo &MRI = MF->getRegInfo();
3406 unsigned InputReg = MI.getOperand(0).getReg();
3407 unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3408 bool Found = false;
3409
3410 // Move the COPY of the input reg to the beginning, so that we can use it.
3411 for (auto I = BB->begin(); I != &MI; I++) {
3412 if (I->getOpcode() != TargetOpcode::COPY ||
3413 I->getOperand(0).getReg() != InputReg)
3414 continue;
3415
3416 if (I == FirstMI) {
3417 FirstMI = &*++BB->begin();
3418 } else {
3419 I->removeFromParent();
3420 BB->insert(FirstMI, &*I);
3421 }
3422 Found = true;
3423 break;
3424 }
3425 assert(Found);
Davide Italiano0dcc0152017-05-11 19:58:52 +00003426 (void)Found;
Marek Olsak2d825902017-04-28 20:21:58 +00003427
3428 // This should be before all vector instructions.
3429 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3430 .addReg(InputReg)
3431 .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3432 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3433 AMDGPU::EXEC)
3434 .addReg(CountReg)
3435 .addImm(0);
3436 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3437 .addReg(CountReg, RegState::Kill)
3438 .addImm(64);
3439 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3440 AMDGPU::EXEC)
3441 .addImm(-1);
3442 MI.eraseFromParent();
3443 return BB;
3444 }
3445
Changpeng Fang01f60622016-03-15 17:28:44 +00003446 case AMDGPU::GET_GROUPSTATICSIZE: {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003447 DebugLoc DL = MI.getDebugLoc();
Matt Arsenault3c07c812016-07-22 17:01:33 +00003448 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
Diana Picus116bbab2017-01-13 09:58:52 +00003449 .add(MI.getOperand(0))
3450 .addImm(MFI->getLDSSize());
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003451 MI.eraseFromParent();
Changpeng Fang01f60622016-03-15 17:28:44 +00003452 return BB;
3453 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003454 case AMDGPU::SI_INDIRECT_SRC_V1:
3455 case AMDGPU::SI_INDIRECT_SRC_V2:
3456 case AMDGPU::SI_INDIRECT_SRC_V4:
3457 case AMDGPU::SI_INDIRECT_SRC_V8:
3458 case AMDGPU::SI_INDIRECT_SRC_V16:
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003459 return emitIndirectSrc(MI, *BB, *getSubtarget());
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003460 case AMDGPU::SI_INDIRECT_DST_V1:
3461 case AMDGPU::SI_INDIRECT_DST_V2:
3462 case AMDGPU::SI_INDIRECT_DST_V4:
3463 case AMDGPU::SI_INDIRECT_DST_V8:
3464 case AMDGPU::SI_INDIRECT_DST_V16:
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003465 return emitIndirectDst(MI, *BB, *getSubtarget());
Marek Olsakce76ea02017-10-24 10:27:13 +00003466 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3467 case AMDGPU::SI_KILL_I1_PSEUDO:
Matt Arsenault786724a2016-07-12 21:41:32 +00003468 return splitKillBlock(MI, BB);
Matt Arsenault22e41792016-08-27 01:00:37 +00003469 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3470 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
Matt Arsenault22e41792016-08-27 01:00:37 +00003471
3472 unsigned Dst = MI.getOperand(0).getReg();
3473 unsigned Src0 = MI.getOperand(1).getReg();
3474 unsigned Src1 = MI.getOperand(2).getReg();
3475 const DebugLoc &DL = MI.getDebugLoc();
3476 unsigned SrcCond = MI.getOperand(3).getReg();
3477
3478 unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3479 unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003480 unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
Matt Arsenault22e41792016-08-27 01:00:37 +00003481
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003482 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3483 .addReg(SrcCond);
Matt Arsenault22e41792016-08-27 01:00:37 +00003484 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
Tim Renouf2e94f6e2019-03-18 19:25:39 +00003485 .addImm(0)
Matt Arsenault22e41792016-08-27 01:00:37 +00003486 .addReg(Src0, 0, AMDGPU::sub0)
Tim Renouf2e94f6e2019-03-18 19:25:39 +00003487 .addImm(0)
Matt Arsenault22e41792016-08-27 01:00:37 +00003488 .addReg(Src1, 0, AMDGPU::sub0)
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003489 .addReg(SrcCondCopy);
Matt Arsenault22e41792016-08-27 01:00:37 +00003490 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
Tim Renouf2e94f6e2019-03-18 19:25:39 +00003491 .addImm(0)
Matt Arsenault22e41792016-08-27 01:00:37 +00003492 .addReg(Src0, 0, AMDGPU::sub1)
Tim Renouf2e94f6e2019-03-18 19:25:39 +00003493 .addImm(0)
Matt Arsenault22e41792016-08-27 01:00:37 +00003494 .addReg(Src1, 0, AMDGPU::sub1)
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003495 .addReg(SrcCondCopy);
Matt Arsenault22e41792016-08-27 01:00:37 +00003496
3497 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3498 .addReg(DstLo)
3499 .addImm(AMDGPU::sub0)
3500 .addReg(DstHi)
3501 .addImm(AMDGPU::sub1);
3502 MI.eraseFromParent();
3503 return BB;
3504 }
Matt Arsenault327188a2016-12-15 21:57:11 +00003505 case AMDGPU::SI_BR_UNDEF: {
3506 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3507 const DebugLoc &DL = MI.getDebugLoc();
3508 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
Diana Picus116bbab2017-01-13 09:58:52 +00003509 .add(MI.getOperand(0));
Matt Arsenault327188a2016-12-15 21:57:11 +00003510 Br->getOperand(1).setIsUndef(true); // read undef SCC
3511 MI.eraseFromParent();
3512 return BB;
3513 }
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003514 case AMDGPU::ADJCALLSTACKUP:
3515 case AMDGPU::ADJCALLSTACKDOWN: {
3516 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3517 MachineInstrBuilder MIB(*MF, &MI);
Matt Arsenaulte9f36792018-03-27 18:38:51 +00003518
3519 // Add an implicit use of the frame offset reg to prevent the restore copy
3520 // inserted after the call from being reorderd after stack operations in the
3521 // the caller's frame.
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003522 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
Matt Arsenaulte9f36792018-03-27 18:38:51 +00003523 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3524 .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003525 return BB;
3526 }
Scott Linderd19d1972019-02-04 20:00:07 +00003527 case AMDGPU::SI_CALL_ISEL: {
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003528 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3529 const DebugLoc &DL = MI.getDebugLoc();
Scott Linderd19d1972019-02-04 20:00:07 +00003530
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003531 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
Matt Arsenault6ed7b9b2017-08-02 01:31:28 +00003532
Matt Arsenault71bcbd42017-08-11 20:42:08 +00003533 MachineInstrBuilder MIB;
Scott Linderd19d1972019-02-04 20:00:07 +00003534 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
Matt Arsenault71bcbd42017-08-11 20:42:08 +00003535
Scott Linderd19d1972019-02-04 20:00:07 +00003536 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003537 MIB.add(MI.getOperand(I));
Matt Arsenault6ed7b9b2017-08-02 01:31:28 +00003538
Chandler Carruthc73c0302018-08-16 21:30:05 +00003539 MIB.cloneMemRefs(MI);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003540 MI.eraseFromParent();
3541 return BB;
3542 }
Stanislav Mekhanoshin64399da2019-05-02 04:26:35 +00003543 case AMDGPU::V_ADD_I32_e32:
3544 case AMDGPU::V_SUB_I32_e32:
3545 case AMDGPU::V_SUBREV_I32_e32: {
3546 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
3547 const DebugLoc &DL = MI.getDebugLoc();
3548 unsigned Opc = MI.getOpcode();
3549
3550 bool NeedClampOperand = false;
3551 if (TII->pseudoToMCOpcode(Opc) == -1) {
3552 Opc = AMDGPU::getVOPe64(Opc);
3553 NeedClampOperand = true;
3554 }
3555
3556 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
3557 if (TII->isVOP3(*I)) {
3558 I.addReg(AMDGPU::VCC, RegState::Define);
3559 }
3560 I.add(MI.getOperand(1))
3561 .add(MI.getOperand(2));
3562 if (NeedClampOperand)
3563 I.addImm(0); // clamp bit for e64 encoding
3564
3565 TII->legalizeOperands(*I);
3566
3567 MI.eraseFromParent();
3568 return BB;
3569 }
Changpeng Fang01f60622016-03-15 17:28:44 +00003570 default:
3571 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
Tom Stellard75aadc22012-12-11 21:25:42 +00003572 }
Tom Stellard75aadc22012-12-11 21:25:42 +00003573}
3574
Matt Arsenaulte11d8ac2017-10-13 21:10:22 +00003575bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
3576 return isTypeLegal(VT.getScalarType());
3577}
3578
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003579bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
3580 // This currently forces unfolding various combinations of fsub into fma with
3581 // free fneg'd operands. As long as we have fast FMA (controlled by
3582 // isFMAFasterThanFMulAndFAdd), we should perform these.
3583
3584 // When fma is quarter rate, for f64 where add / sub are at best half rate,
3585 // most of these combines appear to be cycle neutral but save on instruction
3586 // count / code size.
3587 return true;
3588}
3589
Mehdi Amini44ede332015-07-09 02:09:04 +00003590EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
3591 EVT VT) const {
Tom Stellard83747202013-07-18 21:43:53 +00003592 if (!VT.isVector()) {
3593 return MVT::i1;
3594 }
Matt Arsenault8596f712014-11-28 22:51:38 +00003595 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
Tom Stellard75aadc22012-12-11 21:25:42 +00003596}
3597
Matt Arsenault94163282016-12-22 16:36:25 +00003598MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
3599 // TODO: Should i16 be used always if legal? For now it would force VALU
3600 // shifts.
3601 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
Christian Konig082a14a2013-03-18 11:34:05 +00003602}
3603
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003604// Answering this is somewhat tricky and depends on the specific device which
3605// have different rates for fma or all f64 operations.
3606//
3607// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3608// regardless of which device (although the number of cycles differs between
3609// devices), so it is always profitable for f64.
3610//
3611// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3612// only on full rate devices. Normally, we should prefer selecting v_mad_f32
3613// which we can always do even without fused FP ops since it returns the same
3614// result as the separate operations and since it is always full
3615// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3616// however does not support denormals, so we do report fma as faster if we have
3617// a fast fma device and require denormals.
3618//
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003619bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3620 VT = VT.getScalarType();
3621
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003622 switch (VT.getSimpleVT().SimpleTy) {
Matt Arsenault0084adc2018-04-30 19:08:16 +00003623 case MVT::f32: {
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003624 // This is as fast on some subtargets. However, we always have full rate f32
3625 // mad available which returns the same result as the separate operations
Matt Arsenault8d630032015-02-20 22:10:41 +00003626 // which we should prefer over fma. We can't use this if we want to support
3627 // denormals, so only report this in these cases.
Matt Arsenault0084adc2018-04-30 19:08:16 +00003628 if (Subtarget->hasFP32Denormals())
3629 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
3630
3631 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3632 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
3633 }
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003634 case MVT::f64:
3635 return true;
Matt Arsenault9e22bc22016-12-22 03:21:48 +00003636 case MVT::f16:
3637 return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003638 default:
3639 break;
3640 }
3641
3642 return false;
3643}
3644
Tom Stellard75aadc22012-12-11 21:25:42 +00003645//===----------------------------------------------------------------------===//
3646// Custom DAG Lowering Operations
3647//===----------------------------------------------------------------------===//
3648
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003649// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3650// wider vector type is legal.
3651SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
3652 SelectionDAG &DAG) const {
3653 unsigned Opc = Op.getOpcode();
3654 EVT VT = Op.getValueType();
3655 assert(VT == MVT::v4f16);
3656
3657 SDValue Lo, Hi;
3658 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
3659
3660 SDLoc SL(Op);
3661 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
3662 Op->getFlags());
3663 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
3664 Op->getFlags());
3665
3666 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3667}
3668
3669// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3670// wider vector type is legal.
3671SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
3672 SelectionDAG &DAG) const {
3673 unsigned Opc = Op.getOpcode();
3674 EVT VT = Op.getValueType();
3675 assert(VT == MVT::v4i16 || VT == MVT::v4f16);
3676
3677 SDValue Lo0, Hi0;
3678 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
3679 SDValue Lo1, Hi1;
3680 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
3681
3682 SDLoc SL(Op);
3683
3684 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
3685 Op->getFlags());
3686 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
3687 Op->getFlags());
3688
3689 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3690}
3691
Tom Stellard75aadc22012-12-11 21:25:42 +00003692SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3693 switch (Op.getOpcode()) {
3694 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
Tom Stellardf8794352012-12-19 22:10:31 +00003695 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
Tom Stellard35bb18c2013-08-26 15:06:04 +00003696 case ISD::LOAD: {
Tom Stellarde812f2f2014-07-21 15:45:06 +00003697 SDValue Result = LowerLOAD(Op, DAG);
3698 assert((!Result.getNode() ||
3699 Result.getNode()->getNumValues() == 2) &&
3700 "Load should return a value and a chain");
3701 return Result;
Tom Stellard35bb18c2013-08-26 15:06:04 +00003702 }
Tom Stellardaf775432013-10-23 00:44:32 +00003703
Matt Arsenaultad14ce82014-07-19 18:44:39 +00003704 case ISD::FSIN:
3705 case ISD::FCOS:
3706 return LowerTrig(Op, DAG);
Tom Stellard0ec134f2014-02-04 17:18:40 +00003707 case ISD::SELECT: return LowerSELECT(Op, DAG);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00003708 case ISD::FDIV: return LowerFDIV(Op, DAG);
Tom Stellard354a43c2016-04-01 18:27:37 +00003709 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
Tom Stellard81d871d2013-11-13 23:36:50 +00003710 case ISD::STORE: return LowerSTORE(Op, DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003711 case ISD::GlobalAddress: {
3712 MachineFunction &MF = DAG.getMachineFunction();
3713 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3714 return LowerGlobalAddress(MFI, Op, DAG);
Tom Stellard94593ee2013-06-03 17:40:18 +00003715 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003716 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00003717 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003718 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
Matt Arsenault99c14522016-04-25 19:27:24 +00003719 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
Matt Arsenault3aef8092017-01-23 23:09:58 +00003720 case ISD::INSERT_VECTOR_ELT:
3721 return lowerINSERT_VECTOR_ELT(Op, DAG);
3722 case ISD::EXTRACT_VECTOR_ELT:
3723 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
Matt Arsenault67a98152018-05-16 11:47:30 +00003724 case ISD::BUILD_VECTOR:
3725 return lowerBUILD_VECTOR(Op, DAG);
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00003726 case ISD::FP_ROUND:
3727 return lowerFP_ROUND(Op, DAG);
Matt Arsenault3e025382017-04-24 17:49:13 +00003728 case ISD::TRAP:
Matt Arsenault3e025382017-04-24 17:49:13 +00003729 return lowerTRAP(Op, DAG);
Tony Tye43259df2018-05-16 16:19:34 +00003730 case ISD::DEBUGTRAP:
3731 return lowerDEBUGTRAP(Op, DAG);
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003732 case ISD::FABS:
3733 case ISD::FNEG:
Matt Arsenault36cdcfa2018-08-02 13:43:42 +00003734 case ISD::FCANONICALIZE:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003735 return splitUnaryVectorOp(Op, DAG);
Matt Arsenault687ec752018-10-22 16:27:27 +00003736 case ISD::FMINNUM:
3737 case ISD::FMAXNUM:
3738 return lowerFMINNUM_FMAXNUM(Op, DAG);
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003739 case ISD::SHL:
3740 case ISD::SRA:
3741 case ISD::SRL:
3742 case ISD::ADD:
3743 case ISD::SUB:
3744 case ISD::MUL:
3745 case ISD::SMIN:
3746 case ISD::SMAX:
3747 case ISD::UMIN:
3748 case ISD::UMAX:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003749 case ISD::FADD:
3750 case ISD::FMUL:
Matt Arsenault687ec752018-10-22 16:27:27 +00003751 case ISD::FMINNUM_IEEE:
3752 case ISD::FMAXNUM_IEEE:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003753 return splitBinaryVectorOp(Op, DAG);
Tom Stellard75aadc22012-12-11 21:25:42 +00003754 }
3755 return SDValue();
3756}
3757
Matt Arsenault1349a042018-05-22 06:32:10 +00003758static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
3759 const SDLoc &DL,
3760 SelectionDAG &DAG, bool Unpacked) {
3761 if (!LoadVT.isVector())
3762 return Result;
3763
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003764 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3765 // Truncate to v2i16/v4i16.
3766 EVT IntLoadVT = LoadVT.changeTypeToInteger();
Matt Arsenault1349a042018-05-22 06:32:10 +00003767
3768 // Workaround legalizer not scalarizing truncate after vector op
3769 // legalization byt not creating intermediate vector trunc.
3770 SmallVector<SDValue, 4> Elts;
3771 DAG.ExtractVectorElements(Result, Elts);
3772 for (SDValue &Elt : Elts)
3773 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3774
3775 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3776
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003777 // Bitcast to original type (v2f16/v4f16).
Matt Arsenault1349a042018-05-22 06:32:10 +00003778 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003779 }
Matt Arsenault1349a042018-05-22 06:32:10 +00003780
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003781 // Cast back to the original packed type.
3782 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3783}
3784
Matt Arsenault1349a042018-05-22 06:32:10 +00003785SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3786 MemSDNode *M,
3787 SelectionDAG &DAG,
Tim Renouf366a49d2018-08-02 23:33:01 +00003788 ArrayRef<SDValue> Ops,
Matt Arsenault1349a042018-05-22 06:32:10 +00003789 bool IsIntrinsic) const {
3790 SDLoc DL(M);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003791
3792 bool Unpacked = Subtarget->hasUnpackedD16VMem();
Matt Arsenault1349a042018-05-22 06:32:10 +00003793 EVT LoadVT = M->getValueType(0);
3794
Matt Arsenault1349a042018-05-22 06:32:10 +00003795 EVT EquivLoadVT = LoadVT;
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003796 if (Unpacked && LoadVT.isVector()) {
3797 EquivLoadVT = LoadVT.isVector() ?
3798 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3799 LoadVT.getVectorNumElements()) : LoadVT;
Matt Arsenault1349a042018-05-22 06:32:10 +00003800 }
3801
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003802 // Change from v4f16/v2f16 to EquivLoadVT.
3803 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3804
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003805 SDValue Load
3806 = DAG.getMemIntrinsicNode(
3807 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3808 VTList, Ops, M->getMemoryVT(),
3809 M->getMemOperand());
3810 if (!Unpacked) // Just adjusted the opcode.
3811 return Load;
Changpeng Fang4737e892018-01-18 22:08:53 +00003812
Matt Arsenault1349a042018-05-22 06:32:10 +00003813 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
Changpeng Fang4737e892018-01-18 22:08:53 +00003814
Matt Arsenault1349a042018-05-22 06:32:10 +00003815 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003816}
3817
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003818static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
3819 SDNode *N, SelectionDAG &DAG) {
3820 EVT VT = N->getValueType(0);
Matt Arsenaultcaf13162019-03-12 21:02:54 +00003821 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003822 int CondCode = CD->getSExtValue();
3823 if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
3824 CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
3825 return DAG.getUNDEF(VT);
3826
3827 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
3828
3829
3830 SDValue LHS = N->getOperand(1);
3831 SDValue RHS = N->getOperand(2);
3832
3833 SDLoc DL(N);
3834
3835 EVT CmpVT = LHS.getValueType();
3836 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
3837 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
3838 ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3839 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
3840 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
3841 }
3842
3843 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
3844
3845 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
3846 DAG.getCondCode(CCOpcode));
3847}
3848
3849static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
3850 SDNode *N, SelectionDAG &DAG) {
3851 EVT VT = N->getValueType(0);
Matt Arsenaultcaf13162019-03-12 21:02:54 +00003852 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003853
3854 int CondCode = CD->getSExtValue();
3855 if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
3856 CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
3857 return DAG.getUNDEF(VT);
3858 }
3859
3860 SDValue Src0 = N->getOperand(1);
3861 SDValue Src1 = N->getOperand(2);
3862 EVT CmpVT = Src0.getValueType();
3863 SDLoc SL(N);
3864
3865 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
3866 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
3867 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
3868 }
3869
3870 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
3871 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
3872 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
3873 Src1, DAG.getCondCode(CCOpcode));
3874}
3875
Matt Arsenault3aef8092017-01-23 23:09:58 +00003876void SITargetLowering::ReplaceNodeResults(SDNode *N,
3877 SmallVectorImpl<SDValue> &Results,
3878 SelectionDAG &DAG) const {
3879 switch (N->getOpcode()) {
3880 case ISD::INSERT_VECTOR_ELT: {
3881 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3882 Results.push_back(Res);
3883 return;
3884 }
3885 case ISD::EXTRACT_VECTOR_ELT: {
3886 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3887 Results.push_back(Res);
3888 return;
3889 }
Matt Arsenault1f17c662017-02-22 00:27:34 +00003890 case ISD::INTRINSIC_WO_CHAIN: {
3891 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
Marek Olsak13e47412018-01-31 20:18:04 +00003892 switch (IID) {
3893 case Intrinsic::amdgcn_cvt_pkrtz: {
Matt Arsenault1f17c662017-02-22 00:27:34 +00003894 SDValue Src0 = N->getOperand(1);
3895 SDValue Src1 = N->getOperand(2);
3896 SDLoc SL(N);
3897 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
3898 Src0, Src1);
Matt Arsenault1f17c662017-02-22 00:27:34 +00003899 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3900 return;
3901 }
Marek Olsak13e47412018-01-31 20:18:04 +00003902 case Intrinsic::amdgcn_cvt_pknorm_i16:
3903 case Intrinsic::amdgcn_cvt_pknorm_u16:
3904 case Intrinsic::amdgcn_cvt_pk_i16:
3905 case Intrinsic::amdgcn_cvt_pk_u16: {
3906 SDValue Src0 = N->getOperand(1);
3907 SDValue Src1 = N->getOperand(2);
3908 SDLoc SL(N);
3909 unsigned Opcode;
3910
3911 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
3912 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
3913 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3914 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
3915 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3916 Opcode = AMDGPUISD::CVT_PK_I16_I32;
3917 else
3918 Opcode = AMDGPUISD::CVT_PK_U16_U32;
3919
Matt Arsenault709374d2018-08-01 20:13:58 +00003920 EVT VT = N->getValueType(0);
3921 if (isTypeLegal(VT))
3922 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
3923 else {
3924 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3925 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3926 }
Marek Olsak13e47412018-01-31 20:18:04 +00003927 return;
3928 }
3929 }
Simon Pilgrimd362d272017-07-08 19:50:03 +00003930 break;
Matt Arsenault1f17c662017-02-22 00:27:34 +00003931 }
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003932 case ISD::INTRINSIC_W_CHAIN: {
Matt Arsenault1349a042018-05-22 06:32:10 +00003933 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003934 Results.push_back(Res);
Matt Arsenault1349a042018-05-22 06:32:10 +00003935 Results.push_back(Res.getValue(1));
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003936 return;
3937 }
Matt Arsenault1349a042018-05-22 06:32:10 +00003938
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003939 break;
3940 }
Matt Arsenault4a486232017-04-19 20:53:07 +00003941 case ISD::SELECT: {
3942 SDLoc SL(N);
3943 EVT VT = N->getValueType(0);
3944 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3945 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3946 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3947
3948 EVT SelectVT = NewVT;
3949 if (NewVT.bitsLT(MVT::i32)) {
3950 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3951 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3952 SelectVT = MVT::i32;
3953 }
3954
3955 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3956 N->getOperand(0), LHS, RHS);
3957
3958 if (NewVT != SelectVT)
3959 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3960 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3961 return;
3962 }
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003963 case ISD::FNEG: {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003964 if (N->getValueType(0) != MVT::v2f16)
3965 break;
3966
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003967 SDLoc SL(N);
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003968 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3969
3970 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
3971 BC,
3972 DAG.getConstant(0x80008000, SL, MVT::i32));
3973 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3974 return;
3975 }
3976 case ISD::FABS: {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003977 if (N->getValueType(0) != MVT::v2f16)
3978 break;
3979
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003980 SDLoc SL(N);
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003981 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3982
3983 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
3984 BC,
3985 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
3986 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3987 return;
3988 }
Matt Arsenault3aef8092017-01-23 23:09:58 +00003989 default:
3990 break;
3991 }
3992}
3993
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00003994/// Helper function for LowerBRCOND
Tom Stellardf8794352012-12-19 22:10:31 +00003995static SDNode *findUser(SDValue Value, unsigned Opcode) {
Tom Stellard75aadc22012-12-11 21:25:42 +00003996
Tom Stellardf8794352012-12-19 22:10:31 +00003997 SDNode *Parent = Value.getNode();
3998 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3999 I != E; ++I) {
4000
4001 if (I.getUse().get() != Value)
4002 continue;
4003
4004 if (I->getOpcode() == Opcode)
4005 return *I;
4006 }
Craig Topper062a2ba2014-04-25 05:30:21 +00004007 return nullptr;
Tom Stellardf8794352012-12-19 22:10:31 +00004008}
4009
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004010unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
Matt Arsenault6408c912016-09-16 22:11:18 +00004011 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
4012 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004013 case Intrinsic::amdgcn_if:
4014 return AMDGPUISD::IF;
4015 case Intrinsic::amdgcn_else:
4016 return AMDGPUISD::ELSE;
4017 case Intrinsic::amdgcn_loop:
4018 return AMDGPUISD::LOOP;
4019 case Intrinsic::amdgcn_end_cf:
4020 llvm_unreachable("should not occur");
Matt Arsenault6408c912016-09-16 22:11:18 +00004021 default:
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004022 return 0;
Matt Arsenault6408c912016-09-16 22:11:18 +00004023 }
Tom Stellardbc4497b2016-02-12 23:45:29 +00004024 }
Matt Arsenault6408c912016-09-16 22:11:18 +00004025
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004026 // break, if_break, else_break are all only used as inputs to loop, not
4027 // directly as branch conditions.
4028 return 0;
Tom Stellardbc4497b2016-02-12 23:45:29 +00004029}
4030
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004031bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
4032 const Triple &TT = getTargetMachine().getTargetTriple();
Matt Arsenault0da63502018-08-31 05:49:54 +00004033 return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4034 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004035 AMDGPU::shouldEmitConstantsToTextSection(TT);
4036}
4037
4038bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
Scott Linderd19d1972019-02-04 20:00:07 +00004039 // FIXME: Either avoid relying on address space here or change the default
4040 // address space for functions to avoid the explicit check.
4041 return (GV->getValueType()->isFunctionTy() ||
4042 GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
Matt Arsenault0da63502018-08-31 05:49:54 +00004043 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4044 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004045 !shouldEmitFixup(GV) &&
4046 !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
4047}
4048
4049bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
4050 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
4051}
4052
Tom Stellardf8794352012-12-19 22:10:31 +00004053/// This transforms the control flow intrinsics to get the branch destination as
4054/// last parameter, also switches branch target with BR if the need arise
4055SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
4056 SelectionDAG &DAG) const {
Andrew Trickef9de2a2013-05-25 02:42:55 +00004057 SDLoc DL(BRCOND);
Tom Stellardf8794352012-12-19 22:10:31 +00004058
4059 SDNode *Intr = BRCOND.getOperand(1).getNode();
4060 SDValue Target = BRCOND.getOperand(2);
Craig Topper062a2ba2014-04-25 05:30:21 +00004061 SDNode *BR = nullptr;
Tom Stellardbc4497b2016-02-12 23:45:29 +00004062 SDNode *SetCC = nullptr;
Tom Stellardf8794352012-12-19 22:10:31 +00004063
4064 if (Intr->getOpcode() == ISD::SETCC) {
4065 // As long as we negate the condition everything is fine
Tom Stellardbc4497b2016-02-12 23:45:29 +00004066 SetCC = Intr;
Tom Stellardf8794352012-12-19 22:10:31 +00004067 Intr = SetCC->getOperand(0).getNode();
4068
4069 } else {
4070 // Get the target from BR if we don't negate the condition
4071 BR = findUser(BRCOND, ISD::BR);
4072 Target = BR->getOperand(1);
4073 }
4074
Matt Arsenault6408c912016-09-16 22:11:18 +00004075 // FIXME: This changes the types of the intrinsics instead of introducing new
4076 // nodes with the correct types.
4077 // e.g. llvm.amdgcn.loop
4078
4079 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
4080 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
4081
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004082 unsigned CFNode = isCFIntrinsic(Intr);
4083 if (CFNode == 0) {
Tom Stellardbc4497b2016-02-12 23:45:29 +00004084 // This is a uniform branch so we don't need to legalize.
4085 return BRCOND;
4086 }
4087
Matt Arsenault6408c912016-09-16 22:11:18 +00004088 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
4089 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
4090
Tom Stellardbc4497b2016-02-12 23:45:29 +00004091 assert(!SetCC ||
4092 (SetCC->getConstantOperandVal(1) == 1 &&
Tom Stellardbc4497b2016-02-12 23:45:29 +00004093 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
4094 ISD::SETNE));
Tom Stellardf8794352012-12-19 22:10:31 +00004095
Tom Stellardf8794352012-12-19 22:10:31 +00004096 // operands of the new intrinsic call
4097 SmallVector<SDValue, 4> Ops;
Matt Arsenault6408c912016-09-16 22:11:18 +00004098 if (HaveChain)
4099 Ops.push_back(BRCOND.getOperand(0));
4100
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004101 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
Tom Stellardf8794352012-12-19 22:10:31 +00004102 Ops.push_back(Target);
4103
Matt Arsenault6408c912016-09-16 22:11:18 +00004104 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
4105
Tom Stellardf8794352012-12-19 22:10:31 +00004106 // build the new intrinsic call
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004107 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
Tom Stellardf8794352012-12-19 22:10:31 +00004108
Matt Arsenault6408c912016-09-16 22:11:18 +00004109 if (!HaveChain) {
4110 SDValue Ops[] = {
4111 SDValue(Result, 0),
4112 BRCOND.getOperand(0)
4113 };
4114
4115 Result = DAG.getMergeValues(Ops, DL).getNode();
4116 }
4117
Tom Stellardf8794352012-12-19 22:10:31 +00004118 if (BR) {
4119 // Give the branch instruction our target
4120 SDValue Ops[] = {
4121 BR->getOperand(0),
4122 BRCOND.getOperand(2)
4123 };
Chandler Carruth356665a2014-08-01 22:09:43 +00004124 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
4125 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
4126 BR = NewBR.getNode();
Tom Stellardf8794352012-12-19 22:10:31 +00004127 }
4128
4129 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
4130
4131 // Copy the intrinsic results to registers
4132 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
4133 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
4134 if (!CopyToReg)
4135 continue;
4136
4137 Chain = DAG.getCopyToReg(
4138 Chain, DL,
4139 CopyToReg->getOperand(1),
4140 SDValue(Result, i - 1),
4141 SDValue());
4142
4143 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
4144 }
4145
4146 // Remove the old intrinsic from the chain
4147 DAG.ReplaceAllUsesOfValueWith(
4148 SDValue(Intr, Intr->getNumValues() - 1),
4149 Intr->getOperand(0));
4150
4151 return Chain;
Tom Stellard75aadc22012-12-11 21:25:42 +00004152}
4153
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00004154SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
4155 SDValue Op,
4156 const SDLoc &DL,
4157 EVT VT) const {
4158 return Op.getValueType().bitsLE(VT) ?
4159 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
4160 DAG.getNode(ISD::FTRUNC, DL, VT, Op);
4161}
4162
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004163SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004164 assert(Op.getValueType() == MVT::f16 &&
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004165 "Do not know how to custom lower FP_ROUND for non-f16 type");
4166
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004167 SDValue Src = Op.getOperand(0);
4168 EVT SrcVT = Src.getValueType();
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004169 if (SrcVT != MVT::f64)
4170 return Op;
4171
4172 SDLoc DL(Op);
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004173
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004174 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
4175 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
Mandeep Singh Grang5e1697e2017-06-06 05:08:36 +00004176 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004177}
4178
Matt Arsenault687ec752018-10-22 16:27:27 +00004179SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
4180 SelectionDAG &DAG) const {
4181 EVT VT = Op.getValueType();
Matt Arsenault055e4dc2019-03-29 19:14:54 +00004182 const MachineFunction &MF = DAG.getMachineFunction();
4183 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4184 bool IsIEEEMode = Info->getMode().IEEE;
Matt Arsenault687ec752018-10-22 16:27:27 +00004185
4186 // FIXME: Assert during eslection that this is only selected for
4187 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4188 // mode functions, but this happens to be OK since it's only done in cases
4189 // where there is known no sNaN.
4190 if (IsIEEEMode)
4191 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
4192
4193 if (VT == MVT::v4f16)
4194 return splitBinaryVectorOp(Op, DAG);
4195 return Op;
4196}
4197
Matt Arsenault3e025382017-04-24 17:49:13 +00004198SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
4199 SDLoc SL(Op);
Matt Arsenault3e025382017-04-24 17:49:13 +00004200 SDValue Chain = Op.getOperand(0);
4201
Tom Stellard5bfbae52018-07-11 20:59:01 +00004202 if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
Tony Tye43259df2018-05-16 16:19:34 +00004203 !Subtarget->isTrapHandlerEnabled())
Matt Arsenault3e025382017-04-24 17:49:13 +00004204 return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
Tony Tye43259df2018-05-16 16:19:34 +00004205
4206 MachineFunction &MF = DAG.getMachineFunction();
4207 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4208 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4209 assert(UserSGPR != AMDGPU::NoRegister);
4210 SDValue QueuePtr = CreateLiveInRegister(
4211 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4212 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
4213 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
4214 QueuePtr, SDValue());
4215 SDValue Ops[] = {
4216 ToReg,
Tom Stellard5bfbae52018-07-11 20:59:01 +00004217 DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
Tony Tye43259df2018-05-16 16:19:34 +00004218 SGPR01,
4219 ToReg.getValue(1)
4220 };
4221 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4222}
4223
4224SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
4225 SDLoc SL(Op);
4226 SDValue Chain = Op.getOperand(0);
4227 MachineFunction &MF = DAG.getMachineFunction();
4228
Tom Stellard5bfbae52018-07-11 20:59:01 +00004229 if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
Tony Tye43259df2018-05-16 16:19:34 +00004230 !Subtarget->isTrapHandlerEnabled()) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004231 DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
Matt Arsenault3e025382017-04-24 17:49:13 +00004232 "debugtrap handler not supported",
4233 Op.getDebugLoc(),
4234 DS_Warning);
Matthias Braunf1caa282017-12-15 22:22:58 +00004235 LLVMContext &Ctx = MF.getFunction().getContext();
Matt Arsenault3e025382017-04-24 17:49:13 +00004236 Ctx.diagnose(NoTrap);
4237 return Chain;
4238 }
Matt Arsenault3e025382017-04-24 17:49:13 +00004239
Tony Tye43259df2018-05-16 16:19:34 +00004240 SDValue Ops[] = {
4241 Chain,
Tom Stellard5bfbae52018-07-11 20:59:01 +00004242 DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
Tony Tye43259df2018-05-16 16:19:34 +00004243 };
4244 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
Matt Arsenault3e025382017-04-24 17:49:13 +00004245}
4246
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004247SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
Matt Arsenault99c14522016-04-25 19:27:24 +00004248 SelectionDAG &DAG) const {
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004249 // FIXME: Use inline constants (src_{shared, private}_base) instead.
4250 if (Subtarget->hasApertureRegs()) {
Matt Arsenault0da63502018-08-31 05:49:54 +00004251 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004252 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
4253 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
Matt Arsenault0da63502018-08-31 05:49:54 +00004254 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004255 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
4256 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
4257 unsigned Encoding =
4258 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
4259 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
4260 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
Matt Arsenaulte823d922017-02-18 18:29:53 +00004261
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004262 SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
4263 SDValue ApertureReg = SDValue(
4264 DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
4265 SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
4266 return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
Matt Arsenaulte823d922017-02-18 18:29:53 +00004267 }
4268
Matt Arsenault99c14522016-04-25 19:27:24 +00004269 MachineFunction &MF = DAG.getMachineFunction();
4270 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenault3b2e2a52016-06-06 20:03:31 +00004271 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4272 assert(UserSGPR != AMDGPU::NoRegister);
4273
Matt Arsenault99c14522016-04-25 19:27:24 +00004274 SDValue QueuePtr = CreateLiveInRegister(
Matt Arsenault3b2e2a52016-06-06 20:03:31 +00004275 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
Matt Arsenault99c14522016-04-25 19:27:24 +00004276
4277 // Offset into amd_queue_t for group_segment_aperture_base_hi /
4278 // private_segment_aperture_base_hi.
Matt Arsenault0da63502018-08-31 05:49:54 +00004279 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
Matt Arsenault99c14522016-04-25 19:27:24 +00004280
Matt Arsenaultb655fa92017-11-29 01:25:12 +00004281 SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
Matt Arsenault99c14522016-04-25 19:27:24 +00004282
4283 // TODO: Use custom target PseudoSourceValue.
4284 // TODO: We should use the value from the IR intrinsic call, but it might not
4285 // be available and how do we get it?
4286 Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
Matt Arsenault0da63502018-08-31 05:49:54 +00004287 AMDGPUAS::CONSTANT_ADDRESS));
Matt Arsenault99c14522016-04-25 19:27:24 +00004288
4289 MachinePointerInfo PtrInfo(V, StructOffset);
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004290 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
Justin Lebar9c375812016-07-15 18:27:10 +00004291 MinAlign(64, StructOffset),
Justin Lebaradbf09e2016-09-11 01:38:58 +00004292 MachineMemOperand::MODereferenceable |
4293 MachineMemOperand::MOInvariant);
Matt Arsenault99c14522016-04-25 19:27:24 +00004294}
4295
4296SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4297 SelectionDAG &DAG) const {
4298 SDLoc SL(Op);
4299 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4300
4301 SDValue Src = ASC->getOperand(0);
Matt Arsenault99c14522016-04-25 19:27:24 +00004302 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4303
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004304 const AMDGPUTargetMachine &TM =
4305 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4306
Matt Arsenault99c14522016-04-25 19:27:24 +00004307 // flat -> local/private
Matt Arsenault0da63502018-08-31 05:49:54 +00004308 if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenault971c85e2017-03-13 19:47:31 +00004309 unsigned DestAS = ASC->getDestAddressSpace();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00004310
Matt Arsenault0da63502018-08-31 05:49:54 +00004311 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
4312 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004313 unsigned NullVal = TM.getNullPointerValue(DestAS);
4314 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
Matt Arsenault99c14522016-04-25 19:27:24 +00004315 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4316 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4317
4318 return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4319 NonNull, Ptr, SegmentNullPtr);
4320 }
4321 }
4322
4323 // local/private -> flat
Matt Arsenault0da63502018-08-31 05:49:54 +00004324 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenault971c85e2017-03-13 19:47:31 +00004325 unsigned SrcAS = ASC->getSrcAddressSpace();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00004326
Matt Arsenault0da63502018-08-31 05:49:54 +00004327 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
4328 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004329 unsigned NullVal = TM.getNullPointerValue(SrcAS);
4330 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
Matt Arsenault971c85e2017-03-13 19:47:31 +00004331
Matt Arsenault99c14522016-04-25 19:27:24 +00004332 SDValue NonNull
4333 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4334
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004335 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
Matt Arsenault99c14522016-04-25 19:27:24 +00004336 SDValue CvtPtr
4337 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4338
4339 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4340 DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4341 FlatNullPtr);
4342 }
4343 }
4344
4345 // global <-> flat are no-ops and never emitted.
4346
4347 const MachineFunction &MF = DAG.getMachineFunction();
4348 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
Matthias Braunf1caa282017-12-15 22:22:58 +00004349 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
Matt Arsenault99c14522016-04-25 19:27:24 +00004350 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4351
4352 return DAG.getUNDEF(ASC->getValueType(0));
4353}
4354
Matt Arsenault3aef8092017-01-23 23:09:58 +00004355SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4356 SelectionDAG &DAG) const {
Matt Arsenault67a98152018-05-16 11:47:30 +00004357 SDValue Vec = Op.getOperand(0);
4358 SDValue InsVal = Op.getOperand(1);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004359 SDValue Idx = Op.getOperand(2);
Matt Arsenault67a98152018-05-16 11:47:30 +00004360 EVT VecVT = Vec.getValueType();
Matt Arsenault9224c002018-06-05 19:52:46 +00004361 EVT EltVT = VecVT.getVectorElementType();
4362 unsigned VecSize = VecVT.getSizeInBits();
4363 unsigned EltSize = EltVT.getSizeInBits();
Matt Arsenault67a98152018-05-16 11:47:30 +00004364
Matt Arsenault9224c002018-06-05 19:52:46 +00004365
4366 assert(VecSize <= 64);
Matt Arsenault67a98152018-05-16 11:47:30 +00004367
4368 unsigned NumElts = VecVT.getVectorNumElements();
4369 SDLoc SL(Op);
4370 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
4371
Matt Arsenault9224c002018-06-05 19:52:46 +00004372 if (NumElts == 4 && EltSize == 16 && KIdx) {
Matt Arsenault67a98152018-05-16 11:47:30 +00004373 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
4374
4375 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4376 DAG.getConstant(0, SL, MVT::i32));
4377 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4378 DAG.getConstant(1, SL, MVT::i32));
4379
4380 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
4381 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
4382
4383 unsigned Idx = KIdx->getZExtValue();
4384 bool InsertLo = Idx < 2;
4385 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
4386 InsertLo ? LoVec : HiVec,
4387 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
4388 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
4389
4390 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
4391
4392 SDValue Concat = InsertLo ?
4393 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
4394 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
4395
4396 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
4397 }
4398
Matt Arsenault3aef8092017-01-23 23:09:58 +00004399 if (isa<ConstantSDNode>(Idx))
4400 return SDValue();
4401
Matt Arsenault9224c002018-06-05 19:52:46 +00004402 MVT IntVT = MVT::getIntegerVT(VecSize);
Matt Arsenault67a98152018-05-16 11:47:30 +00004403
Matt Arsenault3aef8092017-01-23 23:09:58 +00004404 // Avoid stack access for dynamic indexing.
Matt Arsenault3aef8092017-01-23 23:09:58 +00004405 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
Tim Corringhamfa3e4e52019-02-01 16:51:09 +00004406
4407 // Create a congruent vector with the target value in each element so that
4408 // the required element can be masked and ORed into the target vector.
4409 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
4410 DAG.getSplatBuildVector(VecVT, SL, InsVal));
Matt Arsenault3aef8092017-01-23 23:09:58 +00004411
Matt Arsenault9224c002018-06-05 19:52:46 +00004412 assert(isPowerOf2_32(EltSize));
4413 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4414
Matt Arsenault3aef8092017-01-23 23:09:58 +00004415 // Convert vector index to bit-index.
Matt Arsenault9224c002018-06-05 19:52:46 +00004416 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004417
Matt Arsenault67a98152018-05-16 11:47:30 +00004418 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4419 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
4420 DAG.getConstant(0xffff, SL, IntVT),
Matt Arsenault3aef8092017-01-23 23:09:58 +00004421 ScaledIdx);
4422
Matt Arsenault67a98152018-05-16 11:47:30 +00004423 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
4424 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
4425 DAG.getNOT(SL, BFM, IntVT), BCVec);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004426
Matt Arsenault67a98152018-05-16 11:47:30 +00004427 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
4428 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004429}
4430
4431SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4432 SelectionDAG &DAG) const {
4433 SDLoc SL(Op);
4434
4435 EVT ResultVT = Op.getValueType();
4436 SDValue Vec = Op.getOperand(0);
4437 SDValue Idx = Op.getOperand(1);
Matt Arsenault67a98152018-05-16 11:47:30 +00004438 EVT VecVT = Vec.getValueType();
Matt Arsenault9224c002018-06-05 19:52:46 +00004439 unsigned VecSize = VecVT.getSizeInBits();
4440 EVT EltVT = VecVT.getVectorElementType();
4441 assert(VecSize <= 64);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004442
Matt Arsenault98f29462017-05-17 20:30:58 +00004443 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4444
Hiroshi Inoue372ffa12018-04-13 11:37:06 +00004445 // Make sure we do any optimizations that will make it easier to fold
Matt Arsenault98f29462017-05-17 20:30:58 +00004446 // source modifiers before obscuring it with bit operations.
4447
4448 // XXX - Why doesn't this get called when vector_shuffle is expanded?
4449 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4450 return Combined;
4451
Matt Arsenault9224c002018-06-05 19:52:46 +00004452 unsigned EltSize = EltVT.getSizeInBits();
4453 assert(isPowerOf2_32(EltSize));
Matt Arsenault3aef8092017-01-23 23:09:58 +00004454
Matt Arsenault9224c002018-06-05 19:52:46 +00004455 MVT IntVT = MVT::getIntegerVT(VecSize);
4456 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4457
4458 // Convert vector index to bit-index (* EltSize)
4459 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004460
Matt Arsenault67a98152018-05-16 11:47:30 +00004461 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4462 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004463
Matt Arsenault67a98152018-05-16 11:47:30 +00004464 if (ResultVT == MVT::f16) {
4465 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
4466 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4467 }
Matt Arsenault3aef8092017-01-23 23:09:58 +00004468
Matt Arsenault67a98152018-05-16 11:47:30 +00004469 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
4470}
4471
4472SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
4473 SelectionDAG &DAG) const {
4474 SDLoc SL(Op);
4475 EVT VT = Op.getValueType();
Matt Arsenault67a98152018-05-16 11:47:30 +00004476
Matt Arsenault02dc7e12018-06-15 15:15:46 +00004477 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4478 EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
4479
4480 // Turn into pair of packed build_vectors.
4481 // TODO: Special case for constants that can be materialized with s_mov_b64.
4482 SDValue Lo = DAG.getBuildVector(HalfVT, SL,
4483 { Op.getOperand(0), Op.getOperand(1) });
4484 SDValue Hi = DAG.getBuildVector(HalfVT, SL,
4485 { Op.getOperand(2), Op.getOperand(3) });
4486
4487 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
4488 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
4489
4490 SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
4491 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
4492 }
4493
Matt Arsenault1349a042018-05-22 06:32:10 +00004494 assert(VT == MVT::v2f16 || VT == MVT::v2i16);
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004495 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
Matt Arsenault67a98152018-05-16 11:47:30 +00004496
Matt Arsenault1349a042018-05-22 06:32:10 +00004497 SDValue Lo = Op.getOperand(0);
4498 SDValue Hi = Op.getOperand(1);
Matt Arsenault67a98152018-05-16 11:47:30 +00004499
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004500 // Avoid adding defined bits with the zero_extend.
4501 if (Hi.isUndef()) {
4502 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4503 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
4504 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
4505 }
Matt Arsenault67a98152018-05-16 11:47:30 +00004506
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004507 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
Matt Arsenault1349a042018-05-22 06:32:10 +00004508 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
4509
4510 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
4511 DAG.getConstant(16, SL, MVT::i32));
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004512 if (Lo.isUndef())
4513 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
4514
4515 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4516 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
Matt Arsenault1349a042018-05-22 06:32:10 +00004517
4518 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
Matt Arsenault1349a042018-05-22 06:32:10 +00004519 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004520}
4521
Tom Stellard418beb72016-07-13 14:23:33 +00004522bool
4523SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
4524 // We can fold offsets for anything that doesn't require a GOT relocation.
Matt Arsenault0da63502018-08-31 05:49:54 +00004525 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
4526 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4527 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004528 !shouldEmitGOTReloc(GA->getGlobal());
Tom Stellard418beb72016-07-13 14:23:33 +00004529}
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004530
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004531static SDValue
4532buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
4533 const SDLoc &DL, unsigned Offset, EVT PtrVT,
4534 unsigned GAFlags = SIInstrInfo::MO_NONE) {
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004535 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4536 // lowered to the following code sequence:
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004537 //
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004538 // For constant address space:
4539 // s_getpc_b64 s[0:1]
4540 // s_add_u32 s0, s0, $symbol
4541 // s_addc_u32 s1, s1, 0
4542 //
4543 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4544 // a fixup or relocation is emitted to replace $symbol with a literal
4545 // constant, which is a pc-relative offset from the encoding of the $symbol
4546 // operand to the global variable.
4547 //
4548 // For global address space:
4549 // s_getpc_b64 s[0:1]
4550 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4551 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4552 //
4553 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4554 // fixups or relocations are emitted to replace $symbol@*@lo and
4555 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4556 // which is a 64-bit pc-relative offset from the encoding of the $symbol
4557 // operand to the global variable.
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004558 //
4559 // What we want here is an offset from the value returned by s_getpc
4560 // (which is the address of the s_add_u32 instruction) to the global
4561 // variable, but since the encoding of $symbol starts 4 bytes after the start
4562 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4563 // small. This requires us to add 4 to the global variable offset in order to
4564 // compute the correct address.
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004565 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4566 GAFlags);
4567 SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4568 GAFlags == SIInstrInfo::MO_NONE ?
4569 GAFlags : GAFlags + 1);
4570 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004571}
4572
Tom Stellard418beb72016-07-13 14:23:33 +00004573SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
4574 SDValue Op,
4575 SelectionDAG &DAG) const {
4576 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00004577 const GlobalValue *GV = GSD->getGlobal();
Matt Arsenaultd1f45712018-09-10 12:16:11 +00004578 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
4579 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
4580 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
Tom Stellard418beb72016-07-13 14:23:33 +00004581 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
4582
4583 SDLoc DL(GSD);
Tom Stellard418beb72016-07-13 14:23:33 +00004584 EVT PtrVT = Op.getValueType();
4585
Matt Arsenaultd1f45712018-09-10 12:16:11 +00004586 // FIXME: Should not make address space based decisions here.
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004587 if (shouldEmitFixup(GV))
Tom Stellard418beb72016-07-13 14:23:33 +00004588 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004589 else if (shouldEmitPCReloc(GV))
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004590 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
4591 SIInstrInfo::MO_REL32);
Tom Stellard418beb72016-07-13 14:23:33 +00004592
4593 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004594 SIInstrInfo::MO_GOTPCREL32);
Tom Stellard418beb72016-07-13 14:23:33 +00004595
4596 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
Matt Arsenault0da63502018-08-31 05:49:54 +00004597 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
Tom Stellard418beb72016-07-13 14:23:33 +00004598 const DataLayout &DataLayout = DAG.getDataLayout();
4599 unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
Matt Arsenaultd77fcc22018-09-10 02:23:39 +00004600 MachinePointerInfo PtrInfo
4601 = MachinePointerInfo::getGOT(DAG.getMachineFunction());
Tom Stellard418beb72016-07-13 14:23:33 +00004602
Justin Lebar9c375812016-07-15 18:27:10 +00004603 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
Justin Lebaradbf09e2016-09-11 01:38:58 +00004604 MachineMemOperand::MODereferenceable |
4605 MachineMemOperand::MOInvariant);
Tom Stellard418beb72016-07-13 14:23:33 +00004606}
4607
Benjamin Kramerbdc49562016-06-12 15:39:02 +00004608SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
4609 const SDLoc &DL, SDValue V) const {
Matt Arsenault4ac341c2016-04-14 21:58:15 +00004610 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
4611 // the destination register.
4612 //
Tom Stellardfc92e772015-05-12 14:18:14 +00004613 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
4614 // so we will end up with redundant moves to m0.
4615 //
Matt Arsenault4ac341c2016-04-14 21:58:15 +00004616 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
4617
4618 // A Null SDValue creates a glue result.
4619 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
4620 V, Chain);
4621 return SDValue(M0, 0);
Tom Stellardfc92e772015-05-12 14:18:14 +00004622}
4623
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00004624SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
4625 SDValue Op,
4626 MVT VT,
4627 unsigned Offset) const {
4628 SDLoc SL(Op);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00004629 SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
Matt Arsenault7b4826e2018-05-30 16:17:51 +00004630 DAG.getEntryNode(), Offset, 4, false);
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00004631 // The local size values will have the hi 16-bits as zero.
4632 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
4633 DAG.getValueType(VT));
4634}
4635
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004636static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4637 EVT VT) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004638 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00004639 "non-hsa intrinsic with hsa target",
4640 DL.getDebugLoc());
4641 DAG.getContext()->diagnose(BadIntrin);
4642 return DAG.getUNDEF(VT);
4643}
4644
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004645static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4646 EVT VT) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004647 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00004648 "intrinsic not supported on subtarget",
4649 DL.getDebugLoc());
Matt Arsenaulte0132462016-01-30 05:19:45 +00004650 DAG.getContext()->diagnose(BadIntrin);
4651 return DAG.getUNDEF(VT);
4652}
4653
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004654static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
4655 ArrayRef<SDValue> Elts) {
4656 assert(!Elts.empty());
4657 MVT Type;
4658 unsigned NumElts;
4659
4660 if (Elts.size() == 1) {
4661 Type = MVT::f32;
4662 NumElts = 1;
4663 } else if (Elts.size() == 2) {
4664 Type = MVT::v2f32;
4665 NumElts = 2;
4666 } else if (Elts.size() <= 4) {
4667 Type = MVT::v4f32;
4668 NumElts = 4;
4669 } else if (Elts.size() <= 8) {
4670 Type = MVT::v8f32;
4671 NumElts = 8;
4672 } else {
4673 assert(Elts.size() <= 16);
4674 Type = MVT::v16f32;
4675 NumElts = 16;
4676 }
4677
4678 SmallVector<SDValue, 16> VecElts(NumElts);
4679 for (unsigned i = 0; i < Elts.size(); ++i) {
4680 SDValue Elt = Elts[i];
4681 if (Elt.getValueType() != MVT::f32)
4682 Elt = DAG.getBitcast(MVT::f32, Elt);
4683 VecElts[i] = Elt;
4684 }
4685 for (unsigned i = Elts.size(); i < NumElts; ++i)
4686 VecElts[i] = DAG.getUNDEF(MVT::f32);
4687
4688 if (NumElts == 1)
4689 return VecElts[0];
4690 return DAG.getBuildVector(Type, DL, VecElts);
4691}
4692
4693static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00004694 SDValue *GLC, SDValue *SLC, SDValue *DLC) {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004695 auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004696
4697 uint64_t Value = CachePolicyConst->getZExtValue();
4698 SDLoc DL(CachePolicy);
4699 if (GLC) {
4700 *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4701 Value &= ~(uint64_t)0x1;
4702 }
4703 if (SLC) {
4704 *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4705 Value &= ~(uint64_t)0x2;
4706 }
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00004707 if (DLC) {
4708 *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32);
4709 Value &= ~(uint64_t)0x4;
4710 }
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004711
4712 return Value == 0;
4713}
4714
David Stuttardf77079f2019-01-14 11:55:24 +00004715// Re-construct the required return value for a image load intrinsic.
4716// This is more complicated due to the optional use TexFailCtrl which means the required
4717// return type is an aggregate
4718static SDValue constructRetValue(SelectionDAG &DAG,
4719 MachineSDNode *Result,
4720 ArrayRef<EVT> ResultTypes,
4721 bool IsTexFail, bool Unpacked, bool IsD16,
4722 int DMaskPop, int NumVDataDwords,
4723 const SDLoc &DL, LLVMContext &Context) {
4724 // Determine the required return type. This is the same regardless of IsTexFail flag
4725 EVT ReqRetVT = ResultTypes[0];
4726 EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
4727 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
4728 EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
4729 EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
4730 : AdjEltVT
4731 : ReqRetVT;
4732
4733 // Extract data part of the result
4734 // Bitcast the result to the same type as the required return type
4735 int NumElts;
4736 if (IsD16 && !Unpacked)
4737 NumElts = NumVDataDwords << 1;
4738 else
4739 NumElts = NumVDataDwords;
4740
4741 EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
4742 : AdjEltVT;
4743
Tim Renouf6f0191a2019-03-22 15:21:11 +00004744 // Special case for v6f16. Rather than add support for this, use v3i32 to
David Stuttardf77079f2019-01-14 11:55:24 +00004745 // extract the data elements
Tim Renouf6f0191a2019-03-22 15:21:11 +00004746 bool V6F16Special = false;
4747 if (NumElts == 6) {
4748 CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
David Stuttardf77079f2019-01-14 11:55:24 +00004749 DMaskPop >>= 1;
4750 ReqRetNumElts >>= 1;
Tim Renouf6f0191a2019-03-22 15:21:11 +00004751 V6F16Special = true;
David Stuttardf77079f2019-01-14 11:55:24 +00004752 AdjVT = MVT::v2i32;
4753 }
4754
4755 SDValue N = SDValue(Result, 0);
4756 SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
4757
4758 // Iterate over the result
4759 SmallVector<SDValue, 4> BVElts;
4760
4761 if (CastVT.isVector()) {
4762 DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
4763 } else {
4764 BVElts.push_back(CastRes);
4765 }
4766 int ExtraElts = ReqRetNumElts - DMaskPop;
4767 while(ExtraElts--)
4768 BVElts.push_back(DAG.getUNDEF(AdjEltVT));
4769
4770 SDValue PreTFCRes;
4771 if (ReqRetNumElts > 1) {
4772 SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
4773 if (IsD16 && Unpacked)
4774 PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
4775 else
4776 PreTFCRes = NewVec;
4777 } else {
4778 PreTFCRes = BVElts[0];
4779 }
4780
Tim Renouf6f0191a2019-03-22 15:21:11 +00004781 if (V6F16Special)
David Stuttardf77079f2019-01-14 11:55:24 +00004782 PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
4783
4784 if (!IsTexFail) {
4785 if (Result->getNumValues() > 1)
4786 return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
4787 else
4788 return PreTFCRes;
4789 }
4790
4791 // Extract the TexFail result and insert into aggregate return
4792 SmallVector<SDValue, 1> TFCElt;
4793 DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
4794 SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
4795 return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
4796}
4797
4798static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
4799 SDValue *LWE, bool &IsTexFail) {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004800 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
David Stuttardf77079f2019-01-14 11:55:24 +00004801
4802 uint64_t Value = TexFailCtrlConst->getZExtValue();
4803 if (Value) {
4804 IsTexFail = true;
4805 }
4806
4807 SDLoc DL(TexFailCtrlConst);
4808 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4809 Value &= ~(uint64_t)0x1;
4810 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4811 Value &= ~(uint64_t)0x2;
4812
4813 return Value == 0;
4814}
4815
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004816SDValue SITargetLowering::lowerImage(SDValue Op,
4817 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4818 SelectionDAG &DAG) const {
4819 SDLoc DL(Op);
Ryan Taylor1f334d02018-08-28 15:07:30 +00004820 MachineFunction &MF = DAG.getMachineFunction();
4821 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004822 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4823 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4824 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004825 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4826 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
4827 unsigned IntrOpcode = Intr->BaseOpcode;
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00004828 bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004829
David Stuttardf77079f2019-01-14 11:55:24 +00004830 SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
4831 SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004832 bool IsD16 = false;
Ryan Taylor1f334d02018-08-28 15:07:30 +00004833 bool IsA16 = false;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004834 SDValue VData;
4835 int NumVDataDwords;
David Stuttardf77079f2019-01-14 11:55:24 +00004836 bool AdjustRetType = false;
4837
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004838 unsigned AddrIdx; // Index of first address argument
4839 unsigned DMask;
David Stuttardf77079f2019-01-14 11:55:24 +00004840 unsigned DMaskLanes = 0;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004841
4842 if (BaseOpcode->Atomic) {
4843 VData = Op.getOperand(2);
4844
4845 bool Is64Bit = VData.getValueType() == MVT::i64;
4846 if (BaseOpcode->AtomicX2) {
4847 SDValue VData2 = Op.getOperand(3);
4848 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
4849 {VData, VData2});
4850 if (Is64Bit)
4851 VData = DAG.getBitcast(MVT::v4i32, VData);
4852
4853 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
4854 DMask = Is64Bit ? 0xf : 0x3;
4855 NumVDataDwords = Is64Bit ? 4 : 2;
4856 AddrIdx = 4;
4857 } else {
4858 DMask = Is64Bit ? 0x3 : 0x1;
4859 NumVDataDwords = Is64Bit ? 2 : 1;
4860 AddrIdx = 3;
4861 }
4862 } else {
David Stuttardf77079f2019-01-14 11:55:24 +00004863 unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004864 auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
David Stuttardf77079f2019-01-14 11:55:24 +00004865 DMask = DMaskConst->getZExtValue();
4866 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004867
4868 if (BaseOpcode->Store) {
4869 VData = Op.getOperand(2);
4870
4871 MVT StoreVT = VData.getSimpleValueType();
4872 if (StoreVT.getScalarType() == MVT::f16) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00004873 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004874 !BaseOpcode->HasD16)
4875 return Op; // D16 is unsupported for this instruction
4876
4877 IsD16 = true;
4878 VData = handleD16VData(VData, DAG);
4879 }
4880
4881 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004882 } else {
David Stuttardf77079f2019-01-14 11:55:24 +00004883 // Work out the num dwords based on the dmask popcount and underlying type
4884 // and whether packing is supported.
4885 MVT LoadVT = ResultTypes[0].getSimpleVT();
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004886 if (LoadVT.getScalarType() == MVT::f16) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00004887 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004888 !BaseOpcode->HasD16)
4889 return Op; // D16 is unsupported for this instruction
4890
4891 IsD16 = true;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004892 }
4893
David Stuttardf77079f2019-01-14 11:55:24 +00004894 // Confirm that the return type is large enough for the dmask specified
4895 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
4896 (!LoadVT.isVector() && DMaskLanes > 1))
4897 return Op;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004898
David Stuttardf77079f2019-01-14 11:55:24 +00004899 if (IsD16 && !Subtarget->hasUnpackedD16VMem())
4900 NumVDataDwords = (DMaskLanes + 1) / 2;
4901 else
4902 NumVDataDwords = DMaskLanes;
4903
4904 AdjustRetType = true;
4905 }
David Stuttardc6603862018-11-29 20:14:17 +00004906
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004907 AddrIdx = DMaskIdx + 1;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004908 }
4909
Ryan Taylor1f334d02018-08-28 15:07:30 +00004910 unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
4911 unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
4912 unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
4913 unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
4914 NumCoords + NumLCM;
4915 unsigned NumMIVAddrs = NumVAddrs;
4916
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004917 SmallVector<SDValue, 4> VAddrs;
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004918
4919 // Optimize _L to _LZ when _L is zero
4920 if (LZMappingInfo) {
4921 if (auto ConstantLod =
Ryan Taylor1f334d02018-08-28 15:07:30 +00004922 dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004923 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4924 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
Ryan Taylor1f334d02018-08-28 15:07:30 +00004925 NumMIVAddrs--; // remove 'lod'
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004926 }
4927 }
4928 }
4929
Ryan Taylor1f334d02018-08-28 15:07:30 +00004930 // Check for 16 bit addresses and pack if true.
4931 unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4932 MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
Neil Henning63718b22018-10-31 10:34:48 +00004933 const MVT VAddrScalarVT = VAddrVT.getScalarType();
4934 if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
Ryan Taylor1f334d02018-08-28 15:07:30 +00004935 ST->hasFeature(AMDGPU::FeatureR128A16)) {
4936 IsA16 = true;
Neil Henning63718b22018-10-31 10:34:48 +00004937 const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
Ryan Taylor1f334d02018-08-28 15:07:30 +00004938 for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
4939 SDValue AddrLo, AddrHi;
4940 // Push back extra arguments.
4941 if (i < DimIdx) {
4942 AddrLo = Op.getOperand(i);
4943 } else {
4944 AddrLo = Op.getOperand(i);
4945 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
4946 // in 1D, derivatives dx/dh and dx/dv are packed with undef.
4947 if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
Matt Arsenault0da63502018-08-31 05:49:54 +00004948 ((NumGradients / 2) % 2 == 1 &&
4949 (i == DimIdx + (NumGradients / 2) - 1 ||
Ryan Taylor1f334d02018-08-28 15:07:30 +00004950 i == DimIdx + NumGradients - 1))) {
4951 AddrHi = DAG.getUNDEF(MVT::f16);
4952 } else {
4953 AddrHi = Op.getOperand(i + 1);
4954 i++;
4955 }
Neil Henning63718b22018-10-31 10:34:48 +00004956 AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
Ryan Taylor1f334d02018-08-28 15:07:30 +00004957 {AddrLo, AddrHi});
4958 AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
4959 }
4960 VAddrs.push_back(AddrLo);
4961 }
4962 } else {
4963 for (unsigned i = 0; i < NumMIVAddrs; ++i)
4964 VAddrs.push_back(Op.getOperand(AddrIdx + i));
4965 }
4966
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00004967 // If the register allocator cannot place the address registers contiguously
4968 // without introducing moves, then using the non-sequential address encoding
4969 // is always preferable, since it saves VALU instructions and is usually a
4970 // wash in terms of code size or even better.
4971 //
4972 // However, we currently have no way of hinting to the register allocator that
4973 // MIMG addresses should be placed contiguously when it is possible to do so,
4974 // so force non-NSA for the common 2-address case as a heuristic.
4975 //
4976 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4977 // allocation when possible.
4978 bool UseNSA =
4979 ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3;
4980 SDValue VAddr;
4981 if (!UseNSA)
4982 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004983
4984 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
4985 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
4986 unsigned CtrlIdx; // Index of texfailctrl argument
4987 SDValue Unorm;
4988 if (!BaseOpcode->Sampler) {
4989 Unorm = True;
4990 CtrlIdx = AddrIdx + NumVAddrs + 1;
4991 } else {
4992 auto UnormConst =
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004993 cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004994
4995 Unorm = UnormConst->getZExtValue() ? True : False;
4996 CtrlIdx = AddrIdx + NumVAddrs + 3;
4997 }
4998
David Stuttardf77079f2019-01-14 11:55:24 +00004999 SDValue TFE;
5000 SDValue LWE;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005001 SDValue TexFail = Op.getOperand(CtrlIdx);
David Stuttardf77079f2019-01-14 11:55:24 +00005002 bool IsTexFail = false;
5003 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005004 return Op;
5005
David Stuttardf77079f2019-01-14 11:55:24 +00005006 if (IsTexFail) {
5007 if (!DMaskLanes) {
5008 // Expecting to get an error flag since TFC is on - and dmask is 0
5009 // Force dmask to be at least 1 otherwise the instruction will fail
5010 DMask = 0x1;
5011 DMaskLanes = 1;
5012 NumVDataDwords = 1;
5013 }
5014 NumVDataDwords += 1;
5015 AdjustRetType = true;
5016 }
5017
5018 // Has something earlier tagged that the return type needs adjusting
5019 // This happens if the instruction is a load or has set TexFailCtrl flags
5020 if (AdjustRetType) {
5021 // NumVDataDwords reflects the true number of dwords required in the return type
5022 if (DMaskLanes == 0 && !BaseOpcode->Store) {
5023 // This is a no-op load. This can be eliminated
5024 SDValue Undef = DAG.getUNDEF(Op.getValueType());
5025 if (isa<MemSDNode>(Op))
5026 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
5027 return Undef;
5028 }
5029
David Stuttardf77079f2019-01-14 11:55:24 +00005030 EVT NewVT = NumVDataDwords > 1 ?
5031 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
5032 : MVT::f32;
5033
5034 ResultTypes[0] = NewVT;
5035 if (ResultTypes.size() == 3) {
5036 // Original result was aggregate type used for TexFailCtrl results
5037 // The actual instruction returns as a vector type which has now been
5038 // created. Remove the aggregate result.
5039 ResultTypes.erase(&ResultTypes[1]);
5040 }
5041 }
5042
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005043 SDValue GLC;
5044 SDValue SLC;
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005045 SDValue DLC;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005046 if (BaseOpcode->Atomic) {
5047 GLC = True; // TODO no-return optimization
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005048 if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC,
5049 IsGFX10 ? &DLC : nullptr))
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005050 return Op;
5051 } else {
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005052 if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC,
5053 IsGFX10 ? &DLC : nullptr))
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005054 return Op;
5055 }
5056
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005057 SmallVector<SDValue, 26> Ops;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005058 if (BaseOpcode->Store || BaseOpcode->Atomic)
5059 Ops.push_back(VData); // vdata
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005060 if (UseNSA) {
5061 for (const SDValue &Addr : VAddrs)
5062 Ops.push_back(Addr);
5063 } else {
5064 Ops.push_back(VAddr);
5065 }
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005066 Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
5067 if (BaseOpcode->Sampler)
5068 Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
5069 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005070 if (IsGFX10)
5071 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005072 Ops.push_back(Unorm);
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005073 if (IsGFX10)
5074 Ops.push_back(DLC);
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005075 Ops.push_back(GLC);
5076 Ops.push_back(SLC);
Ryan Taylor1f334d02018-08-28 15:07:30 +00005077 Ops.push_back(IsA16 && // a16 or r128
5078 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
David Stuttardf77079f2019-01-14 11:55:24 +00005079 Ops.push_back(TFE); // tfe
5080 Ops.push_back(LWE); // lwe
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005081 if (!IsGFX10)
5082 Ops.push_back(DimInfo->DA ? True : False);
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005083 if (BaseOpcode->HasD16)
5084 Ops.push_back(IsD16 ? True : False);
5085 if (isa<MemSDNode>(Op))
5086 Ops.push_back(Op.getOperand(0)); // chain
5087
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005088 int NumVAddrDwords =
5089 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005090 int Opcode = -1;
5091
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005092 if (IsGFX10) {
5093 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
5094 UseNSA ? AMDGPU::MIMGEncGfx10NSA
5095 : AMDGPU::MIMGEncGfx10Default,
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005096 NumVDataDwords, NumVAddrDwords);
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005097 } else {
5098 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5099 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
5100 NumVDataDwords, NumVAddrDwords);
5101 if (Opcode == -1)
5102 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
5103 NumVDataDwords, NumVAddrDwords);
5104 }
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005105 assert(Opcode != -1);
5106
5107 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
5108 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
Chandler Carruth66654b72018-08-14 23:30:32 +00005109 MachineMemOperand *MemRef = MemOp->getMemOperand();
5110 DAG.setNodeMemRefs(NewNode, {MemRef});
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005111 }
5112
5113 if (BaseOpcode->AtomicX2) {
5114 SmallVector<SDValue, 1> Elt;
5115 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
5116 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
David Stuttardf77079f2019-01-14 11:55:24 +00005117 } else if (!BaseOpcode->Store) {
5118 return constructRetValue(DAG, NewNode,
5119 OrigResultTypes, IsTexFail,
5120 Subtarget->hasUnpackedD16VMem(), IsD16,
5121 DMaskLanes, NumVDataDwords, DL,
5122 *DAG.getContext());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005123 }
5124
5125 return SDValue(NewNode, 0);
5126}
5127
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005128SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
5129 SDValue Offset, SDValue GLC,
5130 SelectionDAG &DAG) const {
5131 MachineFunction &MF = DAG.getMachineFunction();
5132 MachineMemOperand *MMO = MF.getMachineMemOperand(
5133 MachinePointerInfo(),
5134 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5135 MachineMemOperand::MOInvariant,
5136 VT.getStoreSize(), VT.getStoreSize());
5137
5138 if (!Offset->isDivergent()) {
5139 SDValue Ops[] = {
5140 Rsrc,
5141 Offset, // Offset
5142 GLC // glc
5143 };
5144 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
5145 DAG.getVTList(VT), Ops, VT, MMO);
5146 }
5147
5148 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
5149 // assume that the buffer is unswizzled.
5150 SmallVector<SDValue, 4> Loads;
5151 unsigned NumLoads = 1;
5152 MVT LoadVT = VT.getSimpleVT();
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005153 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
Simon Pilgrim44dfd812018-12-07 21:44:25 +00005154 assert((LoadVT.getScalarType() == MVT::i32 ||
5155 LoadVT.getScalarType() == MVT::f32) &&
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005156 isPowerOf2_32(NumElts));
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005157
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005158 if (NumElts == 8 || NumElts == 16) {
5159 NumLoads = NumElts == 16 ? 4 : 2;
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005160 LoadVT = MVT::v4i32;
5161 }
5162
5163 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
5164 unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
5165 SDValue Ops[] = {
5166 DAG.getEntryNode(), // Chain
5167 Rsrc, // rsrc
5168 DAG.getConstant(0, DL, MVT::i32), // vindex
5169 {}, // voffset
5170 {}, // soffset
5171 {}, // offset
5172 DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
5173 DAG.getConstant(0, DL, MVT::i1), // idxen
5174 };
5175
5176 // Use the alignment to ensure that the required offsets will fit into the
5177 // immediate offsets.
5178 setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
5179
5180 uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
5181 for (unsigned i = 0; i < NumLoads; ++i) {
5182 Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
5183 Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
5184 Ops, LoadVT, MMO));
5185 }
5186
5187 if (VT == MVT::v8i32 || VT == MVT::v16i32)
5188 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
5189
5190 return Loads[0];
5191}
5192
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005193SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5194 SelectionDAG &DAG) const {
5195 MachineFunction &MF = DAG.getMachineFunction();
Tom Stellarddcb9f092015-07-09 21:20:37 +00005196 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005197
5198 EVT VT = Op.getValueType();
5199 SDLoc DL(Op);
5200 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5201
Sanjay Patela2607012015-09-16 16:31:21 +00005202 // TODO: Should this propagate fast-math-flags?
5203
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005204 switch (IntrinsicID) {
Tom Stellard2f3f9852017-01-25 01:25:13 +00005205 case Intrinsic::amdgcn_implicit_buffer_ptr: {
Konstantin Zhuravlyovaa067cb2018-10-04 21:02:16 +00005206 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
Matt Arsenault10fc0622017-06-26 03:01:31 +00005207 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005208 return getPreloadedValue(DAG, *MFI, VT,
5209 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
Tom Stellard2f3f9852017-01-25 01:25:13 +00005210 }
Tom Stellard48f29f22015-11-26 00:43:29 +00005211 case Intrinsic::amdgcn_dispatch_ptr:
Matt Arsenault48ab5262016-04-25 19:27:18 +00005212 case Intrinsic::amdgcn_queue_ptr: {
Konstantin Zhuravlyovaa067cb2018-10-04 21:02:16 +00005213 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
Oliver Stannard7e7d9832016-02-02 13:52:43 +00005214 DiagnosticInfoUnsupported BadIntrin(
Matthias Braunf1caa282017-12-15 22:22:58 +00005215 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
Oliver Stannard7e7d9832016-02-02 13:52:43 +00005216 DL.getDebugLoc());
Matt Arsenault800fecf2016-01-11 21:18:33 +00005217 DAG.getContext()->diagnose(BadIntrin);
5218 return DAG.getUNDEF(VT);
5219 }
5220
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005221 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
5222 AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
5223 return getPreloadedValue(DAG, *MFI, VT, RegID);
Matt Arsenault48ab5262016-04-25 19:27:18 +00005224 }
Jan Veselyfea814d2016-06-21 20:46:20 +00005225 case Intrinsic::amdgcn_implicitarg_ptr: {
Matt Arsenault9166ce82017-07-28 15:52:08 +00005226 if (MFI->isEntryFunction())
5227 return getImplicitArgPtr(DAG, DL);
Matt Arsenault817c2532017-08-03 23:12:44 +00005228 return getPreloadedValue(DAG, *MFI, VT,
5229 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
Jan Veselyfea814d2016-06-21 20:46:20 +00005230 }
Matt Arsenaultdc4ebad2016-04-29 21:16:52 +00005231 case Intrinsic::amdgcn_kernarg_segment_ptr: {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005232 return getPreloadedValue(DAG, *MFI, VT,
5233 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
Matt Arsenaultdc4ebad2016-04-29 21:16:52 +00005234 }
Matt Arsenault8d718dc2016-07-22 17:01:30 +00005235 case Intrinsic::amdgcn_dispatch_id: {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005236 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
Matt Arsenault8d718dc2016-07-22 17:01:30 +00005237 }
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005238 case Intrinsic::amdgcn_rcp:
5239 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
5240 case Intrinsic::amdgcn_rsq:
5241 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
Eugene Zelenko66203762017-01-21 00:53:49 +00005242 case Intrinsic::amdgcn_rsq_legacy:
Tom Stellard5bfbae52018-07-11 20:59:01 +00005243 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005244 return emitRemovedIntrinsicError(DAG, DL, VT);
5245
5246 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
Eugene Zelenko66203762017-01-21 00:53:49 +00005247 case Intrinsic::amdgcn_rcp_legacy:
Tom Stellard5bfbae52018-07-11 20:59:01 +00005248 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenault32fc5272016-07-26 16:45:45 +00005249 return emitRemovedIntrinsicError(DAG, DL, VT);
5250 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
Matt Arsenault09b2c4a2016-07-15 21:26:52 +00005251 case Intrinsic::amdgcn_rsq_clamp: {
Tom Stellard5bfbae52018-07-11 20:59:01 +00005252 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenault79963e82016-02-13 01:03:00 +00005253 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
Tom Stellard48f29f22015-11-26 00:43:29 +00005254
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005255 Type *Type = VT.getTypeForEVT(*DAG.getContext());
5256 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
5257 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
5258
5259 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
5260 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
5261 DAG.getConstantFP(Max, DL, VT));
5262 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
5263 DAG.getConstantFP(Min, DL, VT));
5264 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005265 case Intrinsic::r600_read_ngroups_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005266 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005267 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005268
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005269 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005270 SI::KernelInputOffsets::NGROUPS_X, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005271 case Intrinsic::r600_read_ngroups_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005272 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005273 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005274
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005275 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005276 SI::KernelInputOffsets::NGROUPS_Y, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005277 case Intrinsic::r600_read_ngroups_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005278 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005279 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005280
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005281 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005282 SI::KernelInputOffsets::NGROUPS_Z, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005283 case Intrinsic::r600_read_global_size_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005284 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005285 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005286
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005287 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005288 SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005289 case Intrinsic::r600_read_global_size_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005290 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005291 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005292
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005293 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005294 SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005295 case Intrinsic::r600_read_global_size_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005296 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005297 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005298
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005299 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005300 SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005301 case Intrinsic::r600_read_local_size_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005302 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005303 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005304
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005305 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5306 SI::KernelInputOffsets::LOCAL_SIZE_X);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005307 case Intrinsic::r600_read_local_size_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005308 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005309 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005310
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005311 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5312 SI::KernelInputOffsets::LOCAL_SIZE_Y);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005313 case Intrinsic::r600_read_local_size_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005314 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005315 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005316
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005317 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5318 SI::KernelInputOffsets::LOCAL_SIZE_Z);
Matt Arsenault43976df2016-01-30 04:25:19 +00005319 case Intrinsic::amdgcn_workgroup_id_x:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005320 case Intrinsic::r600_read_tgid_x:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005321 return getPreloadedValue(DAG, *MFI, VT,
5322 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
Matt Arsenault43976df2016-01-30 04:25:19 +00005323 case Intrinsic::amdgcn_workgroup_id_y:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005324 case Intrinsic::r600_read_tgid_y:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005325 return getPreloadedValue(DAG, *MFI, VT,
5326 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
Matt Arsenault43976df2016-01-30 04:25:19 +00005327 case Intrinsic::amdgcn_workgroup_id_z:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005328 case Intrinsic::r600_read_tgid_z:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005329 return getPreloadedValue(DAG, *MFI, VT,
5330 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
Reid Kleckner4dc0b1a2018-11-01 19:54:45 +00005331 case Intrinsic::amdgcn_workitem_id_x:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005332 case Intrinsic::r600_read_tidig_x:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005333 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5334 SDLoc(DAG.getEntryNode()),
5335 MFI->getArgInfo().WorkItemIDX);
Matt Arsenault43976df2016-01-30 04:25:19 +00005336 case Intrinsic::amdgcn_workitem_id_y:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005337 case Intrinsic::r600_read_tidig_y:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005338 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5339 SDLoc(DAG.getEntryNode()),
5340 MFI->getArgInfo().WorkItemIDY);
Matt Arsenault43976df2016-01-30 04:25:19 +00005341 case Intrinsic::amdgcn_workitem_id_z:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005342 case Intrinsic::r600_read_tidig_z:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005343 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5344 SDLoc(DAG.getEntryNode()),
5345 MFI->getArgInfo().WorkItemIDZ);
Tim Renouf904343f2018-08-25 14:53:17 +00005346 case Intrinsic::amdgcn_s_buffer_load: {
5347 unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005348 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
5349 DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005350 }
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00005351 case Intrinsic::amdgcn_fdiv_fast:
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00005352 return lowerFDIV_FAST(Op, DAG);
Tom Stellard2187bb82016-12-06 23:52:13 +00005353 case Intrinsic::amdgcn_interp_mov: {
5354 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5355 SDValue Glue = M0.getValue(1);
5356 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
5357 Op.getOperand(2), Op.getOperand(3), Glue);
5358 }
Tom Stellardad7d03d2015-12-15 17:02:49 +00005359 case Intrinsic::amdgcn_interp_p1: {
5360 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5361 SDValue Glue = M0.getValue(1);
5362 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
5363 Op.getOperand(2), Op.getOperand(3), Glue);
5364 }
5365 case Intrinsic::amdgcn_interp_p2: {
5366 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5367 SDValue Glue = SDValue(M0.getNode(), 1);
5368 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
5369 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
5370 Glue);
5371 }
Tim Corringham824ca3f2019-01-28 13:48:59 +00005372 case Intrinsic::amdgcn_interp_p1_f16: {
5373 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5374 SDValue Glue = M0.getValue(1);
5375 if (getSubtarget()->getLDSBankCount() == 16) {
5376 // 16 bank LDS
5377 SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
5378 DAG.getConstant(2, DL, MVT::i32), // P0
5379 Op.getOperand(2), // Attrchan
5380 Op.getOperand(3), // Attr
5381 Glue);
5382 SDValue Ops[] = {
5383 Op.getOperand(1), // Src0
5384 Op.getOperand(2), // Attrchan
5385 Op.getOperand(3), // Attr
5386 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5387 S, // Src2 - holds two f16 values selected by high
5388 DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
5389 Op.getOperand(4), // high
5390 DAG.getConstant(0, DL, MVT::i1), // $clamp
5391 DAG.getConstant(0, DL, MVT::i32) // $omod
5392 };
5393 return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
5394 } else {
5395 // 32 bank LDS
5396 SDValue Ops[] = {
5397 Op.getOperand(1), // Src0
5398 Op.getOperand(2), // Attrchan
5399 Op.getOperand(3), // Attr
5400 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5401 Op.getOperand(4), // high
5402 DAG.getConstant(0, DL, MVT::i1), // $clamp
5403 DAG.getConstant(0, DL, MVT::i32), // $omod
5404 Glue
5405 };
5406 return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
5407 }
5408 }
5409 case Intrinsic::amdgcn_interp_p2_f16: {
5410 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6));
5411 SDValue Glue = SDValue(M0.getNode(), 1);
5412 SDValue Ops[] = {
5413 Op.getOperand(2), // Src0
5414 Op.getOperand(3), // Attrchan
5415 Op.getOperand(4), // Attr
5416 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5417 Op.getOperand(1), // Src2
5418 DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
5419 Op.getOperand(5), // high
5420 DAG.getConstant(0, DL, MVT::i1), // $clamp
5421 Glue
5422 };
5423 return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
5424 }
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005425 case Intrinsic::amdgcn_sin:
5426 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
5427
5428 case Intrinsic::amdgcn_cos:
5429 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
5430
5431 case Intrinsic::amdgcn_log_clamp: {
Tom Stellard5bfbae52018-07-11 20:59:01 +00005432 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005433 return SDValue();
5434
5435 DiagnosticInfoUnsupported BadIntrin(
Matthias Braunf1caa282017-12-15 22:22:58 +00005436 MF.getFunction(), "intrinsic not supported on subtarget",
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005437 DL.getDebugLoc());
5438 DAG.getContext()->diagnose(BadIntrin);
5439 return DAG.getUNDEF(VT);
5440 }
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005441 case Intrinsic::amdgcn_ldexp:
5442 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
5443 Op.getOperand(1), Op.getOperand(2));
Matt Arsenault74015162016-05-28 00:19:52 +00005444
5445 case Intrinsic::amdgcn_fract:
5446 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
5447
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005448 case Intrinsic::amdgcn_class:
5449 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
5450 Op.getOperand(1), Op.getOperand(2));
5451 case Intrinsic::amdgcn_div_fmas:
5452 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
5453 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5454 Op.getOperand(4));
5455
5456 case Intrinsic::amdgcn_div_fixup:
5457 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
5458 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5459
5460 case Intrinsic::amdgcn_trig_preop:
5461 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
5462 Op.getOperand(1), Op.getOperand(2));
5463 case Intrinsic::amdgcn_div_scale: {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00005464 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005465
5466 // Translate to the operands expected by the machine instruction. The
5467 // first parameter must be the same as the first instruction.
5468 SDValue Numerator = Op.getOperand(1);
5469 SDValue Denominator = Op.getOperand(2);
5470
5471 // Note this order is opposite of the machine instruction's operations,
5472 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5473 // intrinsic has the numerator as the first operand to match a normal
5474 // division operation.
5475
5476 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
5477
5478 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
5479 Denominator, Numerator);
5480 }
Wei Ding07e03712016-07-28 16:42:13 +00005481 case Intrinsic::amdgcn_icmp: {
Marek Olsak33eb4d92019-01-15 02:13:18 +00005482 // There is a Pat that handles this variant, so return it as-is.
5483 if (Op.getOperand(1).getValueType() == MVT::i1 &&
5484 Op.getConstantOperandVal(2) == 0 &&
5485 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
5486 return Op;
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00005487 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
Wei Ding07e03712016-07-28 16:42:13 +00005488 }
5489 case Intrinsic::amdgcn_fcmp: {
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00005490 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
Wei Ding07e03712016-07-28 16:42:13 +00005491 }
Matt Arsenaultf84e5d92017-01-31 03:07:46 +00005492 case Intrinsic::amdgcn_fmed3:
5493 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
5494 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
Farhana Aleenc370d7b2018-07-16 18:19:59 +00005495 case Intrinsic::amdgcn_fdot2:
5496 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
Konstantin Zhuravlyovbb30ef72018-08-01 01:31:30 +00005497 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5498 Op.getOperand(4));
Matt Arsenault32fc5272016-07-26 16:45:45 +00005499 case Intrinsic::amdgcn_fmul_legacy:
5500 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
5501 Op.getOperand(1), Op.getOperand(2));
Matt Arsenaultc96e1de2016-07-18 18:35:05 +00005502 case Intrinsic::amdgcn_sffbh:
Matt Arsenaultc96e1de2016-07-18 18:35:05 +00005503 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
Matt Arsenaultf5262252017-02-22 23:04:58 +00005504 case Intrinsic::amdgcn_sbfe:
5505 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
5506 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5507 case Intrinsic::amdgcn_ubfe:
5508 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
5509 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
Marek Olsak13e47412018-01-31 20:18:04 +00005510 case Intrinsic::amdgcn_cvt_pkrtz:
5511 case Intrinsic::amdgcn_cvt_pknorm_i16:
5512 case Intrinsic::amdgcn_cvt_pknorm_u16:
5513 case Intrinsic::amdgcn_cvt_pk_i16:
5514 case Intrinsic::amdgcn_cvt_pk_u16: {
5515 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
Matt Arsenault1f17c662017-02-22 00:27:34 +00005516 EVT VT = Op.getValueType();
Marek Olsak13e47412018-01-31 20:18:04 +00005517 unsigned Opcode;
5518
5519 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
5520 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
5521 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
5522 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
5523 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
5524 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
5525 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
5526 Opcode = AMDGPUISD::CVT_PK_I16_I32;
5527 else
5528 Opcode = AMDGPUISD::CVT_PK_U16_U32;
5529
Matt Arsenault709374d2018-08-01 20:13:58 +00005530 if (isTypeLegal(VT))
5531 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
5532
Marek Olsak13e47412018-01-31 20:18:04 +00005533 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
Matt Arsenault1f17c662017-02-22 00:27:34 +00005534 Op.getOperand(1), Op.getOperand(2));
5535 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
5536 }
Connor Abbott8c217d02017-08-04 18:36:49 +00005537 case Intrinsic::amdgcn_wqm: {
5538 SDValue Src = Op.getOperand(1);
5539 return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
5540 0);
5541 }
Connor Abbott92638ab2017-08-04 18:36:52 +00005542 case Intrinsic::amdgcn_wwm: {
5543 SDValue Src = Op.getOperand(1);
5544 return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
5545 0);
5546 }
Stanislav Mekhanoshindacda792018-06-26 20:04:19 +00005547 case Intrinsic::amdgcn_fmad_ftz:
5548 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
5549 Op.getOperand(2), Op.getOperand(3));
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005550 default:
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005551 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5552 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
5553 return lowerImage(Op, ImageDimIntr, DAG);
5554
Matt Arsenault754dd3e2017-04-03 18:08:08 +00005555 return Op;
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005556 }
5557}
5558
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005559SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5560 SelectionDAG &DAG) const {
5561 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
Tom Stellard6f9ef142016-12-20 17:19:44 +00005562 SDLoc DL(Op);
David Stuttard70e8bc12017-06-22 16:29:22 +00005563
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005564 switch (IntrID) {
Marek Olsakc5cec5e2019-01-16 15:43:53 +00005565 case Intrinsic::amdgcn_ds_ordered_add:
5566 case Intrinsic::amdgcn_ds_ordered_swap: {
5567 MemSDNode *M = cast<MemSDNode>(Op);
5568 SDValue Chain = M->getOperand(0);
5569 SDValue M0 = M->getOperand(2);
5570 SDValue Value = M->getOperand(3);
5571 unsigned OrderedCountIndex = M->getConstantOperandVal(7);
5572 unsigned WaveRelease = M->getConstantOperandVal(8);
5573 unsigned WaveDone = M->getConstantOperandVal(9);
5574 unsigned ShaderType;
5575 unsigned Instruction;
5576
5577 switch (IntrID) {
5578 case Intrinsic::amdgcn_ds_ordered_add:
5579 Instruction = 0;
5580 break;
5581 case Intrinsic::amdgcn_ds_ordered_swap:
5582 Instruction = 1;
5583 break;
5584 }
5585
5586 if (WaveDone && !WaveRelease)
5587 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
5588
5589 switch (DAG.getMachineFunction().getFunction().getCallingConv()) {
5590 case CallingConv::AMDGPU_CS:
5591 case CallingConv::AMDGPU_KERNEL:
5592 ShaderType = 0;
5593 break;
5594 case CallingConv::AMDGPU_PS:
5595 ShaderType = 1;
5596 break;
5597 case CallingConv::AMDGPU_VS:
5598 ShaderType = 2;
5599 break;
5600 case CallingConv::AMDGPU_GS:
5601 ShaderType = 3;
5602 break;
5603 default:
5604 report_fatal_error("ds_ordered_count unsupported for this calling conv");
5605 }
5606
5607 unsigned Offset0 = OrderedCountIndex << 2;
5608 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
5609 (Instruction << 4);
5610 unsigned Offset = Offset0 | (Offset1 << 8);
5611
5612 SDValue Ops[] = {
5613 Chain,
5614 Value,
5615 DAG.getTargetConstant(Offset, DL, MVT::i16),
5616 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
5617 };
5618 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
5619 M->getVTList(), Ops, M->getMemoryVT(),
5620 M->getMemOperand());
5621 }
Matt Arsenaulta5840c32019-01-22 18:36:06 +00005622 case Intrinsic::amdgcn_ds_fadd: {
5623 MemSDNode *M = cast<MemSDNode>(Op);
5624 unsigned Opc;
5625 switch (IntrID) {
5626 case Intrinsic::amdgcn_ds_fadd:
5627 Opc = ISD::ATOMIC_LOAD_FADD;
5628 break;
5629 }
5630
5631 return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
5632 M->getOperand(0), M->getOperand(2), M->getOperand(3),
5633 M->getMemOperand());
5634 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005635 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005636 case Intrinsic::amdgcn_atomic_dec:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005637 case Intrinsic::amdgcn_ds_fmin:
5638 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005639 MemSDNode *M = cast<MemSDNode>(Op);
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005640 unsigned Opc;
5641 switch (IntrID) {
5642 case Intrinsic::amdgcn_atomic_inc:
5643 Opc = AMDGPUISD::ATOMIC_INC;
5644 break;
5645 case Intrinsic::amdgcn_atomic_dec:
5646 Opc = AMDGPUISD::ATOMIC_DEC;
5647 break;
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005648 case Intrinsic::amdgcn_ds_fmin:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005649 Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
5650 break;
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005651 case Intrinsic::amdgcn_ds_fmax:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005652 Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
5653 break;
5654 default:
5655 llvm_unreachable("Unknown intrinsic!");
5656 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005657 SDValue Ops[] = {
5658 M->getOperand(0), // Chain
5659 M->getOperand(2), // Ptr
5660 M->getOperand(3) // Value
5661 };
5662
5663 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
5664 M->getMemoryVT(), M->getMemOperand());
5665 }
Tom Stellard6f9ef142016-12-20 17:19:44 +00005666 case Intrinsic::amdgcn_buffer_load:
5667 case Intrinsic::amdgcn_buffer_load_format: {
Tim Renouf4f703f52018-08-21 11:07:10 +00005668 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
5669 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5670 unsigned IdxEn = 1;
5671 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5672 IdxEn = Idx->getZExtValue() != 0;
Tom Stellard6f9ef142016-12-20 17:19:44 +00005673 SDValue Ops[] = {
5674 Op.getOperand(0), // Chain
5675 Op.getOperand(2), // rsrc
5676 Op.getOperand(3), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00005677 SDValue(), // voffset -- will be set by setBufferOffsets
5678 SDValue(), // soffset -- will be set by setBufferOffsets
5679 SDValue(), // offset -- will be set by setBufferOffsets
5680 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5681 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Tom Stellard6f9ef142016-12-20 17:19:44 +00005682 };
Tom Stellard6f9ef142016-12-20 17:19:44 +00005683
Tim Renouf4f703f52018-08-21 11:07:10 +00005684 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
Tom Stellard6f9ef142016-12-20 17:19:44 +00005685 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
5686 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
Tim Renouf4f703f52018-08-21 11:07:10 +00005687
5688 EVT VT = Op.getValueType();
5689 EVT IntVT = VT.changeTypeToInteger();
5690 auto *M = cast<MemSDNode>(Op);
5691 EVT LoadVT = Op.getValueType();
5692
5693 if (LoadVT.getScalarType() == MVT::f16)
5694 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5695 M, DAG, Ops);
Ryan Taylor00e063a2019-03-19 16:07:00 +00005696
5697 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5698 if (LoadVT.getScalarType() == MVT::i8 ||
5699 LoadVT.getScalarType() == MVT::i16)
5700 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
5701
Tim Renouf677387d2019-03-22 14:58:02 +00005702 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5703 M->getMemOperand(), DAG);
Tim Renouf4f703f52018-08-21 11:07:10 +00005704 }
5705 case Intrinsic::amdgcn_raw_buffer_load:
5706 case Intrinsic::amdgcn_raw_buffer_load_format: {
5707 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5708 SDValue Ops[] = {
5709 Op.getOperand(0), // Chain
5710 Op.getOperand(2), // rsrc
5711 DAG.getConstant(0, DL, MVT::i32), // vindex
5712 Offsets.first, // voffset
5713 Op.getOperand(4), // soffset
5714 Offsets.second, // offset
5715 Op.getOperand(5), // cachepolicy
5716 DAG.getConstant(0, DL, MVT::i1), // idxen
5717 };
5718
5719 unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
5720 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5721
5722 EVT VT = Op.getValueType();
5723 EVT IntVT = VT.changeTypeToInteger();
5724 auto *M = cast<MemSDNode>(Op);
5725 EVT LoadVT = Op.getValueType();
5726
5727 if (LoadVT.getScalarType() == MVT::f16)
5728 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5729 M, DAG, Ops);
Ryan Taylor00e063a2019-03-19 16:07:00 +00005730
5731 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5732 if (LoadVT.getScalarType() == MVT::i8 ||
5733 LoadVT.getScalarType() == MVT::i16)
5734 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
5735
Tim Renouf677387d2019-03-22 14:58:02 +00005736 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5737 M->getMemOperand(), DAG);
Tim Renouf4f703f52018-08-21 11:07:10 +00005738 }
5739 case Intrinsic::amdgcn_struct_buffer_load:
5740 case Intrinsic::amdgcn_struct_buffer_load_format: {
5741 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5742 SDValue Ops[] = {
5743 Op.getOperand(0), // Chain
5744 Op.getOperand(2), // rsrc
5745 Op.getOperand(3), // vindex
5746 Offsets.first, // voffset
5747 Op.getOperand(5), // soffset
5748 Offsets.second, // offset
5749 Op.getOperand(6), // cachepolicy
5750 DAG.getConstant(1, DL, MVT::i1), // idxen
5751 };
5752
5753 unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
5754 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5755
Tom Stellard6f9ef142016-12-20 17:19:44 +00005756 EVT VT = Op.getValueType();
5757 EVT IntVT = VT.changeTypeToInteger();
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005758 auto *M = cast<MemSDNode>(Op);
Matt Arsenault1349a042018-05-22 06:32:10 +00005759 EVT LoadVT = Op.getValueType();
Matt Arsenault1349a042018-05-22 06:32:10 +00005760
Tim Renouf366a49d2018-08-02 23:33:01 +00005761 if (LoadVT.getScalarType() == MVT::f16)
5762 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5763 M, DAG, Ops);
Ryan Taylor00e063a2019-03-19 16:07:00 +00005764
5765 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5766 if (LoadVT.getScalarType() == MVT::i8 ||
5767 LoadVT.getScalarType() == MVT::i16)
5768 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
5769
Tim Renouf677387d2019-03-22 14:58:02 +00005770 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5771 M->getMemOperand(), DAG);
Tom Stellard6f9ef142016-12-20 17:19:44 +00005772 }
David Stuttard70e8bc12017-06-22 16:29:22 +00005773 case Intrinsic::amdgcn_tbuffer_load: {
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005774 MemSDNode *M = cast<MemSDNode>(Op);
Matt Arsenault1349a042018-05-22 06:32:10 +00005775 EVT LoadVT = Op.getValueType();
Matt Arsenault1349a042018-05-22 06:32:10 +00005776
Tim Renouf35484c92018-08-21 11:06:05 +00005777 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5778 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5779 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5780 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
5781 unsigned IdxEn = 1;
5782 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5783 IdxEn = Idx->getZExtValue() != 0;
David Stuttard70e8bc12017-06-22 16:29:22 +00005784 SDValue Ops[] = {
5785 Op.getOperand(0), // Chain
5786 Op.getOperand(2), // rsrc
5787 Op.getOperand(3), // vindex
5788 Op.getOperand(4), // voffset
5789 Op.getOperand(5), // soffset
5790 Op.getOperand(6), // offset
Tim Renouf35484c92018-08-21 11:06:05 +00005791 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5792 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5793 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5794 };
5795
5796 if (LoadVT.getScalarType() == MVT::f16)
5797 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5798 M, DAG, Ops);
Tim Renouf677387d2019-03-22 14:58:02 +00005799 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5800 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
5801 DAG);
Tim Renouf35484c92018-08-21 11:06:05 +00005802 }
5803 case Intrinsic::amdgcn_raw_tbuffer_load: {
5804 MemSDNode *M = cast<MemSDNode>(Op);
5805 EVT LoadVT = Op.getValueType();
5806 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5807
5808 SDValue Ops[] = {
5809 Op.getOperand(0), // Chain
5810 Op.getOperand(2), // rsrc
5811 DAG.getConstant(0, DL, MVT::i32), // vindex
5812 Offsets.first, // voffset
5813 Op.getOperand(4), // soffset
5814 Offsets.second, // offset
5815 Op.getOperand(5), // format
5816 Op.getOperand(6), // cachepolicy
5817 DAG.getConstant(0, DL, MVT::i1), // idxen
5818 };
5819
5820 if (LoadVT.getScalarType() == MVT::f16)
5821 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5822 M, DAG, Ops);
Tim Renouf677387d2019-03-22 14:58:02 +00005823 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5824 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
5825 DAG);
Tim Renouf35484c92018-08-21 11:06:05 +00005826 }
5827 case Intrinsic::amdgcn_struct_tbuffer_load: {
5828 MemSDNode *M = cast<MemSDNode>(Op);
5829 EVT LoadVT = Op.getValueType();
5830 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5831
5832 SDValue Ops[] = {
5833 Op.getOperand(0), // Chain
5834 Op.getOperand(2), // rsrc
5835 Op.getOperand(3), // vindex
5836 Offsets.first, // voffset
5837 Op.getOperand(5), // soffset
5838 Offsets.second, // offset
5839 Op.getOperand(6), // format
5840 Op.getOperand(7), // cachepolicy
5841 DAG.getConstant(1, DL, MVT::i1), // idxen
David Stuttard70e8bc12017-06-22 16:29:22 +00005842 };
5843
Tim Renouf366a49d2018-08-02 23:33:01 +00005844 if (LoadVT.getScalarType() == MVT::f16)
5845 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5846 M, DAG, Ops);
Tim Renouf677387d2019-03-22 14:58:02 +00005847 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5848 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
5849 DAG);
David Stuttard70e8bc12017-06-22 16:29:22 +00005850 }
Marek Olsak5cec6412017-11-09 01:52:48 +00005851 case Intrinsic::amdgcn_buffer_atomic_swap:
5852 case Intrinsic::amdgcn_buffer_atomic_add:
5853 case Intrinsic::amdgcn_buffer_atomic_sub:
5854 case Intrinsic::amdgcn_buffer_atomic_smin:
5855 case Intrinsic::amdgcn_buffer_atomic_umin:
5856 case Intrinsic::amdgcn_buffer_atomic_smax:
5857 case Intrinsic::amdgcn_buffer_atomic_umax:
5858 case Intrinsic::amdgcn_buffer_atomic_and:
5859 case Intrinsic::amdgcn_buffer_atomic_or:
5860 case Intrinsic::amdgcn_buffer_atomic_xor: {
Tim Renouf4f703f52018-08-21 11:07:10 +00005861 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5862 unsigned IdxEn = 1;
5863 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5864 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00005865 SDValue Ops[] = {
5866 Op.getOperand(0), // Chain
5867 Op.getOperand(2), // vdata
5868 Op.getOperand(3), // rsrc
5869 Op.getOperand(4), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00005870 SDValue(), // voffset -- will be set by setBufferOffsets
5871 SDValue(), // soffset -- will be set by setBufferOffsets
5872 SDValue(), // offset -- will be set by setBufferOffsets
5873 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5874 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00005875 };
Tim Renouf4f703f52018-08-21 11:07:10 +00005876 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005877 EVT VT = Op.getValueType();
5878
5879 auto *M = cast<MemSDNode>(Op);
Marek Olsak5cec6412017-11-09 01:52:48 +00005880 unsigned Opcode = 0;
5881
5882 switch (IntrID) {
5883 case Intrinsic::amdgcn_buffer_atomic_swap:
5884 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5885 break;
5886 case Intrinsic::amdgcn_buffer_atomic_add:
5887 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5888 break;
5889 case Intrinsic::amdgcn_buffer_atomic_sub:
5890 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5891 break;
5892 case Intrinsic::amdgcn_buffer_atomic_smin:
5893 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5894 break;
5895 case Intrinsic::amdgcn_buffer_atomic_umin:
5896 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5897 break;
5898 case Intrinsic::amdgcn_buffer_atomic_smax:
5899 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5900 break;
5901 case Intrinsic::amdgcn_buffer_atomic_umax:
5902 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5903 break;
5904 case Intrinsic::amdgcn_buffer_atomic_and:
5905 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5906 break;
5907 case Intrinsic::amdgcn_buffer_atomic_or:
5908 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5909 break;
5910 case Intrinsic::amdgcn_buffer_atomic_xor:
5911 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5912 break;
5913 default:
5914 llvm_unreachable("unhandled atomic opcode");
5915 }
5916
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005917 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5918 M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00005919 }
Tim Renouf4f703f52018-08-21 11:07:10 +00005920 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5921 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5922 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5923 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5924 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5925 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5926 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5927 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5928 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5929 case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
5930 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5931 SDValue Ops[] = {
5932 Op.getOperand(0), // Chain
5933 Op.getOperand(2), // vdata
5934 Op.getOperand(3), // rsrc
5935 DAG.getConstant(0, DL, MVT::i32), // vindex
5936 Offsets.first, // voffset
5937 Op.getOperand(5), // soffset
5938 Offsets.second, // offset
5939 Op.getOperand(6), // cachepolicy
5940 DAG.getConstant(0, DL, MVT::i1), // idxen
5941 };
5942 EVT VT = Op.getValueType();
Marek Olsak5cec6412017-11-09 01:52:48 +00005943
Tim Renouf4f703f52018-08-21 11:07:10 +00005944 auto *M = cast<MemSDNode>(Op);
5945 unsigned Opcode = 0;
5946
5947 switch (IntrID) {
5948 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5949 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5950 break;
5951 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5952 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5953 break;
5954 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5955 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5956 break;
5957 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5958 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5959 break;
5960 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5961 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5962 break;
5963 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5964 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5965 break;
5966 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5967 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5968 break;
5969 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5970 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5971 break;
5972 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5973 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5974 break;
5975 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5976 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5977 break;
5978 default:
5979 llvm_unreachable("unhandled atomic opcode");
5980 }
5981
5982 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5983 M->getMemOperand());
5984 }
5985 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5986 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5987 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5988 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5989 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5990 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5991 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5992 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5993 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5994 case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
5995 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5996 SDValue Ops[] = {
5997 Op.getOperand(0), // Chain
5998 Op.getOperand(2), // vdata
5999 Op.getOperand(3), // rsrc
6000 Op.getOperand(4), // vindex
6001 Offsets.first, // voffset
6002 Op.getOperand(6), // soffset
6003 Offsets.second, // offset
6004 Op.getOperand(7), // cachepolicy
6005 DAG.getConstant(1, DL, MVT::i1), // idxen
6006 };
6007 EVT VT = Op.getValueType();
6008
6009 auto *M = cast<MemSDNode>(Op);
6010 unsigned Opcode = 0;
6011
6012 switch (IntrID) {
6013 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6014 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
6015 break;
6016 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6017 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
6018 break;
6019 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6020 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
6021 break;
6022 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6023 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
6024 break;
6025 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6026 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
6027 break;
6028 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6029 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
6030 break;
6031 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6032 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
6033 break;
6034 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6035 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
6036 break;
6037 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6038 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
6039 break;
6040 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6041 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
6042 break;
6043 default:
6044 llvm_unreachable("unhandled atomic opcode");
6045 }
6046
6047 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
6048 M->getMemOperand());
6049 }
Marek Olsak5cec6412017-11-09 01:52:48 +00006050 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
Tim Renouf4f703f52018-08-21 11:07:10 +00006051 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
6052 unsigned IdxEn = 1;
6053 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
6054 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00006055 SDValue Ops[] = {
6056 Op.getOperand(0), // Chain
6057 Op.getOperand(2), // src
6058 Op.getOperand(3), // cmp
6059 Op.getOperand(4), // rsrc
6060 Op.getOperand(5), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00006061 SDValue(), // voffset -- will be set by setBufferOffsets
6062 SDValue(), // soffset -- will be set by setBufferOffsets
6063 SDValue(), // offset -- will be set by setBufferOffsets
6064 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
6065 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
6066 };
6067 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
6068 EVT VT = Op.getValueType();
6069 auto *M = cast<MemSDNode>(Op);
6070
6071 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
6072 Op->getVTList(), Ops, VT, M->getMemOperand());
6073 }
6074 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
6075 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6076 SDValue Ops[] = {
6077 Op.getOperand(0), // Chain
6078 Op.getOperand(2), // src
6079 Op.getOperand(3), // cmp
6080 Op.getOperand(4), // rsrc
6081 DAG.getConstant(0, DL, MVT::i32), // vindex
6082 Offsets.first, // voffset
6083 Op.getOperand(6), // soffset
6084 Offsets.second, // offset
6085 Op.getOperand(7), // cachepolicy
6086 DAG.getConstant(0, DL, MVT::i1), // idxen
6087 };
6088 EVT VT = Op.getValueType();
6089 auto *M = cast<MemSDNode>(Op);
6090
6091 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
6092 Op->getVTList(), Ops, VT, M->getMemOperand());
6093 }
6094 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
6095 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
6096 SDValue Ops[] = {
6097 Op.getOperand(0), // Chain
6098 Op.getOperand(2), // src
6099 Op.getOperand(3), // cmp
6100 Op.getOperand(4), // rsrc
6101 Op.getOperand(5), // vindex
6102 Offsets.first, // voffset
6103 Op.getOperand(7), // soffset
6104 Offsets.second, // offset
6105 Op.getOperand(8), // cachepolicy
6106 DAG.getConstant(1, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00006107 };
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00006108 EVT VT = Op.getValueType();
6109 auto *M = cast<MemSDNode>(Op);
Marek Olsak5cec6412017-11-09 01:52:48 +00006110
6111 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00006112 Op->getVTList(), Ops, VT, M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00006113 }
6114
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00006115 default:
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00006116 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6117 AMDGPU::getImageDimIntrinsicInfo(IntrID))
6118 return lowerImage(Op, ImageDimIntr, DAG);
Matt Arsenault1349a042018-05-22 06:32:10 +00006119
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00006120 return SDValue();
6121 }
6122}
6123
Tim Renouf677387d2019-03-22 14:58:02 +00006124// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
6125// dwordx4 if on SI.
6126SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
6127 SDVTList VTList,
6128 ArrayRef<SDValue> Ops, EVT MemVT,
6129 MachineMemOperand *MMO,
6130 SelectionDAG &DAG) const {
6131 EVT VT = VTList.VTs[0];
6132 EVT WidenedVT = VT;
6133 EVT WidenedMemVT = MemVT;
6134 if (!Subtarget->hasDwordx3LoadStores() &&
6135 (WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) {
6136 WidenedVT = EVT::getVectorVT(*DAG.getContext(),
6137 WidenedVT.getVectorElementType(), 4);
6138 WidenedMemVT = EVT::getVectorVT(*DAG.getContext(),
6139 WidenedMemVT.getVectorElementType(), 4);
6140 MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16);
6141 }
6142
6143 assert(VTList.NumVTs == 2);
6144 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
6145
6146 auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
6147 WidenedMemVT, MMO);
6148 if (WidenedVT != VT) {
6149 auto Extract = DAG.getNode(
6150 ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
6151 DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
6152 NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);
6153 }
6154 return NewOp;
6155}
6156
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006157SDValue SITargetLowering::handleD16VData(SDValue VData,
6158 SelectionDAG &DAG) const {
6159 EVT StoreVT = VData.getValueType();
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006160
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006161 // No change for f16 and legal vector D16 types.
Matt Arsenault1349a042018-05-22 06:32:10 +00006162 if (!StoreVT.isVector())
6163 return VData;
6164
6165 SDLoc DL(VData);
6166 assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
6167
6168 if (Subtarget->hasUnpackedD16VMem()) {
6169 // We need to unpack the packed data to store.
6170 EVT IntStoreVT = StoreVT.changeTypeToInteger();
6171 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
6172
6173 EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6174 StoreVT.getVectorNumElements());
6175 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
6176 return DAG.UnrollVectorOp(ZExt.getNode());
6177 }
6178
Matt Arsenault02dc7e12018-06-15 15:15:46 +00006179 assert(isTypeLegal(StoreVT));
6180 return VData;
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006181}
6182
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006183SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6184 SelectionDAG &DAG) const {
Tom Stellardfc92e772015-05-12 14:18:14 +00006185 SDLoc DL(Op);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006186 SDValue Chain = Op.getOperand(0);
6187 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
David Stuttard70e8bc12017-06-22 16:29:22 +00006188 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006189
6190 switch (IntrinsicID) {
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00006191 case Intrinsic::amdgcn_exp: {
Matt Arsenault4165efd2017-01-17 07:26:53 +00006192 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
6193 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
6194 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
6195 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
6196
6197 const SDValue Ops[] = {
6198 Chain,
6199 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
6200 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
6201 Op.getOperand(4), // src0
6202 Op.getOperand(5), // src1
6203 Op.getOperand(6), // src2
6204 Op.getOperand(7), // src3
6205 DAG.getTargetConstant(0, DL, MVT::i1), // compr
6206 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
6207 };
6208
6209 unsigned Opc = Done->isNullValue() ?
6210 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
6211 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
6212 }
6213 case Intrinsic::amdgcn_exp_compr: {
6214 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
6215 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
6216 SDValue Src0 = Op.getOperand(4);
6217 SDValue Src1 = Op.getOperand(5);
6218 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
6219 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
6220
6221 SDValue Undef = DAG.getUNDEF(MVT::f32);
6222 const SDValue Ops[] = {
6223 Chain,
6224 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
6225 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
6226 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
6227 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
6228 Undef, // src2
6229 Undef, // src3
6230 DAG.getTargetConstant(1, DL, MVT::i1), // compr
6231 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
6232 };
6233
6234 unsigned Opc = Done->isNullValue() ?
6235 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
6236 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
6237 }
6238 case Intrinsic::amdgcn_s_sendmsg:
Matt Arsenaultd3e5cb72017-02-16 02:01:17 +00006239 case Intrinsic::amdgcn_s_sendmsghalt: {
6240 unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
6241 AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
Tom Stellardfc92e772015-05-12 14:18:14 +00006242 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
6243 SDValue Glue = Chain.getValue(1);
Matt Arsenaulta78ca622017-02-15 22:17:09 +00006244 return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
Jan Veselyd48445d2017-01-04 18:06:55 +00006245 Op.getOperand(2), Glue);
6246 }
Marek Olsak2d825902017-04-28 20:21:58 +00006247 case Intrinsic::amdgcn_init_exec: {
6248 return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
6249 Op.getOperand(2));
6250 }
6251 case Intrinsic::amdgcn_init_exec_from_input: {
6252 return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
6253 Op.getOperand(2), Op.getOperand(3));
6254 }
Stanislav Mekhanoshinea57c382017-04-06 16:48:30 +00006255 case Intrinsic::amdgcn_s_barrier: {
6256 if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00006257 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Matthias Braunf1caa282017-12-15 22:22:58 +00006258 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
Stanislav Mekhanoshinea57c382017-04-06 16:48:30 +00006259 if (WGSize <= ST.getWavefrontSize())
6260 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
6261 Op.getOperand(0)), 0);
6262 }
6263 return SDValue();
6264 };
David Stuttard70e8bc12017-06-22 16:29:22 +00006265 case Intrinsic::amdgcn_tbuffer_store: {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006266 SDValue VData = Op.getOperand(2);
6267 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6268 if (IsD16)
6269 VData = handleD16VData(VData, DAG);
Tim Renouf35484c92018-08-21 11:06:05 +00006270 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
6271 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
6272 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
6273 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
6274 unsigned IdxEn = 1;
6275 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6276 IdxEn = Idx->getZExtValue() != 0;
David Stuttard70e8bc12017-06-22 16:29:22 +00006277 SDValue Ops[] = {
6278 Chain,
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006279 VData, // vdata
David Stuttard70e8bc12017-06-22 16:29:22 +00006280 Op.getOperand(3), // rsrc
6281 Op.getOperand(4), // vindex
6282 Op.getOperand(5), // voffset
6283 Op.getOperand(6), // soffset
6284 Op.getOperand(7), // offset
Tim Renouf35484c92018-08-21 11:06:05 +00006285 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
6286 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6287 DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
6288 };
6289 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6290 AMDGPUISD::TBUFFER_STORE_FORMAT;
6291 MemSDNode *M = cast<MemSDNode>(Op);
6292 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6293 M->getMemoryVT(), M->getMemOperand());
6294 }
6295
6296 case Intrinsic::amdgcn_struct_tbuffer_store: {
6297 SDValue VData = Op.getOperand(2);
6298 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6299 if (IsD16)
6300 VData = handleD16VData(VData, DAG);
6301 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6302 SDValue Ops[] = {
6303 Chain,
6304 VData, // vdata
6305 Op.getOperand(3), // rsrc
6306 Op.getOperand(4), // vindex
6307 Offsets.first, // voffset
6308 Op.getOperand(6), // soffset
6309 Offsets.second, // offset
6310 Op.getOperand(7), // format
6311 Op.getOperand(8), // cachepolicy
6312 DAG.getConstant(1, DL, MVT::i1), // idexen
6313 };
6314 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6315 AMDGPUISD::TBUFFER_STORE_FORMAT;
6316 MemSDNode *M = cast<MemSDNode>(Op);
6317 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6318 M->getMemoryVT(), M->getMemOperand());
6319 }
6320
6321 case Intrinsic::amdgcn_raw_tbuffer_store: {
6322 SDValue VData = Op.getOperand(2);
6323 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6324 if (IsD16)
6325 VData = handleD16VData(VData, DAG);
6326 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6327 SDValue Ops[] = {
6328 Chain,
6329 VData, // vdata
6330 Op.getOperand(3), // rsrc
6331 DAG.getConstant(0, DL, MVT::i32), // vindex
6332 Offsets.first, // voffset
6333 Op.getOperand(5), // soffset
6334 Offsets.second, // offset
6335 Op.getOperand(6), // format
6336 Op.getOperand(7), // cachepolicy
6337 DAG.getConstant(0, DL, MVT::i1), // idexen
David Stuttard70e8bc12017-06-22 16:29:22 +00006338 };
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006339 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6340 AMDGPUISD::TBUFFER_STORE_FORMAT;
6341 MemSDNode *M = cast<MemSDNode>(Op);
6342 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6343 M->getMemoryVT(), M->getMemOperand());
David Stuttard70e8bc12017-06-22 16:29:22 +00006344 }
6345
Marek Olsak5cec6412017-11-09 01:52:48 +00006346 case Intrinsic::amdgcn_buffer_store:
6347 case Intrinsic::amdgcn_buffer_store_format: {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006348 SDValue VData = Op.getOperand(2);
6349 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6350 if (IsD16)
6351 VData = handleD16VData(VData, DAG);
Tim Renouf4f703f52018-08-21 11:07:10 +00006352 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
6353 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
6354 unsigned IdxEn = 1;
6355 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6356 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00006357 SDValue Ops[] = {
6358 Chain,
Tim Renouf4f703f52018-08-21 11:07:10 +00006359 VData,
Marek Olsak5cec6412017-11-09 01:52:48 +00006360 Op.getOperand(3), // rsrc
6361 Op.getOperand(4), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00006362 SDValue(), // voffset -- will be set by setBufferOffsets
6363 SDValue(), // soffset -- will be set by setBufferOffsets
6364 SDValue(), // offset -- will be set by setBufferOffsets
6365 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6366 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00006367 };
Tim Renouf4f703f52018-08-21 11:07:10 +00006368 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006369 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
6370 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6371 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6372 MemSDNode *M = cast<MemSDNode>(Op);
Ryan Taylor00e063a2019-03-19 16:07:00 +00006373
6374 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6375 EVT VDataType = VData.getValueType().getScalarType();
6376 if (VDataType == MVT::i8 || VDataType == MVT::i16)
6377 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
6378
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006379 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6380 M->getMemoryVT(), M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00006381 }
Tim Renouf4f703f52018-08-21 11:07:10 +00006382
6383 case Intrinsic::amdgcn_raw_buffer_store:
6384 case Intrinsic::amdgcn_raw_buffer_store_format: {
6385 SDValue VData = Op.getOperand(2);
6386 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6387 if (IsD16)
6388 VData = handleD16VData(VData, DAG);
6389 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6390 SDValue Ops[] = {
6391 Chain,
6392 VData,
6393 Op.getOperand(3), // rsrc
6394 DAG.getConstant(0, DL, MVT::i32), // vindex
6395 Offsets.first, // voffset
6396 Op.getOperand(5), // soffset
6397 Offsets.second, // offset
6398 Op.getOperand(6), // cachepolicy
6399 DAG.getConstant(0, DL, MVT::i1), // idxen
6400 };
6401 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
6402 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6403 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6404 MemSDNode *M = cast<MemSDNode>(Op);
Ryan Taylor00e063a2019-03-19 16:07:00 +00006405
6406 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6407 EVT VDataType = VData.getValueType().getScalarType();
6408 if (VDataType == MVT::i8 || VDataType == MVT::i16)
6409 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
6410
Tim Renouf4f703f52018-08-21 11:07:10 +00006411 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6412 M->getMemoryVT(), M->getMemOperand());
6413 }
6414
6415 case Intrinsic::amdgcn_struct_buffer_store:
6416 case Intrinsic::amdgcn_struct_buffer_store_format: {
6417 SDValue VData = Op.getOperand(2);
6418 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6419 if (IsD16)
6420 VData = handleD16VData(VData, DAG);
6421 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6422 SDValue Ops[] = {
6423 Chain,
6424 VData,
6425 Op.getOperand(3), // rsrc
6426 Op.getOperand(4), // vindex
6427 Offsets.first, // voffset
6428 Op.getOperand(6), // soffset
6429 Offsets.second, // offset
6430 Op.getOperand(7), // cachepolicy
6431 DAG.getConstant(1, DL, MVT::i1), // idxen
6432 };
6433 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
6434 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6435 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6436 MemSDNode *M = cast<MemSDNode>(Op);
Ryan Taylor00e063a2019-03-19 16:07:00 +00006437
6438 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6439 EVT VDataType = VData.getValueType().getScalarType();
6440 if (VDataType == MVT::i8 || VDataType == MVT::i16)
6441 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
6442
Tim Renouf4f703f52018-08-21 11:07:10 +00006443 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6444 M->getMemoryVT(), M->getMemOperand());
6445 }
6446
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006447 default: {
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00006448 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6449 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
6450 return lowerImage(Op, ImageDimIntr, DAG);
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006451
Matt Arsenault754dd3e2017-04-03 18:08:08 +00006452 return Op;
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006453 }
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006454 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006455}
6456
Tim Renouf4f703f52018-08-21 11:07:10 +00006457// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6458// offset (the offset that is included in bounds checking and swizzling, to be
6459// split between the instruction's voffset and immoffset fields) and soffset
6460// (the offset that is excluded from bounds checking and swizzling, to go in
6461// the instruction's soffset field). This function takes the first kind of
6462// offset and figures out how to split it between voffset and immoffset.
Tim Renouf35484c92018-08-21 11:06:05 +00006463std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
6464 SDValue Offset, SelectionDAG &DAG) const {
6465 SDLoc DL(Offset);
6466 const unsigned MaxImm = 4095;
6467 SDValue N0 = Offset;
6468 ConstantSDNode *C1 = nullptr;
Piotr Sobczak378131b2019-01-02 09:47:41 +00006469
6470 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
Tim Renouf35484c92018-08-21 11:06:05 +00006471 N0 = SDValue();
Piotr Sobczak378131b2019-01-02 09:47:41 +00006472 else if (DAG.isBaseWithConstantOffset(N0)) {
6473 C1 = cast<ConstantSDNode>(N0.getOperand(1));
6474 N0 = N0.getOperand(0);
6475 }
Tim Renouf35484c92018-08-21 11:06:05 +00006476
6477 if (C1) {
6478 unsigned ImmOffset = C1->getZExtValue();
6479 // If the immediate value is too big for the immoffset field, put the value
Tim Renoufa37679d2018-10-03 10:29:43 +00006480 // and -4096 into the immoffset field so that the value that is copied/added
Tim Renouf35484c92018-08-21 11:06:05 +00006481 // for the voffset field is a multiple of 4096, and it stands more chance
6482 // of being CSEd with the copy/add for another similar load/store.
Tim Renoufa37679d2018-10-03 10:29:43 +00006483 // However, do not do that rounding down to a multiple of 4096 if that is a
6484 // negative number, as it appears to be illegal to have a negative offset
6485 // in the vgpr, even if adding the immediate offset makes it positive.
Tim Renouf35484c92018-08-21 11:06:05 +00006486 unsigned Overflow = ImmOffset & ~MaxImm;
6487 ImmOffset -= Overflow;
Tim Renoufa37679d2018-10-03 10:29:43 +00006488 if ((int32_t)Overflow < 0) {
6489 Overflow += ImmOffset;
6490 ImmOffset = 0;
6491 }
Tim Renouf35484c92018-08-21 11:06:05 +00006492 C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
6493 if (Overflow) {
6494 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
6495 if (!N0)
6496 N0 = OverflowVal;
6497 else {
6498 SDValue Ops[] = { N0, OverflowVal };
6499 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
6500 }
6501 }
6502 }
6503 if (!N0)
6504 N0 = DAG.getConstant(0, DL, MVT::i32);
6505 if (!C1)
6506 C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
6507 return {N0, SDValue(C1, 0)};
6508}
6509
Tim Renouf4f703f52018-08-21 11:07:10 +00006510// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
6511// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
6512// pointed to by Offsets.
6513void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006514 SelectionDAG &DAG, SDValue *Offsets,
6515 unsigned Align) const {
Tim Renouf4f703f52018-08-21 11:07:10 +00006516 SDLoc DL(CombinedOffset);
6517 if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
6518 uint32_t Imm = C->getZExtValue();
6519 uint32_t SOffset, ImmOffset;
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006520 if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
Tim Renouf4f703f52018-08-21 11:07:10 +00006521 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
6522 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6523 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6524 return;
6525 }
6526 }
6527 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
6528 SDValue N0 = CombinedOffset.getOperand(0);
6529 SDValue N1 = CombinedOffset.getOperand(1);
6530 uint32_t SOffset, ImmOffset;
6531 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006532 if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
6533 Subtarget, Align)) {
Tim Renouf4f703f52018-08-21 11:07:10 +00006534 Offsets[0] = N0;
6535 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6536 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6537 return;
6538 }
6539 }
6540 Offsets[0] = CombinedOffset;
6541 Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
6542 Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
6543}
6544
Ryan Taylor00e063a2019-03-19 16:07:00 +00006545// Handle 8 bit and 16 bit buffer loads
6546SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
6547 EVT LoadVT, SDLoc DL,
6548 ArrayRef<SDValue> Ops,
6549 MemSDNode *M) const {
6550 EVT IntVT = LoadVT.changeTypeToInteger();
6551 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
6552 AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
6553
6554 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
6555 SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
6556 Ops, IntVT,
6557 M->getMemOperand());
6558 SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL,
6559 LoadVT.getScalarType(), BufferLoad);
6560 return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL);
6561}
6562
6563// Handle 8 bit and 16 bit buffer stores
6564SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
6565 EVT VDataType, SDLoc DL,
6566 SDValue Ops[],
6567 MemSDNode *M) const {
6568 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
6569 Ops[1] = BufferStoreExt;
6570 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
6571 AMDGPUISD::BUFFER_STORE_SHORT;
6572 ArrayRef<SDValue> OpsRef = makeArrayRef(&Ops[0], 9);
6573 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
6574 M->getMemOperand());
6575}
6576
Matt Arsenault90083d32018-06-07 09:54:49 +00006577static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
6578 ISD::LoadExtType ExtType, SDValue Op,
6579 const SDLoc &SL, EVT VT) {
6580 if (VT.bitsLT(Op.getValueType()))
6581 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
6582
6583 switch (ExtType) {
6584 case ISD::SEXTLOAD:
6585 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
6586 case ISD::ZEXTLOAD:
6587 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
6588 case ISD::EXTLOAD:
6589 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
6590 case ISD::NON_EXTLOAD:
6591 return Op;
6592 }
6593
6594 llvm_unreachable("invalid ext type");
6595}
6596
6597SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
6598 SelectionDAG &DAG = DCI.DAG;
6599 if (Ld->getAlignment() < 4 || Ld->isDivergent())
6600 return SDValue();
6601
6602 // FIXME: Constant loads should all be marked invariant.
6603 unsigned AS = Ld->getAddressSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00006604 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
6605 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
Matt Arsenault90083d32018-06-07 09:54:49 +00006606 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
6607 return SDValue();
6608
6609 // Don't do this early, since it may interfere with adjacent load merging for
6610 // illegal types. We can avoid losing alignment information for exotic types
6611 // pre-legalize.
6612 EVT MemVT = Ld->getMemoryVT();
6613 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
6614 MemVT.getSizeInBits() >= 32)
6615 return SDValue();
6616
6617 SDLoc SL(Ld);
6618
6619 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
6620 "unexpected vector extload");
6621
6622 // TODO: Drop only high part of range.
6623 SDValue Ptr = Ld->getBasePtr();
6624 SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
6625 MVT::i32, SL, Ld->getChain(), Ptr,
6626 Ld->getOffset(),
6627 Ld->getPointerInfo(), MVT::i32,
6628 Ld->getAlignment(),
6629 Ld->getMemOperand()->getFlags(),
6630 Ld->getAAInfo(),
6631 nullptr); // Drop ranges
6632
6633 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
6634 if (MemVT.isFloatingPoint()) {
6635 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
6636 "unexpected fp extload");
6637 TruncVT = MemVT.changeTypeToInteger();
6638 }
6639
6640 SDValue Cvt = NewLoad;
6641 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
6642 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
6643 DAG.getValueType(TruncVT));
6644 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
6645 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
6646 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
6647 } else {
6648 assert(Ld->getExtensionType() == ISD::EXTLOAD);
6649 }
6650
6651 EVT VT = Ld->getValueType(0);
6652 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
6653
6654 DCI.AddToWorklist(Cvt.getNode());
6655
6656 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
6657 // the appropriate extension from the 32-bit load.
6658 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
6659 DCI.AddToWorklist(Cvt.getNode());
6660
6661 // Handle conversion back to floating point if necessary.
6662 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
6663
6664 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
6665}
6666
Tom Stellard81d871d2013-11-13 23:36:50 +00006667SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
6668 SDLoc DL(Op);
6669 LoadSDNode *Load = cast<LoadSDNode>(Op);
Matt Arsenault6dfda962016-02-10 18:21:39 +00006670 ISD::LoadExtType ExtType = Load->getExtensionType();
Matt Arsenaulta1436412016-02-10 18:21:45 +00006671 EVT MemVT = Load->getMemoryVT();
Matt Arsenault6dfda962016-02-10 18:21:39 +00006672
Matt Arsenaulta1436412016-02-10 18:21:45 +00006673 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
Matt Arsenault65ca292a2017-09-07 05:37:34 +00006674 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
6675 return SDValue();
6676
Matt Arsenault6dfda962016-02-10 18:21:39 +00006677 // FIXME: Copied from PPC
6678 // First, load into 32 bits, then truncate to 1 bit.
6679
6680 SDValue Chain = Load->getChain();
6681 SDValue BasePtr = Load->getBasePtr();
6682 MachineMemOperand *MMO = Load->getMemOperand();
6683
Tom Stellard115a6152016-11-10 16:02:37 +00006684 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
6685
Matt Arsenault6dfda962016-02-10 18:21:39 +00006686 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
Tom Stellard115a6152016-11-10 16:02:37 +00006687 BasePtr, RealMemVT, MMO);
Matt Arsenault6dfda962016-02-10 18:21:39 +00006688
Tim Renouf361b5b22019-03-21 12:01:21 +00006689 if (!MemVT.isVector()) {
6690 SDValue Ops[] = {
6691 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
6692 NewLD.getValue(1)
6693 };
6694
6695 return DAG.getMergeValues(Ops, DL);
6696 }
6697
6698 SmallVector<SDValue, 3> Elts;
6699 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
6700 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
6701 DAG.getConstant(I, DL, MVT::i32));
6702
6703 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
6704 }
6705
Matt Arsenault6dfda962016-02-10 18:21:39 +00006706 SDValue Ops[] = {
Tim Renouf361b5b22019-03-21 12:01:21 +00006707 DAG.getBuildVector(MemVT, DL, Elts),
Matt Arsenault6dfda962016-02-10 18:21:39 +00006708 NewLD.getValue(1)
6709 };
6710
6711 return DAG.getMergeValues(Ops, DL);
6712 }
Tom Stellard81d871d2013-11-13 23:36:50 +00006713
Matt Arsenaulta1436412016-02-10 18:21:45 +00006714 if (!MemVT.isVector())
6715 return SDValue();
Matt Arsenault4d801cd2015-11-24 12:05:03 +00006716
Matt Arsenaulta1436412016-02-10 18:21:45 +00006717 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
6718 "Custom lowering for non-i32 vectors hasn't been implemented.");
Matt Arsenault4d801cd2015-11-24 12:05:03 +00006719
Farhana Aleen89196642018-03-07 17:09:18 +00006720 unsigned Alignment = Load->getAlignment();
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006721 unsigned AS = Load->getAddressSpace();
6722 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
Farhana Aleen89196642018-03-07 17:09:18 +00006723 AS, Alignment)) {
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006724 SDValue Ops[2];
6725 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
6726 return DAG.getMergeValues(Ops, DL);
6727 }
Stanislav Mekhanoshina224f682019-05-01 16:11:11 +00006728 if (Subtarget->hasLDSMisalignedBug() &&
6729 AS == AMDGPUAS::FLAT_ADDRESS &&
6730 Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
6731 return SplitVectorLoad(Op, DAG);
6732 }
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006733
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006734 MachineFunction &MF = DAG.getMachineFunction();
6735 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6736 // If there is a possibilty that flat instruction access scratch memory
6737 // then we need to use the same legalization rules we use for private.
Matt Arsenault0da63502018-08-31 05:49:54 +00006738 if (AS == AMDGPUAS::FLAT_ADDRESS)
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006739 AS = MFI->hasFlatScratchInit() ?
Matt Arsenault0da63502018-08-31 05:49:54 +00006740 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006741
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006742 unsigned NumElements = MemVT.getVectorNumElements();
Matt Arsenault6c041a32018-03-29 19:59:28 +00006743
Matt Arsenault0da63502018-08-31 05:49:54 +00006744 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6745 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
Tim Renouf361b5b22019-03-21 12:01:21 +00006746 if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
6747 if (MemVT.isPow2VectorType())
6748 return SDValue();
6749 if (NumElements == 3)
6750 return WidenVectorLoad(Op, DAG);
6751 return SplitVectorLoad(Op, DAG);
6752 }
Matt Arsenaulta1436412016-02-10 18:21:45 +00006753 // Non-uniform loads will be selected to MUBUF instructions, so they
Alexander Timofeev18009562016-12-08 17:28:47 +00006754 // have the same legalization requirements as global and private
Matt Arsenaulta1436412016-02-10 18:21:45 +00006755 // loads.
6756 //
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006757 }
Matt Arsenault6c041a32018-03-29 19:59:28 +00006758
Matt Arsenault0da63502018-08-31 05:49:54 +00006759 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6760 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6761 AS == AMDGPUAS::GLOBAL_ADDRESS) {
Alexander Timofeev2e5eece2018-03-05 15:12:21 +00006762 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
Farhana Aleen89196642018-03-07 17:09:18 +00006763 !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
Tim Renouf361b5b22019-03-21 12:01:21 +00006764 Alignment >= 4 && NumElements < 32) {
6765 if (MemVT.isPow2VectorType())
6766 return SDValue();
6767 if (NumElements == 3)
6768 return WidenVectorLoad(Op, DAG);
6769 return SplitVectorLoad(Op, DAG);
6770 }
Alexander Timofeev18009562016-12-08 17:28:47 +00006771 // Non-uniform loads will be selected to MUBUF instructions, so they
6772 // have the same legalization requirements as global and private
6773 // loads.
6774 //
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006775 }
Matt Arsenault0da63502018-08-31 05:49:54 +00006776 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6777 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6778 AS == AMDGPUAS::GLOBAL_ADDRESS ||
6779 AS == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006780 if (NumElements > 4)
Matt Arsenaulta1436412016-02-10 18:21:45 +00006781 return SplitVectorLoad(Op, DAG);
Tim Renouf361b5b22019-03-21 12:01:21 +00006782 // v3 loads not supported on SI.
6783 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
6784 return WidenVectorLoad(Op, DAG);
6785 // v3 and v4 loads are supported for private and global memory.
Matt Arsenaulta1436412016-02-10 18:21:45 +00006786 return SDValue();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006787 }
Matt Arsenault0da63502018-08-31 05:49:54 +00006788 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006789 // Depending on the setting of the private_element_size field in the
6790 // resource descriptor, we can only make private accesses up to a certain
6791 // size.
6792 switch (Subtarget->getMaxPrivateElementSize()) {
6793 case 4:
Matt Arsenault9c499c32016-04-14 23:31:26 +00006794 return scalarizeVectorLoad(Load, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006795 case 8:
6796 if (NumElements > 2)
6797 return SplitVectorLoad(Op, DAG);
6798 return SDValue();
6799 case 16:
6800 // Same as global/flat
6801 if (NumElements > 4)
6802 return SplitVectorLoad(Op, DAG);
Tim Renouf361b5b22019-03-21 12:01:21 +00006803 // v3 loads not supported on SI.
6804 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
6805 return WidenVectorLoad(Op, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006806 return SDValue();
6807 default:
6808 llvm_unreachable("unsupported private_element_size");
6809 }
Matt Arsenault0da63502018-08-31 05:49:54 +00006810 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Farhana Aleena7cb3112018-03-09 17:41:39 +00006811 // Use ds_read_b128 if possible.
Marek Olsaka9a58fa2018-04-10 22:48:23 +00006812 if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
Farhana Aleena7cb3112018-03-09 17:41:39 +00006813 MemVT.getStoreSize() == 16)
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006814 return SDValue();
6815
Farhana Aleena7cb3112018-03-09 17:41:39 +00006816 if (NumElements > 2)
6817 return SplitVectorLoad(Op, DAG);
Nicolai Haehnle48219372018-10-17 15:37:48 +00006818
6819 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
6820 // address is negative, then the instruction is incorrectly treated as
6821 // out-of-bounds even if base + offsets is in bounds. Split vectorized
6822 // loads here to avoid emitting ds_read2_b32. We may re-combine the
6823 // load later in the SILoadStoreOptimizer.
6824 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
6825 NumElements == 2 && MemVT.getStoreSize() == 8 &&
6826 Load->getAlignment() < 8) {
6827 return SplitVectorLoad(Op, DAG);
6828 }
Tom Stellarde9373602014-01-22 19:24:14 +00006829 }
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006830 return SDValue();
Tom Stellard81d871d2013-11-13 23:36:50 +00006831}
6832
Tom Stellard0ec134f2014-02-04 17:18:40 +00006833SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00006834 EVT VT = Op.getValueType();
6835 assert(VT.getSizeInBits() == 64);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006836
6837 SDLoc DL(Op);
6838 SDValue Cond = Op.getOperand(0);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006839
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00006840 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
6841 SDValue One = DAG.getConstant(1, DL, MVT::i32);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006842
Tom Stellard7ea3d6d2014-03-31 14:01:55 +00006843 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
6844 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
6845
6846 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
6847 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006848
6849 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
6850
Tom Stellard7ea3d6d2014-03-31 14:01:55 +00006851 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
6852 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006853
6854 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
6855
Ahmed Bougacha128f8732016-04-26 21:15:30 +00006856 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
Matt Arsenault02dc7e12018-06-15 15:15:46 +00006857 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006858}
6859
Matt Arsenault22ca3f82014-07-15 23:50:10 +00006860// Catch division cases where we can use shortcuts with rcp and rsq
6861// instructions.
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00006862SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
6863 SelectionDAG &DAG) const {
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006864 SDLoc SL(Op);
6865 SDValue LHS = Op.getOperand(0);
6866 SDValue RHS = Op.getOperand(1);
6867 EVT VT = Op.getValueType();
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00006868 const SDNodeFlags Flags = Op->getFlags();
Michael Berg7acc81b2018-05-04 18:48:20 +00006869 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006870
Konstantin Zhuravlyovc4b18e72017-04-21 19:25:33 +00006871 if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
6872 return SDValue();
6873
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006874 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
Konstantin Zhuravlyovc4b18e72017-04-21 19:25:33 +00006875 if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
Matt Arsenault979902b2016-08-02 22:25:04 +00006876 if (CLHS->isExactlyValue(1.0)) {
6877 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
6878 // the CI documentation has a worst case error of 1 ulp.
6879 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
6880 // use it as long as we aren't trying to use denormals.
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00006881 //
6882 // v_rcp_f16 and v_rsq_f16 DO support denormals.
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006883
Matt Arsenault979902b2016-08-02 22:25:04 +00006884 // 1.0 / sqrt(x) -> rsq(x)
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00006885
Matt Arsenault979902b2016-08-02 22:25:04 +00006886 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
6887 // error seems really high at 2^29 ULP.
6888 if (RHS.getOpcode() == ISD::FSQRT)
6889 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
6890
6891 // 1.0 / x -> rcp(x)
6892 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
6893 }
6894
6895 // Same as for 1.0, but expand the sign out of the constant.
6896 if (CLHS->isExactlyValue(-1.0)) {
6897 // -1.0 / x -> rcp (fneg x)
6898 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
6899 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
6900 }
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006901 }
6902 }
6903
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00006904 if (Unsafe) {
Matt Arsenault22ca3f82014-07-15 23:50:10 +00006905 // Turn into multiply by the reciprocal.
6906 // x / y -> x * (1.0 / y)
6907 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00006908 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
Matt Arsenault22ca3f82014-07-15 23:50:10 +00006909 }
6910
6911 return SDValue();
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006912}
6913
Tom Stellard8485fa02016-12-07 02:42:15 +00006914static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6915 EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
6916 if (GlueChain->getNumValues() <= 1) {
6917 return DAG.getNode(Opcode, SL, VT, A, B);
6918 }
6919
6920 assert(GlueChain->getNumValues() == 3);
6921
6922 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6923 switch (Opcode) {
6924 default: llvm_unreachable("no chain equivalent for opcode");
6925 case ISD::FMUL:
6926 Opcode = AMDGPUISD::FMUL_W_CHAIN;
6927 break;
6928 }
6929
6930 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
6931 GlueChain.getValue(2));
6932}
6933
6934static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6935 EVT VT, SDValue A, SDValue B, SDValue C,
6936 SDValue GlueChain) {
6937 if (GlueChain->getNumValues() <= 1) {
6938 return DAG.getNode(Opcode, SL, VT, A, B, C);
6939 }
6940
6941 assert(GlueChain->getNumValues() == 3);
6942
6943 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6944 switch (Opcode) {
6945 default: llvm_unreachable("no chain equivalent for opcode");
6946 case ISD::FMA:
6947 Opcode = AMDGPUISD::FMA_W_CHAIN;
6948 break;
6949 }
6950
6951 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
6952 GlueChain.getValue(2));
6953}
6954
Matt Arsenault4052a572016-12-22 03:05:41 +00006955SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00006956 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
6957 return FastLowered;
6958
Matt Arsenault4052a572016-12-22 03:05:41 +00006959 SDLoc SL(Op);
6960 SDValue Src0 = Op.getOperand(0);
6961 SDValue Src1 = Op.getOperand(1);
6962
6963 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6964 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6965
6966 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
6967 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
6968
6969 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
6970 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
6971
6972 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
6973}
6974
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00006975// Faster 2.5 ULP division that does not support denormals.
6976SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
6977 SDLoc SL(Op);
6978 SDValue LHS = Op.getOperand(1);
6979 SDValue RHS = Op.getOperand(2);
6980
6981 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
6982
6983 const APFloat K0Val(BitsToFloat(0x6f800000));
6984 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
6985
6986 const APFloat K1Val(BitsToFloat(0x2f800000));
6987 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
6988
6989 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
6990
6991 EVT SetCCVT =
6992 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
6993
6994 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
6995
6996 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
6997
6998 // TODO: Should this propagate fast-math-flags?
6999 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
7000
7001 // rcp does not support denormals.
7002 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
7003
7004 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
7005
7006 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
7007}
7008
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007009SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00007010 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
Eric Christopher538d09d02016-06-07 20:27:12 +00007011 return FastLowered;
Matt Arsenault22ca3f82014-07-15 23:50:10 +00007012
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007013 SDLoc SL(Op);
7014 SDValue LHS = Op.getOperand(0);
7015 SDValue RHS = Op.getOperand(1);
7016
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007017 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007018
Wei Dinged0f97f2016-06-09 19:17:15 +00007019 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007020
Tom Stellard8485fa02016-12-07 02:42:15 +00007021 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
7022 RHS, RHS, LHS);
7023 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
7024 LHS, RHS, LHS);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007025
Matt Arsenaultdfec5ce2016-07-09 07:48:11 +00007026 // Denominator is scaled to not be denormal, so using rcp is ok.
Tom Stellard8485fa02016-12-07 02:42:15 +00007027 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
7028 DenominatorScaled);
7029 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
7030 DenominatorScaled);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007031
Tom Stellard8485fa02016-12-07 02:42:15 +00007032 const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
7033 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
7034 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007035
Tom Stellard8485fa02016-12-07 02:42:15 +00007036 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007037
Tom Stellard8485fa02016-12-07 02:42:15 +00007038 if (!Subtarget->hasFP32Denormals()) {
7039 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
7040 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
7041 SL, MVT::i32);
7042 SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
7043 DAG.getEntryNode(),
7044 EnableDenormValue, BitField);
7045 SDValue Ops[3] = {
7046 NegDivScale0,
7047 EnableDenorm.getValue(0),
7048 EnableDenorm.getValue(1)
7049 };
Matt Arsenault37fefd62016-06-10 02:18:02 +00007050
Tom Stellard8485fa02016-12-07 02:42:15 +00007051 NegDivScale0 = DAG.getMergeValues(Ops, SL);
7052 }
7053
7054 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
7055 ApproxRcp, One, NegDivScale0);
7056
7057 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
7058 ApproxRcp, Fma0);
7059
7060 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
7061 Fma1, Fma1);
7062
7063 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
7064 NumeratorScaled, Mul);
7065
7066 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
7067
7068 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
7069 NumeratorScaled, Fma3);
7070
7071 if (!Subtarget->hasFP32Denormals()) {
7072 const SDValue DisableDenormValue =
7073 DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
7074 SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
7075 Fma4.getValue(1),
7076 DisableDenormValue,
7077 BitField,
7078 Fma4.getValue(2));
7079
7080 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
7081 DisableDenorm, DAG.getRoot());
7082 DAG.setRoot(OutputChain);
7083 }
Matt Arsenault37fefd62016-06-10 02:18:02 +00007084
Wei Dinged0f97f2016-06-09 19:17:15 +00007085 SDValue Scale = NumeratorScaled.getValue(1);
Tom Stellard8485fa02016-12-07 02:42:15 +00007086 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
7087 Fma4, Fma1, Fma3, Scale);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007088
Wei Dinged0f97f2016-06-09 19:17:15 +00007089 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007090}
7091
7092SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007093 if (DAG.getTarget().Options.UnsafeFPMath)
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00007094 return lowerFastUnsafeFDIV(Op, DAG);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007095
7096 SDLoc SL(Op);
7097 SDValue X = Op.getOperand(0);
7098 SDValue Y = Op.getOperand(1);
7099
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007100 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007101
7102 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
7103
7104 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
7105
7106 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
7107
7108 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
7109
7110 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
7111
7112 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
7113
7114 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
7115
7116 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
7117
7118 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
7119 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
7120
7121 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
7122 NegDivScale0, Mul, DivScale1);
7123
7124 SDValue Scale;
7125
Tom Stellard5bfbae52018-07-11 20:59:01 +00007126 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007127 // Workaround a hardware bug on SI where the condition output from div_scale
7128 // is not usable.
7129
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007130 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007131
7132 // Figure out if the scale to use for div_fmas.
7133 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
7134 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
7135 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
7136 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
7137
7138 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
7139 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
7140
7141 SDValue Scale0Hi
7142 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
7143 SDValue Scale1Hi
7144 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
7145
7146 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
7147 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
7148 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
7149 } else {
7150 Scale = DivScale1.getValue(1);
7151 }
7152
7153 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
7154 Fma4, Fma3, Mul, Scale);
7155
7156 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007157}
7158
7159SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
7160 EVT VT = Op.getValueType();
7161
7162 if (VT == MVT::f32)
7163 return LowerFDIV32(Op, DAG);
7164
7165 if (VT == MVT::f64)
7166 return LowerFDIV64(Op, DAG);
7167
Matt Arsenault4052a572016-12-22 03:05:41 +00007168 if (VT == MVT::f16)
7169 return LowerFDIV16(Op, DAG);
7170
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007171 llvm_unreachable("Unexpected type for fdiv");
7172}
7173
Tom Stellard81d871d2013-11-13 23:36:50 +00007174SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
7175 SDLoc DL(Op);
7176 StoreSDNode *Store = cast<StoreSDNode>(Op);
7177 EVT VT = Store->getMemoryVT();
7178
Matt Arsenault95245662016-02-11 05:32:46 +00007179 if (VT == MVT::i1) {
7180 return DAG.getTruncStore(Store->getChain(), DL,
7181 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
7182 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
Tom Stellardb02094e2014-07-21 15:45:01 +00007183 }
7184
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00007185 assert(VT.isVector() &&
7186 Store->getValue().getValueType().getScalarType() == MVT::i32);
7187
7188 unsigned AS = Store->getAddressSpace();
7189 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
7190 AS, Store->getAlignment())) {
7191 return expandUnalignedStore(Store, DAG);
7192 }
Tom Stellard81d871d2013-11-13 23:36:50 +00007193
Stanislav Mekhanoshina224f682019-05-01 16:11:11 +00007194 if (Subtarget->hasLDSMisalignedBug() &&
7195 AS == AMDGPUAS::FLAT_ADDRESS &&
7196 Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
7197 return SplitVectorStore(Op, DAG);
7198 }
7199
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00007200 MachineFunction &MF = DAG.getMachineFunction();
7201 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
7202 // If there is a possibilty that flat instruction access scratch memory
7203 // then we need to use the same legalization rules we use for private.
Matt Arsenault0da63502018-08-31 05:49:54 +00007204 if (AS == AMDGPUAS::FLAT_ADDRESS)
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00007205 AS = MFI->hasFlatScratchInit() ?
Matt Arsenault0da63502018-08-31 05:49:54 +00007206 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00007207
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007208 unsigned NumElements = VT.getVectorNumElements();
Matt Arsenault0da63502018-08-31 05:49:54 +00007209 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
7210 AS == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007211 if (NumElements > 4)
7212 return SplitVectorStore(Op, DAG);
Tim Renouf361b5b22019-03-21 12:01:21 +00007213 // v3 stores not supported on SI.
7214 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
7215 return SplitVectorStore(Op, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007216 return SDValue();
Matt Arsenault0da63502018-08-31 05:49:54 +00007217 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007218 switch (Subtarget->getMaxPrivateElementSize()) {
7219 case 4:
Matt Arsenault9c499c32016-04-14 23:31:26 +00007220 return scalarizeVectorStore(Store, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007221 case 8:
7222 if (NumElements > 2)
7223 return SplitVectorStore(Op, DAG);
7224 return SDValue();
7225 case 16:
Tim Renouf361b5b22019-03-21 12:01:21 +00007226 if (NumElements > 4 || NumElements == 3)
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007227 return SplitVectorStore(Op, DAG);
7228 return SDValue();
7229 default:
7230 llvm_unreachable("unsupported private_element_size");
7231 }
Matt Arsenault0da63502018-08-31 05:49:54 +00007232 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00007233 // Use ds_write_b128 if possible.
Marek Olsaka9a58fa2018-04-10 22:48:23 +00007234 if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
Tim Renouf361b5b22019-03-21 12:01:21 +00007235 VT.getStoreSize() == 16 && NumElements != 3)
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00007236 return SDValue();
7237
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00007238 if (NumElements > 2)
7239 return SplitVectorStore(Op, DAG);
Nicolai Haehnle48219372018-10-17 15:37:48 +00007240
7241 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7242 // address is negative, then the instruction is incorrectly treated as
7243 // out-of-bounds even if base + offsets is in bounds. Split vectorized
7244 // stores here to avoid emitting ds_write2_b32. We may re-combine the
7245 // store later in the SILoadStoreOptimizer.
7246 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
7247 NumElements == 2 && VT.getStoreSize() == 8 &&
7248 Store->getAlignment() < 8) {
7249 return SplitVectorStore(Op, DAG);
7250 }
7251
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00007252 return SDValue();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00007253 } else {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007254 llvm_unreachable("unhandled address space");
Matt Arsenault95245662016-02-11 05:32:46 +00007255 }
Tom Stellard81d871d2013-11-13 23:36:50 +00007256}
7257
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007258SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007259 SDLoc DL(Op);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007260 EVT VT = Op.getValueType();
7261 SDValue Arg = Op.getOperand(0);
David Stuttard20de3e92018-09-14 10:27:19 +00007262 SDValue TrigVal;
7263
Sanjay Patela2607012015-09-16 16:31:21 +00007264 // TODO: Should this propagate fast-math-flags?
David Stuttard20de3e92018-09-14 10:27:19 +00007265
7266 SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT);
7267
7268 if (Subtarget->hasTrigReducedRange()) {
7269 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
7270 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
7271 } else {
7272 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
7273 }
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007274
7275 switch (Op.getOpcode()) {
7276 case ISD::FCOS:
David Stuttard20de3e92018-09-14 10:27:19 +00007277 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007278 case ISD::FSIN:
David Stuttard20de3e92018-09-14 10:27:19 +00007279 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007280 default:
7281 llvm_unreachable("Wrong trig opcode");
7282 }
7283}
7284
Tom Stellard354a43c2016-04-01 18:27:37 +00007285SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
7286 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
7287 assert(AtomicNode->isCompareAndSwap());
7288 unsigned AS = AtomicNode->getAddressSpace();
7289
7290 // No custom lowering required for local address space
Matt Arsenault0da63502018-08-31 05:49:54 +00007291 if (!isFlatGlobalAddrSpace(AS))
Tom Stellard354a43c2016-04-01 18:27:37 +00007292 return Op;
7293
7294 // Non-local address space requires custom lowering for atomic compare
7295 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
7296 SDLoc DL(Op);
7297 SDValue ChainIn = Op.getOperand(0);
7298 SDValue Addr = Op.getOperand(1);
7299 SDValue Old = Op.getOperand(2);
7300 SDValue New = Op.getOperand(3);
7301 EVT VT = Op.getValueType();
7302 MVT SimpleVT = VT.getSimpleVT();
7303 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
7304
Ahmed Bougacha128f8732016-04-26 21:15:30 +00007305 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
Tom Stellard354a43c2016-04-01 18:27:37 +00007306 SDValue Ops[] = { ChainIn, Addr, NewOld };
Matt Arsenault88701812016-06-09 23:42:48 +00007307
7308 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
7309 Ops, VT, AtomicNode->getMemOperand());
Tom Stellard354a43c2016-04-01 18:27:37 +00007310}
7311
Tom Stellard75aadc22012-12-11 21:25:42 +00007312//===----------------------------------------------------------------------===//
7313// Custom DAG optimizations
7314//===----------------------------------------------------------------------===//
7315
Matt Arsenault364a6742014-06-11 17:50:44 +00007316SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
Matt Arsenaulte6986632015-01-14 01:35:22 +00007317 DAGCombinerInfo &DCI) const {
Matt Arsenault364a6742014-06-11 17:50:44 +00007318 EVT VT = N->getValueType(0);
7319 EVT ScalarVT = VT.getScalarType();
7320 if (ScalarVT != MVT::f32)
7321 return SDValue();
7322
7323 SelectionDAG &DAG = DCI.DAG;
7324 SDLoc DL(N);
7325
7326 SDValue Src = N->getOperand(0);
7327 EVT SrcVT = Src.getValueType();
7328
7329 // TODO: We could try to match extracting the higher bytes, which would be
7330 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
7331 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
7332 // about in practice.
Craig Topper80d3bb32018-03-06 19:44:52 +00007333 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
Matt Arsenault364a6742014-06-11 17:50:44 +00007334 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
7335 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
7336 DCI.AddToWorklist(Cvt.getNode());
7337 return Cvt;
7338 }
7339 }
7340
Matt Arsenault364a6742014-06-11 17:50:44 +00007341 return SDValue();
7342}
7343
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007344// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
7345
7346// This is a variant of
7347// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
7348//
7349// The normal DAG combiner will do this, but only if the add has one use since
7350// that would increase the number of instructions.
7351//
7352// This prevents us from seeing a constant offset that can be folded into a
7353// memory instruction's addressing mode. If we know the resulting add offset of
7354// a pointer can be folded into an addressing offset, we can replace the pointer
7355// operand with the add of new constant offset. This eliminates one of the uses,
7356// and may allow the remaining use to also be simplified.
7357//
7358SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
7359 unsigned AddrSpace,
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007360 EVT MemVT,
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007361 DAGCombinerInfo &DCI) const {
7362 SDValue N0 = N->getOperand(0);
7363 SDValue N1 = N->getOperand(1);
7364
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007365 // We only do this to handle cases where it's profitable when there are
7366 // multiple uses of the add, so defer to the standard combine.
Matt Arsenaultc8903122017-11-14 23:46:42 +00007367 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
7368 N0->hasOneUse())
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007369 return SDValue();
7370
7371 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
7372 if (!CN1)
7373 return SDValue();
7374
7375 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
7376 if (!CAdd)
7377 return SDValue();
7378
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007379 // If the resulting offset is too large, we can't fold it into the addressing
7380 // mode offset.
7381 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007382 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
7383
7384 AddrMode AM;
7385 AM.HasBaseReg = true;
7386 AM.BaseOffs = Offset.getSExtValue();
7387 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007388 return SDValue();
7389
7390 SelectionDAG &DAG = DCI.DAG;
7391 SDLoc SL(N);
7392 EVT VT = N->getValueType(0);
7393
7394 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007395 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007396
Matt Arsenaulte5e0c742017-11-13 05:33:35 +00007397 SDNodeFlags Flags;
7398 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
7399 (N0.getOpcode() == ISD::OR ||
7400 N0->getFlags().hasNoUnsignedWrap()));
7401
7402 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007403}
7404
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00007405SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
7406 DAGCombinerInfo &DCI) const {
7407 SDValue Ptr = N->getBasePtr();
7408 SelectionDAG &DAG = DCI.DAG;
7409 SDLoc SL(N);
7410
7411 // TODO: We could also do this for multiplies.
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007412 if (Ptr.getOpcode() == ISD::SHL) {
7413 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
7414 N->getMemoryVT(), DCI);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00007415 if (NewPtr) {
7416 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
7417
7418 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
7419 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
7420 }
7421 }
7422
7423 return SDValue();
7424}
7425
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007426static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
7427 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
7428 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
7429 (Opc == ISD::XOR && Val == 0);
7430}
7431
7432// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
7433// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
7434// integer combine opportunities since most 64-bit operations are decomposed
7435// this way. TODO: We won't want this for SALU especially if it is an inline
7436// immediate.
7437SDValue SITargetLowering::splitBinaryBitConstantOp(
7438 DAGCombinerInfo &DCI,
7439 const SDLoc &SL,
7440 unsigned Opc, SDValue LHS,
7441 const ConstantSDNode *CRHS) const {
7442 uint64_t Val = CRHS->getZExtValue();
7443 uint32_t ValLo = Lo_32(Val);
7444 uint32_t ValHi = Hi_32(Val);
7445 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7446
7447 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
7448 bitOpWithConstantIsReducible(Opc, ValHi)) ||
7449 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
7450 // If we need to materialize a 64-bit immediate, it will be split up later
7451 // anyway. Avoid creating the harder to understand 64-bit immediate
7452 // materialization.
7453 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
7454 }
7455
7456 return SDValue();
7457}
7458
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00007459// Returns true if argument is a boolean value which is not serialized into
7460// memory or argument and does not require v_cmdmask_b32 to be deserialized.
7461static bool isBoolSGPR(SDValue V) {
7462 if (V.getValueType() != MVT::i1)
7463 return false;
7464 switch (V.getOpcode()) {
7465 default: break;
7466 case ISD::SETCC:
7467 case ISD::AND:
7468 case ISD::OR:
7469 case ISD::XOR:
7470 case AMDGPUISD::FP_CLASS:
7471 return true;
7472 }
7473 return false;
7474}
7475
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007476// If a constant has all zeroes or all ones within each byte return it.
7477// Otherwise return 0.
7478static uint32_t getConstantPermuteMask(uint32_t C) {
7479 // 0xff for any zero byte in the mask
7480 uint32_t ZeroByteMask = 0;
7481 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
7482 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
7483 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
7484 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
7485 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
7486 if ((NonZeroByteMask & C) != NonZeroByteMask)
7487 return 0; // Partial bytes selected.
7488 return C;
7489}
7490
7491// Check if a node selects whole bytes from its operand 0 starting at a byte
7492// boundary while masking the rest. Returns select mask as in the v_perm_b32
7493// or -1 if not succeeded.
7494// Note byte select encoding:
7495// value 0-3 selects corresponding source byte;
7496// value 0xc selects zero;
7497// value 0xff selects 0xff.
7498static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
7499 assert(V.getValueSizeInBits() == 32);
7500
7501 if (V.getNumOperands() != 2)
7502 return ~0;
7503
7504 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
7505 if (!N1)
7506 return ~0;
7507
7508 uint32_t C = N1->getZExtValue();
7509
7510 switch (V.getOpcode()) {
7511 default:
7512 break;
7513 case ISD::AND:
7514 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7515 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
7516 }
7517 break;
7518
7519 case ISD::OR:
7520 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7521 return (0x03020100 & ~ConstMask) | ConstMask;
7522 }
7523 break;
7524
7525 case ISD::SHL:
7526 if (C % 8)
7527 return ~0;
7528
7529 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
7530
7531 case ISD::SRL:
7532 if (C % 8)
7533 return ~0;
7534
7535 return uint32_t(0x0c0c0c0c03020100ull >> C);
7536 }
7537
7538 return ~0;
7539}
7540
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007541SDValue SITargetLowering::performAndCombine(SDNode *N,
7542 DAGCombinerInfo &DCI) const {
7543 if (DCI.isBeforeLegalize())
7544 return SDValue();
7545
7546 SelectionDAG &DAG = DCI.DAG;
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007547 EVT VT = N->getValueType(0);
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007548 SDValue LHS = N->getOperand(0);
7549 SDValue RHS = N->getOperand(1);
7550
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007551
Stanislav Mekhanoshin53a21292017-05-23 19:54:48 +00007552 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7553 if (VT == MVT::i64 && CRHS) {
7554 if (SDValue Split
7555 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
7556 return Split;
7557 }
7558
7559 if (CRHS && VT == MVT::i32) {
7560 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
7561 // nb = number of trailing zeroes in mask
7562 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
7563 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
7564 uint64_t Mask = CRHS->getZExtValue();
7565 unsigned Bits = countPopulation(Mask);
7566 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
7567 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
7568 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
7569 unsigned Shift = CShift->getZExtValue();
7570 unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
7571 unsigned Offset = NB + Shift;
7572 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
7573 SDLoc SL(N);
7574 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
7575 LHS->getOperand(0),
7576 DAG.getConstant(Offset, SL, MVT::i32),
7577 DAG.getConstant(Bits, SL, MVT::i32));
7578 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
7579 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
7580 DAG.getValueType(NarrowVT));
7581 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
7582 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
7583 return Shl;
7584 }
7585 }
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007586 }
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007587
7588 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7589 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
7590 isa<ConstantSDNode>(LHS.getOperand(2))) {
7591 uint32_t Sel = getConstantPermuteMask(Mask);
7592 if (!Sel)
7593 return SDValue();
7594
7595 // Select 0xc for all zero bytes
7596 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
7597 SDLoc DL(N);
7598 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7599 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7600 }
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007601 }
7602
7603 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
7604 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
7605 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007606 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7607 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
7608
7609 SDValue X = LHS.getOperand(0);
7610 SDValue Y = RHS.getOperand(0);
7611 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
7612 return SDValue();
7613
7614 if (LCC == ISD::SETO) {
7615 if (X != LHS.getOperand(1))
7616 return SDValue();
7617
7618 if (RCC == ISD::SETUNE) {
7619 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
7620 if (!C1 || !C1->isInfinity() || C1->isNegative())
7621 return SDValue();
7622
7623 const uint32_t Mask = SIInstrFlags::N_NORMAL |
7624 SIInstrFlags::N_SUBNORMAL |
7625 SIInstrFlags::N_ZERO |
7626 SIInstrFlags::P_ZERO |
7627 SIInstrFlags::P_SUBNORMAL |
7628 SIInstrFlags::P_NORMAL;
7629
7630 static_assert(((~(SIInstrFlags::S_NAN |
7631 SIInstrFlags::Q_NAN |
7632 SIInstrFlags::N_INFINITY |
7633 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
7634 "mask not equal");
7635
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007636 SDLoc DL(N);
7637 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7638 X, DAG.getConstant(Mask, DL, MVT::i32));
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007639 }
7640 }
7641 }
7642
Matt Arsenault3dcf4ce2018-08-10 18:58:56 +00007643 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
7644 std::swap(LHS, RHS);
7645
7646 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7647 RHS.hasOneUse()) {
7648 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7649 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
7650 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
7651 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7652 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
7653 (RHS.getOperand(0) == LHS.getOperand(0) &&
7654 LHS.getOperand(0) == LHS.getOperand(1))) {
7655 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
7656 unsigned NewMask = LCC == ISD::SETO ?
7657 Mask->getZExtValue() & ~OrdMask :
7658 Mask->getZExtValue() & OrdMask;
7659
7660 SDLoc DL(N);
7661 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
7662 DAG.getConstant(NewMask, DL, MVT::i32));
7663 }
7664 }
7665
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00007666 if (VT == MVT::i32 &&
7667 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
7668 // and x, (sext cc from i1) => select cc, x, 0
7669 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
7670 std::swap(LHS, RHS);
7671 if (isBoolSGPR(RHS.getOperand(0)))
7672 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
7673 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
7674 }
7675
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007676 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7677 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7678 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7679 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7680 uint32_t LHSMask = getPermuteMask(DAG, LHS);
7681 uint32_t RHSMask = getPermuteMask(DAG, RHS);
7682 if (LHSMask != ~0u && RHSMask != ~0u) {
7683 // Canonicalize the expression in an attempt to have fewer unique masks
7684 // and therefore fewer registers used to hold the masks.
7685 if (LHSMask > RHSMask) {
7686 std::swap(LHSMask, RHSMask);
7687 std::swap(LHS, RHS);
7688 }
7689
7690 // Select 0xc for each lane used from source operand. Zero has 0xc mask
7691 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7692 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7693 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7694
7695 // Check of we need to combine values from two sources within a byte.
7696 if (!(LHSUsedLanes & RHSUsedLanes) &&
7697 // If we select high and lower word keep it for SDWA.
7698 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7699 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7700 // Each byte in each mask is either selector mask 0-3, or has higher
7701 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
7702 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
7703 // mask which is not 0xff wins. By anding both masks we have a correct
7704 // result except that 0x0c shall be corrected to give 0x0c only.
7705 uint32_t Mask = LHSMask & RHSMask;
7706 for (unsigned I = 0; I < 32; I += 8) {
7707 uint32_t ByteSel = 0xff << I;
7708 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
7709 Mask &= (0x0c << I) & 0xffffffff;
7710 }
7711
7712 // Add 4 to each active LHS lane. It will not affect any existing 0xff
7713 // or 0x0c.
7714 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
7715 SDLoc DL(N);
7716
7717 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7718 LHS.getOperand(0), RHS.getOperand(0),
7719 DAG.getConstant(Sel, DL, MVT::i32));
7720 }
7721 }
7722 }
7723
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007724 return SDValue();
7725}
7726
Matt Arsenaultf2290332015-01-06 23:00:39 +00007727SDValue SITargetLowering::performOrCombine(SDNode *N,
7728 DAGCombinerInfo &DCI) const {
7729 SelectionDAG &DAG = DCI.DAG;
7730 SDValue LHS = N->getOperand(0);
7731 SDValue RHS = N->getOperand(1);
7732
Matt Arsenault3b082382016-04-12 18:24:38 +00007733 EVT VT = N->getValueType(0);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007734 if (VT == MVT::i1) {
7735 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
7736 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7737 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
7738 SDValue Src = LHS.getOperand(0);
7739 if (Src != RHS.getOperand(0))
7740 return SDValue();
Matt Arsenault3b082382016-04-12 18:24:38 +00007741
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007742 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
7743 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7744 if (!CLHS || !CRHS)
7745 return SDValue();
Matt Arsenault3b082382016-04-12 18:24:38 +00007746
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007747 // Only 10 bits are used.
7748 static const uint32_t MaxMask = 0x3ff;
Matt Arsenault3b082382016-04-12 18:24:38 +00007749
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007750 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
7751 SDLoc DL(N);
7752 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7753 Src, DAG.getConstant(NewMask, DL, MVT::i32));
7754 }
Matt Arsenault3b082382016-04-12 18:24:38 +00007755
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007756 return SDValue();
7757 }
7758
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007759 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7760 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
7761 LHS.getOpcode() == AMDGPUISD::PERM &&
7762 isa<ConstantSDNode>(LHS.getOperand(2))) {
7763 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
7764 if (!Sel)
7765 return SDValue();
7766
7767 Sel |= LHS.getConstantOperandVal(2);
7768 SDLoc DL(N);
7769 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7770 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7771 }
7772
7773 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7774 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7775 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7776 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7777 uint32_t LHSMask = getPermuteMask(DAG, LHS);
7778 uint32_t RHSMask = getPermuteMask(DAG, RHS);
7779 if (LHSMask != ~0u && RHSMask != ~0u) {
7780 // Canonicalize the expression in an attempt to have fewer unique masks
7781 // and therefore fewer registers used to hold the masks.
7782 if (LHSMask > RHSMask) {
7783 std::swap(LHSMask, RHSMask);
7784 std::swap(LHS, RHS);
7785 }
7786
7787 // Select 0xc for each lane used from source operand. Zero has 0xc mask
7788 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7789 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7790 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7791
7792 // Check of we need to combine values from two sources within a byte.
7793 if (!(LHSUsedLanes & RHSUsedLanes) &&
7794 // If we select high and lower word keep it for SDWA.
7795 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7796 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7797 // Kill zero bytes selected by other mask. Zero value is 0xc.
7798 LHSMask &= ~RHSUsedLanes;
7799 RHSMask &= ~LHSUsedLanes;
7800 // Add 4 to each active LHS lane
7801 LHSMask |= LHSUsedLanes & 0x04040404;
7802 // Combine masks
7803 uint32_t Sel = LHSMask | RHSMask;
7804 SDLoc DL(N);
7805
7806 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7807 LHS.getOperand(0), RHS.getOperand(0),
7808 DAG.getConstant(Sel, DL, MVT::i32));
7809 }
7810 }
7811 }
7812
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007813 if (VT != MVT::i64)
7814 return SDValue();
7815
7816 // TODO: This could be a generic combine with a predicate for extracting the
7817 // high half of an integer being free.
7818
7819 // (or i64:x, (zero_extend i32:y)) ->
7820 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
7821 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
7822 RHS.getOpcode() != ISD::ZERO_EXTEND)
7823 std::swap(LHS, RHS);
7824
7825 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
7826 SDValue ExtSrc = RHS.getOperand(0);
7827 EVT SrcVT = ExtSrc.getValueType();
7828 if (SrcVT == MVT::i32) {
7829 SDLoc SL(N);
7830 SDValue LowLHS, HiBits;
7831 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
7832 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
7833
7834 DCI.AddToWorklist(LowOr.getNode());
7835 DCI.AddToWorklist(HiBits.getNode());
7836
7837 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
7838 LowOr, HiBits);
7839 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
Matt Arsenault3b082382016-04-12 18:24:38 +00007840 }
7841 }
7842
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007843 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
7844 if (CRHS) {
7845 if (SDValue Split
7846 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
7847 return Split;
7848 }
Matt Arsenaultf2290332015-01-06 23:00:39 +00007849
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007850 return SDValue();
7851}
Matt Arsenaultf2290332015-01-06 23:00:39 +00007852
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007853SDValue SITargetLowering::performXorCombine(SDNode *N,
7854 DAGCombinerInfo &DCI) const {
7855 EVT VT = N->getValueType(0);
7856 if (VT != MVT::i64)
7857 return SDValue();
Matt Arsenaultf2290332015-01-06 23:00:39 +00007858
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007859 SDValue LHS = N->getOperand(0);
7860 SDValue RHS = N->getOperand(1);
7861
7862 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7863 if (CRHS) {
7864 if (SDValue Split
7865 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
7866 return Split;
Matt Arsenaultf2290332015-01-06 23:00:39 +00007867 }
7868
7869 return SDValue();
7870}
7871
Matt Arsenault5cf42712017-04-06 20:58:30 +00007872// Instructions that will be lowered with a final instruction that zeros the
7873// high result bits.
7874// XXX - probably only need to list legal operations.
Matt Arsenault8edfaee2017-03-31 19:53:03 +00007875static bool fp16SrcZerosHighBits(unsigned Opc) {
7876 switch (Opc) {
Matt Arsenault5cf42712017-04-06 20:58:30 +00007877 case ISD::FADD:
7878 case ISD::FSUB:
7879 case ISD::FMUL:
7880 case ISD::FDIV:
7881 case ISD::FREM:
7882 case ISD::FMA:
7883 case ISD::FMAD:
7884 case ISD::FCANONICALIZE:
7885 case ISD::FP_ROUND:
7886 case ISD::UINT_TO_FP:
7887 case ISD::SINT_TO_FP:
7888 case ISD::FABS:
7889 // Fabs is lowered to a bit operation, but it's an and which will clear the
7890 // high bits anyway.
7891 case ISD::FSQRT:
7892 case ISD::FSIN:
7893 case ISD::FCOS:
7894 case ISD::FPOWI:
7895 case ISD::FPOW:
7896 case ISD::FLOG:
7897 case ISD::FLOG2:
7898 case ISD::FLOG10:
7899 case ISD::FEXP:
7900 case ISD::FEXP2:
7901 case ISD::FCEIL:
7902 case ISD::FTRUNC:
7903 case ISD::FRINT:
7904 case ISD::FNEARBYINT:
7905 case ISD::FROUND:
7906 case ISD::FFLOOR:
7907 case ISD::FMINNUM:
7908 case ISD::FMAXNUM:
7909 case AMDGPUISD::FRACT:
7910 case AMDGPUISD::CLAMP:
7911 case AMDGPUISD::COS_HW:
7912 case AMDGPUISD::SIN_HW:
7913 case AMDGPUISD::FMIN3:
7914 case AMDGPUISD::FMAX3:
7915 case AMDGPUISD::FMED3:
7916 case AMDGPUISD::FMAD_FTZ:
7917 case AMDGPUISD::RCP:
7918 case AMDGPUISD::RSQ:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00007919 case AMDGPUISD::RCP_IFLAG:
Matt Arsenault5cf42712017-04-06 20:58:30 +00007920 case AMDGPUISD::LDEXP:
Matt Arsenault8edfaee2017-03-31 19:53:03 +00007921 return true;
Matt Arsenault5cf42712017-04-06 20:58:30 +00007922 default:
7923 // fcopysign, select and others may be lowered to 32-bit bit operations
7924 // which don't zero the high bits.
7925 return false;
Matt Arsenault8edfaee2017-03-31 19:53:03 +00007926 }
7927}
7928
7929SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
7930 DAGCombinerInfo &DCI) const {
7931 if (!Subtarget->has16BitInsts() ||
7932 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
7933 return SDValue();
7934
7935 EVT VT = N->getValueType(0);
7936 if (VT != MVT::i32)
7937 return SDValue();
7938
7939 SDValue Src = N->getOperand(0);
7940 if (Src.getValueType() != MVT::i16)
7941 return SDValue();
7942
7943 // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
7944 // FIXME: It is not universally true that the high bits are zeroed on gfx9.
7945 if (Src.getOpcode() == ISD::BITCAST) {
7946 SDValue BCSrc = Src.getOperand(0);
7947 if (BCSrc.getValueType() == MVT::f16 &&
7948 fp16SrcZerosHighBits(BCSrc.getOpcode()))
7949 return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
7950 }
7951
7952 return SDValue();
7953}
7954
Ryan Taylor00e063a2019-03-19 16:07:00 +00007955SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N,
7956 DAGCombinerInfo &DCI)
7957 const {
7958 SDValue Src = N->getOperand(0);
7959 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
7960
7961 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
7962 VTSign->getVT() == MVT::i8) ||
7963 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
7964 VTSign->getVT() == MVT::i16)) &&
7965 Src.hasOneUse()) {
7966 auto *M = cast<MemSDNode>(Src);
7967 SDValue Ops[] = {
7968 Src.getOperand(0), // Chain
7969 Src.getOperand(1), // rsrc
7970 Src.getOperand(2), // vindex
7971 Src.getOperand(3), // voffset
7972 Src.getOperand(4), // soffset
7973 Src.getOperand(5), // offset
7974 Src.getOperand(6),
7975 Src.getOperand(7)
7976 };
7977 // replace with BUFFER_LOAD_BYTE/SHORT
7978 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
7979 Src.getOperand(0).getValueType());
7980 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
7981 AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT;
7982 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
7983 ResList,
7984 Ops, M->getMemoryVT(),
7985 M->getMemOperand());
7986 return DCI.DAG.getMergeValues({BufferLoadSignExt,
7987 BufferLoadSignExt.getValue(1)}, SDLoc(N));
7988 }
7989 return SDValue();
7990}
7991
Matt Arsenaultf2290332015-01-06 23:00:39 +00007992SDValue SITargetLowering::performClassCombine(SDNode *N,
7993 DAGCombinerInfo &DCI) const {
7994 SelectionDAG &DAG = DCI.DAG;
7995 SDValue Mask = N->getOperand(1);
7996
7997 // fp_class x, 0 -> false
7998 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
7999 if (CMask->isNullValue())
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00008000 return DAG.getConstant(0, SDLoc(N), MVT::i1);
Matt Arsenaultf2290332015-01-06 23:00:39 +00008001 }
8002
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00008003 if (N->getOperand(0).isUndef())
8004 return DAG.getUNDEF(MVT::i1);
8005
Matt Arsenaultf2290332015-01-06 23:00:39 +00008006 return SDValue();
8007}
8008
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00008009SDValue SITargetLowering::performRcpCombine(SDNode *N,
8010 DAGCombinerInfo &DCI) const {
8011 EVT VT = N->getValueType(0);
8012 SDValue N0 = N->getOperand(0);
8013
8014 if (N0.isUndef())
8015 return N0;
8016
8017 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
8018 N0.getOpcode() == ISD::SINT_TO_FP)) {
8019 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
8020 N->getFlags());
8021 }
8022
8023 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
8024}
8025
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008026bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
8027 unsigned MaxDepth) const {
8028 unsigned Opcode = Op.getOpcode();
8029 if (Opcode == ISD::FCANONICALIZE)
8030 return true;
8031
8032 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
8033 auto F = CFP->getValueAPF();
8034 if (F.isNaN() && F.isSignaling())
8035 return false;
8036 return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
8037 }
8038
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008039 // If source is a result of another standard FP operation it is already in
8040 // canonical form.
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008041 if (MaxDepth == 0)
8042 return false;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008043
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008044 switch (Opcode) {
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008045 // These will flush denorms if required.
8046 case ISD::FADD:
8047 case ISD::FSUB:
8048 case ISD::FMUL:
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008049 case ISD::FCEIL:
8050 case ISD::FFLOOR:
8051 case ISD::FMA:
8052 case ISD::FMAD:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008053 case ISD::FSQRT:
8054 case ISD::FDIV:
8055 case ISD::FREM:
Matt Arsenaultce6d61f2018-08-06 21:51:52 +00008056 case ISD::FP_ROUND:
8057 case ISD::FP_EXTEND:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008058 case AMDGPUISD::FMUL_LEGACY:
8059 case AMDGPUISD::FMAD_FTZ:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00008060 case AMDGPUISD::RCP:
8061 case AMDGPUISD::RSQ:
8062 case AMDGPUISD::RSQ_CLAMP:
8063 case AMDGPUISD::RCP_LEGACY:
8064 case AMDGPUISD::RSQ_LEGACY:
8065 case AMDGPUISD::RCP_IFLAG:
8066 case AMDGPUISD::TRIG_PREOP:
8067 case AMDGPUISD::DIV_SCALE:
8068 case AMDGPUISD::DIV_FMAS:
8069 case AMDGPUISD::DIV_FIXUP:
8070 case AMDGPUISD::FRACT:
8071 case AMDGPUISD::LDEXP:
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008072 case AMDGPUISD::CVT_PKRTZ_F16_F32:
Matt Arsenault940e6072018-08-10 19:20:17 +00008073 case AMDGPUISD::CVT_F32_UBYTE0:
8074 case AMDGPUISD::CVT_F32_UBYTE1:
8075 case AMDGPUISD::CVT_F32_UBYTE2:
8076 case AMDGPUISD::CVT_F32_UBYTE3:
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008077 return true;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008078
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008079 // It can/will be lowered or combined as a bit operation.
8080 // Need to check their input recursively to handle.
8081 case ISD::FNEG:
8082 case ISD::FABS:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008083 case ISD::FCOPYSIGN:
8084 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008085
8086 case ISD::FSIN:
8087 case ISD::FCOS:
8088 case ISD::FSINCOS:
8089 return Op.getValueType().getScalarType() != MVT::f16;
8090
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008091 case ISD::FMINNUM:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00008092 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008093 case ISD::FMINNUM_IEEE:
8094 case ISD::FMAXNUM_IEEE:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00008095 case AMDGPUISD::CLAMP:
8096 case AMDGPUISD::FMED3:
8097 case AMDGPUISD::FMAX3:
8098 case AMDGPUISD::FMIN3: {
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008099 // FIXME: Shouldn't treat the generic operations different based these.
Matt Arsenault687ec752018-10-22 16:27:27 +00008100 // However, we aren't really required to flush the result from
8101 // minnum/maxnum..
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008102
Matt Arsenault687ec752018-10-22 16:27:27 +00008103 // snans will be quieted, so we only need to worry about denormals.
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008104 if (Subtarget->supportsMinMaxDenormModes() ||
Matt Arsenault687ec752018-10-22 16:27:27 +00008105 denormalsEnabledForType(Op.getValueType()))
8106 return true;
8107
8108 // Flushing may be required.
8109 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
8110 // targets need to check their input recursively.
8111
8112 // FIXME: Does this apply with clamp? It's implemented with max.
8113 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
8114 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
8115 return false;
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008116 }
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008117
Matt Arsenault687ec752018-10-22 16:27:27 +00008118 return true;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008119 }
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008120 case ISD::SELECT: {
8121 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
8122 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008123 }
Matt Arsenaulte94ee832018-08-06 22:45:51 +00008124 case ISD::BUILD_VECTOR: {
8125 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
8126 SDValue SrcOp = Op.getOperand(i);
8127 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
8128 return false;
8129 }
8130
8131 return true;
8132 }
8133 case ISD::EXTRACT_VECTOR_ELT:
8134 case ISD::EXTRACT_SUBVECTOR: {
8135 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
8136 }
8137 case ISD::INSERT_VECTOR_ELT: {
8138 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
8139 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
8140 }
8141 case ISD::UNDEF:
8142 // Could be anything.
8143 return false;
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008144
Matt Arsenault687ec752018-10-22 16:27:27 +00008145 case ISD::BITCAST: {
8146 // Hack round the mess we make when legalizing extract_vector_elt
8147 SDValue Src = Op.getOperand(0);
8148 if (Src.getValueType() == MVT::i16 &&
8149 Src.getOpcode() == ISD::TRUNCATE) {
8150 SDValue TruncSrc = Src.getOperand(0);
8151 if (TruncSrc.getValueType() == MVT::i32 &&
8152 TruncSrc.getOpcode() == ISD::BITCAST &&
8153 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
8154 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
8155 }
8156 }
8157
8158 return false;
8159 }
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008160 case ISD::INTRINSIC_WO_CHAIN: {
8161 unsigned IntrinsicID
8162 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
8163 // TODO: Handle more intrinsics
8164 switch (IntrinsicID) {
8165 case Intrinsic::amdgcn_cvt_pkrtz:
Matt Arsenault940e6072018-08-10 19:20:17 +00008166 case Intrinsic::amdgcn_cubeid:
8167 case Intrinsic::amdgcn_frexp_mant:
8168 case Intrinsic::amdgcn_fdot2:
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008169 return true;
8170 default:
8171 break;
8172 }
Matt Arsenault5bb9d792018-08-10 17:57:12 +00008173
8174 LLVM_FALLTHROUGH;
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008175 }
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008176 default:
8177 return denormalsEnabledForType(Op.getValueType()) &&
8178 DAG.isKnownNeverSNaN(Op);
8179 }
8180
8181 llvm_unreachable("invalid operation");
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008182}
8183
Matt Arsenault9cd90712016-04-14 01:42:16 +00008184// Constant fold canonicalize.
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00008185SDValue SITargetLowering::getCanonicalConstantFP(
8186 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
8187 // Flush denormals to 0 if not enabled.
8188 if (C.isDenormal() && !denormalsEnabledForType(VT))
8189 return DAG.getConstantFP(0.0, SL, VT);
8190
8191 if (C.isNaN()) {
8192 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
8193 if (C.isSignaling()) {
8194 // Quiet a signaling NaN.
8195 // FIXME: Is this supposed to preserve payload bits?
8196 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
8197 }
8198
8199 // Make sure it is the canonical NaN bitpattern.
8200 //
8201 // TODO: Can we use -1 as the canonical NaN value since it's an inline
8202 // immediate?
8203 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
8204 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
8205 }
8206
8207 // Already canonical.
8208 return DAG.getConstantFP(C, SL, VT);
8209}
8210
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008211static bool vectorEltWillFoldAway(SDValue Op) {
8212 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
8213}
8214
Matt Arsenault9cd90712016-04-14 01:42:16 +00008215SDValue SITargetLowering::performFCanonicalizeCombine(
8216 SDNode *N,
8217 DAGCombinerInfo &DCI) const {
Matt Arsenault9cd90712016-04-14 01:42:16 +00008218 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault4aec86d2018-07-31 13:34:31 +00008219 SDValue N0 = N->getOperand(0);
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008220 EVT VT = N->getValueType(0);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008221
Matt Arsenault4aec86d2018-07-31 13:34:31 +00008222 // fcanonicalize undef -> qnan
8223 if (N0.isUndef()) {
Matt Arsenault4aec86d2018-07-31 13:34:31 +00008224 APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
8225 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
8226 }
8227
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00008228 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
Matt Arsenault9cd90712016-04-14 01:42:16 +00008229 EVT VT = N->getValueType(0);
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00008230 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
Matt Arsenault9cd90712016-04-14 01:42:16 +00008231 }
8232
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008233 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
8234 // (fcanonicalize k)
8235 //
8236 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
8237
8238 // TODO: This could be better with wider vectors that will be split to v2f16,
8239 // and to consider uses since there aren't that many packed operations.
Matt Arsenaultb5acec12018-08-12 08:42:54 +00008240 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
8241 isTypeLegal(MVT::v2f16)) {
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008242 SDLoc SL(N);
8243 SDValue NewElts[2];
8244 SDValue Lo = N0.getOperand(0);
8245 SDValue Hi = N0.getOperand(1);
Matt Arsenaultb5acec12018-08-12 08:42:54 +00008246 EVT EltVT = Lo.getValueType();
8247
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008248 if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
8249 for (unsigned I = 0; I != 2; ++I) {
8250 SDValue Op = N0.getOperand(I);
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008251 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
8252 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
8253 CFP->getValueAPF());
8254 } else if (Op.isUndef()) {
Matt Arsenaultb5acec12018-08-12 08:42:54 +00008255 // Handled below based on what the other operand is.
8256 NewElts[I] = Op;
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008257 } else {
8258 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
8259 }
8260 }
8261
Matt Arsenaultb5acec12018-08-12 08:42:54 +00008262 // If one half is undef, and one is constant, perfer a splat vector rather
8263 // than the normal qNaN. If it's a register, prefer 0.0 since that's
8264 // cheaper to use and may be free with a packed operation.
8265 if (NewElts[0].isUndef()) {
8266 if (isa<ConstantFPSDNode>(NewElts[1]))
8267 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
8268 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
8269 }
8270
8271 if (NewElts[1].isUndef()) {
8272 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
8273 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
8274 }
8275
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008276 return DAG.getBuildVector(VT, SL, NewElts);
8277 }
8278 }
8279
Matt Arsenault687ec752018-10-22 16:27:27 +00008280 unsigned SrcOpc = N0.getOpcode();
8281
8282 // If it's free to do so, push canonicalizes further up the source, which may
8283 // find a canonical source.
8284 //
8285 // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
8286 // sNaNs.
8287 if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
8288 auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
8289 if (CRHS && N0.hasOneUse()) {
8290 SDLoc SL(N);
8291 SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
8292 N0.getOperand(0));
8293 SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
8294 DCI.AddToWorklist(Canon0.getNode());
8295
8296 return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
8297 }
8298 }
8299
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00008300 return isCanonicalized(DAG, N0) ? N0 : SDValue();
Matt Arsenault9cd90712016-04-14 01:42:16 +00008301}
8302
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008303static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
8304 switch (Opc) {
8305 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008306 case ISD::FMAXNUM_IEEE:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008307 return AMDGPUISD::FMAX3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008308 case ISD::SMAX:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008309 return AMDGPUISD::SMAX3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008310 case ISD::UMAX:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008311 return AMDGPUISD::UMAX3;
8312 case ISD::FMINNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008313 case ISD::FMINNUM_IEEE:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008314 return AMDGPUISD::FMIN3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008315 case ISD::SMIN:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008316 return AMDGPUISD::SMIN3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008317 case ISD::UMIN:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008318 return AMDGPUISD::UMIN3;
8319 default:
8320 llvm_unreachable("Not a min/max opcode");
8321 }
8322}
8323
Matt Arsenault10268f92017-02-27 22:40:39 +00008324SDValue SITargetLowering::performIntMed3ImmCombine(
8325 SelectionDAG &DAG, const SDLoc &SL,
8326 SDValue Op0, SDValue Op1, bool Signed) const {
Matt Arsenaultf639c322016-01-28 20:53:42 +00008327 ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
8328 if (!K1)
8329 return SDValue();
8330
8331 ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
8332 if (!K0)
8333 return SDValue();
8334
Matt Arsenaultf639c322016-01-28 20:53:42 +00008335 if (Signed) {
8336 if (K0->getAPIntValue().sge(K1->getAPIntValue()))
8337 return SDValue();
8338 } else {
8339 if (K0->getAPIntValue().uge(K1->getAPIntValue()))
8340 return SDValue();
8341 }
8342
8343 EVT VT = K0->getValueType(0);
Matt Arsenault10268f92017-02-27 22:40:39 +00008344 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
8345 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
8346 return DAG.getNode(Med3Opc, SL, VT,
8347 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
8348 }
Tom Stellard115a6152016-11-10 16:02:37 +00008349
Matt Arsenault10268f92017-02-27 22:40:39 +00008350 // If there isn't a 16-bit med3 operation, convert to 32-bit.
Tom Stellard115a6152016-11-10 16:02:37 +00008351 MVT NVT = MVT::i32;
8352 unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8353
Matt Arsenault10268f92017-02-27 22:40:39 +00008354 SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
8355 SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
8356 SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
Tom Stellard115a6152016-11-10 16:02:37 +00008357
Matt Arsenault10268f92017-02-27 22:40:39 +00008358 SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
8359 return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
Matt Arsenaultf639c322016-01-28 20:53:42 +00008360}
8361
Matt Arsenault6b114d22017-08-30 01:20:17 +00008362static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
8363 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
8364 return C;
8365
8366 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
8367 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
8368 return C;
8369 }
8370
8371 return nullptr;
8372}
8373
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008374SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
8375 const SDLoc &SL,
8376 SDValue Op0,
8377 SDValue Op1) const {
Matt Arsenault6b114d22017-08-30 01:20:17 +00008378 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
Matt Arsenaultf639c322016-01-28 20:53:42 +00008379 if (!K1)
8380 return SDValue();
8381
Matt Arsenault6b114d22017-08-30 01:20:17 +00008382 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
Matt Arsenaultf639c322016-01-28 20:53:42 +00008383 if (!K0)
8384 return SDValue();
8385
8386 // Ordered >= (although NaN inputs should have folded away by now).
8387 APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
8388 if (Cmp == APFloat::cmpGreaterThan)
8389 return SDValue();
8390
Matt Arsenault055e4dc2019-03-29 19:14:54 +00008391 const MachineFunction &MF = DAG.getMachineFunction();
8392 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8393
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008394 // TODO: Check IEEE bit enabled?
Matt Arsenault6b114d22017-08-30 01:20:17 +00008395 EVT VT = Op0.getValueType();
Matt Arsenault055e4dc2019-03-29 19:14:54 +00008396 if (Info->getMode().DX10Clamp) {
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008397 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
8398 // hardware fmed3 behavior converting to a min.
8399 // FIXME: Should this be allowing -0.0?
8400 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
8401 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
8402 }
8403
Matt Arsenault6b114d22017-08-30 01:20:17 +00008404 // med3 for f16 is only available on gfx9+, and not available for v2f16.
8405 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
8406 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
8407 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
8408 // then give the other result, which is different from med3 with a NaN
8409 // input.
8410 SDValue Var = Op0.getOperand(0);
Matt Arsenaultc3dc8e62018-08-03 18:27:52 +00008411 if (!DAG.isKnownNeverSNaN(Var))
Matt Arsenault6b114d22017-08-30 01:20:17 +00008412 return SDValue();
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008413
Matt Arsenaultebf46142018-09-18 02:34:54 +00008414 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
8415
8416 if ((!K0->hasOneUse() ||
8417 TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
8418 (!K1->hasOneUse() ||
8419 TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
8420 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
8421 Var, SDValue(K0, 0), SDValue(K1, 0));
8422 }
Matt Arsenault6b114d22017-08-30 01:20:17 +00008423 }
Matt Arsenaultf639c322016-01-28 20:53:42 +00008424
Matt Arsenault6b114d22017-08-30 01:20:17 +00008425 return SDValue();
Matt Arsenaultf639c322016-01-28 20:53:42 +00008426}
8427
8428SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
8429 DAGCombinerInfo &DCI) const {
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008430 SelectionDAG &DAG = DCI.DAG;
8431
Matt Arsenault79a45db2017-02-22 23:53:37 +00008432 EVT VT = N->getValueType(0);
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008433 unsigned Opc = N->getOpcode();
8434 SDValue Op0 = N->getOperand(0);
8435 SDValue Op1 = N->getOperand(1);
8436
8437 // Only do this if the inner op has one use since this will just increases
8438 // register pressure for no benefit.
8439
Matt Arsenault79a45db2017-02-22 23:53:37 +00008440 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
Neil Henninge85f6bd2019-03-19 15:50:24 +00008441 !VT.isVector() &&
8442 (VT == MVT::i32 || VT == MVT::f32 ||
8443 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
Matt Arsenault5b39b342016-01-28 20:53:48 +00008444 // max(max(a, b), c) -> max3(a, b, c)
8445 // min(min(a, b), c) -> min3(a, b, c)
8446 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
8447 SDLoc DL(N);
8448 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
8449 DL,
8450 N->getValueType(0),
8451 Op0.getOperand(0),
8452 Op0.getOperand(1),
8453 Op1);
8454 }
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008455
Matt Arsenault5b39b342016-01-28 20:53:48 +00008456 // Try commuted.
8457 // max(a, max(b, c)) -> max3(a, b, c)
8458 // min(a, min(b, c)) -> min3(a, b, c)
8459 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
8460 SDLoc DL(N);
8461 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
8462 DL,
8463 N->getValueType(0),
8464 Op0,
8465 Op1.getOperand(0),
8466 Op1.getOperand(1));
8467 }
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008468 }
8469
Matt Arsenaultf639c322016-01-28 20:53:42 +00008470 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
8471 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
8472 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
8473 return Med3;
8474 }
8475
8476 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
8477 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
8478 return Med3;
8479 }
8480
8481 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
Matt Arsenault5b39b342016-01-28 20:53:48 +00008482 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
Matt Arsenault687ec752018-10-22 16:27:27 +00008483 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
Matt Arsenault5b39b342016-01-28 20:53:48 +00008484 (Opc == AMDGPUISD::FMIN_LEGACY &&
8485 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
Matt Arsenault79a45db2017-02-22 23:53:37 +00008486 (VT == MVT::f32 || VT == MVT::f64 ||
Matt Arsenault6b114d22017-08-30 01:20:17 +00008487 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
8488 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008489 Op0.hasOneUse()) {
Matt Arsenaultf639c322016-01-28 20:53:42 +00008490 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
8491 return Res;
8492 }
8493
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008494 return SDValue();
8495}
8496
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008497static bool isClampZeroToOne(SDValue A, SDValue B) {
8498 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
8499 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
8500 // FIXME: Should this be allowing -0.0?
8501 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
8502 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
8503 }
8504 }
8505
8506 return false;
8507}
8508
8509// FIXME: Should only worry about snans for version with chain.
8510SDValue SITargetLowering::performFMed3Combine(SDNode *N,
8511 DAGCombinerInfo &DCI) const {
8512 EVT VT = N->getValueType(0);
8513 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
8514 // NaNs. With a NaN input, the order of the operands may change the result.
8515
8516 SelectionDAG &DAG = DCI.DAG;
8517 SDLoc SL(N);
8518
8519 SDValue Src0 = N->getOperand(0);
8520 SDValue Src1 = N->getOperand(1);
8521 SDValue Src2 = N->getOperand(2);
8522
8523 if (isClampZeroToOne(Src0, Src1)) {
8524 // const_a, const_b, x -> clamp is safe in all cases including signaling
8525 // nans.
8526 // FIXME: Should this be allowing -0.0?
8527 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
8528 }
8529
Matt Arsenault055e4dc2019-03-29 19:14:54 +00008530 const MachineFunction &MF = DAG.getMachineFunction();
8531 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8532
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008533 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
8534 // handling no dx10-clamp?
Matt Arsenault055e4dc2019-03-29 19:14:54 +00008535 if (Info->getMode().DX10Clamp) {
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008536 // If NaNs is clamped to 0, we are free to reorder the inputs.
8537
8538 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8539 std::swap(Src0, Src1);
8540
8541 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
8542 std::swap(Src1, Src2);
8543
8544 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8545 std::swap(Src0, Src1);
8546
8547 if (isClampZeroToOne(Src1, Src2))
8548 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
8549 }
8550
8551 return SDValue();
8552}
8553
Matt Arsenault1f17c662017-02-22 00:27:34 +00008554SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
8555 DAGCombinerInfo &DCI) const {
8556 SDValue Src0 = N->getOperand(0);
8557 SDValue Src1 = N->getOperand(1);
8558 if (Src0.isUndef() && Src1.isUndef())
8559 return DCI.DAG.getUNDEF(N->getValueType(0));
8560 return SDValue();
8561}
8562
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008563SDValue SITargetLowering::performExtractVectorEltCombine(
8564 SDNode *N, DAGCombinerInfo &DCI) const {
8565 SDValue Vec = N->getOperand(0);
Matt Arsenault8cbb4882017-09-20 21:01:24 +00008566 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008567
8568 EVT VecVT = Vec.getValueType();
8569 EVT EltVT = VecVT.getVectorElementType();
8570
Matt Arsenaultfcc5ba42018-04-26 19:21:32 +00008571 if ((Vec.getOpcode() == ISD::FNEG ||
8572 Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008573 SDLoc SL(N);
8574 EVT EltVT = N->getValueType(0);
8575 SDValue Idx = N->getOperand(1);
8576 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8577 Vec.getOperand(0), Idx);
Matt Arsenaultfcc5ba42018-04-26 19:21:32 +00008578 return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008579 }
8580
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008581 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
8582 // =>
8583 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
8584 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
8585 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
Farhana Aleene24f3ff2018-05-09 21:18:34 +00008586 if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008587 SDLoc SL(N);
8588 EVT EltVT = N->getValueType(0);
8589 SDValue Idx = N->getOperand(1);
8590 unsigned Opc = Vec.getOpcode();
8591
8592 switch(Opc) {
8593 default:
Stanislav Mekhanoshinbcb34ac2018-11-13 21:18:21 +00008594 break;
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008595 // TODO: Support other binary operations.
8596 case ISD::FADD:
Matt Arsenaulta8160732018-08-15 21:34:06 +00008597 case ISD::FSUB:
8598 case ISD::FMUL:
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008599 case ISD::ADD:
Farhana Aleene24f3ff2018-05-09 21:18:34 +00008600 case ISD::UMIN:
8601 case ISD::UMAX:
8602 case ISD::SMIN:
8603 case ISD::SMAX:
8604 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008605 case ISD::FMINNUM:
8606 case ISD::FMAXNUM_IEEE:
8607 case ISD::FMINNUM_IEEE: {
Matt Arsenaulta8160732018-08-15 21:34:06 +00008608 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8609 Vec.getOperand(0), Idx);
8610 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8611 Vec.getOperand(1), Idx);
8612
8613 DCI.AddToWorklist(Elt0.getNode());
8614 DCI.AddToWorklist(Elt1.getNode());
8615 return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
8616 }
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008617 }
8618 }
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008619
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008620 unsigned VecSize = VecVT.getSizeInBits();
8621 unsigned EltSize = EltVT.getSizeInBits();
8622
Stanislav Mekhanoshinbcb34ac2018-11-13 21:18:21 +00008623 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
8624 // This elminates non-constant index and subsequent movrel or scratch access.
8625 // Sub-dword vectors of size 2 dword or less have better implementation.
8626 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8627 // instructions.
8628 if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) &&
8629 !isa<ConstantSDNode>(N->getOperand(1))) {
8630 SDLoc SL(N);
8631 SDValue Idx = N->getOperand(1);
8632 EVT IdxVT = Idx.getValueType();
8633 SDValue V;
8634 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8635 SDValue IC = DAG.getConstant(I, SL, IdxVT);
8636 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8637 if (I == 0)
8638 V = Elt;
8639 else
8640 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
8641 }
8642 return V;
8643 }
8644
8645 if (!DCI.isBeforeLegalize())
8646 return SDValue();
8647
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008648 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
8649 // elements. This exposes more load reduction opportunities by replacing
8650 // multiple small extract_vector_elements with a single 32-bit extract.
8651 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
Matt Arsenaultbf07a502018-08-31 15:39:52 +00008652 if (isa<MemSDNode>(Vec) &&
8653 EltSize <= 16 &&
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008654 EltVT.isByteSized() &&
8655 VecSize > 32 &&
8656 VecSize % 32 == 0 &&
8657 Idx) {
8658 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
8659
8660 unsigned BitIndex = Idx->getZExtValue() * EltSize;
8661 unsigned EltIdx = BitIndex / 32;
8662 unsigned LeftoverBitIdx = BitIndex % 32;
8663 SDLoc SL(N);
8664
8665 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
8666 DCI.AddToWorklist(Cast.getNode());
8667
8668 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
8669 DAG.getConstant(EltIdx, SL, MVT::i32));
8670 DCI.AddToWorklist(Elt.getNode());
8671 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
8672 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
8673 DCI.AddToWorklist(Srl.getNode());
8674
8675 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
8676 DCI.AddToWorklist(Trunc.getNode());
8677 return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
8678 }
8679
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008680 return SDValue();
8681}
8682
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00008683SDValue
8684SITargetLowering::performInsertVectorEltCombine(SDNode *N,
8685 DAGCombinerInfo &DCI) const {
8686 SDValue Vec = N->getOperand(0);
8687 SDValue Idx = N->getOperand(2);
8688 EVT VecVT = Vec.getValueType();
8689 EVT EltVT = VecVT.getVectorElementType();
8690 unsigned VecSize = VecVT.getSizeInBits();
8691 unsigned EltSize = EltVT.getSizeInBits();
8692
8693 // INSERT_VECTOR_ELT (<n x e>, var-idx)
8694 // => BUILD_VECTOR n x select (e, const-idx)
8695 // This elminates non-constant index and subsequent movrel or scratch access.
8696 // Sub-dword vectors of size 2 dword or less have better implementation.
8697 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8698 // instructions.
8699 if (isa<ConstantSDNode>(Idx) ||
8700 VecSize > 256 || (VecSize <= 64 && EltSize < 32))
8701 return SDValue();
8702
8703 SelectionDAG &DAG = DCI.DAG;
8704 SDLoc SL(N);
8705 SDValue Ins = N->getOperand(1);
8706 EVT IdxVT = Idx.getValueType();
8707
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00008708 SmallVector<SDValue, 16> Ops;
8709 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8710 SDValue IC = DAG.getConstant(I, SL, IdxVT);
8711 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8712 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
8713 Ops.push_back(V);
8714 }
8715
8716 return DAG.getBuildVector(VecVT, SL, Ops);
8717}
8718
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008719unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
8720 const SDNode *N0,
8721 const SDNode *N1) const {
8722 EVT VT = N0->getValueType(0);
8723
Matt Arsenault770ec862016-12-22 03:55:35 +00008724 // Only do this if we are not trying to support denormals. v_mad_f32 does not
8725 // support denormals ever.
Stanislav Mekhanoshin28a19362019-05-04 04:20:37 +00008726 if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
8727 (VT == MVT::f16 && !Subtarget->hasFP16Denormals() &&
8728 getSubtarget()->hasMadF16())) &&
8729 isOperationLegal(ISD::FMAD, VT))
Matt Arsenault770ec862016-12-22 03:55:35 +00008730 return ISD::FMAD;
8731
8732 const TargetOptions &Options = DAG.getTarget().Options;
Amara Emersond28f0cd42017-05-01 15:17:51 +00008733 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
Michael Berg7acc81b2018-05-04 18:48:20 +00008734 (N0->getFlags().hasAllowContract() &&
8735 N1->getFlags().hasAllowContract())) &&
Matt Arsenault770ec862016-12-22 03:55:35 +00008736 isFMAFasterThanFMulAndFAdd(VT)) {
8737 return ISD::FMA;
8738 }
8739
8740 return 0;
8741}
8742
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00008743// For a reassociatable opcode perform:
8744// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
8745SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
8746 SelectionDAG &DAG) const {
8747 EVT VT = N->getValueType(0);
8748 if (VT != MVT::i32 && VT != MVT::i64)
8749 return SDValue();
8750
8751 unsigned Opc = N->getOpcode();
8752 SDValue Op0 = N->getOperand(0);
8753 SDValue Op1 = N->getOperand(1);
8754
8755 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
8756 return SDValue();
8757
8758 if (Op0->isDivergent())
8759 std::swap(Op0, Op1);
8760
8761 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
8762 return SDValue();
8763
8764 SDValue Op2 = Op1.getOperand(1);
8765 Op1 = Op1.getOperand(0);
8766 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
8767 return SDValue();
8768
8769 if (Op1->isDivergent())
8770 std::swap(Op1, Op2);
8771
8772 // If either operand is constant this will conflict with
8773 // DAGCombiner::ReassociateOps().
Stanislav Mekhanoshinda1628e2019-02-26 20:56:25 +00008774 if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
8775 DAG.isConstantIntBuildVectorOrConstantInt(Op1))
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00008776 return SDValue();
8777
8778 SDLoc SL(N);
8779 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
8780 return DAG.getNode(Opc, SL, VT, Add1, Op2);
8781}
8782
Matt Arsenault4f6318f2017-11-06 17:04:37 +00008783static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
8784 EVT VT,
8785 SDValue N0, SDValue N1, SDValue N2,
8786 bool Signed) {
8787 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
8788 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
8789 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
8790 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
8791}
8792
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008793SDValue SITargetLowering::performAddCombine(SDNode *N,
8794 DAGCombinerInfo &DCI) const {
8795 SelectionDAG &DAG = DCI.DAG;
8796 EVT VT = N->getValueType(0);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008797 SDLoc SL(N);
8798 SDValue LHS = N->getOperand(0);
8799 SDValue RHS = N->getOperand(1);
8800
Matt Arsenault4f6318f2017-11-06 17:04:37 +00008801 if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
8802 && Subtarget->hasMad64_32() &&
8803 !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
8804 VT.getScalarSizeInBits() <= 64) {
8805 if (LHS.getOpcode() != ISD::MUL)
8806 std::swap(LHS, RHS);
8807
8808 SDValue MulLHS = LHS.getOperand(0);
8809 SDValue MulRHS = LHS.getOperand(1);
8810 SDValue AddRHS = RHS;
8811
8812 // TODO: Maybe restrict if SGPR inputs.
8813 if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
8814 numBitsUnsigned(MulRHS, DAG) <= 32) {
8815 MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
8816 MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
8817 AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
8818 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
8819 }
8820
8821 if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
8822 MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
8823 MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
8824 AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
8825 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
8826 }
8827
8828 return SDValue();
8829 }
8830
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00008831 if (SDValue V = reassociateScalarOps(N, DAG)) {
8832 return V;
8833 }
8834
Farhana Aleen07e61232018-05-02 18:16:39 +00008835 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
Matt Arsenault4f6318f2017-11-06 17:04:37 +00008836 return SDValue();
8837
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008838 // add x, zext (setcc) => addcarry x, 0, setcc
8839 // add x, sext (setcc) => subcarry x, 0, setcc
8840 unsigned Opc = LHS.getOpcode();
8841 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008842 Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008843 std::swap(RHS, LHS);
8844
8845 Opc = RHS.getOpcode();
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008846 switch (Opc) {
8847 default: break;
8848 case ISD::ZERO_EXTEND:
8849 case ISD::SIGN_EXTEND:
8850 case ISD::ANY_EXTEND: {
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008851 auto Cond = RHS.getOperand(0);
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00008852 if (!isBoolSGPR(Cond))
Stanislav Mekhanoshin3ed38c62017-06-21 23:46:22 +00008853 break;
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008854 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
8855 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
8856 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
8857 return DAG.getNode(Opc, SL, VTList, Args);
8858 }
8859 case ISD::ADDCARRY: {
8860 // add x, (addcarry y, 0, cc) => addcarry x, y, cc
8861 auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
8862 if (!C || C->getZExtValue() != 0) break;
8863 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
8864 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
8865 }
8866 }
8867 return SDValue();
8868}
8869
8870SDValue SITargetLowering::performSubCombine(SDNode *N,
8871 DAGCombinerInfo &DCI) const {
8872 SelectionDAG &DAG = DCI.DAG;
8873 EVT VT = N->getValueType(0);
8874
8875 if (VT != MVT::i32)
8876 return SDValue();
8877
8878 SDLoc SL(N);
8879 SDValue LHS = N->getOperand(0);
8880 SDValue RHS = N->getOperand(1);
8881
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008882 if (LHS.getOpcode() == ISD::SUBCARRY) {
8883 // sub (subcarry x, 0, cc), y => subcarry x, y, cc
8884 auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
Stanislav Mekhanoshin42e229e2019-02-21 02:58:00 +00008885 if (!C || !C->isNullValue())
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008886 return SDValue();
8887 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
8888 return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
8889 }
8890 return SDValue();
8891}
8892
8893SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
8894 DAGCombinerInfo &DCI) const {
8895
8896 if (N->getValueType(0) != MVT::i32)
8897 return SDValue();
8898
8899 auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8900 if (!C || C->getZExtValue() != 0)
8901 return SDValue();
8902
8903 SelectionDAG &DAG = DCI.DAG;
8904 SDValue LHS = N->getOperand(0);
8905
8906 // addcarry (add x, y), 0, cc => addcarry x, y, cc
8907 // subcarry (sub x, y), 0, cc => subcarry x, y, cc
8908 unsigned LHSOpc = LHS.getOpcode();
8909 unsigned Opc = N->getOpcode();
8910 if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
8911 (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
8912 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
8913 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008914 }
8915 return SDValue();
8916}
8917
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008918SDValue SITargetLowering::performFAddCombine(SDNode *N,
8919 DAGCombinerInfo &DCI) const {
8920 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8921 return SDValue();
8922
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008923 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault770ec862016-12-22 03:55:35 +00008924 EVT VT = N->getValueType(0);
Matt Arsenault770ec862016-12-22 03:55:35 +00008925
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008926 SDLoc SL(N);
8927 SDValue LHS = N->getOperand(0);
8928 SDValue RHS = N->getOperand(1);
8929
8930 // These should really be instruction patterns, but writing patterns with
8931 // source modiifiers is a pain.
8932
8933 // fadd (fadd (a, a), b) -> mad 2.0, a, b
8934 if (LHS.getOpcode() == ISD::FADD) {
8935 SDValue A = LHS.getOperand(0);
8936 if (A == LHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008937 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00008938 if (FusedOp != 0) {
8939 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00008940 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
Matt Arsenault770ec862016-12-22 03:55:35 +00008941 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008942 }
8943 }
8944
8945 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
8946 if (RHS.getOpcode() == ISD::FADD) {
8947 SDValue A = RHS.getOperand(0);
8948 if (A == RHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008949 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00008950 if (FusedOp != 0) {
8951 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00008952 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
Matt Arsenault770ec862016-12-22 03:55:35 +00008953 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008954 }
8955 }
8956
8957 return SDValue();
8958}
8959
8960SDValue SITargetLowering::performFSubCombine(SDNode *N,
8961 DAGCombinerInfo &DCI) const {
8962 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8963 return SDValue();
8964
8965 SelectionDAG &DAG = DCI.DAG;
8966 SDLoc SL(N);
8967 EVT VT = N->getValueType(0);
8968 assert(!VT.isVector());
8969
8970 // Try to get the fneg to fold into the source modifier. This undoes generic
8971 // DAG combines and folds them into the mad.
8972 //
8973 // Only do this if we are not trying to support denormals. v_mad_f32 does
8974 // not support denormals ever.
Matt Arsenault770ec862016-12-22 03:55:35 +00008975 SDValue LHS = N->getOperand(0);
8976 SDValue RHS = N->getOperand(1);
8977 if (LHS.getOpcode() == ISD::FADD) {
8978 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
8979 SDValue A = LHS.getOperand(0);
8980 if (A == LHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008981 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00008982 if (FusedOp != 0){
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008983 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8984 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
8985
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00008986 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008987 }
8988 }
Matt Arsenault770ec862016-12-22 03:55:35 +00008989 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008990
Matt Arsenault770ec862016-12-22 03:55:35 +00008991 if (RHS.getOpcode() == ISD::FADD) {
8992 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008993
Matt Arsenault770ec862016-12-22 03:55:35 +00008994 SDValue A = RHS.getOperand(0);
8995 if (A == RHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008996 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00008997 if (FusedOp != 0){
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008998 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00008999 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009000 }
9001 }
9002 }
9003
9004 return SDValue();
9005}
9006
Farhana Aleenc370d7b2018-07-16 18:19:59 +00009007SDValue SITargetLowering::performFMACombine(SDNode *N,
9008 DAGCombinerInfo &DCI) const {
9009 SelectionDAG &DAG = DCI.DAG;
9010 EVT VT = N->getValueType(0);
9011 SDLoc SL(N);
9012
Stanislav Mekhanoshin0e858b02019-02-09 00:34:21 +00009013 if (!Subtarget->hasDot2Insts() || VT != MVT::f32)
Farhana Aleenc370d7b2018-07-16 18:19:59 +00009014 return SDValue();
9015
9016 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
9017 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
9018 SDValue Op1 = N->getOperand(0);
9019 SDValue Op2 = N->getOperand(1);
9020 SDValue FMA = N->getOperand(2);
9021
9022 if (FMA.getOpcode() != ISD::FMA ||
9023 Op1.getOpcode() != ISD::FP_EXTEND ||
9024 Op2.getOpcode() != ISD::FP_EXTEND)
9025 return SDValue();
9026
9027 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
9028 // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
9029 // is sufficient to allow generaing fdot2.
9030 const TargetOptions &Options = DAG.getTarget().Options;
9031 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
9032 (N->getFlags().hasAllowContract() &&
9033 FMA->getFlags().hasAllowContract())) {
9034 Op1 = Op1.getOperand(0);
9035 Op2 = Op2.getOperand(0);
9036 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9037 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9038 return SDValue();
9039
9040 SDValue Vec1 = Op1.getOperand(0);
9041 SDValue Idx1 = Op1.getOperand(1);
9042 SDValue Vec2 = Op2.getOperand(0);
9043
9044 SDValue FMAOp1 = FMA.getOperand(0);
9045 SDValue FMAOp2 = FMA.getOperand(1);
9046 SDValue FMAAcc = FMA.getOperand(2);
9047
9048 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
9049 FMAOp2.getOpcode() != ISD::FP_EXTEND)
9050 return SDValue();
9051
9052 FMAOp1 = FMAOp1.getOperand(0);
9053 FMAOp2 = FMAOp2.getOperand(0);
9054 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9055 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9056 return SDValue();
9057
9058 SDValue Vec3 = FMAOp1.getOperand(0);
9059 SDValue Vec4 = FMAOp2.getOperand(0);
9060 SDValue Idx2 = FMAOp1.getOperand(1);
9061
9062 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
9063 // Idx1 and Idx2 cannot be the same.
9064 Idx1 == Idx2)
9065 return SDValue();
9066
9067 if (Vec1 == Vec2 || Vec3 == Vec4)
9068 return SDValue();
9069
9070 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
9071 return SDValue();
9072
9073 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
Konstantin Zhuravlyovbb30ef72018-08-01 01:31:30 +00009074 (Vec1 == Vec4 && Vec2 == Vec3)) {
9075 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
9076 DAG.getTargetConstant(0, SL, MVT::i1));
9077 }
Farhana Aleenc370d7b2018-07-16 18:19:59 +00009078 }
9079 return SDValue();
9080}
9081
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009082SDValue SITargetLowering::performSetCCCombine(SDNode *N,
9083 DAGCombinerInfo &DCI) const {
9084 SelectionDAG &DAG = DCI.DAG;
9085 SDLoc SL(N);
9086
9087 SDValue LHS = N->getOperand(0);
9088 SDValue RHS = N->getOperand(1);
9089 EVT VT = LHS.getValueType();
Stanislav Mekhanoshinc9bd53a2017-06-27 18:53:03 +00009090 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
9091
9092 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
9093 if (!CRHS) {
9094 CRHS = dyn_cast<ConstantSDNode>(LHS);
9095 if (CRHS) {
9096 std::swap(LHS, RHS);
9097 CC = getSetCCSwappedOperands(CC);
9098 }
9099 }
9100
Stanislav Mekhanoshin3b117942018-06-16 03:46:59 +00009101 if (CRHS) {
9102 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
9103 isBoolSGPR(LHS.getOperand(0))) {
9104 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
9105 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
9106 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
9107 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
9108 if ((CRHS->isAllOnesValue() &&
9109 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
9110 (CRHS->isNullValue() &&
9111 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
9112 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
9113 DAG.getConstant(-1, SL, MVT::i1));
9114 if ((CRHS->isAllOnesValue() &&
9115 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
9116 (CRHS->isNullValue() &&
9117 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
9118 return LHS.getOperand(0);
9119 }
9120
9121 uint64_t CRHSVal = CRHS->getZExtValue();
9122 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
9123 LHS.getOpcode() == ISD::SELECT &&
9124 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9125 isa<ConstantSDNode>(LHS.getOperand(2)) &&
9126 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
9127 isBoolSGPR(LHS.getOperand(0))) {
9128 // Given CT != FT:
9129 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
9130 // setcc (select cc, CT, CF), CF, ne => cc
9131 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
9132 // setcc (select cc, CT, CF), CT, eq => cc
9133 uint64_t CT = LHS.getConstantOperandVal(1);
9134 uint64_t CF = LHS.getConstantOperandVal(2);
9135
9136 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
9137 (CT == CRHSVal && CC == ISD::SETNE))
9138 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
9139 DAG.getConstant(-1, SL, MVT::i1));
9140 if ((CF == CRHSVal && CC == ISD::SETNE) ||
9141 (CT == CRHSVal && CC == ISD::SETEQ))
9142 return LHS.getOperand(0);
9143 }
Stanislav Mekhanoshinc9bd53a2017-06-27 18:53:03 +00009144 }
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009145
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00009146 if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
9147 VT != MVT::f16))
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009148 return SDValue();
9149
Matt Arsenault8ad00d32018-08-10 18:58:41 +00009150 // Match isinf/isfinite pattern
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009151 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
Matt Arsenault8ad00d32018-08-10 18:58:41 +00009152 // (fcmp one (fabs x), inf) -> (fp_class x,
9153 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
9154 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009155 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
9156 if (!CRHS)
9157 return SDValue();
9158
9159 const APFloat &APF = CRHS->getValueAPF();
9160 if (APF.isInfinity() && !APF.isNegative()) {
Matt Arsenault8ad00d32018-08-10 18:58:41 +00009161 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
9162 SIInstrFlags::N_INFINITY;
9163 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
9164 SIInstrFlags::P_ZERO |
9165 SIInstrFlags::N_NORMAL |
9166 SIInstrFlags::P_NORMAL |
9167 SIInstrFlags::N_SUBNORMAL |
9168 SIInstrFlags::P_SUBNORMAL;
9169 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009170 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
9171 DAG.getConstant(Mask, SL, MVT::i32));
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009172 }
9173 }
9174
9175 return SDValue();
9176}
9177
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009178SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
9179 DAGCombinerInfo &DCI) const {
9180 SelectionDAG &DAG = DCI.DAG;
9181 SDLoc SL(N);
9182 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
9183
9184 SDValue Src = N->getOperand(0);
9185 SDValue Srl = N->getOperand(0);
9186 if (Srl.getOpcode() == ISD::ZERO_EXTEND)
9187 Srl = Srl.getOperand(0);
9188
9189 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
9190 if (Srl.getOpcode() == ISD::SRL) {
9191 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
9192 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
9193 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
9194
9195 if (const ConstantSDNode *C =
9196 dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
9197 Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
9198 EVT(MVT::i32));
9199
9200 unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
9201 if (SrcOffset < 32 && SrcOffset % 8 == 0) {
9202 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
9203 MVT::f32, Srl);
9204 }
9205 }
9206 }
9207
9208 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
9209
Craig Topperd0af7e82017-04-28 05:31:46 +00009210 KnownBits Known;
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009211 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
9212 !DCI.isBeforeLegalizeOps());
9213 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
Stanislav Mekhanoshined0d6c62019-01-09 02:24:22 +00009214 if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009215 DCI.CommitTargetLoweringOpt(TLO);
9216 }
9217
9218 return SDValue();
9219}
9220
Tom Stellard1b95fed2018-05-24 05:28:34 +00009221SDValue SITargetLowering::performClampCombine(SDNode *N,
9222 DAGCombinerInfo &DCI) const {
9223 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
9224 if (!CSrc)
9225 return SDValue();
9226
Matt Arsenault055e4dc2019-03-29 19:14:54 +00009227 const MachineFunction &MF = DCI.DAG.getMachineFunction();
Tom Stellard1b95fed2018-05-24 05:28:34 +00009228 const APFloat &F = CSrc->getValueAPF();
9229 APFloat Zero = APFloat::getZero(F.getSemantics());
9230 APFloat::cmpResult Cmp0 = F.compare(Zero);
9231 if (Cmp0 == APFloat::cmpLessThan ||
Matt Arsenault055e4dc2019-03-29 19:14:54 +00009232 (Cmp0 == APFloat::cmpUnordered &&
9233 MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
Tom Stellard1b95fed2018-05-24 05:28:34 +00009234 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
9235 }
9236
9237 APFloat One(F.getSemantics(), "1.0");
9238 APFloat::cmpResult Cmp1 = F.compare(One);
9239 if (Cmp1 == APFloat::cmpGreaterThan)
9240 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
9241
9242 return SDValue(CSrc, 0);
9243}
9244
9245
Tom Stellard75aadc22012-12-11 21:25:42 +00009246SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
9247 DAGCombinerInfo &DCI) const {
Stanislav Mekhanoshin443a7f92018-11-27 15:13:37 +00009248 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
9249 return SDValue();
Tom Stellard75aadc22012-12-11 21:25:42 +00009250 switch (N->getOpcode()) {
Matt Arsenault22b4c252014-12-21 16:48:42 +00009251 default:
9252 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00009253 case ISD::ADD:
9254 return performAddCombine(N, DCI);
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00009255 case ISD::SUB:
9256 return performSubCombine(N, DCI);
9257 case ISD::ADDCARRY:
9258 case ISD::SUBCARRY:
9259 return performAddCarrySubCarryCombine(N, DCI);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009260 case ISD::FADD:
9261 return performFAddCombine(N, DCI);
9262 case ISD::FSUB:
9263 return performFSubCombine(N, DCI);
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009264 case ISD::SETCC:
9265 return performSetCCCombine(N, DCI);
Matt Arsenault5b39b342016-01-28 20:53:48 +00009266 case ISD::FMAXNUM:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00009267 case ISD::FMINNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00009268 case ISD::FMAXNUM_IEEE:
9269 case ISD::FMINNUM_IEEE:
Matt Arsenault5881f4e2015-06-09 00:52:37 +00009270 case ISD::SMAX:
9271 case ISD::SMIN:
9272 case ISD::UMAX:
Matt Arsenault5b39b342016-01-28 20:53:48 +00009273 case ISD::UMIN:
9274 case AMDGPUISD::FMIN_LEGACY:
Stanislav Mekhanoshin443a7f92018-11-27 15:13:37 +00009275 case AMDGPUISD::FMAX_LEGACY:
9276 return performMinMaxCombine(N, DCI);
Farhana Aleenc370d7b2018-07-16 18:19:59 +00009277 case ISD::FMA:
9278 return performFMACombine(N, DCI);
Matt Arsenault90083d32018-06-07 09:54:49 +00009279 case ISD::LOAD: {
9280 if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
9281 return Widended;
9282 LLVM_FALLTHROUGH;
9283 }
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00009284 case ISD::STORE:
9285 case ISD::ATOMIC_LOAD:
9286 case ISD::ATOMIC_STORE:
9287 case ISD::ATOMIC_CMP_SWAP:
9288 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
9289 case ISD::ATOMIC_SWAP:
9290 case ISD::ATOMIC_LOAD_ADD:
9291 case ISD::ATOMIC_LOAD_SUB:
9292 case ISD::ATOMIC_LOAD_AND:
9293 case ISD::ATOMIC_LOAD_OR:
9294 case ISD::ATOMIC_LOAD_XOR:
9295 case ISD::ATOMIC_LOAD_NAND:
9296 case ISD::ATOMIC_LOAD_MIN:
9297 case ISD::ATOMIC_LOAD_MAX:
9298 case ISD::ATOMIC_LOAD_UMIN:
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00009299 case ISD::ATOMIC_LOAD_UMAX:
Matt Arsenaulta5840c32019-01-22 18:36:06 +00009300 case ISD::ATOMIC_LOAD_FADD:
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00009301 case AMDGPUISD::ATOMIC_INC:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00009302 case AMDGPUISD::ATOMIC_DEC:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00009303 case AMDGPUISD::ATOMIC_LOAD_FMIN:
Matt Arsenaulta5840c32019-01-22 18:36:06 +00009304 case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00009305 if (DCI.isBeforeLegalize())
9306 break;
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009307 return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
Matt Arsenaultd0101a22015-01-06 23:00:46 +00009308 case ISD::AND:
9309 return performAndCombine(N, DCI);
Matt Arsenaultf2290332015-01-06 23:00:39 +00009310 case ISD::OR:
9311 return performOrCombine(N, DCI);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00009312 case ISD::XOR:
9313 return performXorCombine(N, DCI);
Matt Arsenault8edfaee2017-03-31 19:53:03 +00009314 case ISD::ZERO_EXTEND:
9315 return performZeroExtendCombine(N, DCI);
Ryan Taylor00e063a2019-03-19 16:07:00 +00009316 case ISD::SIGN_EXTEND_INREG:
9317 return performSignExtendInRegCombine(N , DCI);
Matt Arsenaultf2290332015-01-06 23:00:39 +00009318 case AMDGPUISD::FP_CLASS:
9319 return performClassCombine(N, DCI);
Matt Arsenault9cd90712016-04-14 01:42:16 +00009320 case ISD::FCANONICALIZE:
9321 return performFCanonicalizeCombine(N, DCI);
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009322 case AMDGPUISD::RCP:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00009323 return performRcpCombine(N, DCI);
9324 case AMDGPUISD::FRACT:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009325 case AMDGPUISD::RSQ:
Matt Arsenault32fc5272016-07-26 16:45:45 +00009326 case AMDGPUISD::RCP_LEGACY:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009327 case AMDGPUISD::RSQ_LEGACY:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00009328 case AMDGPUISD::RCP_IFLAG:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009329 case AMDGPUISD::RSQ_CLAMP:
9330 case AMDGPUISD::LDEXP: {
9331 SDValue Src = N->getOperand(0);
9332 if (Src.isUndef())
9333 return Src;
9334 break;
9335 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009336 case ISD::SINT_TO_FP:
9337 case ISD::UINT_TO_FP:
9338 return performUCharToFloatCombine(N, DCI);
9339 case AMDGPUISD::CVT_F32_UBYTE0:
9340 case AMDGPUISD::CVT_F32_UBYTE1:
9341 case AMDGPUISD::CVT_F32_UBYTE2:
9342 case AMDGPUISD::CVT_F32_UBYTE3:
9343 return performCvtF32UByteNCombine(N, DCI);
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00009344 case AMDGPUISD::FMED3:
9345 return performFMed3Combine(N, DCI);
Matt Arsenault1f17c662017-02-22 00:27:34 +00009346 case AMDGPUISD::CVT_PKRTZ_F16_F32:
9347 return performCvtPkRTZCombine(N, DCI);
Tom Stellard1b95fed2018-05-24 05:28:34 +00009348 case AMDGPUISD::CLAMP:
9349 return performClampCombine(N, DCI);
Matt Arsenaulteb522e62017-02-27 22:15:25 +00009350 case ISD::SCALAR_TO_VECTOR: {
9351 SelectionDAG &DAG = DCI.DAG;
9352 EVT VT = N->getValueType(0);
9353
9354 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
9355 if (VT == MVT::v2i16 || VT == MVT::v2f16) {
9356 SDLoc SL(N);
9357 SDValue Src = N->getOperand(0);
9358 EVT EltVT = Src.getValueType();
9359 if (EltVT == MVT::f16)
9360 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
9361
9362 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
9363 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
9364 }
9365
9366 break;
9367 }
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00009368 case ISD::EXTRACT_VECTOR_ELT:
9369 return performExtractVectorEltCombine(N, DCI);
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00009370 case ISD::INSERT_VECTOR_ELT:
9371 return performInsertVectorEltCombine(N, DCI);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00009372 }
Matt Arsenault5565f65e2014-05-22 18:09:07 +00009373 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
Tom Stellard75aadc22012-12-11 21:25:42 +00009374}
Christian Konigd910b7d2013-02-26 17:52:16 +00009375
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009376/// Helper function for adjustWritemask
Benjamin Kramer635e3682013-05-23 15:43:05 +00009377static unsigned SubIdx2Lane(unsigned Idx) {
Christian Konig8e06e2a2013-04-10 08:39:08 +00009378 switch (Idx) {
9379 default: return 0;
9380 case AMDGPU::sub0: return 0;
9381 case AMDGPU::sub1: return 1;
9382 case AMDGPU::sub2: return 2;
9383 case AMDGPU::sub3: return 3;
David Stuttardf77079f2019-01-14 11:55:24 +00009384 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
Christian Konig8e06e2a2013-04-10 08:39:08 +00009385 }
9386}
9387
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009388/// Adjust the writemask of MIMG instructions
Matt Arsenault68f05052017-12-04 22:18:27 +00009389SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
9390 SelectionDAG &DAG) const {
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009391 unsigned Opcode = Node->getMachineOpcode();
9392
9393 // Subtract 1 because the vdata output is not a MachineSDNode operand.
9394 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
9395 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
9396 return Node; // not implemented for D16
9397
David Stuttardf77079f2019-01-14 11:55:24 +00009398 SDNode *Users[5] = { nullptr };
Tom Stellard54774e52013-10-23 02:53:47 +00009399 unsigned Lane = 0;
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009400 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009401 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
Tom Stellard54774e52013-10-23 02:53:47 +00009402 unsigned NewDmask = 0;
David Stuttardf77079f2019-01-14 11:55:24 +00009403 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
9404 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
9405 bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
9406 Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
9407 unsigned TFCLane = 0;
Matt Arsenault856777d2017-12-08 20:00:57 +00009408 bool HasChain = Node->getNumValues() > 1;
9409
9410 if (OldDmask == 0) {
9411 // These are folded out, but on the chance it happens don't assert.
9412 return Node;
9413 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009414
David Stuttardf77079f2019-01-14 11:55:24 +00009415 unsigned OldBitsSet = countPopulation(OldDmask);
9416 // Work out which is the TFE/LWE lane if that is enabled.
9417 if (UsesTFC) {
9418 TFCLane = OldBitsSet;
9419 }
9420
Christian Konig8e06e2a2013-04-10 08:39:08 +00009421 // Try to figure out the used register components
9422 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
9423 I != E; ++I) {
9424
Matt Arsenault93e65ea2017-02-22 21:16:41 +00009425 // Don't look at users of the chain.
9426 if (I.getUse().getResNo() != 0)
9427 continue;
9428
Christian Konig8e06e2a2013-04-10 08:39:08 +00009429 // Abort if we can't understand the usage
9430 if (!I->isMachineOpcode() ||
9431 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
Matt Arsenault68f05052017-12-04 22:18:27 +00009432 return Node;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009433
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +00009434 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
Tom Stellard54774e52013-10-23 02:53:47 +00009435 // Note that subregs are packed, i.e. Lane==0 is the first bit set
9436 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
9437 // set, etc.
Christian Konig8b1ed282013-04-10 08:39:16 +00009438 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
Christian Konig8e06e2a2013-04-10 08:39:08 +00009439
David Stuttardf77079f2019-01-14 11:55:24 +00009440 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
9441 if (UsesTFC && Lane == TFCLane) {
9442 Users[Lane] = *I;
9443 } else {
9444 // Set which texture component corresponds to the lane.
9445 unsigned Comp;
9446 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
9447 Comp = countTrailingZeros(Dmask);
9448 Dmask &= ~(1 << Comp);
9449 }
9450
9451 // Abort if we have more than one user per component.
9452 if (Users[Lane])
9453 return Node;
9454
9455 Users[Lane] = *I;
9456 NewDmask |= 1 << Comp;
Tom Stellard54774e52013-10-23 02:53:47 +00009457 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009458 }
9459
David Stuttardf77079f2019-01-14 11:55:24 +00009460 // Don't allow 0 dmask, as hardware assumes one channel enabled.
9461 bool NoChannels = !NewDmask;
9462 if (NoChannels) {
David Stuttardfc2a7472019-03-20 09:29:55 +00009463 if (!UsesTFC) {
9464 // No uses of the result and not using TFC. Then do nothing.
9465 return Node;
9466 }
David Stuttardf77079f2019-01-14 11:55:24 +00009467 // If the original dmask has one channel - then nothing to do
9468 if (OldBitsSet == 1)
9469 return Node;
9470 // Use an arbitrary dmask - required for the instruction to work
9471 NewDmask = 1;
9472 }
Tom Stellard54774e52013-10-23 02:53:47 +00009473 // Abort if there's no change
9474 if (NewDmask == OldDmask)
Matt Arsenault68f05052017-12-04 22:18:27 +00009475 return Node;
9476
9477 unsigned BitsSet = countPopulation(NewDmask);
9478
David Stuttardf77079f2019-01-14 11:55:24 +00009479 // Check for TFE or LWE - increase the number of channels by one to account
9480 // for the extra return value
9481 // This will need adjustment for D16 if this is also included in
9482 // adjustWriteMask (this function) but at present D16 are excluded.
9483 unsigned NewChannels = BitsSet + UsesTFC;
9484
9485 int NewOpcode =
9486 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
Matt Arsenault68f05052017-12-04 22:18:27 +00009487 assert(NewOpcode != -1 &&
9488 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
9489 "failed to find equivalent MIMG op");
Christian Konig8e06e2a2013-04-10 08:39:08 +00009490
9491 // Adjust the writemask in the node
Matt Arsenault68f05052017-12-04 22:18:27 +00009492 SmallVector<SDValue, 12> Ops;
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009493 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009494 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009495 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
Christian Konig8e06e2a2013-04-10 08:39:08 +00009496
Matt Arsenault68f05052017-12-04 22:18:27 +00009497 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
9498
David Stuttardf77079f2019-01-14 11:55:24 +00009499 MVT ResultVT = NewChannels == 1 ?
9500 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
9501 NewChannels == 5 ? 8 : NewChannels);
Matt Arsenault856777d2017-12-08 20:00:57 +00009502 SDVTList NewVTList = HasChain ?
9503 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
9504
Matt Arsenault68f05052017-12-04 22:18:27 +00009505
9506 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
9507 NewVTList, Ops);
Matt Arsenaultecad0d532017-12-08 20:00:45 +00009508
Matt Arsenault856777d2017-12-08 20:00:57 +00009509 if (HasChain) {
9510 // Update chain.
Chandler Carruth66654b72018-08-14 23:30:32 +00009511 DAG.setNodeMemRefs(NewNode, Node->memoperands());
Matt Arsenault856777d2017-12-08 20:00:57 +00009512 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
9513 }
Matt Arsenault68f05052017-12-04 22:18:27 +00009514
David Stuttardf77079f2019-01-14 11:55:24 +00009515 if (NewChannels == 1) {
Matt Arsenault68f05052017-12-04 22:18:27 +00009516 assert(Node->hasNUsesOfValue(1, 0));
9517 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
9518 SDLoc(Node), Users[Lane]->getValueType(0),
9519 SDValue(NewNode, 0));
Christian Konig8b1ed282013-04-10 08:39:16 +00009520 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
Matt Arsenault68f05052017-12-04 22:18:27 +00009521 return nullptr;
Christian Konig8b1ed282013-04-10 08:39:16 +00009522 }
9523
Christian Konig8e06e2a2013-04-10 08:39:08 +00009524 // Update the users of the node with the new indices
David Stuttardf77079f2019-01-14 11:55:24 +00009525 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
Christian Konig8e06e2a2013-04-10 08:39:08 +00009526 SDNode *User = Users[i];
David Stuttardf77079f2019-01-14 11:55:24 +00009527 if (!User) {
9528 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
9529 // Users[0] is still nullptr because channel 0 doesn't really have a use.
9530 if (i || !NoChannels)
9531 continue;
9532 } else {
9533 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
9534 DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
9535 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009536
9537 switch (Idx) {
9538 default: break;
9539 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
9540 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
9541 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
David Stuttardf77079f2019-01-14 11:55:24 +00009542 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009543 }
9544 }
Matt Arsenault68f05052017-12-04 22:18:27 +00009545
9546 DAG.RemoveDeadNode(Node);
9547 return nullptr;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009548}
9549
Tom Stellardc98ee202015-07-16 19:40:07 +00009550static bool isFrameIndexOp(SDValue Op) {
9551 if (Op.getOpcode() == ISD::AssertZext)
9552 Op = Op.getOperand(0);
9553
9554 return isa<FrameIndexSDNode>(Op);
9555}
9556
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009557/// Legalize target independent instructions (e.g. INSERT_SUBREG)
Tom Stellard3457a842014-10-09 19:06:00 +00009558/// with frame index operands.
9559/// LLVM assumes that inputs are to these instructions are registers.
Matt Arsenault0d0d6c22017-04-12 21:58:23 +00009560SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
9561 SelectionDAG &DAG) const {
9562 if (Node->getOpcode() == ISD::CopyToReg) {
9563 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
9564 SDValue SrcVal = Node->getOperand(2);
9565
9566 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
9567 // to try understanding copies to physical registers.
9568 if (SrcVal.getValueType() == MVT::i1 &&
9569 TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
9570 SDLoc SL(Node);
9571 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
9572 SDValue VReg = DAG.getRegister(
9573 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
9574
9575 SDNode *Glued = Node->getGluedNode();
9576 SDValue ToVReg
9577 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
9578 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
9579 SDValue ToResultReg
9580 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
9581 VReg, ToVReg.getValue(1));
9582 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
9583 DAG.RemoveDeadNode(Node);
9584 return ToResultReg.getNode();
9585 }
9586 }
Tom Stellard8dd392e2014-10-09 18:09:15 +00009587
9588 SmallVector<SDValue, 8> Ops;
Tom Stellard3457a842014-10-09 19:06:00 +00009589 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
Tom Stellardc98ee202015-07-16 19:40:07 +00009590 if (!isFrameIndexOp(Node->getOperand(i))) {
Tom Stellard3457a842014-10-09 19:06:00 +00009591 Ops.push_back(Node->getOperand(i));
Tom Stellard8dd392e2014-10-09 18:09:15 +00009592 continue;
9593 }
9594
Tom Stellard3457a842014-10-09 19:06:00 +00009595 SDLoc DL(Node);
Tom Stellard8dd392e2014-10-09 18:09:15 +00009596 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
Tom Stellard3457a842014-10-09 19:06:00 +00009597 Node->getOperand(i).getValueType(),
9598 Node->getOperand(i)), 0));
Tom Stellard8dd392e2014-10-09 18:09:15 +00009599 }
9600
Mark Searles4e3d6162017-10-16 23:38:53 +00009601 return DAG.UpdateNodeOperands(Node, Ops);
Tom Stellard8dd392e2014-10-09 18:09:15 +00009602}
9603
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009604/// Fold the instructions after selecting them.
Matt Arsenault68f05052017-12-04 22:18:27 +00009605/// Returns null if users were already updated.
Christian Konig8e06e2a2013-04-10 08:39:08 +00009606SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
9607 SelectionDAG &DAG) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009608 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Nicolai Haehnlef2c64db2016-02-18 16:44:18 +00009609 unsigned Opcode = Node->getMachineOpcode();
Christian Konig8e06e2a2013-04-10 08:39:08 +00009610
Nicolai Haehnlec06bfa12016-07-11 21:59:43 +00009611 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009612 !TII->isGather4(Opcode)) {
Matt Arsenault68f05052017-12-04 22:18:27 +00009613 return adjustWritemask(Node, DAG);
9614 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009615
Nicolai Haehnlef2c64db2016-02-18 16:44:18 +00009616 if (Opcode == AMDGPU::INSERT_SUBREG ||
9617 Opcode == AMDGPU::REG_SEQUENCE) {
Tom Stellard8dd392e2014-10-09 18:09:15 +00009618 legalizeTargetIndependentNode(Node, DAG);
9619 return Node;
9620 }
Matt Arsenault206f8262017-08-01 20:49:41 +00009621
9622 switch (Opcode) {
9623 case AMDGPU::V_DIV_SCALE_F32:
9624 case AMDGPU::V_DIV_SCALE_F64: {
9625 // Satisfy the operand register constraint when one of the inputs is
9626 // undefined. Ordinarily each undef value will have its own implicit_def of
9627 // a vreg, so force these to use a single register.
9628 SDValue Src0 = Node->getOperand(0);
9629 SDValue Src1 = Node->getOperand(1);
9630 SDValue Src2 = Node->getOperand(2);
9631
9632 if ((Src0.isMachineOpcode() &&
9633 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
9634 (Src0 == Src1 || Src0 == Src2))
9635 break;
9636
9637 MVT VT = Src0.getValueType().getSimpleVT();
9638 const TargetRegisterClass *RC = getRegClassFor(VT);
9639
9640 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
9641 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
9642
9643 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
9644 UndefReg, Src0, SDValue());
9645
9646 // src0 must be the same register as src1 or src2, even if the value is
9647 // undefined, so make sure we don't violate this constraint.
9648 if (Src0.isMachineOpcode() &&
9649 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
9650 if (Src1.isMachineOpcode() &&
9651 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9652 Src0 = Src1;
9653 else if (Src2.isMachineOpcode() &&
9654 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9655 Src0 = Src2;
9656 else {
9657 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
9658 Src0 = UndefReg;
9659 Src1 = UndefReg;
9660 }
9661 } else
9662 break;
9663
9664 SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
9665 for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
9666 Ops.push_back(Node->getOperand(I));
9667
9668 Ops.push_back(ImpDef.getValue(1));
9669 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
9670 }
9671 default:
9672 break;
9673 }
9674
Tom Stellard654d6692015-01-08 15:08:17 +00009675 return Node;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009676}
Christian Konig8b1ed282013-04-10 08:39:16 +00009677
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009678/// Assign the register class depending on the number of
Christian Konig8b1ed282013-04-10 08:39:16 +00009679/// bits set in the writemask
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009680void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
Christian Konig8b1ed282013-04-10 08:39:16 +00009681 SDNode *Node) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009682 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009683
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009684 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009685
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009686 if (TII->isVOP3(MI.getOpcode())) {
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009687 // Make sure constant bus requirements are respected.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009688 TII->legalizeOperandsVOP3(MRI, MI);
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009689 return;
9690 }
Matt Arsenaultcb0ac3d2014-09-26 17:54:59 +00009691
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009692 // Replace unused atomics with the no return version.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009693 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009694 if (NoRetAtomicOp != -1) {
9695 if (!Node->hasAnyUseOfValue(0)) {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009696 MI.setDesc(TII->get(NoRetAtomicOp));
9697 MI.RemoveOperand(0);
Tom Stellard354a43c2016-04-01 18:27:37 +00009698 return;
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009699 }
9700
Tom Stellard354a43c2016-04-01 18:27:37 +00009701 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
9702 // instruction, because the return type of these instructions is a vec2 of
9703 // the memory type, so it can be tied to the input operand.
9704 // This means these instructions always have a use, so we need to add a
9705 // special case to check if the atomic has only one extract_subreg use,
9706 // which itself has no uses.
9707 if ((Node->hasNUsesOfValue(1, 0) &&
Nicolai Haehnle750082d2016-04-15 14:42:36 +00009708 Node->use_begin()->isMachineOpcode() &&
Tom Stellard354a43c2016-04-01 18:27:37 +00009709 Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
9710 !Node->use_begin()->hasAnyUseOfValue(0))) {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009711 unsigned Def = MI.getOperand(0).getReg();
Tom Stellard354a43c2016-04-01 18:27:37 +00009712
9713 // Change this into a noret atomic.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009714 MI.setDesc(TII->get(NoRetAtomicOp));
9715 MI.RemoveOperand(0);
Tom Stellard354a43c2016-04-01 18:27:37 +00009716
9717 // If we only remove the def operand from the atomic instruction, the
9718 // extract_subreg will be left with a use of a vreg without a def.
9719 // So we need to insert an implicit_def to avoid machine verifier
9720 // errors.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009721 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
Tom Stellard354a43c2016-04-01 18:27:37 +00009722 TII->get(AMDGPU::IMPLICIT_DEF), Def);
9723 }
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009724 return;
9725 }
Christian Konig8b1ed282013-04-10 08:39:16 +00009726}
Tom Stellard0518ff82013-06-03 17:39:58 +00009727
Benjamin Kramerbdc49562016-06-12 15:39:02 +00009728static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
9729 uint64_t Val) {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009730 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
Matt Arsenault485defe2014-11-05 19:01:17 +00009731 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
9732}
9733
9734MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
Benjamin Kramerbdc49562016-06-12 15:39:02 +00009735 const SDLoc &DL,
Matt Arsenault485defe2014-11-05 19:01:17 +00009736 SDValue Ptr) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009737 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Matt Arsenault485defe2014-11-05 19:01:17 +00009738
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009739 // Build the half of the subregister with the constants before building the
9740 // full 128-bit register. If we are building multiple resource descriptors,
9741 // this will allow CSEing of the 2-component register.
9742 const SDValue Ops0[] = {
9743 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
9744 buildSMovImm32(DAG, DL, 0),
9745 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
9746 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
9747 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
9748 };
Matt Arsenault485defe2014-11-05 19:01:17 +00009749
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009750 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
9751 MVT::v2i32, Ops0), 0);
Matt Arsenault485defe2014-11-05 19:01:17 +00009752
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009753 // Combine the constants and the pointer.
9754 const SDValue Ops1[] = {
9755 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
9756 Ptr,
9757 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
9758 SubRegHi,
9759 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
9760 };
Matt Arsenault485defe2014-11-05 19:01:17 +00009761
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009762 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
Matt Arsenault485defe2014-11-05 19:01:17 +00009763}
9764
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009765/// Return a resource descriptor with the 'Add TID' bit enabled
Benjamin Kramerdf005cb2015-08-08 18:27:36 +00009766/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
9767/// of the resource descriptor) to create an offset, which is added to
9768/// the resource pointer.
Benjamin Kramerbdc49562016-06-12 15:39:02 +00009769MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
9770 SDValue Ptr, uint32_t RsrcDword1,
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009771 uint64_t RsrcDword2And3) const {
9772 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
9773 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
9774 if (RsrcDword1) {
9775 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009776 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
9777 0);
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009778 }
9779
9780 SDValue DataLo = buildSMovImm32(DAG, DL,
9781 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
9782 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
9783
9784 const SDValue Ops[] = {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009785 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009786 PtrLo,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009787 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009788 PtrHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009789 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009790 DataLo,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009791 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009792 DataHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009793 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009794 };
9795
9796 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
9797}
9798
Tom Stellardd7e6f132015-04-08 01:09:26 +00009799//===----------------------------------------------------------------------===//
9800// SI Inline Assembly Support
9801//===----------------------------------------------------------------------===//
9802
9803std::pair<unsigned, const TargetRegisterClass *>
9804SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
Benjamin Kramer9bfb6272015-07-05 19:29:18 +00009805 StringRef Constraint,
Tom Stellardd7e6f132015-04-08 01:09:26 +00009806 MVT VT) const {
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009807 const TargetRegisterClass *RC = nullptr;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009808 if (Constraint.size() == 1) {
9809 switch (Constraint[0]) {
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009810 default:
9811 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009812 case 's':
9813 case 'r':
9814 switch (VT.getSizeInBits()) {
9815 default:
9816 return std::make_pair(0U, nullptr);
9817 case 32:
Matt Arsenault9e910142016-12-20 19:06:12 +00009818 case 16:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009819 RC = &AMDGPU::SReg_32_XM0RegClass;
9820 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009821 case 64:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009822 RC = &AMDGPU::SGPR_64RegClass;
9823 break;
Tim Renouf361b5b22019-03-21 12:01:21 +00009824 case 96:
9825 RC = &AMDGPU::SReg_96RegClass;
9826 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009827 case 128:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009828 RC = &AMDGPU::SReg_128RegClass;
9829 break;
Tim Renouf033f99a2019-03-22 10:11:21 +00009830 case 160:
9831 RC = &AMDGPU::SReg_160RegClass;
9832 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009833 case 256:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009834 RC = &AMDGPU::SReg_256RegClass;
9835 break;
Matt Arsenaulte0bf7d02017-02-21 19:12:08 +00009836 case 512:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009837 RC = &AMDGPU::SReg_512RegClass;
9838 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009839 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009840 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009841 case 'v':
9842 switch (VT.getSizeInBits()) {
9843 default:
9844 return std::make_pair(0U, nullptr);
9845 case 32:
Matt Arsenault9e910142016-12-20 19:06:12 +00009846 case 16:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009847 RC = &AMDGPU::VGPR_32RegClass;
9848 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009849 case 64:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009850 RC = &AMDGPU::VReg_64RegClass;
9851 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009852 case 96:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009853 RC = &AMDGPU::VReg_96RegClass;
9854 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009855 case 128:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009856 RC = &AMDGPU::VReg_128RegClass;
9857 break;
Tim Renouf033f99a2019-03-22 10:11:21 +00009858 case 160:
9859 RC = &AMDGPU::VReg_160RegClass;
9860 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009861 case 256:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009862 RC = &AMDGPU::VReg_256RegClass;
9863 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009864 case 512:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009865 RC = &AMDGPU::VReg_512RegClass;
9866 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009867 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009868 break;
Tom Stellardd7e6f132015-04-08 01:09:26 +00009869 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009870 // We actually support i128, i16 and f16 as inline parameters
9871 // even if they are not reported as legal
9872 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
9873 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
9874 return std::make_pair(0U, RC);
Tom Stellardd7e6f132015-04-08 01:09:26 +00009875 }
9876
9877 if (Constraint.size() > 1) {
Tom Stellardd7e6f132015-04-08 01:09:26 +00009878 if (Constraint[1] == 'v') {
9879 RC = &AMDGPU::VGPR_32RegClass;
9880 } else if (Constraint[1] == 's') {
9881 RC = &AMDGPU::SGPR_32RegClass;
9882 }
9883
9884 if (RC) {
Matt Arsenault0b554ed2015-06-23 02:05:55 +00009885 uint32_t Idx;
9886 bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
9887 if (!Failed && Idx < RC->getNumRegs())
Tom Stellardd7e6f132015-04-08 01:09:26 +00009888 return std::make_pair(RC->getRegister(Idx), RC);
9889 }
9890 }
9891 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9892}
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009893
9894SITargetLowering::ConstraintType
9895SITargetLowering::getConstraintType(StringRef Constraint) const {
9896 if (Constraint.size() == 1) {
9897 switch (Constraint[0]) {
9898 default: break;
9899 case 's':
9900 case 'v':
9901 return C_RegisterClass;
9902 }
9903 }
9904 return TargetLowering::getConstraintType(Constraint);
9905}
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009906
9907// Figure out which registers should be reserved for stack access. Only after
9908// the function is legalized do we know all of the non-spill stack objects or if
9909// calls are present.
9910void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
9911 MachineRegisterInfo &MRI = MF.getRegInfo();
9912 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
9913 const MachineFrameInfo &MFI = MF.getFrameInfo();
Tom Stellardc5a154d2018-06-28 23:47:12 +00009914 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009915
9916 if (Info->isEntryFunction()) {
9917 // Callable functions have fixed registers used for stack access.
9918 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
9919 }
9920
9921 // We have to assume the SP is needed in case there are calls in the function
9922 // during lowering. Calls are only detected after the function is
9923 // lowered. We're about to reserve registers, so don't bother using it if we
9924 // aren't really going to use it.
9925 bool NeedSP = !Info->isEntryFunction() ||
9926 MFI.hasVarSizedObjects() ||
9927 MFI.hasCalls();
9928
9929 if (NeedSP) {
9930 unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
9931 Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
9932
9933 assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
9934 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
9935 Info->getStackPtrOffsetReg()));
Matt Arsenaultbc6d07c2019-03-14 22:54:43 +00009936 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
9937 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009938 }
9939
Matt Arsenaultbc6d07c2019-03-14 22:54:43 +00009940 // We need to worry about replacing the default register with itself in case
9941 // of MIR testcases missing the MFI.
9942 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
9943 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
9944
9945 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
9946 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
9947
9948 if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) {
9949 MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
9950 Info->getScratchWaveOffsetReg());
9951 }
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009952
Stanislav Mekhanoshind4b500c2018-05-31 05:36:04 +00009953 Info->limitOccupancy(MF);
9954
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009955 TargetLoweringBase::finalizeLowering(MF);
9956}
Matt Arsenault45b98182017-11-15 00:45:43 +00009957
9958void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
9959 KnownBits &Known,
9960 const APInt &DemandedElts,
9961 const SelectionDAG &DAG,
9962 unsigned Depth) const {
9963 TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
9964 DAG, Depth);
9965
9966 if (getSubtarget()->enableHugePrivateBuffer())
9967 return;
9968
9969 // Technically it may be possible to have a dispatch with a single workitem
9970 // that uses the full private memory size, but that's not really useful. We
9971 // can't use vaddr in MUBUF instructions if we don't know the address
9972 // calculation won't overflow, so assume the sign bit is never set.
9973 Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
9974}
Tom Stellard264c1712018-06-13 15:06:37 +00009975
Stanislav Mekhanoshin93f15c92019-05-03 21:17:29 +00009976unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
9977 const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
9978 const unsigned CacheLineAlign = 6; // log2(64)
9979
9980 // Pre-GFX10 target did not benefit from loop alignment
9981 if (!ML || DisableLoopAlignment ||
9982 (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) ||
9983 getSubtarget()->hasInstFwdPrefetchBug())
9984 return PrefAlign;
9985
9986 // On GFX10 I$ is 4 x 64 bytes cache lines.
9987 // By default prefetcher keeps one cache line behind and reads two ahead.
9988 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
9989 // behind and one ahead.
9990 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
9991 // If loop fits 64 bytes it always spans no more than two cache lines and
9992 // does not need an alignment.
9993 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
9994 // Else if loop is less or equal 192 bytes we need two lines behind.
9995
9996 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9997 const MachineBasicBlock *Header = ML->getHeader();
9998 if (Header->getAlignment() != PrefAlign)
9999 return Header->getAlignment(); // Already processed.
10000
10001 unsigned LoopSize = 0;
10002 for (const MachineBasicBlock *MBB : ML->blocks()) {
10003 // If inner loop block is aligned assume in average half of the alignment
10004 // size to be added as nops.
10005 if (MBB != Header)
10006 LoopSize += (1 << MBB->getAlignment()) / 2;
10007
10008 for (const MachineInstr &MI : *MBB) {
10009 LoopSize += TII->getInstSizeInBytes(MI);
10010 if (LoopSize > 192)
10011 return PrefAlign;
10012 }
10013 }
10014
10015 if (LoopSize <= 64)
10016 return PrefAlign;
10017
10018 if (LoopSize <= 128)
10019 return CacheLineAlign;
10020
10021 // If any of parent loops is surrounded by prefetch instructions do not
10022 // insert new for inner loop, which would reset parent's settings.
10023 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
10024 if (MachineBasicBlock *Exit = P->getExitBlock()) {
10025 auto I = Exit->getFirstNonDebugInstr();
10026 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
10027 return CacheLineAlign;
10028 }
10029 }
10030
10031 MachineBasicBlock *Pre = ML->getLoopPreheader();
10032 MachineBasicBlock *Exit = ML->getExitBlock();
10033
10034 if (Pre && Exit) {
10035 BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(),
10036 TII->get(AMDGPU::S_INST_PREFETCH))
10037 .addImm(1); // prefetch 2 lines behind PC
10038
10039 BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(),
10040 TII->get(AMDGPU::S_INST_PREFETCH))
10041 .addImm(2); // prefetch 1 line behind PC
10042 }
10043
10044 return CacheLineAlign;
10045}
10046
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010047LLVM_ATTRIBUTE_UNUSED
10048static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
10049 assert(N->getOpcode() == ISD::CopyFromReg);
10050 do {
10051 // Follow the chain until we find an INLINEASM node.
10052 N = N->getOperand(0).getNode();
Craig Topper784929d2019-02-08 20:48:56 +000010053 if (N->getOpcode() == ISD::INLINEASM ||
10054 N->getOpcode() == ISD::INLINEASM_BR)
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010055 return true;
10056 } while (N->getOpcode() == ISD::CopyFromReg);
10057 return false;
10058}
10059
Tom Stellard264c1712018-06-13 15:06:37 +000010060bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
Nicolai Haehnle35617ed2018-08-30 14:21:36 +000010061 FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
Tom Stellard264c1712018-06-13 15:06:37 +000010062{
10063 switch (N->getOpcode()) {
Tom Stellard264c1712018-06-13 15:06:37 +000010064 case ISD::CopyFromReg:
10065 {
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010066 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
10067 const MachineFunction * MF = FLI->MF;
10068 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
10069 const MachineRegisterInfo &MRI = MF->getRegInfo();
10070 const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
10071 unsigned Reg = R->getReg();
10072 if (TRI.isPhysicalRegister(Reg))
10073 return !TRI.isSGPRReg(MRI, Reg);
Tom Stellard264c1712018-06-13 15:06:37 +000010074
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010075 if (MRI.isLiveIn(Reg)) {
10076 // workitem.id.x workitem.id.y workitem.id.z
10077 // Any VGPR formal argument is also considered divergent
10078 if (!TRI.isSGPRReg(MRI, Reg))
10079 return true;
10080 // Formal arguments of non-entry functions
10081 // are conservatively considered divergent
10082 else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
10083 return true;
10084 return false;
Tom Stellard264c1712018-06-13 15:06:37 +000010085 }
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010086 const Value *V = FLI->getValueFromVirtualReg(Reg);
10087 if (V)
10088 return KDA->isDivergent(V);
10089 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
10090 return !TRI.isSGPRReg(MRI, Reg);
Tom Stellard264c1712018-06-13 15:06:37 +000010091 }
10092 break;
10093 case ISD::LOAD: {
Matt Arsenault813613c2018-09-04 18:58:19 +000010094 const LoadSDNode *L = cast<LoadSDNode>(N);
10095 unsigned AS = L->getAddressSpace();
10096 // A flat load may access private memory.
10097 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
Tom Stellard264c1712018-06-13 15:06:37 +000010098 } break;
10099 case ISD::CALLSEQ_END:
10100 return true;
10101 break;
10102 case ISD::INTRINSIC_WO_CHAIN:
10103 {
10104
10105 }
10106 return AMDGPU::isIntrinsicSourceOfDivergence(
10107 cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
10108 case ISD::INTRINSIC_W_CHAIN:
10109 return AMDGPU::isIntrinsicSourceOfDivergence(
10110 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
10111 // In some cases intrinsics that are a source of divergence have been
10112 // lowered to AMDGPUISD so we also need to check those too.
10113 case AMDGPUISD::INTERP_MOV:
10114 case AMDGPUISD::INTERP_P1:
10115 case AMDGPUISD::INTERP_P2:
10116 return true;
10117 }
10118 return false;
10119}
Matt Arsenaultf8768bf2018-08-06 21:38:27 +000010120
10121bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
10122 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
10123 case MVT::f32:
10124 return Subtarget->hasFP32Denormals();
10125 case MVT::f64:
10126 return Subtarget->hasFP64Denormals();
10127 case MVT::f16:
10128 return Subtarget->hasFP16Denormals();
10129 default:
10130 return false;
10131 }
10132}
Matt Arsenault687ec752018-10-22 16:27:27 +000010133
10134bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
10135 const SelectionDAG &DAG,
10136 bool SNaN,
10137 unsigned Depth) const {
10138 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
Matt Arsenault055e4dc2019-03-29 19:14:54 +000010139 const MachineFunction &MF = DAG.getMachineFunction();
10140 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
10141
10142 if (Info->getMode().DX10Clamp)
Matt Arsenault687ec752018-10-22 16:27:27 +000010143 return true; // Clamped to 0.
10144 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
10145 }
10146
10147 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
10148 SNaN, Depth);
10149}
Matt Arsenaulta5840c32019-01-22 18:36:06 +000010150
10151TargetLowering::AtomicExpansionKind
10152SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
10153 switch (RMW->getOperation()) {
10154 case AtomicRMWInst::FAdd: {
10155 Type *Ty = RMW->getType();
10156
10157 // We don't have a way to support 16-bit atomics now, so just leave them
10158 // as-is.
10159 if (Ty->isHalfTy())
10160 return AtomicExpansionKind::None;
10161
10162 if (!Ty->isFloatTy())
10163 return AtomicExpansionKind::CmpXChg;
10164
10165 // TODO: Do have these for flat. Older targets also had them for buffers.
10166 unsigned AS = RMW->getPointerAddressSpace();
10167 return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
10168 AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
10169 }
10170 default:
10171 break;
10172 }
10173
10174 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
10175}