blob: 89c797d7055e03a631c505dcff16308e284d98ec [file] [log] [blame]
Tom Stellard75aadc22012-12-11 21:25:42 +00001//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Tom Stellard75aadc22012-12-11 21:25:42 +00006//
7//===----------------------------------------------------------------------===//
8//
9/// \file
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000010/// Custom DAG lowering for SI
Tom Stellard75aadc22012-12-11 21:25:42 +000011//
12//===----------------------------------------------------------------------===//
13
Sylvestre Ledrudf92dab2018-11-02 17:25:40 +000014#if defined(_MSC_VER) || defined(__MINGW32__)
NAKAMURA Takumi45e0a832014-07-20 11:15:07 +000015// Provide M_PI.
16#define _USE_MATH_DEFINES
NAKAMURA Takumi45e0a832014-07-20 11:15:07 +000017#endif
18
Chandler Carruth6bda14b2017-06-06 11:49:48 +000019#include "SIISelLowering.h"
Christian Konig99ee0f42013-03-07 09:04:14 +000020#include "AMDGPU.h"
Matt Arsenault41e2f2b2014-02-24 21:01:28 +000021#include "AMDGPUSubtarget.h"
Chandler Carruth6bda14b2017-06-06 11:49:48 +000022#include "AMDGPUTargetMachine.h"
Tom Stellard8485fa02016-12-07 02:42:15 +000023#include "SIDefines.h"
Tom Stellard75aadc22012-12-11 21:25:42 +000024#include "SIInstrInfo.h"
25#include "SIMachineFunctionInfo.h"
26#include "SIRegisterInfo.h"
Tom Stellard44b30b42018-05-22 02:03:23 +000027#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000028#include "Utils/AMDGPUBaseInfo.h"
29#include "llvm/ADT/APFloat.h"
30#include "llvm/ADT/APInt.h"
31#include "llvm/ADT/ArrayRef.h"
Alexey Samsonova253bf92014-08-27 19:36:53 +000032#include "llvm/ADT/BitVector.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000033#include "llvm/ADT/SmallVector.h"
Matt Arsenault71bcbd42017-08-11 20:42:08 +000034#include "llvm/ADT/Statistic.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000035#include "llvm/ADT/StringRef.h"
Matt Arsenault9a10cea2016-01-26 04:29:24 +000036#include "llvm/ADT/StringSwitch.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000037#include "llvm/ADT/Twine.h"
Wei Ding07e03712016-07-28 16:42:13 +000038#include "llvm/CodeGen/Analysis.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000039#include "llvm/CodeGen/CallingConvLower.h"
40#include "llvm/CodeGen/DAGCombine.h"
41#include "llvm/CodeGen/ISDOpcodes.h"
42#include "llvm/CodeGen/MachineBasicBlock.h"
43#include "llvm/CodeGen/MachineFrameInfo.h"
44#include "llvm/CodeGen/MachineFunction.h"
45#include "llvm/CodeGen/MachineInstr.h"
46#include "llvm/CodeGen/MachineInstrBuilder.h"
47#include "llvm/CodeGen/MachineMemOperand.h"
Matt Arsenault8623e8d2017-08-03 23:00:29 +000048#include "llvm/CodeGen/MachineModuleInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000049#include "llvm/CodeGen/MachineOperand.h"
50#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000051#include "llvm/CodeGen/SelectionDAG.h"
52#include "llvm/CodeGen/SelectionDAGNodes.h"
David Blaikieb3bde2e2017-11-17 01:07:10 +000053#include "llvm/CodeGen/TargetCallingConv.h"
54#include "llvm/CodeGen/TargetRegisterInfo.h"
Craig Topper2fa14362018-03-29 17:21:10 +000055#include "llvm/CodeGen/ValueTypes.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000056#include "llvm/IR/Constants.h"
57#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/DebugLoc.h"
59#include "llvm/IR/DerivedTypes.h"
Oliver Stannard7e7d9832016-02-02 13:52:43 +000060#include "llvm/IR/DiagnosticInfo.h"
Benjamin Kramerd78bb462013-05-23 17:10:37 +000061#include "llvm/IR/Function.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000062#include "llvm/IR/GlobalValue.h"
63#include "llvm/IR/InstrTypes.h"
64#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Instructions.h"
Matt Arsenault7dc01c92017-03-15 23:15:12 +000066#include "llvm/IR/IntrinsicInst.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000067#include "llvm/IR/Type.h"
68#include "llvm/Support/Casting.h"
69#include "llvm/Support/CodeGen.h"
70#include "llvm/Support/CommandLine.h"
71#include "llvm/Support/Compiler.h"
72#include "llvm/Support/ErrorHandling.h"
Craig Topperd0af7e82017-04-28 05:31:46 +000073#include "llvm/Support/KnownBits.h"
David Blaikie13e77db2018-03-23 23:58:25 +000074#include "llvm/Support/MachineValueType.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000075#include "llvm/Support/MathExtras.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000076#include "llvm/Target/TargetOptions.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000077#include <cassert>
78#include <cmath>
79#include <cstdint>
80#include <iterator>
81#include <tuple>
82#include <utility>
83#include <vector>
Tom Stellard75aadc22012-12-11 21:25:42 +000084
85using namespace llvm;
86
Matt Arsenault71bcbd42017-08-11 20:42:08 +000087#define DEBUG_TYPE "si-lower"
88
89STATISTIC(NumTailCalls, "Number of tail calls");
90
Matt Arsenaultd486d3f2016-10-12 18:49:05 +000091static cl::opt<bool> EnableVGPRIndexMode(
92 "amdgpu-vgpr-index-mode",
93 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
94 cl::init(false));
95
Stanislav Mekhanoshin93f15c92019-05-03 21:17:29 +000096static cl::opt<bool> DisableLoopAlignment(
97 "amdgpu-disable-loop-alignment",
98 cl::desc("Do not align and prefetch loops"),
99 cl::init(false));
100
Tom Stellardf110f8f2016-04-14 16:27:03 +0000101static unsigned findFirstFreeSGPR(CCState &CCInfo) {
102 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
103 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
104 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
105 return AMDGPU::SGPR0 + Reg;
106 }
107 }
108 llvm_unreachable("Cannot allocate sgpr");
109}
110
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000111SITargetLowering::SITargetLowering(const TargetMachine &TM,
Tom Stellard5bfbae52018-07-11 20:59:01 +0000112 const GCNSubtarget &STI)
Tom Stellardc5a154d2018-06-28 23:47:12 +0000113 : AMDGPUTargetLowering(TM, STI),
114 Subtarget(&STI) {
Tom Stellard1bd80722014-04-30 15:31:33 +0000115 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
Tom Stellard436780b2014-05-15 14:41:57 +0000116 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000117
Marek Olsak79c05872016-11-25 17:37:09 +0000118 addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
Tom Stellard45c0b3a2015-01-07 20:59:25 +0000119 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
Tom Stellard75aadc22012-12-11 21:25:42 +0000120
Tom Stellard436780b2014-05-15 14:41:57 +0000121 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
122 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
123 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000124
Tim Renouf361b5b22019-03-21 12:01:21 +0000125 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
126 addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
127
Matt Arsenault61001bb2015-11-25 19:58:34 +0000128 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
129 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
130
Tom Stellard436780b2014-05-15 14:41:57 +0000131 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
132 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000133
Tim Renouf033f99a2019-03-22 10:11:21 +0000134 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
135 addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
136
Tom Stellardf0a21072014-11-18 20:39:39 +0000137 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000138 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
139
Tom Stellardf0a21072014-11-18 20:39:39 +0000140 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000141 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
Tom Stellard75aadc22012-12-11 21:25:42 +0000142
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000143 if (Subtarget->has16BitInsts()) {
Marek Olsak79c05872016-11-25 17:37:09 +0000144 addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
145 addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
Tom Stellard115a6152016-11-10 16:02:37 +0000146
Matt Arsenault1349a042018-05-22 06:32:10 +0000147 // Unless there are also VOP3P operations, not operations are really legal.
Matt Arsenault7596f132017-02-27 20:52:10 +0000148 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
149 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000150 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
151 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
Matt Arsenault7596f132017-02-27 20:52:10 +0000152 }
153
Tom Stellardc5a154d2018-06-28 23:47:12 +0000154 computeRegisterProperties(Subtarget->getRegisterInfo());
Tom Stellard75aadc22012-12-11 21:25:42 +0000155
Tom Stellard35bb18c2013-08-26 15:06:04 +0000156 // We need to custom lower vector stores from local memory
Matt Arsenault71e66762016-05-21 02:27:49 +0000157 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
Tim Renouf361b5b22019-03-21 12:01:21 +0000158 setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
Tom Stellard35bb18c2013-08-26 15:06:04 +0000159 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
Tim Renouf033f99a2019-03-22 10:11:21 +0000160 setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
Tom Stellardaf775432013-10-23 00:44:32 +0000161 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
162 setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000163 setOperationAction(ISD::LOAD, MVT::i1, Custom);
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000164 setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
Matt Arsenault2b957b52016-05-02 20:07:26 +0000165
Matt Arsenaultbcdfee72016-05-02 20:13:51 +0000166 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
Tim Renouf361b5b22019-03-21 12:01:21 +0000167 setOperationAction(ISD::STORE, MVT::v3i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000168 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
Tim Renouf033f99a2019-03-22 10:11:21 +0000169 setOperationAction(ISD::STORE, MVT::v5i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000170 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
171 setOperationAction(ISD::STORE, MVT::v16i32, Custom);
172 setOperationAction(ISD::STORE, MVT::i1, Custom);
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000173 setOperationAction(ISD::STORE, MVT::v32i32, Custom);
Matt Arsenaultbcdfee72016-05-02 20:13:51 +0000174
Jan Vesely06200bd2017-01-06 21:00:46 +0000175 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
176 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
177 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
178 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
179 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
180 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
181 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
182 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
183 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
184 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
185
Matt Arsenault71e66762016-05-21 02:27:49 +0000186 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
187 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000188
189 setOperationAction(ISD::SELECT, MVT::i1, Promote);
Tom Stellard0ec134f2014-02-04 17:18:40 +0000190 setOperationAction(ISD::SELECT, MVT::i64, Custom);
Tom Stellardda99c6e2014-03-24 16:07:30 +0000191 setOperationAction(ISD::SELECT, MVT::f64, Promote);
192 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
Tom Stellard81d871d2013-11-13 23:36:50 +0000193
Tom Stellard3ca1bfc2014-06-10 16:01:22 +0000194 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
195 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
196 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
197 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
Matt Arsenault71e66762016-05-21 02:27:49 +0000198 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
Tom Stellard754f80f2013-04-05 23:31:51 +0000199
Tom Stellardd1efda82016-01-20 21:48:24 +0000200 setOperationAction(ISD::SETCC, MVT::i1, Promote);
Tom Stellard83747202013-07-18 21:43:53 +0000201 setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
202 setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
Matt Arsenault18f56be2016-12-22 16:27:11 +0000203 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
Tom Stellard83747202013-07-18 21:43:53 +0000204
Matt Arsenault71e66762016-05-21 02:27:49 +0000205 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
206 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
Matt Arsenaulte306a322014-10-21 16:25:08 +0000207
Matt Arsenault4e466652014-04-16 01:41:30 +0000208 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
209 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000210 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
211 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000212 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
213 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000214 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
215
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000216 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
Tom Stellard9fa17912013-08-14 23:24:45 +0000217 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
Tom Stellard9fa17912013-08-14 23:24:45 +0000218 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
Matt Arsenaultb3a80e52018-08-15 21:25:20 +0000219 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
220 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
Marek Olsak13e47412018-01-31 20:18:04 +0000221 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000222 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
223
Changpeng Fang44dfa1d2018-01-12 21:12:19 +0000224 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
225 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
David Stuttardf77079f2019-01-14 11:55:24 +0000226 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000227 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
Ryan Taylor00e063a2019-03-19 16:07:00 +0000228 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
229 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000230
231 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
Matt Arsenault4165efd2017-01-17 07:26:53 +0000232 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
233 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +0000234 setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
Ryan Taylor00e063a2019-03-19 16:07:00 +0000235 setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
236 setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000237
Matt Arsenaulte54e1c32014-06-23 18:00:44 +0000238 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000239 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
Tom Stellardbc4497b2016-02-12 23:45:29 +0000240 setOperationAction(ISD::BR_CC, MVT::i32, Expand);
241 setOperationAction(ISD::BR_CC, MVT::i64, Expand);
242 setOperationAction(ISD::BR_CC, MVT::f32, Expand);
243 setOperationAction(ISD::BR_CC, MVT::f64, Expand);
Tom Stellardafcf12f2013-09-12 02:55:14 +0000244
Matt Arsenaultee3f0ac2017-01-30 18:11:38 +0000245 setOperationAction(ISD::UADDO, MVT::i32, Legal);
246 setOperationAction(ISD::USUBO, MVT::i32, Legal);
247
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +0000248 setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
249 setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
250
Matt Arsenaulte7191392018-08-08 16:58:33 +0000251 setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
252 setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
253 setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
254
Matt Arsenault84445dd2017-11-30 22:51:26 +0000255#if 0
256 setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
257 setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
258#endif
259
Benjamin Kramer867bfc52015-03-07 17:41:00 +0000260 // We only support LOAD/STORE and vector manipulation ops for vectors
261 // with > 4 elements.
Matt Arsenault7596f132017-02-27 20:52:10 +0000262 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000263 MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
Tom Stellard967bf582014-02-13 23:34:15 +0000264 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
Matt Arsenault71e66762016-05-21 02:27:49 +0000265 switch (Op) {
Tom Stellard967bf582014-02-13 23:34:15 +0000266 case ISD::LOAD:
267 case ISD::STORE:
268 case ISD::BUILD_VECTOR:
269 case ISD::BITCAST:
270 case ISD::EXTRACT_VECTOR_ELT:
271 case ISD::INSERT_VECTOR_ELT:
Tom Stellard967bf582014-02-13 23:34:15 +0000272 case ISD::INSERT_SUBVECTOR:
273 case ISD::EXTRACT_SUBVECTOR:
Matt Arsenault61001bb2015-11-25 19:58:34 +0000274 case ISD::SCALAR_TO_VECTOR:
Tom Stellard967bf582014-02-13 23:34:15 +0000275 break;
Tom Stellardc0503db2014-08-09 01:06:56 +0000276 case ISD::CONCAT_VECTORS:
277 setOperationAction(Op, VT, Custom);
278 break;
Tom Stellard967bf582014-02-13 23:34:15 +0000279 default:
Matt Arsenaultd504a742014-05-15 21:44:05 +0000280 setOperationAction(Op, VT, Expand);
Tom Stellard967bf582014-02-13 23:34:15 +0000281 break;
282 }
283 }
284 }
285
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000286 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
287
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000288 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
289 // is expanded to avoid having two separate loops in case the index is a VGPR.
290
Matt Arsenault61001bb2015-11-25 19:58:34 +0000291 // Most operations are naturally 32-bit vector operations. We only support
292 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
293 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
294 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
295 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
296
297 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
298 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
299
300 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
301 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
302
303 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
304 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
305 }
306
Matt Arsenault71e66762016-05-21 02:27:49 +0000307 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
308 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
309 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
310 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +0000311
Matt Arsenault67a98152018-05-16 11:47:30 +0000312 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
313 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
314
Matt Arsenault3aef8092017-01-23 23:09:58 +0000315 // Avoid stack access for these.
316 // TODO: Generalize to more vector types.
317 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
318 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault67a98152018-05-16 11:47:30 +0000319 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
320 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
321
Matt Arsenault3aef8092017-01-23 23:09:58 +0000322 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
323 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault9224c002018-06-05 19:52:46 +0000324 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
325 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
326 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
327
328 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
329 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
330 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
Matt Arsenault3aef8092017-01-23 23:09:58 +0000331
Matt Arsenault67a98152018-05-16 11:47:30 +0000332 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
333 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
334 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
335 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
336
Tim Renouf361b5b22019-03-21 12:01:21 +0000337 // Deal with vec3 vector operations when widened to vec4.
338 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Expand);
339 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Expand);
340 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Expand);
341 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Expand);
342
Tim Renouf033f99a2019-03-22 10:11:21 +0000343 // Deal with vec5 vector operations when widened to vec8.
344 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Expand);
345 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Expand);
346 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Expand);
347 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Expand);
348
Tom Stellard354a43c2016-04-01 18:27:37 +0000349 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
350 // and output demarshalling
351 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
352 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
353
354 // We can't return success/failure, only the old value,
355 // let LLVM add the comparison
356 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
357 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
358
Tom Stellardc5a154d2018-06-28 23:47:12 +0000359 if (Subtarget->hasFlatAddressSpace()) {
Matt Arsenault99c14522016-04-25 19:27:24 +0000360 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
361 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
362 }
363
Matt Arsenault71e66762016-05-21 02:27:49 +0000364 setOperationAction(ISD::BSWAP, MVT::i32, Legal);
365 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
366
367 // On SI this is s_memtime and s_memrealtime on VI.
368 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
Matt Arsenault3e025382017-04-24 17:49:13 +0000369 setOperationAction(ISD::TRAP, MVT::Other, Custom);
370 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000371
Tom Stellardc5a154d2018-06-28 23:47:12 +0000372 if (Subtarget->has16BitInsts()) {
373 setOperationAction(ISD::FLOG, MVT::f16, Custom);
Matt Arsenault7121bed2018-08-16 17:07:52 +0000374 setOperationAction(ISD::FEXP, MVT::f16, Custom);
Tom Stellardc5a154d2018-06-28 23:47:12 +0000375 setOperationAction(ISD::FLOG10, MVT::f16, Custom);
376 }
377
378 // v_mad_f32 does not support denormals according to some sources.
379 if (!Subtarget->hasFP32Denormals())
380 setOperationAction(ISD::FMAD, MVT::f32, Legal);
381
382 if (!Subtarget->hasBFI()) {
383 // fcopysign can be done in a single instruction with BFI.
384 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
385 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
386 }
387
388 if (!Subtarget->hasBCNT(32))
389 setOperationAction(ISD::CTPOP, MVT::i32, Expand);
390
391 if (!Subtarget->hasBCNT(64))
392 setOperationAction(ISD::CTPOP, MVT::i64, Expand);
393
394 if (Subtarget->hasFFBH())
395 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
396
397 if (Subtarget->hasFFBL())
398 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
399
400 // We only really have 32-bit BFE instructions (and 16-bit on VI).
401 //
402 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
403 // effort to match them now. We want this to be false for i64 cases when the
404 // extraction isn't restricted to the upper or lower half. Ideally we would
405 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
406 // span the midpoint are probably relatively rare, so don't worry about them
407 // for now.
408 if (Subtarget->hasBFE())
409 setHasExtractBitsInsn(true);
410
Matt Arsenault687ec752018-10-22 16:27:27 +0000411 setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
412 setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
413 setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
414 setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
415
416
417 // These are really only legal for ieee_mode functions. We should be avoiding
418 // them for functions that don't have ieee_mode enabled, so just say they are
419 // legal.
420 setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
421 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
422 setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
423 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
424
Matt Arsenault71e66762016-05-21 02:27:49 +0000425
Matt Arsenaulte4c2e9b2019-06-19 23:54:58 +0000426 if (Subtarget->haveRoundOpsF64()) {
Matt Arsenault71e66762016-05-21 02:27:49 +0000427 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
428 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
429 setOperationAction(ISD::FRINT, MVT::f64, Legal);
Tom Stellardc5a154d2018-06-28 23:47:12 +0000430 } else {
431 setOperationAction(ISD::FCEIL, MVT::f64, Custom);
432 setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
433 setOperationAction(ISD::FRINT, MVT::f64, Custom);
434 setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000435 }
436
437 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
438
439 setOperationAction(ISD::FSIN, MVT::f32, Custom);
440 setOperationAction(ISD::FCOS, MVT::f32, Custom);
441 setOperationAction(ISD::FDIV, MVT::f32, Custom);
442 setOperationAction(ISD::FDIV, MVT::f64, Custom);
443
Tom Stellard115a6152016-11-10 16:02:37 +0000444 if (Subtarget->has16BitInsts()) {
445 setOperationAction(ISD::Constant, MVT::i16, Legal);
446
447 setOperationAction(ISD::SMIN, MVT::i16, Legal);
448 setOperationAction(ISD::SMAX, MVT::i16, Legal);
449
450 setOperationAction(ISD::UMIN, MVT::i16, Legal);
451 setOperationAction(ISD::UMAX, MVT::i16, Legal);
452
Tom Stellard115a6152016-11-10 16:02:37 +0000453 setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
454 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
455
456 setOperationAction(ISD::ROTR, MVT::i16, Promote);
457 setOperationAction(ISD::ROTL, MVT::i16, Promote);
458
459 setOperationAction(ISD::SDIV, MVT::i16, Promote);
460 setOperationAction(ISD::UDIV, MVT::i16, Promote);
461 setOperationAction(ISD::SREM, MVT::i16, Promote);
462 setOperationAction(ISD::UREM, MVT::i16, Promote);
463
464 setOperationAction(ISD::BSWAP, MVT::i16, Promote);
465 setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
466
467 setOperationAction(ISD::CTTZ, MVT::i16, Promote);
468 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
469 setOperationAction(ISD::CTLZ, MVT::i16, Promote);
470 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
Jan Veselyb283ea02018-03-02 02:50:22 +0000471 setOperationAction(ISD::CTPOP, MVT::i16, Promote);
Tom Stellard115a6152016-11-10 16:02:37 +0000472
473 setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
474
475 setOperationAction(ISD::BR_CC, MVT::i16, Expand);
476
477 setOperationAction(ISD::LOAD, MVT::i16, Custom);
478
479 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
480
Tom Stellard115a6152016-11-10 16:02:37 +0000481 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
482 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
483 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
484 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
Tom Stellardb4c8e8e2016-11-12 00:19:11 +0000485
Konstantin Zhuravlyov3f0cdc72016-11-17 04:00:46 +0000486 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
487 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
488 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
489 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
Tom Stellardb4c8e8e2016-11-12 00:19:11 +0000490
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000491 // F16 - Constant Actions.
Matt Arsenaulte96d0372016-12-08 20:14:46 +0000492 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000493
494 // F16 - Load/Store Actions.
495 setOperationAction(ISD::LOAD, MVT::f16, Promote);
496 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
497 setOperationAction(ISD::STORE, MVT::f16, Promote);
498 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
499
500 // F16 - VOP1 Actions.
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +0000501 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000502 setOperationAction(ISD::FCOS, MVT::f16, Promote);
503 setOperationAction(ISD::FSIN, MVT::f16, Promote);
Konstantin Zhuravlyov3f0cdc72016-11-17 04:00:46 +0000504 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
505 setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
506 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
507 setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
Matt Arsenaultb5d23272017-03-24 20:04:18 +0000508 setOperationAction(ISD::FROUND, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000509
510 // F16 - VOP2 Actions.
Konstantin Zhuravlyov662e01d2016-11-17 03:49:01 +0000511 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
Konstantin Zhuravlyov2a87a422016-11-16 03:16:26 +0000512 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
Matt Arsenault687ec752018-10-22 16:27:27 +0000513
Matt Arsenault4052a572016-12-22 03:05:41 +0000514 setOperationAction(ISD::FDIV, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000515
516 // F16 - VOP3 Actions.
517 setOperationAction(ISD::FMA, MVT::f16, Legal);
Stanislav Mekhanoshin28a19362019-05-04 04:20:37 +0000518 if (!Subtarget->hasFP16Denormals() && STI.hasMadF16())
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000519 setOperationAction(ISD::FMAD, MVT::f16, Legal);
Tom Stellard115a6152016-11-10 16:02:37 +0000520
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000521 for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
Matt Arsenault7596f132017-02-27 20:52:10 +0000522 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
523 switch (Op) {
524 case ISD::LOAD:
525 case ISD::STORE:
526 case ISD::BUILD_VECTOR:
527 case ISD::BITCAST:
528 case ISD::EXTRACT_VECTOR_ELT:
529 case ISD::INSERT_VECTOR_ELT:
530 case ISD::INSERT_SUBVECTOR:
531 case ISD::EXTRACT_SUBVECTOR:
532 case ISD::SCALAR_TO_VECTOR:
533 break;
534 case ISD::CONCAT_VECTORS:
535 setOperationAction(Op, VT, Custom);
536 break;
537 default:
538 setOperationAction(Op, VT, Expand);
539 break;
540 }
541 }
542 }
543
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000544 // XXX - Do these do anything? Vector constants turn into build_vector.
545 setOperationAction(ISD::Constant, MVT::v2i16, Legal);
546 setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
547
Matt Arsenaultdfb88df2018-05-13 10:04:38 +0000548 setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
549 setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
550
Matt Arsenault7596f132017-02-27 20:52:10 +0000551 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
552 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
553 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
554 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
555
556 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
557 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
558 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
559 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000560
561 setOperationAction(ISD::AND, MVT::v2i16, Promote);
562 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
563 setOperationAction(ISD::OR, MVT::v2i16, Promote);
564 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
565 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
566 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000567
Matt Arsenault1349a042018-05-22 06:32:10 +0000568 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
569 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
570 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
571 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
572
573 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
574 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
575 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
576 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
577
578 setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
579 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
580 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
581 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
582
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000583 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
584 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
585 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
586
Matt Arsenault1349a042018-05-22 06:32:10 +0000587 if (!Subtarget->hasVOP3PInsts()) {
588 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
589 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
590 }
591
592 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
593 // This isn't really legal, but this avoids the legalizer unrolling it (and
594 // allows matching fneg (fabs x) patterns)
595 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
Matt Arsenault687ec752018-10-22 16:27:27 +0000596
597 setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
598 setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
599 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
600 setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
601
602 setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
603 setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
604
605 setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
606 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
Matt Arsenault1349a042018-05-22 06:32:10 +0000607 }
608
609 if (Subtarget->hasVOP3PInsts()) {
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000610 setOperationAction(ISD::ADD, MVT::v2i16, Legal);
611 setOperationAction(ISD::SUB, MVT::v2i16, Legal);
612 setOperationAction(ISD::MUL, MVT::v2i16, Legal);
613 setOperationAction(ISD::SHL, MVT::v2i16, Legal);
614 setOperationAction(ISD::SRL, MVT::v2i16, Legal);
615 setOperationAction(ISD::SRA, MVT::v2i16, Legal);
616 setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
617 setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
618 setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
619 setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
620
621 setOperationAction(ISD::FADD, MVT::v2f16, Legal);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000622 setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
623 setOperationAction(ISD::FMA, MVT::v2f16, Legal);
Matt Arsenault687ec752018-10-22 16:27:27 +0000624
625 setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
626 setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
627
Matt Arsenault540512c2018-04-26 19:21:37 +0000628 setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000629
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000630 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
631 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000632
633 setOperationAction(ISD::SHL, MVT::v4i16, Custom);
634 setOperationAction(ISD::SRA, MVT::v4i16, Custom);
635 setOperationAction(ISD::SRL, MVT::v4i16, Custom);
636 setOperationAction(ISD::ADD, MVT::v4i16, Custom);
637 setOperationAction(ISD::SUB, MVT::v4i16, Custom);
638 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
639
640 setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
641 setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
642 setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
643 setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
644
645 setOperationAction(ISD::FADD, MVT::v4f16, Custom);
646 setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
Matt Arsenault687ec752018-10-22 16:27:27 +0000647
648 setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
649 setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
650
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000651 setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
652 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
Matt Arsenault36cdcfa2018-08-02 13:43:42 +0000653 setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000654
Matt Arsenault7121bed2018-08-16 17:07:52 +0000655 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000656 setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
657 setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
Matt Arsenault1349a042018-05-22 06:32:10 +0000658 }
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000659
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000660 setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
661 setOperationAction(ISD::FABS, MVT::v4f16, Custom);
662
Matt Arsenault1349a042018-05-22 06:32:10 +0000663 if (Subtarget->has16BitInsts()) {
664 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
665 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
666 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
667 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
Matt Arsenault4a486232017-04-19 20:53:07 +0000668 } else {
Matt Arsenault1349a042018-05-22 06:32:10 +0000669 // Legalization hack.
Matt Arsenault4a486232017-04-19 20:53:07 +0000670 setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
671 setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
Matt Arsenaulte9524f12018-06-06 21:28:11 +0000672
673 setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
674 setOperationAction(ISD::FABS, MVT::v2f16, Custom);
Matt Arsenault4a486232017-04-19 20:53:07 +0000675 }
676
677 for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
678 setOperationAction(ISD::SELECT, VT, Custom);
Matt Arsenault7596f132017-02-27 20:52:10 +0000679 }
680
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +0000681 setTargetDAGCombine(ISD::ADD);
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +0000682 setTargetDAGCombine(ISD::ADDCARRY);
683 setTargetDAGCombine(ISD::SUB);
684 setTargetDAGCombine(ISD::SUBCARRY);
Matt Arsenault02cb0ff2014-09-29 14:59:34 +0000685 setTargetDAGCombine(ISD::FADD);
Matt Arsenault8675db12014-08-29 16:01:14 +0000686 setTargetDAGCombine(ISD::FSUB);
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +0000687 setTargetDAGCombine(ISD::FMINNUM);
688 setTargetDAGCombine(ISD::FMAXNUM);
Matt Arsenault687ec752018-10-22 16:27:27 +0000689 setTargetDAGCombine(ISD::FMINNUM_IEEE);
690 setTargetDAGCombine(ISD::FMAXNUM_IEEE);
Farhana Aleenc370d7b2018-07-16 18:19:59 +0000691 setTargetDAGCombine(ISD::FMA);
Matt Arsenault5881f4e2015-06-09 00:52:37 +0000692 setTargetDAGCombine(ISD::SMIN);
693 setTargetDAGCombine(ISD::SMAX);
694 setTargetDAGCombine(ISD::UMIN);
695 setTargetDAGCombine(ISD::UMAX);
Tom Stellard75aadc22012-12-11 21:25:42 +0000696 setTargetDAGCombine(ISD::SETCC);
Matt Arsenaultd0101a22015-01-06 23:00:46 +0000697 setTargetDAGCombine(ISD::AND);
Matt Arsenaultf2290332015-01-06 23:00:39 +0000698 setTargetDAGCombine(ISD::OR);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +0000699 setTargetDAGCombine(ISD::XOR);
Konstantin Zhuravlyovfda33ea2016-10-21 22:10:03 +0000700 setTargetDAGCombine(ISD::SINT_TO_FP);
Matt Arsenault364a6742014-06-11 17:50:44 +0000701 setTargetDAGCombine(ISD::UINT_TO_FP);
Matt Arsenault9cd90712016-04-14 01:42:16 +0000702 setTargetDAGCombine(ISD::FCANONICALIZE);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000703 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
Matt Arsenault8edfaee2017-03-31 19:53:03 +0000704 setTargetDAGCombine(ISD::ZERO_EXTEND);
Ryan Taylor00e063a2019-03-19 16:07:00 +0000705 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
Matt Arsenaultbf5482e2017-05-11 17:26:25 +0000706 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +0000707 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
Matt Arsenault364a6742014-06-11 17:50:44 +0000708
Matt Arsenaultb2baffa2014-08-15 17:49:05 +0000709 // All memory operations. Some folding on the pointer operand is done to help
710 // matching the constant offsets in the addressing modes.
711 setTargetDAGCombine(ISD::LOAD);
712 setTargetDAGCombine(ISD::STORE);
713 setTargetDAGCombine(ISD::ATOMIC_LOAD);
714 setTargetDAGCombine(ISD::ATOMIC_STORE);
715 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
716 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
717 setTargetDAGCombine(ISD::ATOMIC_SWAP);
718 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
719 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
720 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
721 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
722 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
723 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
724 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
725 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
726 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
727 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
Matt Arsenaulta5840c32019-01-22 18:36:06 +0000728 setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +0000729
Christian Konigeecebd02013-03-26 14:04:02 +0000730 setSchedulingPreference(Sched::RegPressure);
Tom Stellard75aadc22012-12-11 21:25:42 +0000731}
732
Tom Stellard5bfbae52018-07-11 20:59:01 +0000733const GCNSubtarget *SITargetLowering::getSubtarget() const {
Tom Stellardc5a154d2018-06-28 23:47:12 +0000734 return Subtarget;
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000735}
736
Tom Stellard0125f2a2013-06-25 02:39:35 +0000737//===----------------------------------------------------------------------===//
738// TargetLowering queries
739//===----------------------------------------------------------------------===//
740
Tom Stellardb12f4de2018-05-22 19:37:55 +0000741// v_mad_mix* support a conversion from f16 to f32.
742//
743// There is only one special case when denormals are enabled we don't currently,
744// where this is OK to use.
745bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
746 EVT DestVT, EVT SrcVT) const {
747 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
748 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
749 DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
750 SrcVT.getScalarType() == MVT::f16;
751}
752
Zvi Rackover1b736822017-07-26 08:06:58 +0000753bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
Matt Arsenault7dc01c92017-03-15 23:15:12 +0000754 // SI has some legal vector types, but no legal vector operations. Say no
755 // shuffles are legal in order to prefer scalarizing some vector operations.
756 return false;
757}
758
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000759MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
760 CallingConv::ID CC,
761 EVT VT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000762 // TODO: Consider splitting all arguments into 32-bit pieces.
763 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000764 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000765 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000766 if (Size == 32)
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000767 return ScalarVT.getSimpleVT();
Matt Arsenault0395da72018-07-31 19:17:47 +0000768
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000769 if (Size == 64)
770 return MVT::i32;
771
Matt Arsenault57b59662018-09-10 11:49:23 +0000772 if (Size == 16 && Subtarget->has16BitInsts())
Matt Arsenault0395da72018-07-31 19:17:47 +0000773 return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000774 }
775
776 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
777}
778
779unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
780 CallingConv::ID CC,
781 EVT VT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000782 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000783 unsigned NumElts = VT.getVectorNumElements();
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000784 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000785 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenault0395da72018-07-31 19:17:47 +0000786
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000787 if (Size == 32)
Matt Arsenault0395da72018-07-31 19:17:47 +0000788 return NumElts;
789
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000790 if (Size == 64)
791 return 2 * NumElts;
792
Matt Arsenault57b59662018-09-10 11:49:23 +0000793 if (Size == 16 && Subtarget->has16BitInsts())
794 return (VT.getVectorNumElements() + 1) / 2;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000795 }
796
797 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
798}
799
800unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
801 LLVMContext &Context, CallingConv::ID CC,
802 EVT VT, EVT &IntermediateVT,
803 unsigned &NumIntermediates, MVT &RegisterVT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000804 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000805 unsigned NumElts = VT.getVectorNumElements();
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000806 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000807 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000808 if (Size == 32) {
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000809 RegisterVT = ScalarVT.getSimpleVT();
810 IntermediateVT = RegisterVT;
Matt Arsenault0395da72018-07-31 19:17:47 +0000811 NumIntermediates = NumElts;
812 return NumIntermediates;
813 }
814
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000815 if (Size == 64) {
816 RegisterVT = MVT::i32;
817 IntermediateVT = RegisterVT;
818 NumIntermediates = 2 * NumElts;
819 return NumIntermediates;
820 }
821
Matt Arsenault0395da72018-07-31 19:17:47 +0000822 // FIXME: We should fix the ABI to be the same on targets without 16-bit
823 // support, but unless we can properly handle 3-vectors, it will be still be
824 // inconsistent.
Matt Arsenault57b59662018-09-10 11:49:23 +0000825 if (Size == 16 && Subtarget->has16BitInsts()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000826 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
827 IntermediateVT = RegisterVT;
Matt Arsenault57b59662018-09-10 11:49:23 +0000828 NumIntermediates = (NumElts + 1) / 2;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000829 return NumIntermediates;
830 }
831 }
832
833 return TargetLowering::getVectorTypeBreakdownForCallingConv(
834 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
835}
836
David Stuttardf77079f2019-01-14 11:55:24 +0000837static MVT memVTFromAggregate(Type *Ty) {
838 // Only limited forms of aggregate type currently expected.
839 assert(Ty->isStructTy() && "Expected struct type");
840
841
842 Type *ElementType = nullptr;
843 unsigned NumElts;
844 if (Ty->getContainedType(0)->isVectorTy()) {
845 VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
846 ElementType = VecComponent->getElementType();
847 NumElts = VecComponent->getNumElements();
848 } else {
849 ElementType = Ty->getContainedType(0);
850 NumElts = 1;
851 }
852
853 assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
854
855 // Calculate the size of the memVT type from the aggregate
856 unsigned Pow2Elts = 0;
857 unsigned ElementSize;
858 switch (ElementType->getTypeID()) {
859 default:
860 llvm_unreachable("Unknown type!");
861 case Type::IntegerTyID:
862 ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
863 break;
864 case Type::HalfTyID:
865 ElementSize = 16;
866 break;
867 case Type::FloatTyID:
868 ElementSize = 32;
869 break;
870 }
871 unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
872 Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
873
874 return MVT::getVectorVT(MVT::getVT(ElementType, false),
875 Pow2Elts);
876}
877
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000878bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
879 const CallInst &CI,
Matt Arsenault7d7adf42017-12-14 22:34:10 +0000880 MachineFunction &MF,
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000881 unsigned IntrID) const {
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000882 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
Nicolai Haehnlee741d7e2018-06-21 13:36:33 +0000883 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000884 AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
885 (Intrinsic::ID)IntrID);
886 if (Attr.hasFnAttribute(Attribute::ReadNone))
887 return false;
888
889 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
890
891 if (RsrcIntr->IsImage) {
892 Info.ptrVal = MFI->getImagePSV(
Tom Stellard5bfbae52018-07-11 20:59:01 +0000893 *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000894 CI.getArgOperand(RsrcIntr->RsrcArg));
895 Info.align = 0;
896 } else {
897 Info.ptrVal = MFI->getBufferPSV(
Tom Stellard5bfbae52018-07-11 20:59:01 +0000898 *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000899 CI.getArgOperand(RsrcIntr->RsrcArg));
900 }
901
902 Info.flags = MachineMemOperand::MODereferenceable;
903 if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
904 Info.opc = ISD::INTRINSIC_W_CHAIN;
David Stuttardf77079f2019-01-14 11:55:24 +0000905 Info.memVT = MVT::getVT(CI.getType(), true);
906 if (Info.memVT == MVT::Other) {
907 // Some intrinsics return an aggregate type - special case to work out
908 // the correct memVT
909 Info.memVT = memVTFromAggregate(CI.getType());
910 }
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000911 Info.flags |= MachineMemOperand::MOLoad;
912 } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
913 Info.opc = ISD::INTRINSIC_VOID;
914 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
915 Info.flags |= MachineMemOperand::MOStore;
916 } else {
917 // Atomic
918 Info.opc = ISD::INTRINSIC_W_CHAIN;
919 Info.memVT = MVT::getVT(CI.getType());
920 Info.flags = MachineMemOperand::MOLoad |
921 MachineMemOperand::MOStore |
922 MachineMemOperand::MODereferenceable;
923
924 // XXX - Should this be volatile without known ordering?
925 Info.flags |= MachineMemOperand::MOVolatile;
926 }
927 return true;
928 }
929
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000930 switch (IntrID) {
931 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +0000932 case Intrinsic::amdgcn_atomic_dec:
Marek Olsakc5cec5e2019-01-16 15:43:53 +0000933 case Intrinsic::amdgcn_ds_ordered_add:
934 case Intrinsic::amdgcn_ds_ordered_swap:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +0000935 case Intrinsic::amdgcn_ds_fadd:
936 case Intrinsic::amdgcn_ds_fmin:
937 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000938 Info.opc = ISD::INTRINSIC_W_CHAIN;
939 Info.memVT = MVT::getVT(CI.getType());
940 Info.ptrVal = CI.getOperand(0);
941 Info.align = 0;
Matt Arsenault11171332017-12-14 21:39:51 +0000942 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
Matt Arsenault79f837c2017-03-30 22:21:40 +0000943
Matt Arsenaultcaf13162019-03-12 21:02:54 +0000944 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
945 if (!Vol->isZero())
Matt Arsenault11171332017-12-14 21:39:51 +0000946 Info.flags |= MachineMemOperand::MOVolatile;
947
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000948 return true;
Matt Arsenault79f837c2017-03-30 22:21:40 +0000949 }
Matt Arsenaultcdd191d2019-01-28 20:14:49 +0000950 case Intrinsic::amdgcn_ds_append:
951 case Intrinsic::amdgcn_ds_consume: {
952 Info.opc = ISD::INTRINSIC_W_CHAIN;
953 Info.memVT = MVT::getVT(CI.getType());
954 Info.ptrVal = CI.getOperand(0);
955 Info.align = 0;
956 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
Matt Arsenault905f3512017-12-29 17:18:14 +0000957
Matt Arsenaultcaf13162019-03-12 21:02:54 +0000958 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
959 if (!Vol->isZero())
Matt Arsenaultcdd191d2019-01-28 20:14:49 +0000960 Info.flags |= MachineMemOperand::MOVolatile;
961
962 return true;
963 }
Matt Arsenault4d55d022019-06-19 19:55:27 +0000964 case Intrinsic::amdgcn_ds_gws_init:
Matt Arsenault740322f2019-06-20 21:11:42 +0000965 case Intrinsic::amdgcn_ds_gws_barrier:
966 case Intrinsic::amdgcn_ds_gws_sema_v:
967 case Intrinsic::amdgcn_ds_gws_sema_br:
968 case Intrinsic::amdgcn_ds_gws_sema_p:
969 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
Matt Arsenault4d55d022019-06-19 19:55:27 +0000970 Info.opc = ISD::INTRINSIC_VOID;
971
972 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
973 Info.ptrVal =
974 MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
975
976 // This is an abstract access, but we need to specify a type and size.
977 Info.memVT = MVT::i32;
978 Info.size = 4;
979 Info.align = 4;
980
981 Info.flags = MachineMemOperand::MOStore;
982 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
983 Info.flags = MachineMemOperand::MOLoad;
984 return true;
985 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000986 default:
987 return false;
988 }
989}
990
Matt Arsenault7dc01c92017-03-15 23:15:12 +0000991bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
992 SmallVectorImpl<Value*> &Ops,
993 Type *&AccessTy) const {
994 switch (II->getIntrinsicID()) {
995 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +0000996 case Intrinsic::amdgcn_atomic_dec:
Marek Olsakc5cec5e2019-01-16 15:43:53 +0000997 case Intrinsic::amdgcn_ds_ordered_add:
998 case Intrinsic::amdgcn_ds_ordered_swap:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +0000999 case Intrinsic::amdgcn_ds_fadd:
1000 case Intrinsic::amdgcn_ds_fmin:
1001 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenault7dc01c92017-03-15 23:15:12 +00001002 Value *Ptr = II->getArgOperand(0);
1003 AccessTy = II->getType();
1004 Ops.push_back(Ptr);
1005 return true;
1006 }
1007 default:
1008 return false;
1009 }
Matt Arsenaulte306a322014-10-21 16:25:08 +00001010}
1011
Tom Stellard70580f82015-07-20 14:28:41 +00001012bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
Matt Arsenaultd9b77842017-06-12 17:06:35 +00001013 if (!Subtarget->hasFlatInstOffsets()) {
1014 // Flat instructions do not have offsets, and only have the register
1015 // address.
1016 return AM.BaseOffs == 0 && AM.Scale == 0;
1017 }
1018
1019 // GFX9 added a 13-bit signed offset. When using regular flat instructions,
1020 // the sign bit is ignored and is treated as a 12-bit unsigned offset.
1021
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00001022 // GFX10 shrinked signed offset to 12 bits. When using regular flat
1023 // instructions, the sign bit is also ignored and is treated as 11-bit
1024 // unsigned offset.
1025
1026 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
1027 return isUInt<11>(AM.BaseOffs) && AM.Scale == 0;
1028
Matt Arsenaultd9b77842017-06-12 17:06:35 +00001029 // Just r + i
1030 return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
Tom Stellard70580f82015-07-20 14:28:41 +00001031}
1032
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +00001033bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1034 if (Subtarget->hasFlatGlobalInsts())
1035 return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
1036
1037 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1038 // Assume the we will use FLAT for all global memory accesses
1039 // on VI.
1040 // FIXME: This assumption is currently wrong. On VI we still use
1041 // MUBUF instructions for the r + i addressing mode. As currently
1042 // implemented, the MUBUF instructions only work on buffer < 4GB.
1043 // It may be possible to support > 4GB buffers with MUBUF instructions,
1044 // by setting the stride value in the resource descriptor which would
1045 // increase the size limit to (stride * 4GB). However, this is risky,
1046 // because it has never been validated.
1047 return isLegalFlatAddressingMode(AM);
1048 }
1049
1050 return isLegalMUBUFAddressingMode(AM);
1051}
1052
Matt Arsenault711b3902015-08-07 20:18:34 +00001053bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1054 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1055 // additionally can do r + r + i with addr64. 32-bit has more addressing
1056 // mode options. Depending on the resource constant, it can also do
1057 // (i64 r0) + (i32 r1) * (i14 i).
1058 //
1059 // Private arrays end up using a scratch buffer most of the time, so also
1060 // assume those use MUBUF instructions. Scratch loads / stores are currently
1061 // implemented as mubuf instructions with offen bit set, so slightly
1062 // different than the normal addr64.
1063 if (!isUInt<12>(AM.BaseOffs))
1064 return false;
1065
1066 // FIXME: Since we can split immediate into soffset and immediate offset,
1067 // would it make sense to allow any immediate?
1068
1069 switch (AM.Scale) {
1070 case 0: // r + i or just i, depending on HasBaseReg.
1071 return true;
1072 case 1:
1073 return true; // We have r + r or r + i.
1074 case 2:
1075 if (AM.HasBaseReg) {
1076 // Reject 2 * r + r.
1077 return false;
1078 }
1079
1080 // Allow 2 * r as r + r
1081 // Or 2 * r + i is allowed as r + r + i.
1082 return true;
1083 default: // Don't allow n * r
1084 return false;
1085 }
1086}
1087
Mehdi Amini0cdec1e2015-07-09 02:09:40 +00001088bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1089 const AddrMode &AM, Type *Ty,
Jonas Paulsson024e3192017-07-21 11:59:37 +00001090 unsigned AS, Instruction *I) const {
Matt Arsenault5015a892014-08-15 17:17:07 +00001091 // No global is ever allowed as a base.
1092 if (AM.BaseGV)
1093 return false;
1094
Matt Arsenault0da63502018-08-31 05:49:54 +00001095 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +00001096 return isLegalGlobalAddressingMode(AM);
Matt Arsenault5015a892014-08-15 17:17:07 +00001097
Matt Arsenault0da63502018-08-31 05:49:54 +00001098 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
Neil Henning523dab02019-03-18 14:44:28 +00001099 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1100 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001101 // If the offset isn't a multiple of 4, it probably isn't going to be
1102 // correctly aligned.
Matt Arsenault3cc1e002016-08-13 01:43:51 +00001103 // FIXME: Can we get the real alignment here?
Matt Arsenault711b3902015-08-07 20:18:34 +00001104 if (AM.BaseOffs % 4 != 0)
1105 return isLegalMUBUFAddressingMode(AM);
1106
1107 // There are no SMRD extloads, so if we have to do a small type access we
1108 // will use a MUBUF load.
1109 // FIXME?: We also need to do this if unaligned, but we don't know the
1110 // alignment here.
Stanislav Mekhanoshin57d341c2018-05-15 22:07:51 +00001111 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +00001112 return isLegalGlobalAddressingMode(AM);
Matt Arsenault711b3902015-08-07 20:18:34 +00001113
Tom Stellard5bfbae52018-07-11 20:59:01 +00001114 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001115 // SMRD instructions have an 8-bit, dword offset on SI.
1116 if (!isUInt<8>(AM.BaseOffs / 4))
1117 return false;
Tom Stellard5bfbae52018-07-11 20:59:01 +00001118 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001119 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1120 // in 8-bits, it can use a smaller encoding.
1121 if (!isUInt<32>(AM.BaseOffs / 4))
1122 return false;
Tom Stellard5bfbae52018-07-11 20:59:01 +00001123 } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001124 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1125 if (!isUInt<20>(AM.BaseOffs))
1126 return false;
1127 } else
1128 llvm_unreachable("unhandled generation");
1129
1130 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1131 return true;
1132
1133 if (AM.Scale == 1 && AM.HasBaseReg)
1134 return true;
1135
1136 return false;
Matt Arsenault711b3902015-08-07 20:18:34 +00001137
Matt Arsenault0da63502018-08-31 05:49:54 +00001138 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001139 return isLegalMUBUFAddressingMode(AM);
Matt Arsenault0da63502018-08-31 05:49:54 +00001140 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1141 AS == AMDGPUAS::REGION_ADDRESS) {
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001142 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1143 // field.
1144 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1145 // an 8-bit dword offset but we don't know the alignment here.
1146 if (!isUInt<16>(AM.BaseOffs))
Matt Arsenault5015a892014-08-15 17:17:07 +00001147 return false;
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001148
1149 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1150 return true;
1151
1152 if (AM.Scale == 1 && AM.HasBaseReg)
1153 return true;
1154
Matt Arsenault5015a892014-08-15 17:17:07 +00001155 return false;
Matt Arsenault0da63502018-08-31 05:49:54 +00001156 } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1157 AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
Matt Arsenault7d1b6c82016-04-29 06:25:10 +00001158 // For an unknown address space, this usually means that this is for some
1159 // reason being used for pure arithmetic, and not based on some addressing
1160 // computation. We don't have instructions that compute pointers with any
1161 // addressing modes, so treat them as having no offset like flat
1162 // instructions.
Tom Stellard70580f82015-07-20 14:28:41 +00001163 return isLegalFlatAddressingMode(AM);
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00001164 } else {
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001165 llvm_unreachable("unhandled address space");
1166 }
Matt Arsenault5015a892014-08-15 17:17:07 +00001167}
1168
Nirav Dave4dcad5d2017-07-10 20:25:54 +00001169bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1170 const SelectionDAG &DAG) const {
Matt Arsenault0da63502018-08-31 05:49:54 +00001171 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001172 return (MemVT.getSizeInBits() <= 4 * 32);
Matt Arsenault0da63502018-08-31 05:49:54 +00001173 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001174 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1175 return (MemVT.getSizeInBits() <= MaxPrivateBits);
Matt Arsenault0da63502018-08-31 05:49:54 +00001176 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001177 return (MemVT.getSizeInBits() <= 2 * 32);
1178 }
1179 return true;
1180}
1181
Simon Pilgrim4e0648a2019-06-12 17:14:03 +00001182bool SITargetLowering::allowsMisalignedMemoryAccesses(
1183 EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
1184 bool *IsFast) const {
Matt Arsenault1018c892014-04-24 17:08:26 +00001185 if (IsFast)
1186 *IsFast = false;
1187
Matt Arsenault1018c892014-04-24 17:08:26 +00001188 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1189 // which isn't a simple VT.
Alina Sbirlea6f937b12016-08-04 16:38:44 +00001190 // Until MVT is extended to handle this, simply check for the size and
1191 // rely on the condition below: allow accesses if the size is a multiple of 4.
1192 if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1193 VT.getStoreSize() > 16)) {
Tom Stellard81d871d2013-11-13 23:36:50 +00001194 return false;
Alina Sbirlea6f937b12016-08-04 16:38:44 +00001195 }
Matt Arsenault1018c892014-04-24 17:08:26 +00001196
Matt Arsenault0da63502018-08-31 05:49:54 +00001197 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1198 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
Matt Arsenault6f2a5262014-07-27 17:46:40 +00001199 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1200 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1201 // with adjacent offsets.
Sanjay Patelce74db92015-09-03 15:03:19 +00001202 bool AlignedBy4 = (Align % 4 == 0);
1203 if (IsFast)
1204 *IsFast = AlignedBy4;
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001205
Sanjay Patelce74db92015-09-03 15:03:19 +00001206 return AlignedBy4;
Matt Arsenault6f2a5262014-07-27 17:46:40 +00001207 }
Matt Arsenault1018c892014-04-24 17:08:26 +00001208
Tom Stellard64a9d082016-10-14 18:10:39 +00001209 // FIXME: We have to be conservative here and assume that flat operations
1210 // will access scratch. If we had access to the IR function, then we
1211 // could determine if any private memory was used in the function.
1212 if (!Subtarget->hasUnalignedScratchAccess() &&
Matt Arsenault0da63502018-08-31 05:49:54 +00001213 (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1214 AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
Matt Arsenaultf4320112018-09-24 13:18:15 +00001215 bool AlignedBy4 = Align >= 4;
1216 if (IsFast)
1217 *IsFast = AlignedBy4;
1218
1219 return AlignedBy4;
Tom Stellard64a9d082016-10-14 18:10:39 +00001220 }
1221
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001222 if (Subtarget->hasUnalignedBufferAccess()) {
1223 // If we have an uniform constant load, it still requires using a slow
1224 // buffer instruction if unaligned.
1225 if (IsFast) {
Matt Arsenault0da63502018-08-31 05:49:54 +00001226 *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1227 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001228 (Align % 4 == 0) : true;
1229 }
1230
1231 return true;
1232 }
1233
Tom Stellard33e64c62015-02-04 20:49:52 +00001234 // Smaller than dword value must be aligned.
Tom Stellard33e64c62015-02-04 20:49:52 +00001235 if (VT.bitsLT(MVT::i32))
1236 return false;
1237
Matt Arsenault1018c892014-04-24 17:08:26 +00001238 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1239 // byte-address are ignored, thus forcing Dword alignment.
Tom Stellarde812f2f2014-07-21 15:45:06 +00001240 // This applies to private, global, and constant memory.
Matt Arsenault1018c892014-04-24 17:08:26 +00001241 if (IsFast)
1242 *IsFast = true;
Tom Stellardc6b299c2015-02-02 18:02:28 +00001243
1244 return VT.bitsGT(MVT::i32) && Align % 4 == 0;
Tom Stellard0125f2a2013-06-25 02:39:35 +00001245}
1246
Sjoerd Meijer180f1ae2019-04-30 08:38:12 +00001247EVT SITargetLowering::getOptimalMemOpType(
1248 uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
1249 bool ZeroMemset, bool MemcpyStrSrc,
1250 const AttributeList &FuncAttributes) const {
Matt Arsenault46645fa2014-07-28 17:49:26 +00001251 // FIXME: Should account for address space here.
1252
1253 // The default fallback uses the private pointer size as a guess for a type to
1254 // use. Make sure we switch these to 64-bit accesses.
1255
1256 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
1257 return MVT::v4i32;
1258
1259 if (Size >= 8 && DstAlign >= 4)
1260 return MVT::v2i32;
1261
1262 // Use the default.
1263 return MVT::Other;
1264}
1265
Matt Arsenault0da63502018-08-31 05:49:54 +00001266static bool isFlatGlobalAddrSpace(unsigned AS) {
1267 return AS == AMDGPUAS::GLOBAL_ADDRESS ||
1268 AS == AMDGPUAS::FLAT_ADDRESS ||
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001269 AS == AMDGPUAS::CONSTANT_ADDRESS ||
1270 AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
Matt Arsenaultf9bfeaf2015-12-01 23:04:00 +00001271}
1272
1273bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1274 unsigned DestAS) const {
Matt Arsenault0da63502018-08-31 05:49:54 +00001275 return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
Matt Arsenaultf9bfeaf2015-12-01 23:04:00 +00001276}
1277
Alexander Timofeev18009562016-12-08 17:28:47 +00001278bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1279 const MemSDNode *MemNode = cast<MemSDNode>(N);
1280 const Value *Ptr = MemNode->getMemOperand()->getValue();
Matt Arsenault0a0c8712018-03-27 18:39:45 +00001281 const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
Alexander Timofeev18009562016-12-08 17:28:47 +00001282 return I && I->getMetadata("amdgpu.noclobber");
1283}
1284
Matt Arsenault8dbeb922019-06-03 18:41:34 +00001285bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
1286 unsigned DestAS) const {
Matt Arsenaultd4da0ed2016-12-02 18:12:53 +00001287 // Flat -> private/local is a simple truncate.
1288 // Flat -> global is no-op
Matt Arsenault0da63502018-08-31 05:49:54 +00001289 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
Matt Arsenaultd4da0ed2016-12-02 18:12:53 +00001290 return true;
1291
1292 return isNoopAddrSpaceCast(SrcAS, DestAS);
1293}
1294
Tom Stellarda6f24c62015-12-15 20:55:55 +00001295bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1296 const MemSDNode *MemNode = cast<MemSDNode>(N);
Tom Stellarda6f24c62015-12-15 20:55:55 +00001297
Matt Arsenaultbcf7bec2018-02-09 16:57:48 +00001298 return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
Tom Stellarda6f24c62015-12-15 20:55:55 +00001299}
1300
Chandler Carruth9d010ff2014-07-03 00:23:43 +00001301TargetLoweringBase::LegalizeTypeAction
Craig Topper0b5f8162018-11-05 23:26:13 +00001302SITargetLowering::getPreferredVectorAction(MVT VT) const {
Chandler Carruth9d010ff2014-07-03 00:23:43 +00001303 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
1304 return TypeSplitVector;
1305
1306 return TargetLoweringBase::getPreferredVectorAction(VT);
Tom Stellardd86003e2013-08-14 23:25:00 +00001307}
Tom Stellard0125f2a2013-06-25 02:39:35 +00001308
Matt Arsenaultd7bdcc42014-03-31 19:54:27 +00001309bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1310 Type *Ty) const {
Matt Arsenault749035b2016-07-30 01:40:36 +00001311 // FIXME: Could be smarter if called for vector constants.
1312 return true;
Matt Arsenaultd7bdcc42014-03-31 19:54:27 +00001313}
1314
Tom Stellard2e045bb2016-01-20 00:13:22 +00001315bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
Matt Arsenault7b00cf42016-12-09 17:57:43 +00001316 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1317 switch (Op) {
1318 case ISD::LOAD:
1319 case ISD::STORE:
Tom Stellard2e045bb2016-01-20 00:13:22 +00001320
Matt Arsenault7b00cf42016-12-09 17:57:43 +00001321 // These operations are done with 32-bit instructions anyway.
1322 case ISD::AND:
1323 case ISD::OR:
1324 case ISD::XOR:
1325 case ISD::SELECT:
1326 // TODO: Extensions?
1327 return true;
1328 default:
1329 return false;
1330 }
1331 }
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +00001332
Tom Stellard2e045bb2016-01-20 00:13:22 +00001333 // SimplifySetCC uses this function to determine whether or not it should
1334 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1335 if (VT == MVT::i1 && Op == ISD::SETCC)
1336 return false;
1337
1338 return TargetLowering::isTypeDesirableForOp(Op, VT);
1339}
1340
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001341SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1342 const SDLoc &SL,
1343 SDValue Chain,
1344 uint64_t Offset) const {
Mehdi Aminia749f2a2015-07-09 02:09:52 +00001345 const DataLayout &DL = DAG.getDataLayout();
Tom Stellardec2e43c2014-09-22 15:35:29 +00001346 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001347 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1348
1349 const ArgDescriptor *InputPtrReg;
1350 const TargetRegisterClass *RC;
1351
1352 std::tie(InputPtrReg, RC)
1353 = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
Tom Stellard94593ee2013-06-03 17:40:18 +00001354
Matt Arsenault86033ca2014-07-28 17:31:39 +00001355 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
Matt Arsenault0da63502018-08-31 05:49:54 +00001356 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
Matt Arsenaulta0269b62015-06-01 21:58:24 +00001357 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001358 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1359
Matt Arsenault2fb9ccf2018-05-29 17:42:38 +00001360 return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
Jan Veselyfea814d2016-06-21 20:46:20 +00001361}
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00001362
Matt Arsenault9166ce82017-07-28 15:52:08 +00001363SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1364 const SDLoc &SL) const {
Matt Arsenault75e71922018-06-28 10:18:55 +00001365 uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1366 FIRST_IMPLICIT);
Matt Arsenault9166ce82017-07-28 15:52:08 +00001367 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1368}
1369
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001370SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1371 const SDLoc &SL, SDValue Val,
1372 bool Signed,
Matt Arsenault6dca5422017-01-09 18:52:39 +00001373 const ISD::InputArg *Arg) const {
Tim Renouf361b5b22019-03-21 12:01:21 +00001374 // First, if it is a widened vector, narrow it.
1375 if (VT.isVector() &&
1376 VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1377 EVT NarrowedVT =
1378 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
1379 VT.getVectorNumElements());
1380 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1381 DAG.getConstant(0, SL, MVT::i32));
1382 }
1383
1384 // Then convert the vector elements or scalar value.
Matt Arsenault6dca5422017-01-09 18:52:39 +00001385 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1386 VT.bitsLT(MemVT)) {
1387 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1388 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1389 }
1390
Tom Stellardbc6c5232016-10-17 16:21:45 +00001391 if (MemVT.isFloatingPoint())
Matt Arsenault6dca5422017-01-09 18:52:39 +00001392 Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001393 else if (Signed)
Matt Arsenault6dca5422017-01-09 18:52:39 +00001394 Val = DAG.getSExtOrTrunc(Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001395 else
Matt Arsenault6dca5422017-01-09 18:52:39 +00001396 Val = DAG.getZExtOrTrunc(Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001397
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001398 return Val;
1399}
1400
1401SDValue SITargetLowering::lowerKernargMemParameter(
1402 SelectionDAG &DAG, EVT VT, EVT MemVT,
1403 const SDLoc &SL, SDValue Chain,
Matt Arsenault7b4826e2018-05-30 16:17:51 +00001404 uint64_t Offset, unsigned Align, bool Signed,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001405 const ISD::InputArg *Arg) const {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001406 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
Matt Arsenault0da63502018-08-31 05:49:54 +00001407 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001408 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1409
Matt Arsenault90083d32018-06-07 09:54:49 +00001410 // Try to avoid using an extload by loading earlier than the argument address,
1411 // and extracting the relevant bits. The load should hopefully be merged with
1412 // the previous argument.
Matt Arsenault4bec7d42018-07-20 09:05:08 +00001413 if (MemVT.getStoreSize() < 4 && Align < 4) {
1414 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
Matt Arsenault90083d32018-06-07 09:54:49 +00001415 int64_t AlignDownOffset = alignDown(Offset, 4);
1416 int64_t OffsetDiff = Offset - AlignDownOffset;
1417
1418 EVT IntVT = MemVT.changeTypeToInteger();
1419
1420 // TODO: If we passed in the base kernel offset we could have a better
1421 // alignment than 4, but we don't really need it.
1422 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1423 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1424 MachineMemOperand::MODereferenceable |
1425 MachineMemOperand::MOInvariant);
1426
1427 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1428 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1429
1430 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1431 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1432 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1433
1434
1435 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1436 }
1437
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001438 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1439 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001440 MachineMemOperand::MODereferenceable |
1441 MachineMemOperand::MOInvariant);
1442
1443 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
Matt Arsenault6dca5422017-01-09 18:52:39 +00001444 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
Tom Stellard94593ee2013-06-03 17:40:18 +00001445}
1446
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001447SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1448 const SDLoc &SL, SDValue Chain,
1449 const ISD::InputArg &Arg) const {
1450 MachineFunction &MF = DAG.getMachineFunction();
1451 MachineFrameInfo &MFI = MF.getFrameInfo();
1452
1453 if (Arg.Flags.isByVal()) {
1454 unsigned Size = Arg.Flags.getByValSize();
1455 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1456 return DAG.getFrameIndex(FrameIdx, MVT::i32);
1457 }
1458
1459 unsigned ArgOffset = VA.getLocMemOffset();
1460 unsigned ArgSize = VA.getValVT().getStoreSize();
1461
1462 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1463
1464 // Create load nodes to retrieve arguments from the stack.
1465 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1466 SDValue ArgValue;
1467
1468 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1469 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
1470 MVT MemVT = VA.getValVT();
1471
1472 switch (VA.getLocInfo()) {
1473 default:
1474 break;
1475 case CCValAssign::BCvt:
1476 MemVT = VA.getLocVT();
1477 break;
1478 case CCValAssign::SExt:
1479 ExtType = ISD::SEXTLOAD;
1480 break;
1481 case CCValAssign::ZExt:
1482 ExtType = ISD::ZEXTLOAD;
1483 break;
1484 case CCValAssign::AExt:
1485 ExtType = ISD::EXTLOAD;
1486 break;
1487 }
1488
1489 ArgValue = DAG.getExtLoad(
1490 ExtType, SL, VA.getLocVT(), Chain, FIN,
1491 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1492 MemVT);
1493 return ArgValue;
1494}
1495
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001496SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1497 const SIMachineFunctionInfo &MFI,
1498 EVT VT,
1499 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1500 const ArgDescriptor *Reg;
1501 const TargetRegisterClass *RC;
1502
1503 std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1504 return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1505}
1506
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001507static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1508 CallingConv::ID CallConv,
1509 ArrayRef<ISD::InputArg> Ins,
1510 BitVector &Skipped,
1511 FunctionType *FType,
1512 SIMachineFunctionInfo *Info) {
1513 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001514 const ISD::InputArg *Arg = &Ins[I];
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001515
Matt Arsenault55ab9212018-08-01 19:57:34 +00001516 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1517 "vector type argument should have been split");
Matt Arsenault9ced1e02018-07-31 19:05:14 +00001518
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001519 // First check if it's a PS input addr.
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001520 if (CallConv == CallingConv::AMDGPU_PS &&
1521 !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001522
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001523 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1524
1525 // Inconveniently only the first part of the split is marked as isSplit,
1526 // so skip to the end. We only want to increment PSInputNum once for the
1527 // entire split argument.
1528 if (Arg->Flags.isSplit()) {
1529 while (!Arg->Flags.isSplitEnd()) {
1530 assert(!Arg->VT.isVector() &&
1531 "unexpected vector split in ps argument type");
1532 if (!SkipArg)
1533 Splits.push_back(*Arg);
1534 Arg = &Ins[++I];
1535 }
1536 }
1537
1538 if (SkipArg) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001539 // We can safely skip PS inputs.
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001540 Skipped.set(Arg->getOrigArgIndex());
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001541 ++PSInputNum;
1542 continue;
1543 }
1544
1545 Info->markPSInputAllocated(PSInputNum);
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001546 if (Arg->Used)
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001547 Info->markPSInputEnabled(PSInputNum);
1548
1549 ++PSInputNum;
1550 }
1551
Matt Arsenault9ced1e02018-07-31 19:05:14 +00001552 Splits.push_back(*Arg);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001553 }
1554}
1555
1556// Allocate special inputs passed in VGPRs.
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001557static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1558 MachineFunction &MF,
1559 const SIRegisterInfo &TRI,
1560 SIMachineFunctionInfo &Info) {
1561 if (Info.hasWorkItemIDX()) {
1562 unsigned Reg = AMDGPU::VGPR0;
1563 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001564
1565 CCInfo.AllocateReg(Reg);
1566 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
1567 }
1568
1569 if (Info.hasWorkItemIDY()) {
1570 unsigned Reg = AMDGPU::VGPR1;
1571 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1572
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001573 CCInfo.AllocateReg(Reg);
1574 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1575 }
1576
1577 if (Info.hasWorkItemIDZ()) {
1578 unsigned Reg = AMDGPU::VGPR2;
1579 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1580
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001581 CCInfo.AllocateReg(Reg);
1582 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1583 }
1584}
1585
1586// Try to allocate a VGPR at the end of the argument list, or if no argument
1587// VGPRs are left allocating a stack slot.
1588static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
1589 ArrayRef<MCPhysReg> ArgVGPRs
1590 = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1591 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1592 if (RegIdx == ArgVGPRs.size()) {
1593 // Spill to stack required.
1594 int64_t Offset = CCInfo.AllocateStack(4, 4);
1595
1596 return ArgDescriptor::createStack(Offset);
1597 }
1598
1599 unsigned Reg = ArgVGPRs[RegIdx];
1600 Reg = CCInfo.AllocateReg(Reg);
1601 assert(Reg != AMDGPU::NoRegister);
1602
1603 MachineFunction &MF = CCInfo.getMachineFunction();
1604 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1605 return ArgDescriptor::createRegister(Reg);
1606}
1607
1608static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1609 const TargetRegisterClass *RC,
1610 unsigned NumArgRegs) {
1611 ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1612 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1613 if (RegIdx == ArgSGPRs.size())
1614 report_fatal_error("ran out of SGPRs for arguments");
1615
1616 unsigned Reg = ArgSGPRs[RegIdx];
1617 Reg = CCInfo.AllocateReg(Reg);
1618 assert(Reg != AMDGPU::NoRegister);
1619
1620 MachineFunction &MF = CCInfo.getMachineFunction();
1621 MF.addLiveIn(Reg, RC);
1622 return ArgDescriptor::createRegister(Reg);
1623}
1624
1625static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
1626 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1627}
1628
1629static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
1630 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1631}
1632
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001633static void allocateSpecialInputVGPRs(CCState &CCInfo,
1634 MachineFunction &MF,
1635 const SIRegisterInfo &TRI,
1636 SIMachineFunctionInfo &Info) {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001637 if (Info.hasWorkItemIDX())
1638 Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001639
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001640 if (Info.hasWorkItemIDY())
1641 Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001642
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001643 if (Info.hasWorkItemIDZ())
1644 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1645}
1646
1647static void allocateSpecialInputSGPRs(CCState &CCInfo,
1648 MachineFunction &MF,
1649 const SIRegisterInfo &TRI,
1650 SIMachineFunctionInfo &Info) {
1651 auto &ArgInfo = Info.getArgInfo();
1652
1653 // TODO: Unify handling with private memory pointers.
1654
1655 if (Info.hasDispatchPtr())
1656 ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1657
1658 if (Info.hasQueuePtr())
1659 ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1660
1661 if (Info.hasKernargSegmentPtr())
1662 ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1663
1664 if (Info.hasDispatchID())
1665 ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1666
1667 // flat_scratch_init is not applicable for non-kernel functions.
1668
1669 if (Info.hasWorkGroupIDX())
1670 ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1671
1672 if (Info.hasWorkGroupIDY())
1673 ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1674
1675 if (Info.hasWorkGroupIDZ())
1676 ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
Matt Arsenault817c2532017-08-03 23:12:44 +00001677
1678 if (Info.hasImplicitArgPtr())
1679 ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001680}
1681
1682// Allocate special inputs passed in user SGPRs.
1683static void allocateHSAUserSGPRs(CCState &CCInfo,
1684 MachineFunction &MF,
1685 const SIRegisterInfo &TRI,
1686 SIMachineFunctionInfo &Info) {
Matt Arsenault10fc0622017-06-26 03:01:31 +00001687 if (Info.hasImplicitBufferPtr()) {
1688 unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1689 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1690 CCInfo.AllocateReg(ImplicitBufferPtrReg);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001691 }
1692
1693 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1694 if (Info.hasPrivateSegmentBuffer()) {
1695 unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1696 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1697 CCInfo.AllocateReg(PrivateSegmentBufferReg);
1698 }
1699
1700 if (Info.hasDispatchPtr()) {
1701 unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1702 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1703 CCInfo.AllocateReg(DispatchPtrReg);
1704 }
1705
1706 if (Info.hasQueuePtr()) {
1707 unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1708 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1709 CCInfo.AllocateReg(QueuePtrReg);
1710 }
1711
1712 if (Info.hasKernargSegmentPtr()) {
1713 unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1714 MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1715 CCInfo.AllocateReg(InputPtrReg);
1716 }
1717
1718 if (Info.hasDispatchID()) {
1719 unsigned DispatchIDReg = Info.addDispatchID(TRI);
1720 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1721 CCInfo.AllocateReg(DispatchIDReg);
1722 }
1723
1724 if (Info.hasFlatScratchInit()) {
1725 unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1726 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1727 CCInfo.AllocateReg(FlatScratchInitReg);
1728 }
1729
1730 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1731 // these from the dispatch pointer.
1732}
1733
1734// Allocate special input registers that are initialized per-wave.
1735static void allocateSystemSGPRs(CCState &CCInfo,
1736 MachineFunction &MF,
1737 SIMachineFunctionInfo &Info,
Marek Olsak584d2c02017-05-04 22:25:20 +00001738 CallingConv::ID CallConv,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001739 bool IsShader) {
1740 if (Info.hasWorkGroupIDX()) {
1741 unsigned Reg = Info.addWorkGroupIDX();
1742 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1743 CCInfo.AllocateReg(Reg);
1744 }
1745
1746 if (Info.hasWorkGroupIDY()) {
1747 unsigned Reg = Info.addWorkGroupIDY();
1748 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1749 CCInfo.AllocateReg(Reg);
1750 }
1751
1752 if (Info.hasWorkGroupIDZ()) {
1753 unsigned Reg = Info.addWorkGroupIDZ();
1754 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1755 CCInfo.AllocateReg(Reg);
1756 }
1757
1758 if (Info.hasWorkGroupInfo()) {
1759 unsigned Reg = Info.addWorkGroupInfo();
1760 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1761 CCInfo.AllocateReg(Reg);
1762 }
1763
1764 if (Info.hasPrivateSegmentWaveByteOffset()) {
1765 // Scratch wave offset passed in system SGPR.
1766 unsigned PrivateSegmentWaveByteOffsetReg;
1767
1768 if (IsShader) {
Marek Olsak584d2c02017-05-04 22:25:20 +00001769 PrivateSegmentWaveByteOffsetReg =
1770 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
1771
1772 // This is true if the scratch wave byte offset doesn't have a fixed
1773 // location.
1774 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1775 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1776 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1777 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001778 } else
1779 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1780
1781 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1782 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1783 }
1784}
1785
1786static void reservePrivateMemoryRegs(const TargetMachine &TM,
1787 MachineFunction &MF,
1788 const SIRegisterInfo &TRI,
Matt Arsenault1cc47f82017-07-18 16:44:56 +00001789 SIMachineFunctionInfo &Info) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001790 // Now that we've figured out where the scratch register inputs are, see if
1791 // should reserve the arguments and use them directly.
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001792 MachineFrameInfo &MFI = MF.getFrameInfo();
1793 bool HasStackObjects = MFI.hasStackObjects();
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001794 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001795
1796 // Record that we know we have non-spill stack objects so we don't need to
1797 // check all stack objects later.
1798 if (HasStackObjects)
1799 Info.setHasNonSpillStackObjects(true);
1800
1801 // Everything live out of a block is spilled with fast regalloc, so it's
1802 // almost certain that spilling will be required.
1803 if (TM.getOptLevel() == CodeGenOpt::None)
1804 HasStackObjects = true;
1805
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001806 // For now assume stack access is needed in any callee functions, so we need
1807 // the scratch registers to pass in.
1808 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1809
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001810 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
1811 // If we have stack objects, we unquestionably need the private buffer
1812 // resource. For the Code Object V2 ABI, this will be the first 4 user
1813 // SGPR inputs. We can reserve those and use them directly.
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001814
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001815 unsigned PrivateSegmentBufferReg =
1816 Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
1817 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001818 } else {
1819 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001820 // We tentatively reserve the last registers (skipping the last registers
1821 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
1822 // we'll replace these with the ones immediately after those which were
1823 // really allocated. In the prologue copies will be inserted from the
1824 // argument to these reserved registers.
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001825
1826 // Without HSA, relocations are used for the scratch pointer and the
1827 // buffer resource setup is always inserted in the prologue. Scratch wave
1828 // offset is still in an input SGPR.
1829 Info.setScratchRSrcReg(ReservedBufferReg);
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001830 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001831
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001832 // This should be accurate for kernels even before the frame is finalized.
1833 const bool HasFP = ST.getFrameLowering()->hasFP(MF);
1834 if (HasFP) {
1835 unsigned ReservedOffsetReg =
1836 TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1837 MachineRegisterInfo &MRI = MF.getRegInfo();
1838
1839 // Try to use s32 as the SP, but move it if it would interfere with input
1840 // arguments. This won't work with calls though.
1841 //
1842 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
1843 // registers.
1844 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
1845 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001846 } else {
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001847 assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
1848
1849 if (MFI.hasCalls())
1850 report_fatal_error("call in graphics shader with too many input SGPRs");
1851
1852 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
1853 if (!MRI.isLiveIn(Reg)) {
1854 Info.setStackPtrOffsetReg(Reg);
1855 break;
1856 }
1857 }
1858
1859 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
1860 report_fatal_error("failed to find register for SP");
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001861 }
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001862
1863 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1864 Info.setFrameOffsetReg(ReservedOffsetReg);
1865 } else if (RequiresStackAccess) {
1866 assert(!MFI.hasCalls());
1867 // We know there are accesses and they will be done relative to SP, so just
1868 // pin it to the input.
1869 //
1870 // FIXME: Should not do this if inline asm is reading/writing these
1871 // registers.
1872 unsigned PreloadedSP = Info.getPreloadedReg(
1873 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1874
1875 Info.setStackPtrOffsetReg(PreloadedSP);
1876 Info.setScratchWaveOffsetReg(PreloadedSP);
1877 Info.setFrameOffsetReg(PreloadedSP);
1878 } else {
1879 assert(!MFI.hasCalls());
1880
1881 // There may not be stack access at all. There may still be spills, or
1882 // access of a constant pointer (in which cases an extra copy will be
1883 // emitted in the prolog).
1884 unsigned ReservedOffsetReg
1885 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1886 Info.setStackPtrOffsetReg(ReservedOffsetReg);
1887 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1888 Info.setFrameOffsetReg(ReservedOffsetReg);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001889 }
1890}
1891
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001892bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
1893 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1894 return !Info->isEntryFunction();
1895}
1896
1897void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
1898
1899}
1900
1901void SITargetLowering::insertCopiesSplitCSR(
1902 MachineBasicBlock *Entry,
1903 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1904 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1905
1906 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1907 if (!IStart)
1908 return;
1909
1910 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1911 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1912 MachineBasicBlock::iterator MBBI = Entry->begin();
1913 for (const MCPhysReg *I = IStart; *I; ++I) {
1914 const TargetRegisterClass *RC = nullptr;
1915 if (AMDGPU::SReg_64RegClass.contains(*I))
1916 RC = &AMDGPU::SGPR_64RegClass;
1917 else if (AMDGPU::SReg_32RegClass.contains(*I))
1918 RC = &AMDGPU::SGPR_32RegClass;
1919 else
1920 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1921
1922 unsigned NewVR = MRI->createVirtualRegister(RC);
1923 // Create copy from CSR to a virtual register.
1924 Entry->addLiveIn(*I);
1925 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1926 .addReg(*I);
1927
1928 // Insert the copy-back instructions right before the terminator.
1929 for (auto *Exit : Exits)
1930 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1931 TII->get(TargetOpcode::COPY), *I)
1932 .addReg(NewVR);
1933 }
1934}
1935
Christian Konig2c8f6d52013-03-07 09:03:52 +00001936SDValue SITargetLowering::LowerFormalArguments(
Eric Christopher7792e322015-01-30 23:24:40 +00001937 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
Benjamin Kramerbdc49562016-06-12 15:39:02 +00001938 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1939 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00001940 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
Christian Konig2c8f6d52013-03-07 09:03:52 +00001941
1942 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenaultceafc552018-05-29 17:42:50 +00001943 const Function &Fn = MF.getFunction();
Matthias Braunf1caa282017-12-15 22:22:58 +00001944 FunctionType *FType = MF.getFunction().getFunctionType();
Christian Konig99ee0f42013-03-07 09:04:14 +00001945 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Christian Konig2c8f6d52013-03-07 09:03:52 +00001946
Nicolai Haehnledf3a20c2016-04-06 19:40:20 +00001947 if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
Oliver Stannard7e7d9832016-02-02 13:52:43 +00001948 DiagnosticInfoUnsupported NoGraphicsHSA(
Matthias Braunf1caa282017-12-15 22:22:58 +00001949 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
Matt Arsenaultd48da142015-11-02 23:23:02 +00001950 DAG.getContext()->diagnose(NoGraphicsHSA);
Diana Picus81bc3172016-05-26 15:24:55 +00001951 return DAG.getEntryNode();
Matt Arsenaultd48da142015-11-02 23:23:02 +00001952 }
1953
Christian Konig2c8f6d52013-03-07 09:03:52 +00001954 SmallVector<ISD::InputArg, 16> Splits;
Christian Konig2c8f6d52013-03-07 09:03:52 +00001955 SmallVector<CCValAssign, 16> ArgLocs;
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001956 BitVector Skipped(Ins.size());
Eric Christopherb5217502014-08-06 18:45:26 +00001957 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1958 *DAG.getContext());
Christian Konig2c8f6d52013-03-07 09:03:52 +00001959
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001960 bool IsShader = AMDGPU::isShader(CallConv);
Matt Arsenaultefa9f4b2017-04-11 22:29:28 +00001961 bool IsKernel = AMDGPU::isKernel(CallConv);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001962 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
Christian Konig99ee0f42013-03-07 09:04:14 +00001963
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001964 if (IsShader) {
1965 processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1966
1967 // At least one interpolation mode must be enabled or else the GPU will
1968 // hang.
1969 //
1970 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1971 // set PSInputAddr, the user wants to enable some bits after the compilation
1972 // based on run-time states. Since we can't know what the final PSInputEna
1973 // will look like, so we shouldn't do anything here and the user should take
1974 // responsibility for the correct programming.
1975 //
1976 // Otherwise, the following restrictions apply:
1977 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1978 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1979 // enabled too.
Tim Renoufc8ffffe2017-10-12 16:16:41 +00001980 if (CallConv == CallingConv::AMDGPU_PS) {
1981 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1982 ((Info->getPSInputAddr() & 0xF) == 0 &&
1983 Info->isPSInputAllocated(11))) {
1984 CCInfo.AllocateReg(AMDGPU::VGPR0);
1985 CCInfo.AllocateReg(AMDGPU::VGPR1);
1986 Info->markPSInputAllocated(0);
1987 Info->markPSInputEnabled(0);
1988 }
1989 if (Subtarget->isAmdPalOS()) {
1990 // For isAmdPalOS, the user does not enable some bits after compilation
1991 // based on run-time states; the register values being generated here are
1992 // the final ones set in hardware. Therefore we need to apply the
1993 // workaround to PSInputAddr and PSInputEnable together. (The case where
1994 // a bit is set in PSInputAddr but not PSInputEnable is where the
1995 // frontend set up an input arg for a particular interpolation mode, but
1996 // nothing uses that input arg. Really we should have an earlier pass
1997 // that removes such an arg.)
1998 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1999 if ((PsInputBits & 0x7F) == 0 ||
2000 ((PsInputBits & 0xF) == 0 &&
2001 (PsInputBits >> 11 & 1)))
2002 Info->markPSInputEnabled(
2003 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
2004 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002005 }
2006
Tom Stellard2f3f9852017-01-25 01:25:13 +00002007 assert(!Info->hasDispatchPtr() &&
Tom Stellardf110f8f2016-04-14 16:27:03 +00002008 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
2009 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2010 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
2011 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
2012 !Info->hasWorkItemIDZ());
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002013 } else if (IsKernel) {
2014 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002015 } else {
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002016 Splits.append(Ins.begin(), Ins.end());
Tom Stellardaf775432013-10-23 00:44:32 +00002017 }
2018
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002019 if (IsEntryFunc) {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002020 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002021 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
Tom Stellard2f3f9852017-01-25 01:25:13 +00002022 }
2023
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002024 if (IsKernel) {
Tom Stellardbbeb45a2016-09-16 21:53:00 +00002025 analyzeFormalArgumentsCompute(CCInfo, Ins);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002026 } else {
2027 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2028 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2029 }
Christian Konig2c8f6d52013-03-07 09:03:52 +00002030
Matt Arsenaultcf13d182015-07-10 22:51:36 +00002031 SmallVector<SDValue, 16> Chains;
2032
Matt Arsenault7b4826e2018-05-30 16:17:51 +00002033 // FIXME: This is the minimum kernel argument alignment. We should improve
2034 // this to the maximum alignment of the arguments.
2035 //
2036 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2037 // kern arg offset.
2038 const unsigned KernelArgBaseAlign = 16;
Matt Arsenault7b4826e2018-05-30 16:17:51 +00002039
2040 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
Christian Konigb7be72d2013-05-17 09:46:48 +00002041 const ISD::InputArg &Arg = Ins[i];
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00002042 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
Christian Konigb7be72d2013-05-17 09:46:48 +00002043 InVals.push_back(DAG.getUNDEF(Arg.VT));
Christian Konig99ee0f42013-03-07 09:04:14 +00002044 continue;
2045 }
2046
Christian Konig2c8f6d52013-03-07 09:03:52 +00002047 CCValAssign &VA = ArgLocs[ArgIdx++];
Craig Topper7f416c82014-11-16 21:17:18 +00002048 MVT VT = VA.getLocVT();
Tom Stellarded882c22013-06-03 17:40:11 +00002049
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002050 if (IsEntryFunc && VA.isMemLoc()) {
Tom Stellardaf775432013-10-23 00:44:32 +00002051 VT = Ins[i].VT;
Tom Stellardbbeb45a2016-09-16 21:53:00 +00002052 EVT MemVT = VA.getLocVT();
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002053
Matt Arsenault4bec7d42018-07-20 09:05:08 +00002054 const uint64_t Offset = VA.getLocMemOffset();
Matt Arsenault7b4826e2018-05-30 16:17:51 +00002055 unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002056
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002057 SDValue Arg = lowerKernargMemParameter(
Matt Arsenault7b4826e2018-05-30 16:17:51 +00002058 DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
Matt Arsenaultcf13d182015-07-10 22:51:36 +00002059 Chains.push_back(Arg.getValue(1));
Tom Stellardca7ecf32014-08-22 18:49:31 +00002060
Craig Toppere3dcce92015-08-01 22:20:21 +00002061 auto *ParamTy =
Andrew Trick05938a52015-02-16 18:10:47 +00002062 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
Tom Stellard5bfbae52018-07-11 20:59:01 +00002063 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
Matt Arsenaultcdd191d2019-01-28 20:14:49 +00002064 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2065 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
Tom Stellardca7ecf32014-08-22 18:49:31 +00002066 // On SI local pointers are just offsets into LDS, so they are always
2067 // less than 16-bits. On CI and newer they could potentially be
2068 // real pointers, so we can't guarantee their size.
2069 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2070 DAG.getValueType(MVT::i16));
2071 }
2072
Tom Stellarded882c22013-06-03 17:40:11 +00002073 InVals.push_back(Arg);
2074 continue;
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002075 } else if (!IsEntryFunc && VA.isMemLoc()) {
2076 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2077 InVals.push_back(Val);
2078 if (!Arg.Flags.isByVal())
2079 Chains.push_back(Val.getValue(1));
2080 continue;
Tom Stellarded882c22013-06-03 17:40:11 +00002081 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002082
Christian Konig2c8f6d52013-03-07 09:03:52 +00002083 assert(VA.isRegLoc() && "Parameter must be in a register!");
2084
2085 unsigned Reg = VA.getLocReg();
Christian Konig2c8f6d52013-03-07 09:03:52 +00002086 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
Matt Arsenaultb3463552017-07-15 05:52:59 +00002087 EVT ValVT = VA.getValVT();
Christian Konig2c8f6d52013-03-07 09:03:52 +00002088
2089 Reg = MF.addLiveIn(Reg, RC);
2090 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2091
Matt Arsenault5c714cb2019-05-23 19:38:14 +00002092 if (Arg.Flags.isSRet()) {
Matt Arsenault45b98182017-11-15 00:45:43 +00002093 // The return object should be reasonably addressable.
2094
2095 // FIXME: This helps when the return is a real sret. If it is a
2096 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2097 // extra copy is inserted in SelectionDAGBuilder which obscures this.
Matt Arsenault5c714cb2019-05-23 19:38:14 +00002098 unsigned NumBits
2099 = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
Matt Arsenault45b98182017-11-15 00:45:43 +00002100 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2101 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2102 }
2103
Matt Arsenaultb3463552017-07-15 05:52:59 +00002104 // If this is an 8 or 16-bit value, it is really passed promoted
2105 // to 32 bits. Insert an assert[sz]ext to capture this, then
2106 // truncate to the right size.
2107 switch (VA.getLocInfo()) {
2108 case CCValAssign::Full:
2109 break;
2110 case CCValAssign::BCvt:
2111 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2112 break;
2113 case CCValAssign::SExt:
2114 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2115 DAG.getValueType(ValVT));
2116 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2117 break;
2118 case CCValAssign::ZExt:
2119 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2120 DAG.getValueType(ValVT));
2121 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2122 break;
2123 case CCValAssign::AExt:
2124 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2125 break;
2126 default:
2127 llvm_unreachable("Unknown loc info!");
2128 }
2129
Christian Konig2c8f6d52013-03-07 09:03:52 +00002130 InVals.push_back(Val);
2131 }
Tom Stellarde99fb652015-01-20 19:33:04 +00002132
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002133 if (!IsEntryFunc) {
2134 // Special inputs come after user arguments.
2135 allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2136 }
2137
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002138 // Start adding system SGPRs.
2139 if (IsEntryFunc) {
2140 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002141 } else {
2142 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2143 CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
2144 CCInfo.AllocateReg(Info->getFrameOffsetReg());
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002145 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002146 }
Matt Arsenaultcf13d182015-07-10 22:51:36 +00002147
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002148 auto &ArgUsageInfo =
2149 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
Matt Arsenaultceafc552018-05-29 17:42:50 +00002150 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002151
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002152 unsigned StackArgSize = CCInfo.getNextStackOffset();
2153 Info->setBytesInStackArgArea(StackArgSize);
2154
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002155 return Chains.empty() ? Chain :
2156 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
Christian Konig2c8f6d52013-03-07 09:03:52 +00002157}
2158
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002159// TODO: If return values can't fit in registers, we should return as many as
2160// possible in registers before passing on stack.
2161bool SITargetLowering::CanLowerReturn(
2162 CallingConv::ID CallConv,
2163 MachineFunction &MF, bool IsVarArg,
2164 const SmallVectorImpl<ISD::OutputArg> &Outs,
2165 LLVMContext &Context) const {
2166 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2167 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2168 // for shaders. Vector types should be explicitly handled by CC.
2169 if (AMDGPU::isEntryFunctionCC(CallConv))
2170 return true;
2171
2172 SmallVector<CCValAssign, 16> RVLocs;
2173 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2174 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2175}
2176
Benjamin Kramerbdc49562016-06-12 15:39:02 +00002177SDValue
2178SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2179 bool isVarArg,
2180 const SmallVectorImpl<ISD::OutputArg> &Outs,
2181 const SmallVectorImpl<SDValue> &OutVals,
2182 const SDLoc &DL, SelectionDAG &DAG) const {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002183 MachineFunction &MF = DAG.getMachineFunction();
2184 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2185
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002186 if (AMDGPU::isKernel(CallConv)) {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002187 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2188 OutVals, DL, DAG);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002189 }
2190
2191 bool IsShader = AMDGPU::isShader(CallConv);
Marek Olsak8a0f3352016-01-13 17:23:04 +00002192
Matt Arsenault55ab9212018-08-01 19:57:34 +00002193 Info->setIfReturnsVoid(Outs.empty());
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002194 bool IsWaveEnd = Info->returnsVoid() && IsShader;
Marek Olsak8e9cc632016-01-13 17:23:09 +00002195
Marek Olsak8a0f3352016-01-13 17:23:04 +00002196 // CCValAssign - represent the assignment of the return value to a location.
2197 SmallVector<CCValAssign, 48> RVLocs;
Matt Arsenault55ab9212018-08-01 19:57:34 +00002198 SmallVector<ISD::OutputArg, 48> Splits;
Marek Olsak8a0f3352016-01-13 17:23:04 +00002199
2200 // CCState - Info about the registers and stack slots.
2201 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2202 *DAG.getContext());
2203
2204 // Analyze outgoing return values.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002205 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
Marek Olsak8a0f3352016-01-13 17:23:04 +00002206
2207 SDValue Flag;
2208 SmallVector<SDValue, 48> RetOps;
2209 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2210
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002211 // Add return address for callable functions.
2212 if (!Info->isEntryFunction()) {
2213 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2214 SDValue ReturnAddrReg = CreateLiveInRegister(
2215 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2216
2217 // FIXME: Should be able to use a vreg here, but need a way to prevent it
2218 // from being allcoated to a CSR.
2219
2220 SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2221 MVT::i64);
2222
2223 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
2224 Flag = Chain.getValue(1);
2225
2226 RetOps.push_back(PhysReturnAddrReg);
2227 }
2228
Marek Olsak8a0f3352016-01-13 17:23:04 +00002229 // Copy the result values into the output registers.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002230 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2231 ++I, ++RealRVLocIdx) {
2232 CCValAssign &VA = RVLocs[I];
Marek Olsak8a0f3352016-01-13 17:23:04 +00002233 assert(VA.isRegLoc() && "Can only return in registers!");
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002234 // TODO: Partially return in registers if return values don't fit.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002235 SDValue Arg = OutVals[RealRVLocIdx];
Marek Olsak8a0f3352016-01-13 17:23:04 +00002236
2237 // Copied from other backends.
2238 switch (VA.getLocInfo()) {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002239 case CCValAssign::Full:
2240 break;
2241 case CCValAssign::BCvt:
2242 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2243 break;
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002244 case CCValAssign::SExt:
2245 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2246 break;
2247 case CCValAssign::ZExt:
2248 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2249 break;
2250 case CCValAssign::AExt:
2251 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2252 break;
2253 default:
2254 llvm_unreachable("Unknown loc info!");
Marek Olsak8a0f3352016-01-13 17:23:04 +00002255 }
2256
2257 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2258 Flag = Chain.getValue(1);
2259 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2260 }
2261
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002262 // FIXME: Does sret work properly?
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002263 if (!Info->isEntryFunction()) {
Tom Stellardc5a154d2018-06-28 23:47:12 +00002264 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002265 const MCPhysReg *I =
2266 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2267 if (I) {
2268 for (; *I; ++I) {
2269 if (AMDGPU::SReg_64RegClass.contains(*I))
2270 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2271 else if (AMDGPU::SReg_32RegClass.contains(*I))
2272 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2273 else
2274 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2275 }
2276 }
2277 }
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002278
Marek Olsak8a0f3352016-01-13 17:23:04 +00002279 // Update chain and glue.
2280 RetOps[0] = Chain;
2281 if (Flag.getNode())
2282 RetOps.push_back(Flag);
2283
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002284 unsigned Opc = AMDGPUISD::ENDPGM;
2285 if (!IsWaveEnd)
2286 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
Matt Arsenault9babdf42016-06-22 20:15:28 +00002287 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
Marek Olsak8a0f3352016-01-13 17:23:04 +00002288}
2289
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002290SDValue SITargetLowering::LowerCallResult(
2291 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2292 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2293 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2294 SDValue ThisVal) const {
2295 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2296
2297 // Assign locations to each value returned by this call.
2298 SmallVector<CCValAssign, 16> RVLocs;
2299 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2300 *DAG.getContext());
2301 CCInfo.AnalyzeCallResult(Ins, RetCC);
2302
2303 // Copy all of the result registers out of their specified physreg.
2304 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2305 CCValAssign VA = RVLocs[i];
2306 SDValue Val;
2307
2308 if (VA.isRegLoc()) {
2309 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2310 Chain = Val.getValue(1);
2311 InFlag = Val.getValue(2);
2312 } else if (VA.isMemLoc()) {
2313 report_fatal_error("TODO: return values in memory");
2314 } else
2315 llvm_unreachable("unknown argument location type");
2316
2317 switch (VA.getLocInfo()) {
2318 case CCValAssign::Full:
2319 break;
2320 case CCValAssign::BCvt:
2321 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2322 break;
2323 case CCValAssign::ZExt:
2324 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2325 DAG.getValueType(VA.getValVT()));
2326 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2327 break;
2328 case CCValAssign::SExt:
2329 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2330 DAG.getValueType(VA.getValVT()));
2331 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2332 break;
2333 case CCValAssign::AExt:
2334 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2335 break;
2336 default:
2337 llvm_unreachable("Unknown loc info!");
2338 }
2339
2340 InVals.push_back(Val);
2341 }
2342
2343 return Chain;
2344}
2345
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002346// Add code to pass special inputs required depending on used features separate
2347// from the explicit user arguments present in the IR.
2348void SITargetLowering::passSpecialInputs(
2349 CallLoweringInfo &CLI,
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002350 CCState &CCInfo,
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002351 const SIMachineFunctionInfo &Info,
2352 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2353 SmallVectorImpl<SDValue> &MemOpChains,
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002354 SDValue Chain) const {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002355 // If we don't have a call site, this was a call inserted by
2356 // legalization. These can never use special inputs.
2357 if (!CLI.CS)
2358 return;
2359
2360 const Function *CalleeFunc = CLI.CS.getCalledFunction();
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002361 assert(CalleeFunc);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002362
2363 SelectionDAG &DAG = CLI.DAG;
2364 const SDLoc &DL = CLI.DL;
2365
Tom Stellardc5a154d2018-06-28 23:47:12 +00002366 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002367
2368 auto &ArgUsageInfo =
2369 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2370 const AMDGPUFunctionArgInfo &CalleeArgInfo
2371 = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2372
2373 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2374
2375 // TODO: Unify with private memory register handling. This is complicated by
2376 // the fact that at least in kernels, the input argument is not necessarily
2377 // in the same location as the input.
2378 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
2379 AMDGPUFunctionArgInfo::DISPATCH_PTR,
2380 AMDGPUFunctionArgInfo::QUEUE_PTR,
2381 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
2382 AMDGPUFunctionArgInfo::DISPATCH_ID,
2383 AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
2384 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
2385 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
2386 AMDGPUFunctionArgInfo::WORKITEM_ID_X,
2387 AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
Matt Arsenault817c2532017-08-03 23:12:44 +00002388 AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
2389 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002390 };
2391
2392 for (auto InputID : InputRegs) {
2393 const ArgDescriptor *OutgoingArg;
2394 const TargetRegisterClass *ArgRC;
2395
2396 std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2397 if (!OutgoingArg)
2398 continue;
2399
2400 const ArgDescriptor *IncomingArg;
2401 const TargetRegisterClass *IncomingArgRC;
2402 std::tie(IncomingArg, IncomingArgRC)
2403 = CallerArgInfo.getPreloadedValue(InputID);
2404 assert(IncomingArgRC == ArgRC);
2405
2406 // All special arguments are ints for now.
2407 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
Matt Arsenault817c2532017-08-03 23:12:44 +00002408 SDValue InputReg;
2409
2410 if (IncomingArg) {
2411 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2412 } else {
2413 // The implicit arg ptr is special because it doesn't have a corresponding
2414 // input for kernels, and is computed from the kernarg segment pointer.
2415 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2416 InputReg = getImplicitArgPtr(DAG, DL);
2417 }
2418
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002419 if (OutgoingArg->isRegister()) {
2420 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2421 } else {
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002422 unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
2423 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2424 SpecialArgOffset);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002425 MemOpChains.push_back(ArgStore);
2426 }
2427 }
2428}
2429
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002430static bool canGuaranteeTCO(CallingConv::ID CC) {
2431 return CC == CallingConv::Fast;
2432}
2433
2434/// Return true if we might ever do TCO for calls with this calling convention.
2435static bool mayTailCallThisCC(CallingConv::ID CC) {
2436 switch (CC) {
2437 case CallingConv::C:
2438 return true;
2439 default:
2440 return canGuaranteeTCO(CC);
2441 }
2442}
2443
2444bool SITargetLowering::isEligibleForTailCallOptimization(
2445 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2446 const SmallVectorImpl<ISD::OutputArg> &Outs,
2447 const SmallVectorImpl<SDValue> &OutVals,
2448 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2449 if (!mayTailCallThisCC(CalleeCC))
2450 return false;
2451
2452 MachineFunction &MF = DAG.getMachineFunction();
Matthias Braunf1caa282017-12-15 22:22:58 +00002453 const Function &CallerF = MF.getFunction();
2454 CallingConv::ID CallerCC = CallerF.getCallingConv();
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002455 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2456 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2457
2458 // Kernels aren't callable, and don't have a live in return address so it
2459 // doesn't make sense to do a tail call with entry functions.
2460 if (!CallerPreserved)
2461 return false;
2462
2463 bool CCMatch = CallerCC == CalleeCC;
2464
2465 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
2466 if (canGuaranteeTCO(CalleeCC) && CCMatch)
2467 return true;
2468 return false;
2469 }
2470
2471 // TODO: Can we handle var args?
2472 if (IsVarArg)
2473 return false;
2474
Matthias Braunf1caa282017-12-15 22:22:58 +00002475 for (const Argument &Arg : CallerF.args()) {
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002476 if (Arg.hasByValAttr())
2477 return false;
2478 }
2479
2480 LLVMContext &Ctx = *DAG.getContext();
2481
2482 // Check that the call results are passed in the same way.
2483 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2484 CCAssignFnForCall(CalleeCC, IsVarArg),
2485 CCAssignFnForCall(CallerCC, IsVarArg)))
2486 return false;
2487
2488 // The callee has to preserve all registers the caller needs to preserve.
2489 if (!CCMatch) {
2490 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2491 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2492 return false;
2493 }
2494
2495 // Nothing more to check if the callee is taking no arguments.
2496 if (Outs.empty())
2497 return true;
2498
2499 SmallVector<CCValAssign, 16> ArgLocs;
2500 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2501
2502 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2503
2504 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2505 // If the stack arguments for this call do not fit into our own save area then
2506 // the call cannot be made tail.
2507 // TODO: Is this really necessary?
2508 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2509 return false;
2510
2511 const MachineRegisterInfo &MRI = MF.getRegInfo();
2512 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2513}
2514
2515bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2516 if (!CI->isTailCall())
2517 return false;
2518
2519 const Function *ParentFn = CI->getParent()->getParent();
2520 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2521 return false;
2522
2523 auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2524 return (Attr.getValueAsString() != "true");
2525}
2526
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002527// The wave scratch offset register is used as the global base pointer.
2528SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
2529 SmallVectorImpl<SDValue> &InVals) const {
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002530 SelectionDAG &DAG = CLI.DAG;
2531 const SDLoc &DL = CLI.DL;
2532 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2533 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2534 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2535 SDValue Chain = CLI.Chain;
2536 SDValue Callee = CLI.Callee;
2537 bool &IsTailCall = CLI.IsTailCall;
2538 CallingConv::ID CallConv = CLI.CallConv;
2539 bool IsVarArg = CLI.IsVarArg;
2540 bool IsSibCall = false;
2541 bool IsThisReturn = false;
2542 MachineFunction &MF = DAG.getMachineFunction();
2543
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002544 if (IsVarArg) {
2545 return lowerUnhandledCall(CLI, InVals,
2546 "unsupported call to variadic function ");
2547 }
2548
Matt Arsenault935f3b72018-08-08 16:58:39 +00002549 if (!CLI.CS.getInstruction())
2550 report_fatal_error("unsupported libcall legalization");
2551
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002552 if (!CLI.CS.getCalledFunction()) {
2553 return lowerUnhandledCall(CLI, InVals,
2554 "unsupported indirect call to function ");
2555 }
2556
2557 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2558 return lowerUnhandledCall(CLI, InVals,
2559 "unsupported required tail call to function ");
2560 }
2561
Matt Arsenault1fb90132018-06-28 10:18:36 +00002562 if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
2563 // Note the issue is with the CC of the calling function, not of the call
2564 // itself.
2565 return lowerUnhandledCall(CLI, InVals,
2566 "unsupported call from graphics shader of function ");
2567 }
2568
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002569 if (IsTailCall) {
2570 IsTailCall = isEligibleForTailCallOptimization(
2571 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2572 if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2573 report_fatal_error("failed to perform tail call elimination on a call "
2574 "site marked musttail");
2575 }
2576
2577 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2578
2579 // A sibling call is one where we're under the usual C ABI and not planning
2580 // to change that but can still do a tail call:
2581 if (!TailCallOpt && IsTailCall)
2582 IsSibCall = true;
2583
2584 if (IsTailCall)
2585 ++NumTailCalls;
2586 }
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002587
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002588 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2589
2590 // Analyze operands of the call, assigning locations to each operand.
2591 SmallVector<CCValAssign, 16> ArgLocs;
2592 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2593 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002594
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002595 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2596
2597 // Get a count of how many bytes are to be pushed on the stack.
2598 unsigned NumBytes = CCInfo.getNextStackOffset();
2599
2600 if (IsSibCall) {
2601 // Since we're not changing the ABI to make this a tail call, the memory
2602 // operands are already available in the caller's incoming argument space.
2603 NumBytes = 0;
2604 }
2605
2606 // FPDiff is the byte offset of the call's argument area from the callee's.
2607 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2608 // by this amount for a tail call. In a sibling call it must be 0 because the
2609 // caller will deallocate the entire stack and the callee still expects its
2610 // arguments to begin at SP+0. Completely unused for non-tail calls.
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002611 int32_t FPDiff = 0;
2612 MachineFrameInfo &MFI = MF.getFrameInfo();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002613 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2614
Matt Arsenault6efd0822017-09-14 17:14:57 +00002615 SDValue CallerSavedFP;
2616
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002617 // Adjust the stack pointer for the new arguments...
2618 // These operations are automatically eliminated by the prolog/epilog pass
2619 if (!IsSibCall) {
Matt Arsenaultdefe3712017-09-14 17:37:40 +00002620 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002621
Matt Arsenault99e6f4d2019-05-16 15:10:27 +00002622 SmallVector<SDValue, 4> CopyFromChains;
2623
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002624 unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2625
2626 // In the HSA case, this should be an identity copy.
2627 SDValue ScratchRSrcReg
2628 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2629 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
Matt Arsenault99e6f4d2019-05-16 15:10:27 +00002630 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002631
2632 // TODO: Don't hardcode these registers and get from the callee function.
2633 SDValue ScratchWaveOffsetReg
2634 = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2635 RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
Matt Arsenault99e6f4d2019-05-16 15:10:27 +00002636 CopyFromChains.push_back(ScratchWaveOffsetReg.getValue(1));
Matt Arsenault6efd0822017-09-14 17:14:57 +00002637
2638 if (!Info->isEntryFunction()) {
2639 // Avoid clobbering this function's FP value. In the current convention
2640 // callee will overwrite this, so do save/restore around the call site.
2641 CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2642 Info->getFrameOffsetReg(), MVT::i32);
Matt Arsenault99e6f4d2019-05-16 15:10:27 +00002643 CopyFromChains.push_back(CallerSavedFP.getValue(1));
Matt Arsenault6efd0822017-09-14 17:14:57 +00002644 }
Matt Arsenault99e6f4d2019-05-16 15:10:27 +00002645
2646 Chain = DAG.getTokenFactor(DL, CopyFromChains);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002647 }
2648
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002649 SmallVector<SDValue, 8> MemOpChains;
2650 MVT PtrVT = MVT::i32;
2651
2652 // Walk the register/memloc assignments, inserting copies/loads.
2653 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2654 ++i, ++realArgIdx) {
2655 CCValAssign &VA = ArgLocs[i];
2656 SDValue Arg = OutVals[realArgIdx];
2657
2658 // Promote the value if needed.
2659 switch (VA.getLocInfo()) {
2660 case CCValAssign::Full:
2661 break;
2662 case CCValAssign::BCvt:
2663 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2664 break;
2665 case CCValAssign::ZExt:
2666 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2667 break;
2668 case CCValAssign::SExt:
2669 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2670 break;
2671 case CCValAssign::AExt:
2672 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2673 break;
2674 case CCValAssign::FPExt:
2675 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2676 break;
2677 default:
2678 llvm_unreachable("Unknown loc info!");
2679 }
2680
2681 if (VA.isRegLoc()) {
2682 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2683 } else {
2684 assert(VA.isMemLoc());
2685
2686 SDValue DstAddr;
2687 MachinePointerInfo DstInfo;
2688
2689 unsigned LocMemOffset = VA.getLocMemOffset();
2690 int32_t Offset = LocMemOffset;
Matt Arsenaultb655fa92017-11-29 01:25:12 +00002691
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002692 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002693 unsigned Align = 0;
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002694
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002695 if (IsTailCall) {
2696 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2697 unsigned OpSize = Flags.isByVal() ?
2698 Flags.getByValSize() : VA.getValVT().getStoreSize();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002699
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002700 // FIXME: We can have better than the minimum byval required alignment.
2701 Align = Flags.isByVal() ? Flags.getByValAlign() :
2702 MinAlign(Subtarget->getStackAlignment(), Offset);
2703
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002704 Offset = Offset + FPDiff;
2705 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2706
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002707 DstAddr = DAG.getFrameIndex(FI, PtrVT);
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002708 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2709
2710 // Make sure any stack arguments overlapping with where we're storing
2711 // are loaded before this eventual operation. Otherwise they'll be
2712 // clobbered.
2713
2714 // FIXME: Why is this really necessary? This seems to just result in a
2715 // lot of code to copy the stack and write them back to the same
2716 // locations, which are supposed to be immutable?
2717 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2718 } else {
2719 DstAddr = PtrOff;
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002720 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002721 Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002722 }
2723
2724 if (Outs[i].Flags.isByVal()) {
2725 SDValue SizeNode =
2726 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2727 SDValue Cpy = DAG.getMemcpy(
2728 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2729 /*isVol = */ false, /*AlwaysInline = */ true,
Yaxun Liuc5962262017-11-22 16:13:35 +00002730 /*isTailCall = */ false, DstInfo,
2731 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
Matt Arsenault0da63502018-08-31 05:49:54 +00002732 *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))));
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002733
2734 MemOpChains.push_back(Cpy);
2735 } else {
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002736 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002737 MemOpChains.push_back(Store);
2738 }
2739 }
2740 }
2741
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002742 // Copy special input registers after user input arguments.
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002743 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002744
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002745 if (!MemOpChains.empty())
2746 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2747
2748 // Build a sequence of copy-to-reg nodes chained together with token chain
2749 // and flag operands which copy the outgoing args into the appropriate regs.
2750 SDValue InFlag;
2751 for (auto &RegToPass : RegsToPass) {
2752 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2753 RegToPass.second, InFlag);
2754 InFlag = Chain.getValue(1);
2755 }
2756
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002757
2758 SDValue PhysReturnAddrReg;
2759 if (IsTailCall) {
2760 // Since the return is being combined with the call, we need to pass on the
2761 // return address.
2762
2763 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2764 SDValue ReturnAddrReg = CreateLiveInRegister(
2765 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2766
2767 PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2768 MVT::i64);
2769 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2770 InFlag = Chain.getValue(1);
2771 }
2772
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002773 // We don't usually want to end the call-sequence here because we would tidy
2774 // the frame up *after* the call, however in the ABI-changing tail-call case
2775 // we've carefully laid out the parameters so that when sp is reset they'll be
2776 // in the correct location.
2777 if (IsTailCall && !IsSibCall) {
2778 Chain = DAG.getCALLSEQ_END(Chain,
2779 DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2780 DAG.getTargetConstant(0, DL, MVT::i32),
2781 InFlag, DL);
2782 InFlag = Chain.getValue(1);
2783 }
2784
2785 std::vector<SDValue> Ops;
2786 Ops.push_back(Chain);
2787 Ops.push_back(Callee);
Scott Linderd19d1972019-02-04 20:00:07 +00002788 // Add a redundant copy of the callee global which will not be legalized, as
2789 // we need direct access to the callee later.
2790 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee);
2791 const GlobalValue *GV = GSD->getGlobal();
2792 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002793
2794 if (IsTailCall) {
2795 // Each tail call may have to adjust the stack by a different amount, so
2796 // this information must travel along with the operation for eventual
2797 // consumption by emitEpilogue.
2798 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002799
2800 Ops.push_back(PhysReturnAddrReg);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002801 }
2802
2803 // Add argument registers to the end of the list so that they are known live
2804 // into the call.
2805 for (auto &RegToPass : RegsToPass) {
2806 Ops.push_back(DAG.getRegister(RegToPass.first,
2807 RegToPass.second.getValueType()));
2808 }
2809
2810 // Add a register mask operand representing the call-preserved registers.
2811
Tom Stellardc5a154d2018-06-28 23:47:12 +00002812 auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002813 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2814 assert(Mask && "Missing call preserved mask for calling convention");
2815 Ops.push_back(DAG.getRegisterMask(Mask));
2816
2817 if (InFlag.getNode())
2818 Ops.push_back(InFlag);
2819
2820 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2821
2822 // If we're doing a tall call, use a TC_RETURN here rather than an
2823 // actual call instruction.
2824 if (IsTailCall) {
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002825 MFI.setHasTailCall();
2826 return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002827 }
2828
2829 // Returns a chain and a flag for retval copy to use.
2830 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2831 Chain = Call.getValue(0);
2832 InFlag = Call.getValue(1);
2833
Matt Arsenault6efd0822017-09-14 17:14:57 +00002834 if (CallerSavedFP) {
2835 SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2836 Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2837 InFlag = Chain.getValue(1);
2838 }
2839
Matt Arsenaultdefe3712017-09-14 17:37:40 +00002840 uint64_t CalleePopBytes = NumBytes;
2841 Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002842 DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2843 InFlag, DL);
2844 if (!Ins.empty())
2845 InFlag = Chain.getValue(1);
2846
2847 // Handle result values, copying them out of physregs into vregs that we
2848 // return.
2849 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2850 InVals, IsThisReturn,
2851 IsThisReturn ? OutVals[0] : SDValue());
2852}
2853
Matt Arsenault9a10cea2016-01-26 04:29:24 +00002854unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2855 SelectionDAG &DAG) const {
2856 unsigned Reg = StringSwitch<unsigned>(RegName)
2857 .Case("m0", AMDGPU::M0)
2858 .Case("exec", AMDGPU::EXEC)
2859 .Case("exec_lo", AMDGPU::EXEC_LO)
2860 .Case("exec_hi", AMDGPU::EXEC_HI)
2861 .Case("flat_scratch", AMDGPU::FLAT_SCR)
2862 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2863 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2864 .Default(AMDGPU::NoRegister);
2865
2866 if (Reg == AMDGPU::NoRegister) {
2867 report_fatal_error(Twine("invalid register name \""
2868 + StringRef(RegName) + "\"."));
2869
2870 }
2871
Matt Arsenaulte4c2e9b2019-06-19 23:54:58 +00002872 if (!Subtarget->hasFlatScrRegister() &&
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00002873 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
Matt Arsenault9a10cea2016-01-26 04:29:24 +00002874 report_fatal_error(Twine("invalid register \""
2875 + StringRef(RegName) + "\" for subtarget."));
2876 }
2877
2878 switch (Reg) {
2879 case AMDGPU::M0:
2880 case AMDGPU::EXEC_LO:
2881 case AMDGPU::EXEC_HI:
2882 case AMDGPU::FLAT_SCR_LO:
2883 case AMDGPU::FLAT_SCR_HI:
2884 if (VT.getSizeInBits() == 32)
2885 return Reg;
2886 break;
2887 case AMDGPU::EXEC:
2888 case AMDGPU::FLAT_SCR:
2889 if (VT.getSizeInBits() == 64)
2890 return Reg;
2891 break;
2892 default:
2893 llvm_unreachable("missing register type checking");
2894 }
2895
2896 report_fatal_error(Twine("invalid type for register \""
2897 + StringRef(RegName) + "\"."));
2898}
2899
Matt Arsenault786724a2016-07-12 21:41:32 +00002900// If kill is not the last instruction, split the block so kill is always a
2901// proper terminator.
2902MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
2903 MachineBasicBlock *BB) const {
2904 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2905
2906 MachineBasicBlock::iterator SplitPoint(&MI);
2907 ++SplitPoint;
2908
2909 if (SplitPoint == BB->end()) {
2910 // Don't bother with a new block.
Marek Olsakce76ea02017-10-24 10:27:13 +00002911 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
Matt Arsenault786724a2016-07-12 21:41:32 +00002912 return BB;
2913 }
2914
2915 MachineFunction *MF = BB->getParent();
2916 MachineBasicBlock *SplitBB
2917 = MF->CreateMachineBasicBlock(BB->getBasicBlock());
2918
Matt Arsenault786724a2016-07-12 21:41:32 +00002919 MF->insert(++MachineFunction::iterator(BB), SplitBB);
2920 SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2921
Matt Arsenaultd40ded62016-07-22 17:01:15 +00002922 SplitBB->transferSuccessorsAndUpdatePHIs(BB);
Matt Arsenault786724a2016-07-12 21:41:32 +00002923 BB->addSuccessor(SplitBB);
2924
Marek Olsakce76ea02017-10-24 10:27:13 +00002925 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
Matt Arsenault786724a2016-07-12 21:41:32 +00002926 return SplitBB;
2927}
2928
Matt Arsenault8ad1dec2019-06-20 20:54:32 +00002929// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
2930// \p MI will be the only instruction in the loop body block. Otherwise, it will
2931// be the first instruction in the remainder block.
2932//
2933/// \returns { LoopBody, Remainder }
2934static std::pair<MachineBasicBlock *, MachineBasicBlock *>
2935splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
2936 MachineFunction *MF = MBB.getParent();
2937 MachineBasicBlock::iterator I(&MI);
2938
2939 // To insert the loop we need to split the block. Move everything after this
2940 // point to a new block, and insert a new empty block between the two.
2941 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
2942 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
2943 MachineFunction::iterator MBBI(MBB);
2944 ++MBBI;
2945
2946 MF->insert(MBBI, LoopBB);
2947 MF->insert(MBBI, RemainderBB);
2948
2949 LoopBB->addSuccessor(LoopBB);
2950 LoopBB->addSuccessor(RemainderBB);
2951
2952 // Move the rest of the block into a new block.
2953 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
2954
2955 if (InstInLoop) {
2956 auto Next = std::next(I);
2957
2958 // Move instruction to loop body.
2959 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
2960
2961 // Move the rest of the block.
2962 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
2963 } else {
2964 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
2965 }
2966
2967 MBB.addSuccessor(LoopBB);
2968
2969 return std::make_pair(LoopBB, RemainderBB);
2970}
2971
2972MachineBasicBlock *
2973SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
2974 MachineBasicBlock *BB) const {
2975 const DebugLoc &DL = MI.getDebugLoc();
2976
2977 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
2978
2979 MachineBasicBlock *LoopBB;
2980 MachineBasicBlock *RemainderBB;
2981 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2982
2983 MachineBasicBlock::iterator Prev = std::prev(MI.getIterator());
2984
2985 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
2986
2987 MachineBasicBlock::iterator I = LoopBB->end();
Matt Arsenault8ad1dec2019-06-20 20:54:32 +00002988 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
Matt Arsenault8ad1dec2019-06-20 20:54:32 +00002989
2990 const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
2991 AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
2992
2993 // Clear TRAP_STS.MEM_VIOL
2994 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
2995 .addImm(0)
2996 .addImm(EncodedReg);
2997
2998 // This is a pain, but we're not allowed to have physical register live-ins
2999 // yet. Insert a pair of copies if the VGPR0 hack is necessary.
Matt Arsenault740322f2019-06-20 21:11:42 +00003000 if (Src && TargetRegisterInfo::isPhysicalRegister(Src->getReg())) {
Matt Arsenault8ad1dec2019-06-20 20:54:32 +00003001 unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3002 BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0)
3003 .add(*Src);
3004
3005 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg())
3006 .addReg(Data0);
3007
3008 MRI.setSimpleHint(Data0, Src->getReg());
3009 }
3010
3011 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT))
3012 .addImm(0);
3013
3014 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3015
3016 // Load and check TRAP_STS.MEM_VIOL
3017 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
3018 .addImm(EncodedReg);
3019
3020 // FIXME: Do we need to use an isel pseudo that may clobber scc?
3021 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
3022 .addReg(Reg, RegState::Kill)
3023 .addImm(0);
3024 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3025 .addMBB(LoopBB);
3026
3027 return RemainderBB;
3028}
3029
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003030// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3031// wavefront. If the value is uniform and just happens to be in a VGPR, this
3032// will only do one iteration. In the worst case, this will loop 64 times.
3033//
3034// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003035static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
3036 const SIInstrInfo *TII,
3037 MachineRegisterInfo &MRI,
3038 MachineBasicBlock &OrigBB,
3039 MachineBasicBlock &LoopBB,
3040 const DebugLoc &DL,
3041 const MachineOperand &IdxReg,
3042 unsigned InitReg,
3043 unsigned ResultReg,
3044 unsigned PhiReg,
3045 unsigned InitSaveExecReg,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003046 int Offset,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003047 bool UseGPRIdxMode,
3048 bool IsIndirectSrc) {
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003049 MachineFunction *MF = OrigBB.getParent();
3050 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3051 const SIRegisterInfo *TRI = ST.getRegisterInfo();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003052 MachineBasicBlock::iterator I = LoopBB.begin();
3053
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003054 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3055 unsigned PhiExec = MRI.createVirtualRegister(BoolRC);
3056 unsigned NewExec = MRI.createVirtualRegister(BoolRC);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003057 unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003058 unsigned CondReg = MRI.createVirtualRegister(BoolRC);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003059
3060 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
3061 .addReg(InitReg)
3062 .addMBB(&OrigBB)
3063 .addReg(ResultReg)
3064 .addMBB(&LoopBB);
3065
3066 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
3067 .addReg(InitSaveExecReg)
3068 .addMBB(&OrigBB)
3069 .addReg(NewExec)
3070 .addMBB(&LoopBB);
3071
3072 // Read the next variant <- also loop target.
3073 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3074 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
3075
3076 // Compare the just read M0 value to all possible Idx values.
3077 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3078 .addReg(CurrentIdxReg)
Matt Arsenaultf0ba86a2016-07-21 09:40:57 +00003079 .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003080
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003081 // Update EXEC, save the original EXEC value to VCC.
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003082 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3083 : AMDGPU::S_AND_SAVEEXEC_B64),
3084 NewExec)
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003085 .addReg(CondReg, RegState::Kill);
3086
3087 MRI.setSimpleHint(NewExec, CondReg);
3088
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003089 if (UseGPRIdxMode) {
3090 unsigned IdxReg;
3091 if (Offset == 0) {
3092 IdxReg = CurrentIdxReg;
3093 } else {
3094 IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3095 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
3096 .addReg(CurrentIdxReg, RegState::Kill)
3097 .addImm(Offset);
3098 }
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003099 unsigned IdxMode = IsIndirectSrc ?
Dmitry Preobrazhenskyef920352019-02-27 13:12:12 +00003100 AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003101 MachineInstr *SetOn =
3102 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3103 .addReg(IdxReg, RegState::Kill)
3104 .addImm(IdxMode);
3105 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003106 } else {
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003107 // Move index from VCC into M0
3108 if (Offset == 0) {
3109 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3110 .addReg(CurrentIdxReg, RegState::Kill);
3111 } else {
3112 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3113 .addReg(CurrentIdxReg, RegState::Kill)
3114 .addImm(Offset);
3115 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003116 }
3117
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003118 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003119 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003120 MachineInstr *InsertPt =
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003121 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3122 : AMDGPU::S_XOR_B64_term), Exec)
3123 .addReg(Exec)
3124 .addReg(NewExec);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003125
3126 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3127 // s_cbranch_scc0?
3128
3129 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3130 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
3131 .addMBB(&LoopBB);
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003132
3133 return InsertPt->getIterator();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003134}
3135
3136// This has slightly sub-optimal regalloc when the source vector is killed by
3137// the read. The register allocator does not understand that the kill is
3138// per-workitem, so is kept alive for the whole loop so we end up not re-using a
3139// subregister from it, using 1 more VGPR than necessary. This was saved when
3140// this was expanded after register allocation.
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003141static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
3142 MachineBasicBlock &MBB,
3143 MachineInstr &MI,
3144 unsigned InitResultReg,
3145 unsigned PhiReg,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003146 int Offset,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003147 bool UseGPRIdxMode,
3148 bool IsIndirectSrc) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003149 MachineFunction *MF = MBB.getParent();
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003150 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3151 const SIRegisterInfo *TRI = ST.getRegisterInfo();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003152 MachineRegisterInfo &MRI = MF->getRegInfo();
3153 const DebugLoc &DL = MI.getDebugLoc();
3154 MachineBasicBlock::iterator I(&MI);
3155
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003156 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003157 unsigned DstReg = MI.getOperand(0).getReg();
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003158 unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
3159 unsigned TmpExec = MRI.createVirtualRegister(BoolXExecRC);
3160 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3161 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003162
3163 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3164
3165 // Save the EXEC mask
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003166 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
3167 .addReg(Exec);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003168
Matt Arsenault8ad1dec2019-06-20 20:54:32 +00003169 MachineBasicBlock *LoopBB;
3170 MachineBasicBlock *RemainderBB;
3171 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003172
3173 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3174
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003175 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3176 InitResultReg, DstReg, PhiReg, TmpExec,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003177 Offset, UseGPRIdxMode, IsIndirectSrc);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003178
3179 MachineBasicBlock::iterator First = RemainderBB->begin();
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003180 BuildMI(*RemainderBB, First, DL, TII->get(MovExecOpc), Exec)
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003181 .addReg(SaveExec);
3182
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003183 return InsPt;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003184}
3185
3186// Returns subreg index, offset
3187static std::pair<unsigned, int>
3188computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
3189 const TargetRegisterClass *SuperRC,
3190 unsigned VecReg,
3191 int Offset) {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003192 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003193
3194 // Skip out of bounds offsets, or else we would end up using an undefined
3195 // register.
3196 if (Offset >= NumElts || Offset < 0)
3197 return std::make_pair(AMDGPU::sub0, Offset);
3198
3199 return std::make_pair(AMDGPU::sub0 + Offset, 0);
3200}
3201
3202// Return true if the index is an SGPR and was set.
3203static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
3204 MachineRegisterInfo &MRI,
3205 MachineInstr &MI,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003206 int Offset,
3207 bool UseGPRIdxMode,
3208 bool IsIndirectSrc) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003209 MachineBasicBlock *MBB = MI.getParent();
3210 const DebugLoc &DL = MI.getDebugLoc();
3211 MachineBasicBlock::iterator I(&MI);
3212
3213 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3214 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3215
3216 assert(Idx->getReg() != AMDGPU::NoRegister);
3217
3218 if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
3219 return false;
3220
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003221 if (UseGPRIdxMode) {
3222 unsigned IdxMode = IsIndirectSrc ?
Dmitry Preobrazhenskyef920352019-02-27 13:12:12 +00003223 AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003224 if (Offset == 0) {
3225 MachineInstr *SetOn =
Diana Picus116bbab2017-01-13 09:58:52 +00003226 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3227 .add(*Idx)
3228 .addImm(IdxMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003229
Matt Arsenaultdac31db2016-10-13 12:45:16 +00003230 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003231 } else {
3232 unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3233 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
Diana Picus116bbab2017-01-13 09:58:52 +00003234 .add(*Idx)
3235 .addImm(Offset);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003236 MachineInstr *SetOn =
3237 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3238 .addReg(Tmp, RegState::Kill)
3239 .addImm(IdxMode);
3240
Matt Arsenaultdac31db2016-10-13 12:45:16 +00003241 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003242 }
3243
3244 return true;
3245 }
3246
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003247 if (Offset == 0) {
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00003248 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3249 .add(*Idx);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003250 } else {
3251 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00003252 .add(*Idx)
3253 .addImm(Offset);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003254 }
3255
3256 return true;
3257}
3258
3259// Control flow needs to be inserted if indexing with a VGPR.
3260static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
3261 MachineBasicBlock &MBB,
Tom Stellard5bfbae52018-07-11 20:59:01 +00003262 const GCNSubtarget &ST) {
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003263 const SIInstrInfo *TII = ST.getInstrInfo();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003264 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3265 MachineFunction *MF = MBB.getParent();
3266 MachineRegisterInfo &MRI = MF->getRegInfo();
3267
3268 unsigned Dst = MI.getOperand(0).getReg();
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003269 unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003270 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3271
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003272 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003273
3274 unsigned SubReg;
3275 std::tie(SubReg, Offset)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003276 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003277
Marek Olsake22fdb92017-03-21 17:00:32 +00003278 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003279
3280 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003281 MachineBasicBlock::iterator I(&MI);
3282 const DebugLoc &DL = MI.getDebugLoc();
3283
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003284 if (UseGPRIdxMode) {
3285 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3286 // to avoid interfering with other uses, so probably requires a new
3287 // optimization pass.
3288 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003289 .addReg(SrcReg, RegState::Undef, SubReg)
3290 .addReg(SrcReg, RegState::Implicit)
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003291 .addReg(AMDGPU::M0, RegState::Implicit);
3292 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3293 } else {
3294 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003295 .addReg(SrcReg, RegState::Undef, SubReg)
3296 .addReg(SrcReg, RegState::Implicit);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003297 }
3298
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003299 MI.eraseFromParent();
3300
3301 return &MBB;
3302 }
3303
3304 const DebugLoc &DL = MI.getDebugLoc();
3305 MachineBasicBlock::iterator I(&MI);
3306
3307 unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3308 unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3309
3310 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3311
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003312 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
3313 Offset, UseGPRIdxMode, true);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003314 MachineBasicBlock *LoopBB = InsPt->getParent();
3315
3316 if (UseGPRIdxMode) {
3317 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003318 .addReg(SrcReg, RegState::Undef, SubReg)
3319 .addReg(SrcReg, RegState::Implicit)
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003320 .addReg(AMDGPU::M0, RegState::Implicit);
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003321 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003322 } else {
3323 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003324 .addReg(SrcReg, RegState::Undef, SubReg)
3325 .addReg(SrcReg, RegState::Implicit);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003326 }
3327
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003328 MI.eraseFromParent();
3329
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003330 return LoopBB;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003331}
3332
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003333static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
3334 const TargetRegisterClass *VecRC) {
3335 switch (TRI.getRegSizeInBits(*VecRC)) {
3336 case 32: // 4 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003337 return AMDGPU::V_MOVRELD_B32_V1;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003338 case 64: // 8 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003339 return AMDGPU::V_MOVRELD_B32_V2;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003340 case 128: // 16 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003341 return AMDGPU::V_MOVRELD_B32_V4;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003342 case 256: // 32 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003343 return AMDGPU::V_MOVRELD_B32_V8;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003344 case 512: // 64 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003345 return AMDGPU::V_MOVRELD_B32_V16;
3346 default:
3347 llvm_unreachable("unsupported size for MOVRELD pseudos");
3348 }
3349}
3350
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003351static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
3352 MachineBasicBlock &MBB,
Tom Stellard5bfbae52018-07-11 20:59:01 +00003353 const GCNSubtarget &ST) {
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003354 const SIInstrInfo *TII = ST.getInstrInfo();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003355 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3356 MachineFunction *MF = MBB.getParent();
3357 MachineRegisterInfo &MRI = MF->getRegInfo();
3358
3359 unsigned Dst = MI.getOperand(0).getReg();
3360 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3361 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3362 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3363 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3364 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3365
3366 // This can be an immediate, but will be folded later.
3367 assert(Val->getReg());
3368
3369 unsigned SubReg;
3370 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3371 SrcVec->getReg(),
3372 Offset);
Marek Olsake22fdb92017-03-21 17:00:32 +00003373 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003374
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003375 if (Idx->getReg() == AMDGPU::NoRegister) {
3376 MachineBasicBlock::iterator I(&MI);
3377 const DebugLoc &DL = MI.getDebugLoc();
3378
3379 assert(Offset == 0);
3380
3381 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
Diana Picus116bbab2017-01-13 09:58:52 +00003382 .add(*SrcVec)
3383 .add(*Val)
3384 .addImm(SubReg);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003385
3386 MI.eraseFromParent();
3387 return &MBB;
3388 }
3389
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003390 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003391 MachineBasicBlock::iterator I(&MI);
3392 const DebugLoc &DL = MI.getDebugLoc();
3393
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003394 if (UseGPRIdxMode) {
3395 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
Diana Picus116bbab2017-01-13 09:58:52 +00003396 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
3397 .add(*Val)
3398 .addReg(Dst, RegState::ImplicitDefine)
3399 .addReg(SrcVec->getReg(), RegState::Implicit)
3400 .addReg(AMDGPU::M0, RegState::Implicit);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003401
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003402 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3403 } else {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003404 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003405
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003406 BuildMI(MBB, I, DL, MovRelDesc)
3407 .addReg(Dst, RegState::Define)
3408 .addReg(SrcVec->getReg())
Diana Picus116bbab2017-01-13 09:58:52 +00003409 .add(*Val)
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003410 .addImm(SubReg - AMDGPU::sub0);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003411 }
3412
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003413 MI.eraseFromParent();
3414 return &MBB;
3415 }
3416
3417 if (Val->isReg())
3418 MRI.clearKillFlags(Val->getReg());
3419
3420 const DebugLoc &DL = MI.getDebugLoc();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003421
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003422 unsigned PhiReg = MRI.createVirtualRegister(VecRC);
3423
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003424 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003425 Offset, UseGPRIdxMode, false);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003426 MachineBasicBlock *LoopBB = InsPt->getParent();
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003427
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003428 if (UseGPRIdxMode) {
3429 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
Diana Picus116bbab2017-01-13 09:58:52 +00003430 .addReg(PhiReg, RegState::Undef, SubReg) // vdst
3431 .add(*Val) // src0
3432 .addReg(Dst, RegState::ImplicitDefine)
3433 .addReg(PhiReg, RegState::Implicit)
3434 .addReg(AMDGPU::M0, RegState::Implicit);
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003435 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003436 } else {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003437 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003438
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003439 BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
3440 .addReg(Dst, RegState::Define)
3441 .addReg(PhiReg)
Diana Picus116bbab2017-01-13 09:58:52 +00003442 .add(*Val)
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003443 .addImm(SubReg - AMDGPU::sub0);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003444 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003445
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003446 MI.eraseFromParent();
3447
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003448 return LoopBB;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003449}
3450
Matt Arsenault786724a2016-07-12 21:41:32 +00003451MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
3452 MachineInstr &MI, MachineBasicBlock *BB) const {
Tom Stellard244891d2016-12-20 15:52:17 +00003453
3454 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3455 MachineFunction *MF = BB->getParent();
3456 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
3457
3458 if (TII->isMIMG(MI)) {
Matt Arsenault905f3512017-12-29 17:18:14 +00003459 if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
3460 report_fatal_error("missing mem operand from MIMG instruction");
3461 }
Tom Stellard244891d2016-12-20 15:52:17 +00003462 // Add a memoperand for mimg instructions so that they aren't assumed to
3463 // be ordered memory instuctions.
3464
Tom Stellard244891d2016-12-20 15:52:17 +00003465 return BB;
3466 }
3467
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003468 switch (MI.getOpcode()) {
Matt Arsenault301162c2017-11-15 21:51:43 +00003469 case AMDGPU::S_ADD_U64_PSEUDO:
3470 case AMDGPU::S_SUB_U64_PSEUDO: {
3471 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003472 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3473 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3474 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
Matt Arsenault301162c2017-11-15 21:51:43 +00003475 const DebugLoc &DL = MI.getDebugLoc();
3476
3477 MachineOperand &Dest = MI.getOperand(0);
3478 MachineOperand &Src0 = MI.getOperand(1);
3479 MachineOperand &Src1 = MI.getOperand(2);
3480
3481 unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3482 unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3483
3484 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003485 Src0, BoolRC, AMDGPU::sub0,
Matt Arsenault301162c2017-11-15 21:51:43 +00003486 &AMDGPU::SReg_32_XM0RegClass);
3487 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003488 Src0, BoolRC, AMDGPU::sub1,
Matt Arsenault301162c2017-11-15 21:51:43 +00003489 &AMDGPU::SReg_32_XM0RegClass);
3490
3491 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003492 Src1, BoolRC, AMDGPU::sub0,
Matt Arsenault301162c2017-11-15 21:51:43 +00003493 &AMDGPU::SReg_32_XM0RegClass);
3494 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003495 Src1, BoolRC, AMDGPU::sub1,
Matt Arsenault301162c2017-11-15 21:51:43 +00003496 &AMDGPU::SReg_32_XM0RegClass);
3497
3498 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3499
3500 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3501 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3502 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3503 .add(Src0Sub0)
3504 .add(Src1Sub0);
3505 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3506 .add(Src0Sub1)
3507 .add(Src1Sub1);
3508 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3509 .addReg(DestSub0)
3510 .addImm(AMDGPU::sub0)
3511 .addReg(DestSub1)
3512 .addImm(AMDGPU::sub1);
3513 MI.eraseFromParent();
3514 return BB;
3515 }
3516 case AMDGPU::SI_INIT_M0: {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003517 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
Matt Arsenault4ac341c2016-04-14 21:58:15 +00003518 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
Diana Picus116bbab2017-01-13 09:58:52 +00003519 .add(MI.getOperand(0));
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003520 MI.eraseFromParent();
Matt Arsenault20711b72015-02-20 22:10:45 +00003521 return BB;
Matt Arsenault301162c2017-11-15 21:51:43 +00003522 }
Marek Olsak2d825902017-04-28 20:21:58 +00003523 case AMDGPU::SI_INIT_EXEC:
3524 // This should be before all vector instructions.
3525 BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3526 AMDGPU::EXEC)
3527 .addImm(MI.getOperand(0).getImm());
3528 MI.eraseFromParent();
3529 return BB;
3530
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003531 case AMDGPU::SI_INIT_EXEC_LO:
3532 // This should be before all vector instructions.
3533 BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
3534 AMDGPU::EXEC_LO)
3535 .addImm(MI.getOperand(0).getImm());
3536 MI.eraseFromParent();
3537 return BB;
3538
Marek Olsak2d825902017-04-28 20:21:58 +00003539 case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3540 // Extract the thread count from an SGPR input and set EXEC accordingly.
3541 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3542 //
3543 // S_BFE_U32 count, input, {shift, 7}
3544 // S_BFM_B64 exec, count, 0
3545 // S_CMP_EQ_U32 count, 64
3546 // S_CMOV_B64 exec, -1
3547 MachineInstr *FirstMI = &*BB->begin();
3548 MachineRegisterInfo &MRI = MF->getRegInfo();
3549 unsigned InputReg = MI.getOperand(0).getReg();
3550 unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3551 bool Found = false;
3552
3553 // Move the COPY of the input reg to the beginning, so that we can use it.
3554 for (auto I = BB->begin(); I != &MI; I++) {
3555 if (I->getOpcode() != TargetOpcode::COPY ||
3556 I->getOperand(0).getReg() != InputReg)
3557 continue;
3558
3559 if (I == FirstMI) {
3560 FirstMI = &*++BB->begin();
3561 } else {
3562 I->removeFromParent();
3563 BB->insert(FirstMI, &*I);
3564 }
3565 Found = true;
3566 break;
3567 }
3568 assert(Found);
Davide Italiano0dcc0152017-05-11 19:58:52 +00003569 (void)Found;
Marek Olsak2d825902017-04-28 20:21:58 +00003570
3571 // This should be before all vector instructions.
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003572 unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1;
3573 bool isWave32 = getSubtarget()->isWave32();
3574 unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
Marek Olsak2d825902017-04-28 20:21:58 +00003575 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3576 .addReg(InputReg)
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003577 .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
3578 BuildMI(*BB, FirstMI, DebugLoc(),
3579 TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
3580 Exec)
Marek Olsak2d825902017-04-28 20:21:58 +00003581 .addReg(CountReg)
3582 .addImm(0);
3583 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3584 .addReg(CountReg, RegState::Kill)
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003585 .addImm(getSubtarget()->getWavefrontSize());
3586 BuildMI(*BB, FirstMI, DebugLoc(),
3587 TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
3588 Exec)
Marek Olsak2d825902017-04-28 20:21:58 +00003589 .addImm(-1);
3590 MI.eraseFromParent();
3591 return BB;
3592 }
3593
Changpeng Fang01f60622016-03-15 17:28:44 +00003594 case AMDGPU::GET_GROUPSTATICSIZE: {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003595 DebugLoc DL = MI.getDebugLoc();
Matt Arsenault3c07c812016-07-22 17:01:33 +00003596 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
Diana Picus116bbab2017-01-13 09:58:52 +00003597 .add(MI.getOperand(0))
3598 .addImm(MFI->getLDSSize());
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003599 MI.eraseFromParent();
Changpeng Fang01f60622016-03-15 17:28:44 +00003600 return BB;
3601 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003602 case AMDGPU::SI_INDIRECT_SRC_V1:
3603 case AMDGPU::SI_INDIRECT_SRC_V2:
3604 case AMDGPU::SI_INDIRECT_SRC_V4:
3605 case AMDGPU::SI_INDIRECT_SRC_V8:
3606 case AMDGPU::SI_INDIRECT_SRC_V16:
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003607 return emitIndirectSrc(MI, *BB, *getSubtarget());
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003608 case AMDGPU::SI_INDIRECT_DST_V1:
3609 case AMDGPU::SI_INDIRECT_DST_V2:
3610 case AMDGPU::SI_INDIRECT_DST_V4:
3611 case AMDGPU::SI_INDIRECT_DST_V8:
3612 case AMDGPU::SI_INDIRECT_DST_V16:
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003613 return emitIndirectDst(MI, *BB, *getSubtarget());
Marek Olsakce76ea02017-10-24 10:27:13 +00003614 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3615 case AMDGPU::SI_KILL_I1_PSEUDO:
Matt Arsenault786724a2016-07-12 21:41:32 +00003616 return splitKillBlock(MI, BB);
Matt Arsenault22e41792016-08-27 01:00:37 +00003617 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3618 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003619 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3620 const SIRegisterInfo *TRI = ST.getRegisterInfo();
Matt Arsenault22e41792016-08-27 01:00:37 +00003621
3622 unsigned Dst = MI.getOperand(0).getReg();
3623 unsigned Src0 = MI.getOperand(1).getReg();
3624 unsigned Src1 = MI.getOperand(2).getReg();
3625 const DebugLoc &DL = MI.getDebugLoc();
3626 unsigned SrcCond = MI.getOperand(3).getReg();
3627
3628 unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3629 unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003630 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3631 unsigned SrcCondCopy = MRI.createVirtualRegister(CondRC);
Matt Arsenault22e41792016-08-27 01:00:37 +00003632
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003633 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3634 .addReg(SrcCond);
Matt Arsenault22e41792016-08-27 01:00:37 +00003635 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
Tim Renouf2e94f6e2019-03-18 19:25:39 +00003636 .addImm(0)
Matt Arsenault22e41792016-08-27 01:00:37 +00003637 .addReg(Src0, 0, AMDGPU::sub0)
Tim Renouf2e94f6e2019-03-18 19:25:39 +00003638 .addImm(0)
Matt Arsenault22e41792016-08-27 01:00:37 +00003639 .addReg(Src1, 0, AMDGPU::sub0)
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003640 .addReg(SrcCondCopy);
Matt Arsenault22e41792016-08-27 01:00:37 +00003641 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
Tim Renouf2e94f6e2019-03-18 19:25:39 +00003642 .addImm(0)
Matt Arsenault22e41792016-08-27 01:00:37 +00003643 .addReg(Src0, 0, AMDGPU::sub1)
Tim Renouf2e94f6e2019-03-18 19:25:39 +00003644 .addImm(0)
Matt Arsenault22e41792016-08-27 01:00:37 +00003645 .addReg(Src1, 0, AMDGPU::sub1)
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003646 .addReg(SrcCondCopy);
Matt Arsenault22e41792016-08-27 01:00:37 +00003647
3648 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3649 .addReg(DstLo)
3650 .addImm(AMDGPU::sub0)
3651 .addReg(DstHi)
3652 .addImm(AMDGPU::sub1);
3653 MI.eraseFromParent();
3654 return BB;
3655 }
Matt Arsenault327188a2016-12-15 21:57:11 +00003656 case AMDGPU::SI_BR_UNDEF: {
3657 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3658 const DebugLoc &DL = MI.getDebugLoc();
3659 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
Diana Picus116bbab2017-01-13 09:58:52 +00003660 .add(MI.getOperand(0));
Matt Arsenault327188a2016-12-15 21:57:11 +00003661 Br->getOperand(1).setIsUndef(true); // read undef SCC
3662 MI.eraseFromParent();
3663 return BB;
3664 }
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003665 case AMDGPU::ADJCALLSTACKUP:
3666 case AMDGPU::ADJCALLSTACKDOWN: {
3667 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3668 MachineInstrBuilder MIB(*MF, &MI);
Matt Arsenaulte9f36792018-03-27 18:38:51 +00003669
3670 // Add an implicit use of the frame offset reg to prevent the restore copy
3671 // inserted after the call from being reorderd after stack operations in the
3672 // the caller's frame.
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003673 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
Matt Arsenaulte9f36792018-03-27 18:38:51 +00003674 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3675 .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003676 return BB;
3677 }
Scott Linderd19d1972019-02-04 20:00:07 +00003678 case AMDGPU::SI_CALL_ISEL: {
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003679 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3680 const DebugLoc &DL = MI.getDebugLoc();
Scott Linderd19d1972019-02-04 20:00:07 +00003681
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003682 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
Matt Arsenault6ed7b9b2017-08-02 01:31:28 +00003683
Matt Arsenault71bcbd42017-08-11 20:42:08 +00003684 MachineInstrBuilder MIB;
Scott Linderd19d1972019-02-04 20:00:07 +00003685 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
Matt Arsenault71bcbd42017-08-11 20:42:08 +00003686
Scott Linderd19d1972019-02-04 20:00:07 +00003687 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003688 MIB.add(MI.getOperand(I));
Matt Arsenault6ed7b9b2017-08-02 01:31:28 +00003689
Chandler Carruthc73c0302018-08-16 21:30:05 +00003690 MIB.cloneMemRefs(MI);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003691 MI.eraseFromParent();
3692 return BB;
3693 }
Stanislav Mekhanoshin64399da2019-05-02 04:26:35 +00003694 case AMDGPU::V_ADD_I32_e32:
3695 case AMDGPU::V_SUB_I32_e32:
3696 case AMDGPU::V_SUBREV_I32_e32: {
3697 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
3698 const DebugLoc &DL = MI.getDebugLoc();
3699 unsigned Opc = MI.getOpcode();
3700
3701 bool NeedClampOperand = false;
3702 if (TII->pseudoToMCOpcode(Opc) == -1) {
3703 Opc = AMDGPU::getVOPe64(Opc);
3704 NeedClampOperand = true;
3705 }
3706
3707 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
3708 if (TII->isVOP3(*I)) {
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +00003709 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3710 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3711 I.addReg(TRI->getVCC(), RegState::Define);
Stanislav Mekhanoshin64399da2019-05-02 04:26:35 +00003712 }
3713 I.add(MI.getOperand(1))
3714 .add(MI.getOperand(2));
3715 if (NeedClampOperand)
3716 I.addImm(0); // clamp bit for e64 encoding
3717
3718 TII->legalizeOperands(*I);
3719
3720 MI.eraseFromParent();
3721 return BB;
3722 }
Matt Arsenault8ad1dec2019-06-20 20:54:32 +00003723 case AMDGPU::DS_GWS_INIT:
3724 case AMDGPU::DS_GWS_SEMA_V:
3725 case AMDGPU::DS_GWS_SEMA_BR:
3726 case AMDGPU::DS_GWS_SEMA_P:
Matt Arsenault740322f2019-06-20 21:11:42 +00003727 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
Matt Arsenault8ad1dec2019-06-20 20:54:32 +00003728 case AMDGPU::DS_GWS_BARRIER:
3729 if (getSubtarget()->hasGWSAutoReplay())
3730 return BB;
3731 return emitGWSMemViolTestLoop(MI, BB);
Changpeng Fang01f60622016-03-15 17:28:44 +00003732 default:
3733 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
Tom Stellard75aadc22012-12-11 21:25:42 +00003734 }
Tom Stellard75aadc22012-12-11 21:25:42 +00003735}
3736
Matt Arsenaulte11d8ac2017-10-13 21:10:22 +00003737bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
3738 return isTypeLegal(VT.getScalarType());
3739}
3740
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003741bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
3742 // This currently forces unfolding various combinations of fsub into fma with
3743 // free fneg'd operands. As long as we have fast FMA (controlled by
3744 // isFMAFasterThanFMulAndFAdd), we should perform these.
3745
3746 // When fma is quarter rate, for f64 where add / sub are at best half rate,
3747 // most of these combines appear to be cycle neutral but save on instruction
3748 // count / code size.
3749 return true;
3750}
3751
Mehdi Amini44ede332015-07-09 02:09:04 +00003752EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
3753 EVT VT) const {
Tom Stellard83747202013-07-18 21:43:53 +00003754 if (!VT.isVector()) {
3755 return MVT::i1;
3756 }
Matt Arsenault8596f712014-11-28 22:51:38 +00003757 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
Tom Stellard75aadc22012-12-11 21:25:42 +00003758}
3759
Matt Arsenault94163282016-12-22 16:36:25 +00003760MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
3761 // TODO: Should i16 be used always if legal? For now it would force VALU
3762 // shifts.
3763 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
Christian Konig082a14a2013-03-18 11:34:05 +00003764}
3765
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003766// Answering this is somewhat tricky and depends on the specific device which
3767// have different rates for fma or all f64 operations.
3768//
3769// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3770// regardless of which device (although the number of cycles differs between
3771// devices), so it is always profitable for f64.
3772//
3773// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3774// only on full rate devices. Normally, we should prefer selecting v_mad_f32
3775// which we can always do even without fused FP ops since it returns the same
3776// result as the separate operations and since it is always full
3777// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3778// however does not support denormals, so we do report fma as faster if we have
3779// a fast fma device and require denormals.
3780//
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003781bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3782 VT = VT.getScalarType();
3783
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003784 switch (VT.getSimpleVT().SimpleTy) {
Matt Arsenault0084adc2018-04-30 19:08:16 +00003785 case MVT::f32: {
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003786 // This is as fast on some subtargets. However, we always have full rate f32
3787 // mad available which returns the same result as the separate operations
Matt Arsenault8d630032015-02-20 22:10:41 +00003788 // which we should prefer over fma. We can't use this if we want to support
3789 // denormals, so only report this in these cases.
Matt Arsenault0084adc2018-04-30 19:08:16 +00003790 if (Subtarget->hasFP32Denormals())
3791 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
3792
3793 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3794 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
3795 }
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003796 case MVT::f64:
3797 return true;
Matt Arsenault9e22bc22016-12-22 03:21:48 +00003798 case MVT::f16:
3799 return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003800 default:
3801 break;
3802 }
3803
3804 return false;
3805}
3806
Tom Stellard75aadc22012-12-11 21:25:42 +00003807//===----------------------------------------------------------------------===//
3808// Custom DAG Lowering Operations
3809//===----------------------------------------------------------------------===//
3810
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003811// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3812// wider vector type is legal.
3813SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
3814 SelectionDAG &DAG) const {
3815 unsigned Opc = Op.getOpcode();
3816 EVT VT = Op.getValueType();
3817 assert(VT == MVT::v4f16);
3818
3819 SDValue Lo, Hi;
3820 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
3821
3822 SDLoc SL(Op);
3823 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
3824 Op->getFlags());
3825 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
3826 Op->getFlags());
3827
3828 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3829}
3830
3831// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3832// wider vector type is legal.
3833SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
3834 SelectionDAG &DAG) const {
3835 unsigned Opc = Op.getOpcode();
3836 EVT VT = Op.getValueType();
3837 assert(VT == MVT::v4i16 || VT == MVT::v4f16);
3838
3839 SDValue Lo0, Hi0;
3840 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
3841 SDValue Lo1, Hi1;
3842 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
3843
3844 SDLoc SL(Op);
3845
3846 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
3847 Op->getFlags());
3848 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
3849 Op->getFlags());
3850
3851 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3852}
3853
Tom Stellard75aadc22012-12-11 21:25:42 +00003854SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3855 switch (Op.getOpcode()) {
3856 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
Tom Stellardf8794352012-12-19 22:10:31 +00003857 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
Aakanksha Patild5443f82019-05-29 18:20:11 +00003858 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
Tom Stellard35bb18c2013-08-26 15:06:04 +00003859 case ISD::LOAD: {
Tom Stellarde812f2f2014-07-21 15:45:06 +00003860 SDValue Result = LowerLOAD(Op, DAG);
3861 assert((!Result.getNode() ||
3862 Result.getNode()->getNumValues() == 2) &&
3863 "Load should return a value and a chain");
3864 return Result;
Tom Stellard35bb18c2013-08-26 15:06:04 +00003865 }
Tom Stellardaf775432013-10-23 00:44:32 +00003866
Matt Arsenaultad14ce82014-07-19 18:44:39 +00003867 case ISD::FSIN:
3868 case ISD::FCOS:
3869 return LowerTrig(Op, DAG);
Tom Stellard0ec134f2014-02-04 17:18:40 +00003870 case ISD::SELECT: return LowerSELECT(Op, DAG);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00003871 case ISD::FDIV: return LowerFDIV(Op, DAG);
Tom Stellard354a43c2016-04-01 18:27:37 +00003872 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
Tom Stellard81d871d2013-11-13 23:36:50 +00003873 case ISD::STORE: return LowerSTORE(Op, DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003874 case ISD::GlobalAddress: {
3875 MachineFunction &MF = DAG.getMachineFunction();
3876 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3877 return LowerGlobalAddress(MFI, Op, DAG);
Tom Stellard94593ee2013-06-03 17:40:18 +00003878 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003879 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00003880 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003881 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
Matt Arsenault99c14522016-04-25 19:27:24 +00003882 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
Matt Arsenault3aef8092017-01-23 23:09:58 +00003883 case ISD::INSERT_VECTOR_ELT:
3884 return lowerINSERT_VECTOR_ELT(Op, DAG);
3885 case ISD::EXTRACT_VECTOR_ELT:
3886 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
Matt Arsenault67a98152018-05-16 11:47:30 +00003887 case ISD::BUILD_VECTOR:
3888 return lowerBUILD_VECTOR(Op, DAG);
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00003889 case ISD::FP_ROUND:
3890 return lowerFP_ROUND(Op, DAG);
Matt Arsenault3e025382017-04-24 17:49:13 +00003891 case ISD::TRAP:
Matt Arsenault3e025382017-04-24 17:49:13 +00003892 return lowerTRAP(Op, DAG);
Tony Tye43259df2018-05-16 16:19:34 +00003893 case ISD::DEBUGTRAP:
3894 return lowerDEBUGTRAP(Op, DAG);
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003895 case ISD::FABS:
3896 case ISD::FNEG:
Matt Arsenault36cdcfa2018-08-02 13:43:42 +00003897 case ISD::FCANONICALIZE:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003898 return splitUnaryVectorOp(Op, DAG);
Matt Arsenault687ec752018-10-22 16:27:27 +00003899 case ISD::FMINNUM:
3900 case ISD::FMAXNUM:
3901 return lowerFMINNUM_FMAXNUM(Op, DAG);
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003902 case ISD::SHL:
3903 case ISD::SRA:
3904 case ISD::SRL:
3905 case ISD::ADD:
3906 case ISD::SUB:
3907 case ISD::MUL:
3908 case ISD::SMIN:
3909 case ISD::SMAX:
3910 case ISD::UMIN:
3911 case ISD::UMAX:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003912 case ISD::FADD:
3913 case ISD::FMUL:
Matt Arsenault687ec752018-10-22 16:27:27 +00003914 case ISD::FMINNUM_IEEE:
3915 case ISD::FMAXNUM_IEEE:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003916 return splitBinaryVectorOp(Op, DAG);
Tom Stellard75aadc22012-12-11 21:25:42 +00003917 }
3918 return SDValue();
3919}
3920
Matt Arsenault1349a042018-05-22 06:32:10 +00003921static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
3922 const SDLoc &DL,
3923 SelectionDAG &DAG, bool Unpacked) {
3924 if (!LoadVT.isVector())
3925 return Result;
3926
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003927 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3928 // Truncate to v2i16/v4i16.
3929 EVT IntLoadVT = LoadVT.changeTypeToInteger();
Matt Arsenault1349a042018-05-22 06:32:10 +00003930
3931 // Workaround legalizer not scalarizing truncate after vector op
3932 // legalization byt not creating intermediate vector trunc.
3933 SmallVector<SDValue, 4> Elts;
3934 DAG.ExtractVectorElements(Result, Elts);
3935 for (SDValue &Elt : Elts)
3936 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3937
3938 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3939
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003940 // Bitcast to original type (v2f16/v4f16).
Matt Arsenault1349a042018-05-22 06:32:10 +00003941 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003942 }
Matt Arsenault1349a042018-05-22 06:32:10 +00003943
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003944 // Cast back to the original packed type.
3945 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3946}
3947
Matt Arsenault1349a042018-05-22 06:32:10 +00003948SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3949 MemSDNode *M,
3950 SelectionDAG &DAG,
Tim Renouf366a49d2018-08-02 23:33:01 +00003951 ArrayRef<SDValue> Ops,
Matt Arsenault1349a042018-05-22 06:32:10 +00003952 bool IsIntrinsic) const {
3953 SDLoc DL(M);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003954
3955 bool Unpacked = Subtarget->hasUnpackedD16VMem();
Matt Arsenault1349a042018-05-22 06:32:10 +00003956 EVT LoadVT = M->getValueType(0);
3957
Matt Arsenault1349a042018-05-22 06:32:10 +00003958 EVT EquivLoadVT = LoadVT;
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003959 if (Unpacked && LoadVT.isVector()) {
3960 EquivLoadVT = LoadVT.isVector() ?
3961 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3962 LoadVT.getVectorNumElements()) : LoadVT;
Matt Arsenault1349a042018-05-22 06:32:10 +00003963 }
3964
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003965 // Change from v4f16/v2f16 to EquivLoadVT.
3966 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3967
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003968 SDValue Load
3969 = DAG.getMemIntrinsicNode(
3970 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3971 VTList, Ops, M->getMemoryVT(),
3972 M->getMemOperand());
3973 if (!Unpacked) // Just adjusted the opcode.
3974 return Load;
Changpeng Fang4737e892018-01-18 22:08:53 +00003975
Matt Arsenault1349a042018-05-22 06:32:10 +00003976 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
Changpeng Fang4737e892018-01-18 22:08:53 +00003977
Matt Arsenault1349a042018-05-22 06:32:10 +00003978 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003979}
3980
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003981static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
3982 SDNode *N, SelectionDAG &DAG) {
3983 EVT VT = N->getValueType(0);
Matt Arsenaultcaf13162019-03-12 21:02:54 +00003984 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003985 int CondCode = CD->getSExtValue();
3986 if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
3987 CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
3988 return DAG.getUNDEF(VT);
3989
3990 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
3991
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003992 SDValue LHS = N->getOperand(1);
3993 SDValue RHS = N->getOperand(2);
3994
3995 SDLoc DL(N);
3996
3997 EVT CmpVT = LHS.getValueType();
3998 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
3999 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
4000 ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4001 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
4002 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
4003 }
4004
4005 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
4006
Stanislav Mekhanoshin68a2fef2019-06-13 23:47:36 +00004007 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
4008 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
4009
4010 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
4011 DAG.getCondCode(CCOpcode));
4012 if (VT.bitsEq(CCVT))
4013 return SetCC;
4014 return DAG.getZExtOrTrunc(SetCC, DL, VT);
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00004015}
4016
4017static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
4018 SDNode *N, SelectionDAG &DAG) {
4019 EVT VT = N->getValueType(0);
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004020 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00004021
4022 int CondCode = CD->getSExtValue();
4023 if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
4024 CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
4025 return DAG.getUNDEF(VT);
4026 }
4027
4028 SDValue Src0 = N->getOperand(1);
4029 SDValue Src1 = N->getOperand(2);
4030 EVT CmpVT = Src0.getValueType();
4031 SDLoc SL(N);
4032
4033 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
4034 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
4035 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
4036 }
4037
4038 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
4039 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
Stanislav Mekhanoshin68a2fef2019-06-13 23:47:36 +00004040 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
4041 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
4042 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
4043 Src1, DAG.getCondCode(CCOpcode));
4044 if (VT.bitsEq(CCVT))
4045 return SetCC;
4046 return DAG.getZExtOrTrunc(SetCC, SL, VT);
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00004047}
4048
Matt Arsenault3aef8092017-01-23 23:09:58 +00004049void SITargetLowering::ReplaceNodeResults(SDNode *N,
4050 SmallVectorImpl<SDValue> &Results,
4051 SelectionDAG &DAG) const {
4052 switch (N->getOpcode()) {
4053 case ISD::INSERT_VECTOR_ELT: {
4054 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
4055 Results.push_back(Res);
4056 return;
4057 }
4058 case ISD::EXTRACT_VECTOR_ELT: {
4059 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
4060 Results.push_back(Res);
4061 return;
4062 }
Matt Arsenault1f17c662017-02-22 00:27:34 +00004063 case ISD::INTRINSIC_WO_CHAIN: {
4064 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
Marek Olsak13e47412018-01-31 20:18:04 +00004065 switch (IID) {
4066 case Intrinsic::amdgcn_cvt_pkrtz: {
Matt Arsenault1f17c662017-02-22 00:27:34 +00004067 SDValue Src0 = N->getOperand(1);
4068 SDValue Src1 = N->getOperand(2);
4069 SDLoc SL(N);
4070 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
4071 Src0, Src1);
Matt Arsenault1f17c662017-02-22 00:27:34 +00004072 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
4073 return;
4074 }
Marek Olsak13e47412018-01-31 20:18:04 +00004075 case Intrinsic::amdgcn_cvt_pknorm_i16:
4076 case Intrinsic::amdgcn_cvt_pknorm_u16:
4077 case Intrinsic::amdgcn_cvt_pk_i16:
4078 case Intrinsic::amdgcn_cvt_pk_u16: {
4079 SDValue Src0 = N->getOperand(1);
4080 SDValue Src1 = N->getOperand(2);
4081 SDLoc SL(N);
4082 unsigned Opcode;
4083
4084 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
4085 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
4086 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
4087 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
4088 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
4089 Opcode = AMDGPUISD::CVT_PK_I16_I32;
4090 else
4091 Opcode = AMDGPUISD::CVT_PK_U16_U32;
4092
Matt Arsenault709374d2018-08-01 20:13:58 +00004093 EVT VT = N->getValueType(0);
4094 if (isTypeLegal(VT))
4095 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
4096 else {
4097 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
4098 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
4099 }
Marek Olsak13e47412018-01-31 20:18:04 +00004100 return;
4101 }
4102 }
Simon Pilgrimd362d272017-07-08 19:50:03 +00004103 break;
Matt Arsenault1f17c662017-02-22 00:27:34 +00004104 }
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00004105 case ISD::INTRINSIC_W_CHAIN: {
Matt Arsenault1349a042018-05-22 06:32:10 +00004106 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00004107 Results.push_back(Res);
Matt Arsenault1349a042018-05-22 06:32:10 +00004108 Results.push_back(Res.getValue(1));
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00004109 return;
4110 }
Matt Arsenault1349a042018-05-22 06:32:10 +00004111
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00004112 break;
4113 }
Matt Arsenault4a486232017-04-19 20:53:07 +00004114 case ISD::SELECT: {
4115 SDLoc SL(N);
4116 EVT VT = N->getValueType(0);
4117 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4118 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
4119 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
4120
4121 EVT SelectVT = NewVT;
4122 if (NewVT.bitsLT(MVT::i32)) {
4123 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
4124 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
4125 SelectVT = MVT::i32;
4126 }
4127
4128 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
4129 N->getOperand(0), LHS, RHS);
4130
4131 if (NewVT != SelectVT)
4132 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
4133 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
4134 return;
4135 }
Matt Arsenaulte9524f12018-06-06 21:28:11 +00004136 case ISD::FNEG: {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00004137 if (N->getValueType(0) != MVT::v2f16)
4138 break;
4139
Matt Arsenaulte9524f12018-06-06 21:28:11 +00004140 SDLoc SL(N);
Matt Arsenaulte9524f12018-06-06 21:28:11 +00004141 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
4142
4143 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
4144 BC,
4145 DAG.getConstant(0x80008000, SL, MVT::i32));
4146 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
4147 return;
4148 }
4149 case ISD::FABS: {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00004150 if (N->getValueType(0) != MVT::v2f16)
4151 break;
4152
Matt Arsenaulte9524f12018-06-06 21:28:11 +00004153 SDLoc SL(N);
Matt Arsenaulte9524f12018-06-06 21:28:11 +00004154 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
4155
4156 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
4157 BC,
4158 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
4159 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
4160 return;
4161 }
Matt Arsenault3aef8092017-01-23 23:09:58 +00004162 default:
4163 break;
4164 }
4165}
4166
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00004167/// Helper function for LowerBRCOND
Tom Stellardf8794352012-12-19 22:10:31 +00004168static SDNode *findUser(SDValue Value, unsigned Opcode) {
Tom Stellard75aadc22012-12-11 21:25:42 +00004169
Tom Stellardf8794352012-12-19 22:10:31 +00004170 SDNode *Parent = Value.getNode();
4171 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
4172 I != E; ++I) {
4173
4174 if (I.getUse().get() != Value)
4175 continue;
4176
4177 if (I->getOpcode() == Opcode)
4178 return *I;
4179 }
Craig Topper062a2ba2014-04-25 05:30:21 +00004180 return nullptr;
Tom Stellardf8794352012-12-19 22:10:31 +00004181}
4182
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004183unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
Matt Arsenault6408c912016-09-16 22:11:18 +00004184 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
4185 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004186 case Intrinsic::amdgcn_if:
4187 return AMDGPUISD::IF;
4188 case Intrinsic::amdgcn_else:
4189 return AMDGPUISD::ELSE;
4190 case Intrinsic::amdgcn_loop:
4191 return AMDGPUISD::LOOP;
4192 case Intrinsic::amdgcn_end_cf:
4193 llvm_unreachable("should not occur");
Matt Arsenault6408c912016-09-16 22:11:18 +00004194 default:
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004195 return 0;
Matt Arsenault6408c912016-09-16 22:11:18 +00004196 }
Tom Stellardbc4497b2016-02-12 23:45:29 +00004197 }
Matt Arsenault6408c912016-09-16 22:11:18 +00004198
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004199 // break, if_break, else_break are all only used as inputs to loop, not
4200 // directly as branch conditions.
4201 return 0;
Tom Stellardbc4497b2016-02-12 23:45:29 +00004202}
4203
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004204bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
4205 const Triple &TT = getTargetMachine().getTargetTriple();
Matt Arsenault0da63502018-08-31 05:49:54 +00004206 return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4207 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004208 AMDGPU::shouldEmitConstantsToTextSection(TT);
4209}
4210
4211bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
Scott Linderd19d1972019-02-04 20:00:07 +00004212 // FIXME: Either avoid relying on address space here or change the default
4213 // address space for functions to avoid the explicit check.
4214 return (GV->getValueType()->isFunctionTy() ||
4215 GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
Matt Arsenault0da63502018-08-31 05:49:54 +00004216 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4217 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004218 !shouldEmitFixup(GV) &&
4219 !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
4220}
4221
4222bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
4223 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
4224}
4225
Tom Stellardf8794352012-12-19 22:10:31 +00004226/// This transforms the control flow intrinsics to get the branch destination as
4227/// last parameter, also switches branch target with BR if the need arise
4228SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
4229 SelectionDAG &DAG) const {
Andrew Trickef9de2a2013-05-25 02:42:55 +00004230 SDLoc DL(BRCOND);
Tom Stellardf8794352012-12-19 22:10:31 +00004231
4232 SDNode *Intr = BRCOND.getOperand(1).getNode();
4233 SDValue Target = BRCOND.getOperand(2);
Craig Topper062a2ba2014-04-25 05:30:21 +00004234 SDNode *BR = nullptr;
Tom Stellardbc4497b2016-02-12 23:45:29 +00004235 SDNode *SetCC = nullptr;
Tom Stellardf8794352012-12-19 22:10:31 +00004236
4237 if (Intr->getOpcode() == ISD::SETCC) {
4238 // As long as we negate the condition everything is fine
Tom Stellardbc4497b2016-02-12 23:45:29 +00004239 SetCC = Intr;
Tom Stellardf8794352012-12-19 22:10:31 +00004240 Intr = SetCC->getOperand(0).getNode();
4241
4242 } else {
4243 // Get the target from BR if we don't negate the condition
4244 BR = findUser(BRCOND, ISD::BR);
4245 Target = BR->getOperand(1);
4246 }
4247
Matt Arsenault6408c912016-09-16 22:11:18 +00004248 // FIXME: This changes the types of the intrinsics instead of introducing new
4249 // nodes with the correct types.
4250 // e.g. llvm.amdgcn.loop
4251
4252 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
4253 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
4254
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004255 unsigned CFNode = isCFIntrinsic(Intr);
4256 if (CFNode == 0) {
Tom Stellardbc4497b2016-02-12 23:45:29 +00004257 // This is a uniform branch so we don't need to legalize.
4258 return BRCOND;
4259 }
4260
Matt Arsenault6408c912016-09-16 22:11:18 +00004261 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
4262 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
4263
Tom Stellardbc4497b2016-02-12 23:45:29 +00004264 assert(!SetCC ||
4265 (SetCC->getConstantOperandVal(1) == 1 &&
Tom Stellardbc4497b2016-02-12 23:45:29 +00004266 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
4267 ISD::SETNE));
Tom Stellardf8794352012-12-19 22:10:31 +00004268
Tom Stellardf8794352012-12-19 22:10:31 +00004269 // operands of the new intrinsic call
4270 SmallVector<SDValue, 4> Ops;
Matt Arsenault6408c912016-09-16 22:11:18 +00004271 if (HaveChain)
4272 Ops.push_back(BRCOND.getOperand(0));
4273
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004274 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
Tom Stellardf8794352012-12-19 22:10:31 +00004275 Ops.push_back(Target);
4276
Matt Arsenault6408c912016-09-16 22:11:18 +00004277 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
4278
Tom Stellardf8794352012-12-19 22:10:31 +00004279 // build the new intrinsic call
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004280 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
Tom Stellardf8794352012-12-19 22:10:31 +00004281
Matt Arsenault6408c912016-09-16 22:11:18 +00004282 if (!HaveChain) {
4283 SDValue Ops[] = {
4284 SDValue(Result, 0),
4285 BRCOND.getOperand(0)
4286 };
4287
4288 Result = DAG.getMergeValues(Ops, DL).getNode();
4289 }
4290
Tom Stellardf8794352012-12-19 22:10:31 +00004291 if (BR) {
4292 // Give the branch instruction our target
4293 SDValue Ops[] = {
4294 BR->getOperand(0),
4295 BRCOND.getOperand(2)
4296 };
Chandler Carruth356665a2014-08-01 22:09:43 +00004297 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
4298 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
4299 BR = NewBR.getNode();
Tom Stellardf8794352012-12-19 22:10:31 +00004300 }
4301
4302 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
4303
4304 // Copy the intrinsic results to registers
4305 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
4306 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
4307 if (!CopyToReg)
4308 continue;
4309
4310 Chain = DAG.getCopyToReg(
4311 Chain, DL,
4312 CopyToReg->getOperand(1),
4313 SDValue(Result, i - 1),
4314 SDValue());
4315
4316 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
4317 }
4318
4319 // Remove the old intrinsic from the chain
4320 DAG.ReplaceAllUsesOfValueWith(
4321 SDValue(Intr, Intr->getNumValues() - 1),
4322 Intr->getOperand(0));
4323
4324 return Chain;
Tom Stellard75aadc22012-12-11 21:25:42 +00004325}
4326
Aakanksha Patild5443f82019-05-29 18:20:11 +00004327SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
4328 SelectionDAG &DAG) const {
4329 MVT VT = Op.getSimpleValueType();
4330 SDLoc DL(Op);
4331 // Checking the depth
4332 if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0)
4333 return DAG.getConstant(0, DL, VT);
4334
4335 MachineFunction &MF = DAG.getMachineFunction();
4336 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4337 // Check for kernel and shader functions
4338 if (Info->isEntryFunction())
4339 return DAG.getConstant(0, DL, VT);
4340
4341 MachineFrameInfo &MFI = MF.getFrameInfo();
4342 // There is a call to @llvm.returnaddress in this function
4343 MFI.setReturnAddressIsTaken(true);
4344
4345 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
4346 // Get the return address reg and mark it as an implicit live-in
4347 unsigned Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
4348
4349 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
4350}
4351
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00004352SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
4353 SDValue Op,
4354 const SDLoc &DL,
4355 EVT VT) const {
4356 return Op.getValueType().bitsLE(VT) ?
4357 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
4358 DAG.getNode(ISD::FTRUNC, DL, VT, Op);
4359}
4360
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004361SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004362 assert(Op.getValueType() == MVT::f16 &&
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004363 "Do not know how to custom lower FP_ROUND for non-f16 type");
4364
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004365 SDValue Src = Op.getOperand(0);
4366 EVT SrcVT = Src.getValueType();
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004367 if (SrcVT != MVT::f64)
4368 return Op;
4369
4370 SDLoc DL(Op);
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004371
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004372 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
4373 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
Mandeep Singh Grang5e1697e2017-06-06 05:08:36 +00004374 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004375}
4376
Matt Arsenault687ec752018-10-22 16:27:27 +00004377SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
4378 SelectionDAG &DAG) const {
4379 EVT VT = Op.getValueType();
Matt Arsenault055e4dc2019-03-29 19:14:54 +00004380 const MachineFunction &MF = DAG.getMachineFunction();
4381 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4382 bool IsIEEEMode = Info->getMode().IEEE;
Matt Arsenault687ec752018-10-22 16:27:27 +00004383
4384 // FIXME: Assert during eslection that this is only selected for
4385 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4386 // mode functions, but this happens to be OK since it's only done in cases
4387 // where there is known no sNaN.
4388 if (IsIEEEMode)
4389 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
4390
4391 if (VT == MVT::v4f16)
4392 return splitBinaryVectorOp(Op, DAG);
4393 return Op;
4394}
4395
Matt Arsenault3e025382017-04-24 17:49:13 +00004396SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
4397 SDLoc SL(Op);
Matt Arsenault3e025382017-04-24 17:49:13 +00004398 SDValue Chain = Op.getOperand(0);
4399
Tom Stellard5bfbae52018-07-11 20:59:01 +00004400 if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
Tony Tye43259df2018-05-16 16:19:34 +00004401 !Subtarget->isTrapHandlerEnabled())
Matt Arsenault3e025382017-04-24 17:49:13 +00004402 return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
Tony Tye43259df2018-05-16 16:19:34 +00004403
4404 MachineFunction &MF = DAG.getMachineFunction();
4405 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4406 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4407 assert(UserSGPR != AMDGPU::NoRegister);
4408 SDValue QueuePtr = CreateLiveInRegister(
4409 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4410 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
4411 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
4412 QueuePtr, SDValue());
4413 SDValue Ops[] = {
4414 ToReg,
Tom Stellard5bfbae52018-07-11 20:59:01 +00004415 DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
Tony Tye43259df2018-05-16 16:19:34 +00004416 SGPR01,
4417 ToReg.getValue(1)
4418 };
4419 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4420}
4421
4422SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
4423 SDLoc SL(Op);
4424 SDValue Chain = Op.getOperand(0);
4425 MachineFunction &MF = DAG.getMachineFunction();
4426
Tom Stellard5bfbae52018-07-11 20:59:01 +00004427 if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
Tony Tye43259df2018-05-16 16:19:34 +00004428 !Subtarget->isTrapHandlerEnabled()) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004429 DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
Matt Arsenault3e025382017-04-24 17:49:13 +00004430 "debugtrap handler not supported",
4431 Op.getDebugLoc(),
4432 DS_Warning);
Matthias Braunf1caa282017-12-15 22:22:58 +00004433 LLVMContext &Ctx = MF.getFunction().getContext();
Matt Arsenault3e025382017-04-24 17:49:13 +00004434 Ctx.diagnose(NoTrap);
4435 return Chain;
4436 }
Matt Arsenault3e025382017-04-24 17:49:13 +00004437
Tony Tye43259df2018-05-16 16:19:34 +00004438 SDValue Ops[] = {
4439 Chain,
Tom Stellard5bfbae52018-07-11 20:59:01 +00004440 DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
Tony Tye43259df2018-05-16 16:19:34 +00004441 };
4442 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
Matt Arsenault3e025382017-04-24 17:49:13 +00004443}
4444
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004445SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
Matt Arsenault99c14522016-04-25 19:27:24 +00004446 SelectionDAG &DAG) const {
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004447 // FIXME: Use inline constants (src_{shared, private}_base) instead.
4448 if (Subtarget->hasApertureRegs()) {
Matt Arsenault0da63502018-08-31 05:49:54 +00004449 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004450 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
4451 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
Matt Arsenault0da63502018-08-31 05:49:54 +00004452 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004453 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
4454 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
4455 unsigned Encoding =
4456 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
4457 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
4458 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
Matt Arsenaulte823d922017-02-18 18:29:53 +00004459
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004460 SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
4461 SDValue ApertureReg = SDValue(
4462 DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
4463 SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
4464 return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
Matt Arsenaulte823d922017-02-18 18:29:53 +00004465 }
4466
Matt Arsenault99c14522016-04-25 19:27:24 +00004467 MachineFunction &MF = DAG.getMachineFunction();
4468 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenault3b2e2a52016-06-06 20:03:31 +00004469 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4470 assert(UserSGPR != AMDGPU::NoRegister);
4471
Matt Arsenault99c14522016-04-25 19:27:24 +00004472 SDValue QueuePtr = CreateLiveInRegister(
Matt Arsenault3b2e2a52016-06-06 20:03:31 +00004473 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
Matt Arsenault99c14522016-04-25 19:27:24 +00004474
4475 // Offset into amd_queue_t for group_segment_aperture_base_hi /
4476 // private_segment_aperture_base_hi.
Matt Arsenault0da63502018-08-31 05:49:54 +00004477 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
Matt Arsenault99c14522016-04-25 19:27:24 +00004478
Matt Arsenaultb655fa92017-11-29 01:25:12 +00004479 SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
Matt Arsenault99c14522016-04-25 19:27:24 +00004480
4481 // TODO: Use custom target PseudoSourceValue.
4482 // TODO: We should use the value from the IR intrinsic call, but it might not
4483 // be available and how do we get it?
4484 Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
Matt Arsenault0da63502018-08-31 05:49:54 +00004485 AMDGPUAS::CONSTANT_ADDRESS));
Matt Arsenault99c14522016-04-25 19:27:24 +00004486
4487 MachinePointerInfo PtrInfo(V, StructOffset);
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004488 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
Justin Lebar9c375812016-07-15 18:27:10 +00004489 MinAlign(64, StructOffset),
Justin Lebaradbf09e2016-09-11 01:38:58 +00004490 MachineMemOperand::MODereferenceable |
4491 MachineMemOperand::MOInvariant);
Matt Arsenault99c14522016-04-25 19:27:24 +00004492}
4493
4494SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4495 SelectionDAG &DAG) const {
4496 SDLoc SL(Op);
4497 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4498
4499 SDValue Src = ASC->getOperand(0);
Matt Arsenault99c14522016-04-25 19:27:24 +00004500 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4501
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004502 const AMDGPUTargetMachine &TM =
4503 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4504
Matt Arsenault99c14522016-04-25 19:27:24 +00004505 // flat -> local/private
Matt Arsenault0da63502018-08-31 05:49:54 +00004506 if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenault971c85e2017-03-13 19:47:31 +00004507 unsigned DestAS = ASC->getDestAddressSpace();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00004508
Matt Arsenault0da63502018-08-31 05:49:54 +00004509 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
4510 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004511 unsigned NullVal = TM.getNullPointerValue(DestAS);
4512 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
Matt Arsenault99c14522016-04-25 19:27:24 +00004513 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4514 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4515
4516 return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4517 NonNull, Ptr, SegmentNullPtr);
4518 }
4519 }
4520
4521 // local/private -> flat
Matt Arsenault0da63502018-08-31 05:49:54 +00004522 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenault971c85e2017-03-13 19:47:31 +00004523 unsigned SrcAS = ASC->getSrcAddressSpace();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00004524
Matt Arsenault0da63502018-08-31 05:49:54 +00004525 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
4526 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004527 unsigned NullVal = TM.getNullPointerValue(SrcAS);
4528 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
Matt Arsenault971c85e2017-03-13 19:47:31 +00004529
Matt Arsenault99c14522016-04-25 19:27:24 +00004530 SDValue NonNull
4531 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4532
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004533 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
Matt Arsenault99c14522016-04-25 19:27:24 +00004534 SDValue CvtPtr
4535 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4536
4537 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4538 DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4539 FlatNullPtr);
4540 }
4541 }
4542
4543 // global <-> flat are no-ops and never emitted.
4544
4545 const MachineFunction &MF = DAG.getMachineFunction();
4546 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
Matthias Braunf1caa282017-12-15 22:22:58 +00004547 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
Matt Arsenault99c14522016-04-25 19:27:24 +00004548 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4549
4550 return DAG.getUNDEF(ASC->getValueType(0));
4551}
4552
Matt Arsenault3aef8092017-01-23 23:09:58 +00004553SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4554 SelectionDAG &DAG) const {
Matt Arsenault67a98152018-05-16 11:47:30 +00004555 SDValue Vec = Op.getOperand(0);
4556 SDValue InsVal = Op.getOperand(1);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004557 SDValue Idx = Op.getOperand(2);
Matt Arsenault67a98152018-05-16 11:47:30 +00004558 EVT VecVT = Vec.getValueType();
Matt Arsenault9224c002018-06-05 19:52:46 +00004559 EVT EltVT = VecVT.getVectorElementType();
4560 unsigned VecSize = VecVT.getSizeInBits();
4561 unsigned EltSize = EltVT.getSizeInBits();
Matt Arsenault67a98152018-05-16 11:47:30 +00004562
Matt Arsenault9224c002018-06-05 19:52:46 +00004563
4564 assert(VecSize <= 64);
Matt Arsenault67a98152018-05-16 11:47:30 +00004565
4566 unsigned NumElts = VecVT.getVectorNumElements();
4567 SDLoc SL(Op);
4568 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
4569
Matt Arsenault9224c002018-06-05 19:52:46 +00004570 if (NumElts == 4 && EltSize == 16 && KIdx) {
Matt Arsenault67a98152018-05-16 11:47:30 +00004571 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
4572
4573 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4574 DAG.getConstant(0, SL, MVT::i32));
4575 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4576 DAG.getConstant(1, SL, MVT::i32));
4577
4578 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
4579 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
4580
4581 unsigned Idx = KIdx->getZExtValue();
4582 bool InsertLo = Idx < 2;
4583 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
4584 InsertLo ? LoVec : HiVec,
4585 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
4586 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
4587
4588 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
4589
4590 SDValue Concat = InsertLo ?
4591 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
4592 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
4593
4594 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
4595 }
4596
Matt Arsenault3aef8092017-01-23 23:09:58 +00004597 if (isa<ConstantSDNode>(Idx))
4598 return SDValue();
4599
Matt Arsenault9224c002018-06-05 19:52:46 +00004600 MVT IntVT = MVT::getIntegerVT(VecSize);
Matt Arsenault67a98152018-05-16 11:47:30 +00004601
Matt Arsenault3aef8092017-01-23 23:09:58 +00004602 // Avoid stack access for dynamic indexing.
Matt Arsenault3aef8092017-01-23 23:09:58 +00004603 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
Tim Corringhamfa3e4e52019-02-01 16:51:09 +00004604
4605 // Create a congruent vector with the target value in each element so that
4606 // the required element can be masked and ORed into the target vector.
4607 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
4608 DAG.getSplatBuildVector(VecVT, SL, InsVal));
Matt Arsenault3aef8092017-01-23 23:09:58 +00004609
Matt Arsenault9224c002018-06-05 19:52:46 +00004610 assert(isPowerOf2_32(EltSize));
4611 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4612
Matt Arsenault3aef8092017-01-23 23:09:58 +00004613 // Convert vector index to bit-index.
Matt Arsenault9224c002018-06-05 19:52:46 +00004614 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004615
Matt Arsenault67a98152018-05-16 11:47:30 +00004616 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4617 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
4618 DAG.getConstant(0xffff, SL, IntVT),
Matt Arsenault3aef8092017-01-23 23:09:58 +00004619 ScaledIdx);
4620
Matt Arsenault67a98152018-05-16 11:47:30 +00004621 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
4622 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
4623 DAG.getNOT(SL, BFM, IntVT), BCVec);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004624
Matt Arsenault67a98152018-05-16 11:47:30 +00004625 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
4626 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004627}
4628
4629SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4630 SelectionDAG &DAG) const {
4631 SDLoc SL(Op);
4632
4633 EVT ResultVT = Op.getValueType();
4634 SDValue Vec = Op.getOperand(0);
4635 SDValue Idx = Op.getOperand(1);
Matt Arsenault67a98152018-05-16 11:47:30 +00004636 EVT VecVT = Vec.getValueType();
Matt Arsenault9224c002018-06-05 19:52:46 +00004637 unsigned VecSize = VecVT.getSizeInBits();
4638 EVT EltVT = VecVT.getVectorElementType();
4639 assert(VecSize <= 64);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004640
Matt Arsenault98f29462017-05-17 20:30:58 +00004641 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4642
Hiroshi Inoue372ffa12018-04-13 11:37:06 +00004643 // Make sure we do any optimizations that will make it easier to fold
Matt Arsenault98f29462017-05-17 20:30:58 +00004644 // source modifiers before obscuring it with bit operations.
4645
4646 // XXX - Why doesn't this get called when vector_shuffle is expanded?
4647 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4648 return Combined;
4649
Matt Arsenault9224c002018-06-05 19:52:46 +00004650 unsigned EltSize = EltVT.getSizeInBits();
4651 assert(isPowerOf2_32(EltSize));
Matt Arsenault3aef8092017-01-23 23:09:58 +00004652
Matt Arsenault9224c002018-06-05 19:52:46 +00004653 MVT IntVT = MVT::getIntegerVT(VecSize);
4654 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4655
4656 // Convert vector index to bit-index (* EltSize)
4657 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004658
Matt Arsenault67a98152018-05-16 11:47:30 +00004659 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4660 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004661
Matt Arsenault67a98152018-05-16 11:47:30 +00004662 if (ResultVT == MVT::f16) {
4663 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
4664 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4665 }
Matt Arsenault3aef8092017-01-23 23:09:58 +00004666
Matt Arsenault67a98152018-05-16 11:47:30 +00004667 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
4668}
4669
4670SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
4671 SelectionDAG &DAG) const {
4672 SDLoc SL(Op);
4673 EVT VT = Op.getValueType();
Matt Arsenault67a98152018-05-16 11:47:30 +00004674
Matt Arsenault02dc7e12018-06-15 15:15:46 +00004675 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4676 EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
4677
4678 // Turn into pair of packed build_vectors.
4679 // TODO: Special case for constants that can be materialized with s_mov_b64.
4680 SDValue Lo = DAG.getBuildVector(HalfVT, SL,
4681 { Op.getOperand(0), Op.getOperand(1) });
4682 SDValue Hi = DAG.getBuildVector(HalfVT, SL,
4683 { Op.getOperand(2), Op.getOperand(3) });
4684
4685 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
4686 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
4687
4688 SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
4689 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
4690 }
4691
Matt Arsenault1349a042018-05-22 06:32:10 +00004692 assert(VT == MVT::v2f16 || VT == MVT::v2i16);
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004693 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
Matt Arsenault67a98152018-05-16 11:47:30 +00004694
Matt Arsenault1349a042018-05-22 06:32:10 +00004695 SDValue Lo = Op.getOperand(0);
4696 SDValue Hi = Op.getOperand(1);
Matt Arsenault67a98152018-05-16 11:47:30 +00004697
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004698 // Avoid adding defined bits with the zero_extend.
4699 if (Hi.isUndef()) {
4700 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4701 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
4702 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
4703 }
Matt Arsenault67a98152018-05-16 11:47:30 +00004704
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004705 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
Matt Arsenault1349a042018-05-22 06:32:10 +00004706 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
4707
4708 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
4709 DAG.getConstant(16, SL, MVT::i32));
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004710 if (Lo.isUndef())
4711 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
4712
4713 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4714 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
Matt Arsenault1349a042018-05-22 06:32:10 +00004715
4716 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
Matt Arsenault1349a042018-05-22 06:32:10 +00004717 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004718}
4719
Tom Stellard418beb72016-07-13 14:23:33 +00004720bool
4721SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
4722 // We can fold offsets for anything that doesn't require a GOT relocation.
Matt Arsenault0da63502018-08-31 05:49:54 +00004723 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
4724 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4725 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004726 !shouldEmitGOTReloc(GA->getGlobal());
Tom Stellard418beb72016-07-13 14:23:33 +00004727}
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004728
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004729static SDValue
4730buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
4731 const SDLoc &DL, unsigned Offset, EVT PtrVT,
4732 unsigned GAFlags = SIInstrInfo::MO_NONE) {
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004733 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4734 // lowered to the following code sequence:
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004735 //
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004736 // For constant address space:
4737 // s_getpc_b64 s[0:1]
4738 // s_add_u32 s0, s0, $symbol
4739 // s_addc_u32 s1, s1, 0
4740 //
4741 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4742 // a fixup or relocation is emitted to replace $symbol with a literal
4743 // constant, which is a pc-relative offset from the encoding of the $symbol
4744 // operand to the global variable.
4745 //
4746 // For global address space:
4747 // s_getpc_b64 s[0:1]
4748 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4749 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4750 //
4751 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4752 // fixups or relocations are emitted to replace $symbol@*@lo and
4753 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4754 // which is a 64-bit pc-relative offset from the encoding of the $symbol
4755 // operand to the global variable.
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004756 //
4757 // What we want here is an offset from the value returned by s_getpc
4758 // (which is the address of the s_add_u32 instruction) to the global
4759 // variable, but since the encoding of $symbol starts 4 bytes after the start
4760 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4761 // small. This requires us to add 4 to the global variable offset in order to
4762 // compute the correct address.
Nicolai Haehnle6d71be42019-06-16 17:32:01 +00004763 unsigned LoFlags = GAFlags;
4764 if (LoFlags == SIInstrInfo::MO_NONE)
4765 LoFlags = SIInstrInfo::MO_REL32;
4766 SDValue PtrLo =
4767 DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, LoFlags);
4768 SDValue PtrHi;
4769 if (GAFlags == SIInstrInfo::MO_NONE) {
4770 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
4771 } else {
4772 PtrHi =
4773 DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags + 1);
4774 }
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004775 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004776}
4777
Tom Stellard418beb72016-07-13 14:23:33 +00004778SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
4779 SDValue Op,
4780 SelectionDAG &DAG) const {
4781 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00004782 const GlobalValue *GV = GSD->getGlobal();
Matt Arsenaultd1f45712018-09-10 12:16:11 +00004783 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
4784 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
4785 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
Tom Stellard418beb72016-07-13 14:23:33 +00004786 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
4787
4788 SDLoc DL(GSD);
Tom Stellard418beb72016-07-13 14:23:33 +00004789 EVT PtrVT = Op.getValueType();
4790
Matt Arsenaultd1f45712018-09-10 12:16:11 +00004791 // FIXME: Should not make address space based decisions here.
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004792 if (shouldEmitFixup(GV))
Tom Stellard418beb72016-07-13 14:23:33 +00004793 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004794 else if (shouldEmitPCReloc(GV))
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004795 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
4796 SIInstrInfo::MO_REL32);
Tom Stellard418beb72016-07-13 14:23:33 +00004797
4798 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004799 SIInstrInfo::MO_GOTPCREL32);
Tom Stellard418beb72016-07-13 14:23:33 +00004800
4801 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
Matt Arsenault0da63502018-08-31 05:49:54 +00004802 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
Tom Stellard418beb72016-07-13 14:23:33 +00004803 const DataLayout &DataLayout = DAG.getDataLayout();
4804 unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
Matt Arsenaultd77fcc22018-09-10 02:23:39 +00004805 MachinePointerInfo PtrInfo
4806 = MachinePointerInfo::getGOT(DAG.getMachineFunction());
Tom Stellard418beb72016-07-13 14:23:33 +00004807
Justin Lebar9c375812016-07-15 18:27:10 +00004808 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
Justin Lebaradbf09e2016-09-11 01:38:58 +00004809 MachineMemOperand::MODereferenceable |
4810 MachineMemOperand::MOInvariant);
Tom Stellard418beb72016-07-13 14:23:33 +00004811}
4812
Benjamin Kramerbdc49562016-06-12 15:39:02 +00004813SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
4814 const SDLoc &DL, SDValue V) const {
Matt Arsenault4ac341c2016-04-14 21:58:15 +00004815 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
4816 // the destination register.
4817 //
Tom Stellardfc92e772015-05-12 14:18:14 +00004818 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
4819 // so we will end up with redundant moves to m0.
4820 //
Matt Arsenault4ac341c2016-04-14 21:58:15 +00004821 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
4822
4823 // A Null SDValue creates a glue result.
4824 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
4825 V, Chain);
4826 return SDValue(M0, 0);
Tom Stellardfc92e772015-05-12 14:18:14 +00004827}
4828
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00004829SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
4830 SDValue Op,
4831 MVT VT,
4832 unsigned Offset) const {
4833 SDLoc SL(Op);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00004834 SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
Matt Arsenault7b4826e2018-05-30 16:17:51 +00004835 DAG.getEntryNode(), Offset, 4, false);
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00004836 // The local size values will have the hi 16-bits as zero.
4837 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
4838 DAG.getValueType(VT));
4839}
4840
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004841static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4842 EVT VT) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004843 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00004844 "non-hsa intrinsic with hsa target",
4845 DL.getDebugLoc());
4846 DAG.getContext()->diagnose(BadIntrin);
4847 return DAG.getUNDEF(VT);
4848}
4849
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004850static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4851 EVT VT) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004852 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00004853 "intrinsic not supported on subtarget",
4854 DL.getDebugLoc());
Matt Arsenaulte0132462016-01-30 05:19:45 +00004855 DAG.getContext()->diagnose(BadIntrin);
4856 return DAG.getUNDEF(VT);
4857}
4858
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004859static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
4860 ArrayRef<SDValue> Elts) {
4861 assert(!Elts.empty());
4862 MVT Type;
4863 unsigned NumElts;
4864
4865 if (Elts.size() == 1) {
4866 Type = MVT::f32;
4867 NumElts = 1;
4868 } else if (Elts.size() == 2) {
4869 Type = MVT::v2f32;
4870 NumElts = 2;
4871 } else if (Elts.size() <= 4) {
4872 Type = MVT::v4f32;
4873 NumElts = 4;
4874 } else if (Elts.size() <= 8) {
4875 Type = MVT::v8f32;
4876 NumElts = 8;
4877 } else {
4878 assert(Elts.size() <= 16);
4879 Type = MVT::v16f32;
4880 NumElts = 16;
4881 }
4882
4883 SmallVector<SDValue, 16> VecElts(NumElts);
4884 for (unsigned i = 0; i < Elts.size(); ++i) {
4885 SDValue Elt = Elts[i];
4886 if (Elt.getValueType() != MVT::f32)
4887 Elt = DAG.getBitcast(MVT::f32, Elt);
4888 VecElts[i] = Elt;
4889 }
4890 for (unsigned i = Elts.size(); i < NumElts; ++i)
4891 VecElts[i] = DAG.getUNDEF(MVT::f32);
4892
4893 if (NumElts == 1)
4894 return VecElts[0];
4895 return DAG.getBuildVector(Type, DL, VecElts);
4896}
4897
4898static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00004899 SDValue *GLC, SDValue *SLC, SDValue *DLC) {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004900 auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004901
4902 uint64_t Value = CachePolicyConst->getZExtValue();
4903 SDLoc DL(CachePolicy);
4904 if (GLC) {
4905 *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4906 Value &= ~(uint64_t)0x1;
4907 }
4908 if (SLC) {
4909 *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4910 Value &= ~(uint64_t)0x2;
4911 }
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00004912 if (DLC) {
4913 *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32);
4914 Value &= ~(uint64_t)0x4;
4915 }
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004916
4917 return Value == 0;
4918}
4919
David Stuttardf77079f2019-01-14 11:55:24 +00004920// Re-construct the required return value for a image load intrinsic.
4921// This is more complicated due to the optional use TexFailCtrl which means the required
4922// return type is an aggregate
4923static SDValue constructRetValue(SelectionDAG &DAG,
4924 MachineSDNode *Result,
4925 ArrayRef<EVT> ResultTypes,
4926 bool IsTexFail, bool Unpacked, bool IsD16,
4927 int DMaskPop, int NumVDataDwords,
4928 const SDLoc &DL, LLVMContext &Context) {
4929 // Determine the required return type. This is the same regardless of IsTexFail flag
4930 EVT ReqRetVT = ResultTypes[0];
4931 EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
4932 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
4933 EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
4934 EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
4935 : AdjEltVT
4936 : ReqRetVT;
4937
4938 // Extract data part of the result
4939 // Bitcast the result to the same type as the required return type
4940 int NumElts;
4941 if (IsD16 && !Unpacked)
4942 NumElts = NumVDataDwords << 1;
4943 else
4944 NumElts = NumVDataDwords;
4945
4946 EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
4947 : AdjEltVT;
4948
Tim Renouf6f0191a2019-03-22 15:21:11 +00004949 // Special case for v6f16. Rather than add support for this, use v3i32 to
David Stuttardf77079f2019-01-14 11:55:24 +00004950 // extract the data elements
Tim Renouf6f0191a2019-03-22 15:21:11 +00004951 bool V6F16Special = false;
4952 if (NumElts == 6) {
4953 CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
David Stuttardf77079f2019-01-14 11:55:24 +00004954 DMaskPop >>= 1;
4955 ReqRetNumElts >>= 1;
Tim Renouf6f0191a2019-03-22 15:21:11 +00004956 V6F16Special = true;
David Stuttardf77079f2019-01-14 11:55:24 +00004957 AdjVT = MVT::v2i32;
4958 }
4959
4960 SDValue N = SDValue(Result, 0);
4961 SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
4962
4963 // Iterate over the result
4964 SmallVector<SDValue, 4> BVElts;
4965
4966 if (CastVT.isVector()) {
4967 DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
4968 } else {
4969 BVElts.push_back(CastRes);
4970 }
4971 int ExtraElts = ReqRetNumElts - DMaskPop;
4972 while(ExtraElts--)
4973 BVElts.push_back(DAG.getUNDEF(AdjEltVT));
4974
4975 SDValue PreTFCRes;
4976 if (ReqRetNumElts > 1) {
4977 SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
4978 if (IsD16 && Unpacked)
4979 PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
4980 else
4981 PreTFCRes = NewVec;
4982 } else {
4983 PreTFCRes = BVElts[0];
4984 }
4985
Tim Renouf6f0191a2019-03-22 15:21:11 +00004986 if (V6F16Special)
David Stuttardf77079f2019-01-14 11:55:24 +00004987 PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
4988
4989 if (!IsTexFail) {
4990 if (Result->getNumValues() > 1)
4991 return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
4992 else
4993 return PreTFCRes;
4994 }
4995
4996 // Extract the TexFail result and insert into aggregate return
4997 SmallVector<SDValue, 1> TFCElt;
4998 DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
4999 SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
5000 return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
5001}
5002
5003static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
5004 SDValue *LWE, bool &IsTexFail) {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00005005 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
David Stuttardf77079f2019-01-14 11:55:24 +00005006
5007 uint64_t Value = TexFailCtrlConst->getZExtValue();
5008 if (Value) {
5009 IsTexFail = true;
5010 }
5011
5012 SDLoc DL(TexFailCtrlConst);
5013 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
5014 Value &= ~(uint64_t)0x1;
5015 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
5016 Value &= ~(uint64_t)0x2;
5017
5018 return Value == 0;
5019}
5020
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005021SDValue SITargetLowering::lowerImage(SDValue Op,
5022 const AMDGPU::ImageDimIntrinsicInfo *Intr,
5023 SelectionDAG &DAG) const {
5024 SDLoc DL(Op);
Ryan Taylor1f334d02018-08-28 15:07:30 +00005025 MachineFunction &MF = DAG.getMachineFunction();
5026 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005027 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5028 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
5029 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
Ryan Taylor894c8fd2018-08-01 12:12:01 +00005030 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
5031 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
Piotr Sobczak9b11e932019-06-10 15:58:51 +00005032 const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
5033 AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
Ryan Taylor894c8fd2018-08-01 12:12:01 +00005034 unsigned IntrOpcode = Intr->BaseOpcode;
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005035 bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005036
David Stuttardf77079f2019-01-14 11:55:24 +00005037 SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
5038 SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005039 bool IsD16 = false;
Ryan Taylor1f334d02018-08-28 15:07:30 +00005040 bool IsA16 = false;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005041 SDValue VData;
5042 int NumVDataDwords;
David Stuttardf77079f2019-01-14 11:55:24 +00005043 bool AdjustRetType = false;
5044
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005045 unsigned AddrIdx; // Index of first address argument
5046 unsigned DMask;
David Stuttardf77079f2019-01-14 11:55:24 +00005047 unsigned DMaskLanes = 0;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005048
5049 if (BaseOpcode->Atomic) {
5050 VData = Op.getOperand(2);
5051
5052 bool Is64Bit = VData.getValueType() == MVT::i64;
5053 if (BaseOpcode->AtomicX2) {
5054 SDValue VData2 = Op.getOperand(3);
5055 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
5056 {VData, VData2});
5057 if (Is64Bit)
5058 VData = DAG.getBitcast(MVT::v4i32, VData);
5059
5060 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
5061 DMask = Is64Bit ? 0xf : 0x3;
5062 NumVDataDwords = Is64Bit ? 4 : 2;
5063 AddrIdx = 4;
5064 } else {
5065 DMask = Is64Bit ? 0x3 : 0x1;
5066 NumVDataDwords = Is64Bit ? 2 : 1;
5067 AddrIdx = 3;
5068 }
5069 } else {
David Stuttardf77079f2019-01-14 11:55:24 +00005070 unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
Matt Arsenaultcaf13162019-03-12 21:02:54 +00005071 auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
David Stuttardf77079f2019-01-14 11:55:24 +00005072 DMask = DMaskConst->getZExtValue();
5073 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005074
5075 if (BaseOpcode->Store) {
5076 VData = Op.getOperand(2);
5077
5078 MVT StoreVT = VData.getSimpleValueType();
5079 if (StoreVT.getScalarType() == MVT::f16) {
Matt Arsenaulte4c2e9b2019-06-19 23:54:58 +00005080 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005081 return Op; // D16 is unsupported for this instruction
5082
5083 IsD16 = true;
5084 VData = handleD16VData(VData, DAG);
5085 }
5086
5087 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005088 } else {
David Stuttardf77079f2019-01-14 11:55:24 +00005089 // Work out the num dwords based on the dmask popcount and underlying type
5090 // and whether packing is supported.
5091 MVT LoadVT = ResultTypes[0].getSimpleVT();
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005092 if (LoadVT.getScalarType() == MVT::f16) {
Matt Arsenaulte4c2e9b2019-06-19 23:54:58 +00005093 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005094 return Op; // D16 is unsupported for this instruction
5095
5096 IsD16 = true;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005097 }
5098
David Stuttardf77079f2019-01-14 11:55:24 +00005099 // Confirm that the return type is large enough for the dmask specified
5100 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
5101 (!LoadVT.isVector() && DMaskLanes > 1))
5102 return Op;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005103
David Stuttardf77079f2019-01-14 11:55:24 +00005104 if (IsD16 && !Subtarget->hasUnpackedD16VMem())
5105 NumVDataDwords = (DMaskLanes + 1) / 2;
5106 else
5107 NumVDataDwords = DMaskLanes;
5108
5109 AdjustRetType = true;
5110 }
David Stuttardc6603862018-11-29 20:14:17 +00005111
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005112 AddrIdx = DMaskIdx + 1;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005113 }
5114
Ryan Taylor1f334d02018-08-28 15:07:30 +00005115 unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
5116 unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
5117 unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
5118 unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
5119 NumCoords + NumLCM;
5120 unsigned NumMIVAddrs = NumVAddrs;
5121
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005122 SmallVector<SDValue, 4> VAddrs;
Ryan Taylor894c8fd2018-08-01 12:12:01 +00005123
5124 // Optimize _L to _LZ when _L is zero
5125 if (LZMappingInfo) {
5126 if (auto ConstantLod =
Ryan Taylor1f334d02018-08-28 15:07:30 +00005127 dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
Ryan Taylor894c8fd2018-08-01 12:12:01 +00005128 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
5129 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
Ryan Taylor1f334d02018-08-28 15:07:30 +00005130 NumMIVAddrs--; // remove 'lod'
Ryan Taylor894c8fd2018-08-01 12:12:01 +00005131 }
5132 }
5133 }
5134
Piotr Sobczak9b11e932019-06-10 15:58:51 +00005135 // Optimize _mip away, when 'lod' is zero
5136 if (MIPMappingInfo) {
5137 if (auto ConstantLod =
5138 dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
5139 if (ConstantLod->isNullValue()) {
5140 IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
5141 NumMIVAddrs--; // remove 'lod'
5142 }
5143 }
5144 }
5145
Ryan Taylor1f334d02018-08-28 15:07:30 +00005146 // Check for 16 bit addresses and pack if true.
5147 unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
5148 MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
Neil Henning63718b22018-10-31 10:34:48 +00005149 const MVT VAddrScalarVT = VAddrVT.getScalarType();
5150 if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
Ryan Taylor1f334d02018-08-28 15:07:30 +00005151 ST->hasFeature(AMDGPU::FeatureR128A16)) {
5152 IsA16 = true;
Neil Henning63718b22018-10-31 10:34:48 +00005153 const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
Ryan Taylor1f334d02018-08-28 15:07:30 +00005154 for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
5155 SDValue AddrLo, AddrHi;
5156 // Push back extra arguments.
5157 if (i < DimIdx) {
5158 AddrLo = Op.getOperand(i);
5159 } else {
5160 AddrLo = Op.getOperand(i);
5161 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
5162 // in 1D, derivatives dx/dh and dx/dv are packed with undef.
5163 if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
Matt Arsenault0da63502018-08-31 05:49:54 +00005164 ((NumGradients / 2) % 2 == 1 &&
5165 (i == DimIdx + (NumGradients / 2) - 1 ||
Ryan Taylor1f334d02018-08-28 15:07:30 +00005166 i == DimIdx + NumGradients - 1))) {
5167 AddrHi = DAG.getUNDEF(MVT::f16);
5168 } else {
5169 AddrHi = Op.getOperand(i + 1);
5170 i++;
5171 }
Neil Henning63718b22018-10-31 10:34:48 +00005172 AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
Ryan Taylor1f334d02018-08-28 15:07:30 +00005173 {AddrLo, AddrHi});
5174 AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
5175 }
5176 VAddrs.push_back(AddrLo);
5177 }
5178 } else {
5179 for (unsigned i = 0; i < NumMIVAddrs; ++i)
5180 VAddrs.push_back(Op.getOperand(AddrIdx + i));
5181 }
5182
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005183 // If the register allocator cannot place the address registers contiguously
5184 // without introducing moves, then using the non-sequential address encoding
5185 // is always preferable, since it saves VALU instructions and is usually a
5186 // wash in terms of code size or even better.
5187 //
5188 // However, we currently have no way of hinting to the register allocator that
5189 // MIMG addresses should be placed contiguously when it is possible to do so,
5190 // so force non-NSA for the common 2-address case as a heuristic.
5191 //
5192 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5193 // allocation when possible.
5194 bool UseNSA =
5195 ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3;
5196 SDValue VAddr;
5197 if (!UseNSA)
5198 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005199
5200 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
5201 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
5202 unsigned CtrlIdx; // Index of texfailctrl argument
5203 SDValue Unorm;
5204 if (!BaseOpcode->Sampler) {
5205 Unorm = True;
5206 CtrlIdx = AddrIdx + NumVAddrs + 1;
5207 } else {
5208 auto UnormConst =
Matt Arsenaultcaf13162019-03-12 21:02:54 +00005209 cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005210
5211 Unorm = UnormConst->getZExtValue() ? True : False;
5212 CtrlIdx = AddrIdx + NumVAddrs + 3;
5213 }
5214
David Stuttardf77079f2019-01-14 11:55:24 +00005215 SDValue TFE;
5216 SDValue LWE;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005217 SDValue TexFail = Op.getOperand(CtrlIdx);
David Stuttardf77079f2019-01-14 11:55:24 +00005218 bool IsTexFail = false;
5219 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005220 return Op;
5221
David Stuttardf77079f2019-01-14 11:55:24 +00005222 if (IsTexFail) {
5223 if (!DMaskLanes) {
5224 // Expecting to get an error flag since TFC is on - and dmask is 0
5225 // Force dmask to be at least 1 otherwise the instruction will fail
5226 DMask = 0x1;
5227 DMaskLanes = 1;
5228 NumVDataDwords = 1;
5229 }
5230 NumVDataDwords += 1;
5231 AdjustRetType = true;
5232 }
5233
5234 // Has something earlier tagged that the return type needs adjusting
5235 // This happens if the instruction is a load or has set TexFailCtrl flags
5236 if (AdjustRetType) {
5237 // NumVDataDwords reflects the true number of dwords required in the return type
5238 if (DMaskLanes == 0 && !BaseOpcode->Store) {
5239 // This is a no-op load. This can be eliminated
5240 SDValue Undef = DAG.getUNDEF(Op.getValueType());
5241 if (isa<MemSDNode>(Op))
5242 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
5243 return Undef;
5244 }
5245
David Stuttardf77079f2019-01-14 11:55:24 +00005246 EVT NewVT = NumVDataDwords > 1 ?
5247 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
5248 : MVT::f32;
5249
5250 ResultTypes[0] = NewVT;
5251 if (ResultTypes.size() == 3) {
5252 // Original result was aggregate type used for TexFailCtrl results
5253 // The actual instruction returns as a vector type which has now been
5254 // created. Remove the aggregate result.
5255 ResultTypes.erase(&ResultTypes[1]);
5256 }
5257 }
5258
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005259 SDValue GLC;
5260 SDValue SLC;
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005261 SDValue DLC;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005262 if (BaseOpcode->Atomic) {
5263 GLC = True; // TODO no-return optimization
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005264 if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC,
5265 IsGFX10 ? &DLC : nullptr))
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005266 return Op;
5267 } else {
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005268 if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC,
5269 IsGFX10 ? &DLC : nullptr))
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005270 return Op;
5271 }
5272
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005273 SmallVector<SDValue, 26> Ops;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005274 if (BaseOpcode->Store || BaseOpcode->Atomic)
5275 Ops.push_back(VData); // vdata
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005276 if (UseNSA) {
5277 for (const SDValue &Addr : VAddrs)
5278 Ops.push_back(Addr);
5279 } else {
5280 Ops.push_back(VAddr);
5281 }
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005282 Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
5283 if (BaseOpcode->Sampler)
5284 Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
5285 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005286 if (IsGFX10)
5287 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005288 Ops.push_back(Unorm);
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005289 if (IsGFX10)
5290 Ops.push_back(DLC);
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005291 Ops.push_back(GLC);
5292 Ops.push_back(SLC);
Ryan Taylor1f334d02018-08-28 15:07:30 +00005293 Ops.push_back(IsA16 && // a16 or r128
5294 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
David Stuttardf77079f2019-01-14 11:55:24 +00005295 Ops.push_back(TFE); // tfe
5296 Ops.push_back(LWE); // lwe
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005297 if (!IsGFX10)
5298 Ops.push_back(DimInfo->DA ? True : False);
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005299 if (BaseOpcode->HasD16)
5300 Ops.push_back(IsD16 ? True : False);
5301 if (isa<MemSDNode>(Op))
5302 Ops.push_back(Op.getOperand(0)); // chain
5303
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005304 int NumVAddrDwords =
5305 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005306 int Opcode = -1;
5307
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005308 if (IsGFX10) {
5309 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
5310 UseNSA ? AMDGPU::MIMGEncGfx10NSA
5311 : AMDGPU::MIMGEncGfx10Default,
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005312 NumVDataDwords, NumVAddrDwords);
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005313 } else {
5314 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5315 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
5316 NumVDataDwords, NumVAddrDwords);
5317 if (Opcode == -1)
5318 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
5319 NumVDataDwords, NumVAddrDwords);
5320 }
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005321 assert(Opcode != -1);
5322
5323 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
5324 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
Chandler Carruth66654b72018-08-14 23:30:32 +00005325 MachineMemOperand *MemRef = MemOp->getMemOperand();
5326 DAG.setNodeMemRefs(NewNode, {MemRef});
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005327 }
5328
5329 if (BaseOpcode->AtomicX2) {
5330 SmallVector<SDValue, 1> Elt;
5331 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
5332 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
David Stuttardf77079f2019-01-14 11:55:24 +00005333 } else if (!BaseOpcode->Store) {
5334 return constructRetValue(DAG, NewNode,
5335 OrigResultTypes, IsTexFail,
5336 Subtarget->hasUnpackedD16VMem(), IsD16,
5337 DMaskLanes, NumVDataDwords, DL,
5338 *DAG.getContext());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005339 }
5340
5341 return SDValue(NewNode, 0);
5342}
5343
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005344SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
Nicolai Haehnle490e83c2019-06-16 17:14:12 +00005345 SDValue Offset, SDValue GLC, SDValue DLC,
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005346 SelectionDAG &DAG) const {
5347 MachineFunction &MF = DAG.getMachineFunction();
5348 MachineMemOperand *MMO = MF.getMachineMemOperand(
5349 MachinePointerInfo(),
5350 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5351 MachineMemOperand::MOInvariant,
5352 VT.getStoreSize(), VT.getStoreSize());
5353
5354 if (!Offset->isDivergent()) {
5355 SDValue Ops[] = {
5356 Rsrc,
5357 Offset, // Offset
Nicolai Haehnle490e83c2019-06-16 17:14:12 +00005358 GLC,
5359 DLC,
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005360 };
5361 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
5362 DAG.getVTList(VT), Ops, VT, MMO);
5363 }
5364
5365 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
5366 // assume that the buffer is unswizzled.
5367 SmallVector<SDValue, 4> Loads;
5368 unsigned NumLoads = 1;
5369 MVT LoadVT = VT.getSimpleVT();
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005370 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
Simon Pilgrim44dfd812018-12-07 21:44:25 +00005371 assert((LoadVT.getScalarType() == MVT::i32 ||
5372 LoadVT.getScalarType() == MVT::f32) &&
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005373 isPowerOf2_32(NumElts));
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005374
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005375 if (NumElts == 8 || NumElts == 16) {
5376 NumLoads = NumElts == 16 ? 4 : 2;
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005377 LoadVT = MVT::v4i32;
5378 }
5379
5380 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
5381 unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
5382 SDValue Ops[] = {
5383 DAG.getEntryNode(), // Chain
5384 Rsrc, // rsrc
5385 DAG.getConstant(0, DL, MVT::i32), // vindex
5386 {}, // voffset
5387 {}, // soffset
5388 {}, // offset
5389 DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
5390 DAG.getConstant(0, DL, MVT::i1), // idxen
5391 };
5392
5393 // Use the alignment to ensure that the required offsets will fit into the
5394 // immediate offsets.
5395 setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
5396
5397 uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
5398 for (unsigned i = 0; i < NumLoads; ++i) {
5399 Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
5400 Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
5401 Ops, LoadVT, MMO));
5402 }
5403
5404 if (VT == MVT::v8i32 || VT == MVT::v16i32)
5405 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
5406
5407 return Loads[0];
5408}
5409
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005410SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5411 SelectionDAG &DAG) const {
5412 MachineFunction &MF = DAG.getMachineFunction();
Tom Stellarddcb9f092015-07-09 21:20:37 +00005413 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005414
5415 EVT VT = Op.getValueType();
5416 SDLoc DL(Op);
5417 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5418
Sanjay Patela2607012015-09-16 16:31:21 +00005419 // TODO: Should this propagate fast-math-flags?
5420
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005421 switch (IntrinsicID) {
Tom Stellard2f3f9852017-01-25 01:25:13 +00005422 case Intrinsic::amdgcn_implicit_buffer_ptr: {
Konstantin Zhuravlyovaa067cb2018-10-04 21:02:16 +00005423 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
Matt Arsenault10fc0622017-06-26 03:01:31 +00005424 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005425 return getPreloadedValue(DAG, *MFI, VT,
5426 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
Tom Stellard2f3f9852017-01-25 01:25:13 +00005427 }
Tom Stellard48f29f22015-11-26 00:43:29 +00005428 case Intrinsic::amdgcn_dispatch_ptr:
Matt Arsenault48ab5262016-04-25 19:27:18 +00005429 case Intrinsic::amdgcn_queue_ptr: {
Konstantin Zhuravlyovaa067cb2018-10-04 21:02:16 +00005430 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
Oliver Stannard7e7d9832016-02-02 13:52:43 +00005431 DiagnosticInfoUnsupported BadIntrin(
Matthias Braunf1caa282017-12-15 22:22:58 +00005432 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
Oliver Stannard7e7d9832016-02-02 13:52:43 +00005433 DL.getDebugLoc());
Matt Arsenault800fecf2016-01-11 21:18:33 +00005434 DAG.getContext()->diagnose(BadIntrin);
5435 return DAG.getUNDEF(VT);
5436 }
5437
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005438 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
5439 AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
5440 return getPreloadedValue(DAG, *MFI, VT, RegID);
Matt Arsenault48ab5262016-04-25 19:27:18 +00005441 }
Jan Veselyfea814d2016-06-21 20:46:20 +00005442 case Intrinsic::amdgcn_implicitarg_ptr: {
Matt Arsenault9166ce82017-07-28 15:52:08 +00005443 if (MFI->isEntryFunction())
5444 return getImplicitArgPtr(DAG, DL);
Matt Arsenault817c2532017-08-03 23:12:44 +00005445 return getPreloadedValue(DAG, *MFI, VT,
5446 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
Jan Veselyfea814d2016-06-21 20:46:20 +00005447 }
Matt Arsenaultdc4ebad2016-04-29 21:16:52 +00005448 case Intrinsic::amdgcn_kernarg_segment_ptr: {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005449 return getPreloadedValue(DAG, *MFI, VT,
5450 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
Matt Arsenaultdc4ebad2016-04-29 21:16:52 +00005451 }
Matt Arsenault8d718dc2016-07-22 17:01:30 +00005452 case Intrinsic::amdgcn_dispatch_id: {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005453 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
Matt Arsenault8d718dc2016-07-22 17:01:30 +00005454 }
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005455 case Intrinsic::amdgcn_rcp:
5456 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
5457 case Intrinsic::amdgcn_rsq:
5458 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
Eugene Zelenko66203762017-01-21 00:53:49 +00005459 case Intrinsic::amdgcn_rsq_legacy:
Tom Stellard5bfbae52018-07-11 20:59:01 +00005460 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005461 return emitRemovedIntrinsicError(DAG, DL, VT);
5462
5463 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
Eugene Zelenko66203762017-01-21 00:53:49 +00005464 case Intrinsic::amdgcn_rcp_legacy:
Tom Stellard5bfbae52018-07-11 20:59:01 +00005465 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenault32fc5272016-07-26 16:45:45 +00005466 return emitRemovedIntrinsicError(DAG, DL, VT);
5467 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
Matt Arsenault09b2c4a2016-07-15 21:26:52 +00005468 case Intrinsic::amdgcn_rsq_clamp: {
Tom Stellard5bfbae52018-07-11 20:59:01 +00005469 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenault79963e82016-02-13 01:03:00 +00005470 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
Tom Stellard48f29f22015-11-26 00:43:29 +00005471
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005472 Type *Type = VT.getTypeForEVT(*DAG.getContext());
5473 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
5474 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
5475
5476 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
5477 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
5478 DAG.getConstantFP(Max, DL, VT));
5479 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
5480 DAG.getConstantFP(Min, DL, VT));
5481 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005482 case Intrinsic::r600_read_ngroups_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005483 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005484 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005485
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005486 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005487 SI::KernelInputOffsets::NGROUPS_X, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005488 case Intrinsic::r600_read_ngroups_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005489 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005490 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005491
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005492 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005493 SI::KernelInputOffsets::NGROUPS_Y, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005494 case Intrinsic::r600_read_ngroups_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005495 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005496 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005497
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005498 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005499 SI::KernelInputOffsets::NGROUPS_Z, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005500 case Intrinsic::r600_read_global_size_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005501 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005502 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005503
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005504 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005505 SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005506 case Intrinsic::r600_read_global_size_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005507 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005508 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005509
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005510 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005511 SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005512 case Intrinsic::r600_read_global_size_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005513 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005514 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005515
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005516 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005517 SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005518 case Intrinsic::r600_read_local_size_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005519 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005520 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005521
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005522 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5523 SI::KernelInputOffsets::LOCAL_SIZE_X);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005524 case Intrinsic::r600_read_local_size_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005525 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005526 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005527
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005528 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5529 SI::KernelInputOffsets::LOCAL_SIZE_Y);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005530 case Intrinsic::r600_read_local_size_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005531 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005532 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005533
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005534 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5535 SI::KernelInputOffsets::LOCAL_SIZE_Z);
Matt Arsenault43976df2016-01-30 04:25:19 +00005536 case Intrinsic::amdgcn_workgroup_id_x:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005537 case Intrinsic::r600_read_tgid_x:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005538 return getPreloadedValue(DAG, *MFI, VT,
5539 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
Matt Arsenault43976df2016-01-30 04:25:19 +00005540 case Intrinsic::amdgcn_workgroup_id_y:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005541 case Intrinsic::r600_read_tgid_y:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005542 return getPreloadedValue(DAG, *MFI, VT,
5543 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
Matt Arsenault43976df2016-01-30 04:25:19 +00005544 case Intrinsic::amdgcn_workgroup_id_z:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005545 case Intrinsic::r600_read_tgid_z:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005546 return getPreloadedValue(DAG, *MFI, VT,
5547 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
Reid Kleckner4dc0b1a2018-11-01 19:54:45 +00005548 case Intrinsic::amdgcn_workitem_id_x:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005549 case Intrinsic::r600_read_tidig_x:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005550 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5551 SDLoc(DAG.getEntryNode()),
5552 MFI->getArgInfo().WorkItemIDX);
Matt Arsenault43976df2016-01-30 04:25:19 +00005553 case Intrinsic::amdgcn_workitem_id_y:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005554 case Intrinsic::r600_read_tidig_y:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005555 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5556 SDLoc(DAG.getEntryNode()),
5557 MFI->getArgInfo().WorkItemIDY);
Matt Arsenault43976df2016-01-30 04:25:19 +00005558 case Intrinsic::amdgcn_workitem_id_z:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005559 case Intrinsic::r600_read_tidig_z:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005560 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5561 SDLoc(DAG.getEntryNode()),
5562 MFI->getArgInfo().WorkItemIDZ);
Stanislav Mekhanoshin68a2fef2019-06-13 23:47:36 +00005563 case Intrinsic::amdgcn_wavefrontsize:
5564 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
5565 SDLoc(Op), MVT::i32);
Tim Renouf904343f2018-08-25 14:53:17 +00005566 case Intrinsic::amdgcn_s_buffer_load: {
Nicolai Haehnle490e83c2019-06-16 17:14:12 +00005567 bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
5568 SDValue GLC;
5569 SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1);
5570 if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr,
5571 IsGFX10 ? &DLC : nullptr))
5572 return Op;
5573 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), GLC, DLC,
5574 DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005575 }
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00005576 case Intrinsic::amdgcn_fdiv_fast:
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00005577 return lowerFDIV_FAST(Op, DAG);
Tom Stellard2187bb82016-12-06 23:52:13 +00005578 case Intrinsic::amdgcn_interp_mov: {
5579 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5580 SDValue Glue = M0.getValue(1);
5581 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
5582 Op.getOperand(2), Op.getOperand(3), Glue);
5583 }
Tom Stellardad7d03d2015-12-15 17:02:49 +00005584 case Intrinsic::amdgcn_interp_p1: {
5585 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5586 SDValue Glue = M0.getValue(1);
5587 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
5588 Op.getOperand(2), Op.getOperand(3), Glue);
5589 }
5590 case Intrinsic::amdgcn_interp_p2: {
5591 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5592 SDValue Glue = SDValue(M0.getNode(), 1);
5593 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
5594 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
5595 Glue);
5596 }
Tim Corringham824ca3f2019-01-28 13:48:59 +00005597 case Intrinsic::amdgcn_interp_p1_f16: {
5598 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5599 SDValue Glue = M0.getValue(1);
5600 if (getSubtarget()->getLDSBankCount() == 16) {
5601 // 16 bank LDS
5602 SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
5603 DAG.getConstant(2, DL, MVT::i32), // P0
5604 Op.getOperand(2), // Attrchan
5605 Op.getOperand(3), // Attr
5606 Glue);
5607 SDValue Ops[] = {
5608 Op.getOperand(1), // Src0
5609 Op.getOperand(2), // Attrchan
5610 Op.getOperand(3), // Attr
5611 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5612 S, // Src2 - holds two f16 values selected by high
5613 DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
5614 Op.getOperand(4), // high
5615 DAG.getConstant(0, DL, MVT::i1), // $clamp
5616 DAG.getConstant(0, DL, MVT::i32) // $omod
5617 };
5618 return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
5619 } else {
5620 // 32 bank LDS
5621 SDValue Ops[] = {
5622 Op.getOperand(1), // Src0
5623 Op.getOperand(2), // Attrchan
5624 Op.getOperand(3), // Attr
5625 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5626 Op.getOperand(4), // high
5627 DAG.getConstant(0, DL, MVT::i1), // $clamp
5628 DAG.getConstant(0, DL, MVT::i32), // $omod
5629 Glue
5630 };
5631 return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
5632 }
5633 }
5634 case Intrinsic::amdgcn_interp_p2_f16: {
5635 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6));
5636 SDValue Glue = SDValue(M0.getNode(), 1);
5637 SDValue Ops[] = {
5638 Op.getOperand(2), // Src0
5639 Op.getOperand(3), // Attrchan
5640 Op.getOperand(4), // Attr
5641 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5642 Op.getOperand(1), // Src2
5643 DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
5644 Op.getOperand(5), // high
5645 DAG.getConstant(0, DL, MVT::i1), // $clamp
5646 Glue
5647 };
5648 return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
5649 }
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005650 case Intrinsic::amdgcn_sin:
5651 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
5652
5653 case Intrinsic::amdgcn_cos:
5654 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
5655
5656 case Intrinsic::amdgcn_log_clamp: {
Tom Stellard5bfbae52018-07-11 20:59:01 +00005657 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005658 return SDValue();
5659
5660 DiagnosticInfoUnsupported BadIntrin(
Matthias Braunf1caa282017-12-15 22:22:58 +00005661 MF.getFunction(), "intrinsic not supported on subtarget",
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005662 DL.getDebugLoc());
5663 DAG.getContext()->diagnose(BadIntrin);
5664 return DAG.getUNDEF(VT);
5665 }
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005666 case Intrinsic::amdgcn_ldexp:
5667 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
5668 Op.getOperand(1), Op.getOperand(2));
Matt Arsenault74015162016-05-28 00:19:52 +00005669
5670 case Intrinsic::amdgcn_fract:
5671 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
5672
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005673 case Intrinsic::amdgcn_class:
5674 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
5675 Op.getOperand(1), Op.getOperand(2));
5676 case Intrinsic::amdgcn_div_fmas:
5677 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
5678 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5679 Op.getOperand(4));
5680
5681 case Intrinsic::amdgcn_div_fixup:
5682 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
5683 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5684
5685 case Intrinsic::amdgcn_trig_preop:
5686 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
5687 Op.getOperand(1), Op.getOperand(2));
5688 case Intrinsic::amdgcn_div_scale: {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00005689 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005690
5691 // Translate to the operands expected by the machine instruction. The
5692 // first parameter must be the same as the first instruction.
5693 SDValue Numerator = Op.getOperand(1);
5694 SDValue Denominator = Op.getOperand(2);
5695
5696 // Note this order is opposite of the machine instruction's operations,
5697 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5698 // intrinsic has the numerator as the first operand to match a normal
5699 // division operation.
5700
5701 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
5702
5703 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
5704 Denominator, Numerator);
5705 }
Wei Ding07e03712016-07-28 16:42:13 +00005706 case Intrinsic::amdgcn_icmp: {
Marek Olsak33eb4d92019-01-15 02:13:18 +00005707 // There is a Pat that handles this variant, so return it as-is.
5708 if (Op.getOperand(1).getValueType() == MVT::i1 &&
5709 Op.getConstantOperandVal(2) == 0 &&
5710 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
5711 return Op;
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00005712 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
Wei Ding07e03712016-07-28 16:42:13 +00005713 }
5714 case Intrinsic::amdgcn_fcmp: {
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00005715 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
Wei Ding07e03712016-07-28 16:42:13 +00005716 }
Matt Arsenaultf84e5d92017-01-31 03:07:46 +00005717 case Intrinsic::amdgcn_fmed3:
5718 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
5719 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
Farhana Aleenc370d7b2018-07-16 18:19:59 +00005720 case Intrinsic::amdgcn_fdot2:
5721 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
Konstantin Zhuravlyovbb30ef72018-08-01 01:31:30 +00005722 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5723 Op.getOperand(4));
Matt Arsenault32fc5272016-07-26 16:45:45 +00005724 case Intrinsic::amdgcn_fmul_legacy:
5725 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
5726 Op.getOperand(1), Op.getOperand(2));
Matt Arsenaultc96e1de2016-07-18 18:35:05 +00005727 case Intrinsic::amdgcn_sffbh:
Matt Arsenaultc96e1de2016-07-18 18:35:05 +00005728 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
Matt Arsenaultf5262252017-02-22 23:04:58 +00005729 case Intrinsic::amdgcn_sbfe:
5730 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
5731 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5732 case Intrinsic::amdgcn_ubfe:
5733 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
5734 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
Marek Olsak13e47412018-01-31 20:18:04 +00005735 case Intrinsic::amdgcn_cvt_pkrtz:
5736 case Intrinsic::amdgcn_cvt_pknorm_i16:
5737 case Intrinsic::amdgcn_cvt_pknorm_u16:
5738 case Intrinsic::amdgcn_cvt_pk_i16:
5739 case Intrinsic::amdgcn_cvt_pk_u16: {
5740 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
Matt Arsenault1f17c662017-02-22 00:27:34 +00005741 EVT VT = Op.getValueType();
Marek Olsak13e47412018-01-31 20:18:04 +00005742 unsigned Opcode;
5743
5744 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
5745 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
5746 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
5747 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
5748 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
5749 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
5750 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
5751 Opcode = AMDGPUISD::CVT_PK_I16_I32;
5752 else
5753 Opcode = AMDGPUISD::CVT_PK_U16_U32;
5754
Matt Arsenault709374d2018-08-01 20:13:58 +00005755 if (isTypeLegal(VT))
5756 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
5757
Marek Olsak13e47412018-01-31 20:18:04 +00005758 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
Matt Arsenault1f17c662017-02-22 00:27:34 +00005759 Op.getOperand(1), Op.getOperand(2));
5760 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
5761 }
Connor Abbott8c217d02017-08-04 18:36:49 +00005762 case Intrinsic::amdgcn_wqm: {
5763 SDValue Src = Op.getOperand(1);
5764 return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
5765 0);
5766 }
Connor Abbott92638ab2017-08-04 18:36:52 +00005767 case Intrinsic::amdgcn_wwm: {
5768 SDValue Src = Op.getOperand(1);
5769 return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
5770 0);
5771 }
Stanislav Mekhanoshindacda792018-06-26 20:04:19 +00005772 case Intrinsic::amdgcn_fmad_ftz:
5773 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
5774 Op.getOperand(2), Op.getOperand(3));
Stanislav Mekhanoshin68a2fef2019-06-13 23:47:36 +00005775
5776 case Intrinsic::amdgcn_if_break:
5777 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
5778 Op->getOperand(1), Op->getOperand(2)), 0);
5779
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005780 default:
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005781 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5782 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
5783 return lowerImage(Op, ImageDimIntr, DAG);
5784
Matt Arsenault754dd3e2017-04-03 18:08:08 +00005785 return Op;
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005786 }
5787}
5788
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005789SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5790 SelectionDAG &DAG) const {
5791 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
Tom Stellard6f9ef142016-12-20 17:19:44 +00005792 SDLoc DL(Op);
David Stuttard70e8bc12017-06-22 16:29:22 +00005793
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005794 switch (IntrID) {
Marek Olsakc5cec5e2019-01-16 15:43:53 +00005795 case Intrinsic::amdgcn_ds_ordered_add:
5796 case Intrinsic::amdgcn_ds_ordered_swap: {
5797 MemSDNode *M = cast<MemSDNode>(Op);
5798 SDValue Chain = M->getOperand(0);
5799 SDValue M0 = M->getOperand(2);
5800 SDValue Value = M->getOperand(3);
5801 unsigned OrderedCountIndex = M->getConstantOperandVal(7);
5802 unsigned WaveRelease = M->getConstantOperandVal(8);
5803 unsigned WaveDone = M->getConstantOperandVal(9);
5804 unsigned ShaderType;
5805 unsigned Instruction;
5806
5807 switch (IntrID) {
5808 case Intrinsic::amdgcn_ds_ordered_add:
5809 Instruction = 0;
5810 break;
5811 case Intrinsic::amdgcn_ds_ordered_swap:
5812 Instruction = 1;
5813 break;
5814 }
5815
5816 if (WaveDone && !WaveRelease)
5817 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
5818
5819 switch (DAG.getMachineFunction().getFunction().getCallingConv()) {
5820 case CallingConv::AMDGPU_CS:
5821 case CallingConv::AMDGPU_KERNEL:
5822 ShaderType = 0;
5823 break;
5824 case CallingConv::AMDGPU_PS:
5825 ShaderType = 1;
5826 break;
5827 case CallingConv::AMDGPU_VS:
5828 ShaderType = 2;
5829 break;
5830 case CallingConv::AMDGPU_GS:
5831 ShaderType = 3;
5832 break;
5833 default:
5834 report_fatal_error("ds_ordered_count unsupported for this calling conv");
5835 }
5836
5837 unsigned Offset0 = OrderedCountIndex << 2;
5838 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
5839 (Instruction << 4);
5840 unsigned Offset = Offset0 | (Offset1 << 8);
5841
5842 SDValue Ops[] = {
5843 Chain,
5844 Value,
5845 DAG.getTargetConstant(Offset, DL, MVT::i16),
5846 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
5847 };
5848 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
5849 M->getVTList(), Ops, M->getMemoryVT(),
5850 M->getMemOperand());
5851 }
Matt Arsenaulta5840c32019-01-22 18:36:06 +00005852 case Intrinsic::amdgcn_ds_fadd: {
5853 MemSDNode *M = cast<MemSDNode>(Op);
5854 unsigned Opc;
5855 switch (IntrID) {
5856 case Intrinsic::amdgcn_ds_fadd:
5857 Opc = ISD::ATOMIC_LOAD_FADD;
5858 break;
5859 }
5860
5861 return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
5862 M->getOperand(0), M->getOperand(2), M->getOperand(3),
5863 M->getMemOperand());
5864 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005865 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005866 case Intrinsic::amdgcn_atomic_dec:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005867 case Intrinsic::amdgcn_ds_fmin:
5868 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005869 MemSDNode *M = cast<MemSDNode>(Op);
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005870 unsigned Opc;
5871 switch (IntrID) {
5872 case Intrinsic::amdgcn_atomic_inc:
5873 Opc = AMDGPUISD::ATOMIC_INC;
5874 break;
5875 case Intrinsic::amdgcn_atomic_dec:
5876 Opc = AMDGPUISD::ATOMIC_DEC;
5877 break;
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005878 case Intrinsic::amdgcn_ds_fmin:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005879 Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
5880 break;
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005881 case Intrinsic::amdgcn_ds_fmax:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005882 Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
5883 break;
5884 default:
5885 llvm_unreachable("Unknown intrinsic!");
5886 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005887 SDValue Ops[] = {
5888 M->getOperand(0), // Chain
5889 M->getOperand(2), // Ptr
5890 M->getOperand(3) // Value
5891 };
5892
5893 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
5894 M->getMemoryVT(), M->getMemOperand());
5895 }
Tom Stellard6f9ef142016-12-20 17:19:44 +00005896 case Intrinsic::amdgcn_buffer_load:
5897 case Intrinsic::amdgcn_buffer_load_format: {
Tim Renouf4f703f52018-08-21 11:07:10 +00005898 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
5899 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5900 unsigned IdxEn = 1;
5901 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5902 IdxEn = Idx->getZExtValue() != 0;
Tom Stellard6f9ef142016-12-20 17:19:44 +00005903 SDValue Ops[] = {
5904 Op.getOperand(0), // Chain
5905 Op.getOperand(2), // rsrc
5906 Op.getOperand(3), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00005907 SDValue(), // voffset -- will be set by setBufferOffsets
5908 SDValue(), // soffset -- will be set by setBufferOffsets
5909 SDValue(), // offset -- will be set by setBufferOffsets
5910 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5911 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Tom Stellard6f9ef142016-12-20 17:19:44 +00005912 };
Tom Stellard6f9ef142016-12-20 17:19:44 +00005913
Tim Renouf4f703f52018-08-21 11:07:10 +00005914 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
Tom Stellard6f9ef142016-12-20 17:19:44 +00005915 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
5916 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
Tim Renouf4f703f52018-08-21 11:07:10 +00005917
5918 EVT VT = Op.getValueType();
5919 EVT IntVT = VT.changeTypeToInteger();
5920 auto *M = cast<MemSDNode>(Op);
5921 EVT LoadVT = Op.getValueType();
5922
5923 if (LoadVT.getScalarType() == MVT::f16)
5924 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5925 M, DAG, Ops);
Ryan Taylor00e063a2019-03-19 16:07:00 +00005926
5927 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5928 if (LoadVT.getScalarType() == MVT::i8 ||
5929 LoadVT.getScalarType() == MVT::i16)
5930 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
5931
Tim Renouf677387d2019-03-22 14:58:02 +00005932 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5933 M->getMemOperand(), DAG);
Tim Renouf4f703f52018-08-21 11:07:10 +00005934 }
5935 case Intrinsic::amdgcn_raw_buffer_load:
5936 case Intrinsic::amdgcn_raw_buffer_load_format: {
5937 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5938 SDValue Ops[] = {
5939 Op.getOperand(0), // Chain
5940 Op.getOperand(2), // rsrc
5941 DAG.getConstant(0, DL, MVT::i32), // vindex
5942 Offsets.first, // voffset
5943 Op.getOperand(4), // soffset
5944 Offsets.second, // offset
5945 Op.getOperand(5), // cachepolicy
5946 DAG.getConstant(0, DL, MVT::i1), // idxen
5947 };
5948
5949 unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
5950 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5951
5952 EVT VT = Op.getValueType();
5953 EVT IntVT = VT.changeTypeToInteger();
5954 auto *M = cast<MemSDNode>(Op);
5955 EVT LoadVT = Op.getValueType();
5956
5957 if (LoadVT.getScalarType() == MVT::f16)
5958 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5959 M, DAG, Ops);
Ryan Taylor00e063a2019-03-19 16:07:00 +00005960
5961 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5962 if (LoadVT.getScalarType() == MVT::i8 ||
5963 LoadVT.getScalarType() == MVT::i16)
5964 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
5965
Tim Renouf677387d2019-03-22 14:58:02 +00005966 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5967 M->getMemOperand(), DAG);
Tim Renouf4f703f52018-08-21 11:07:10 +00005968 }
5969 case Intrinsic::amdgcn_struct_buffer_load:
5970 case Intrinsic::amdgcn_struct_buffer_load_format: {
5971 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5972 SDValue Ops[] = {
5973 Op.getOperand(0), // Chain
5974 Op.getOperand(2), // rsrc
5975 Op.getOperand(3), // vindex
5976 Offsets.first, // voffset
5977 Op.getOperand(5), // soffset
5978 Offsets.second, // offset
5979 Op.getOperand(6), // cachepolicy
5980 DAG.getConstant(1, DL, MVT::i1), // idxen
5981 };
5982
5983 unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
5984 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5985
Tom Stellard6f9ef142016-12-20 17:19:44 +00005986 EVT VT = Op.getValueType();
5987 EVT IntVT = VT.changeTypeToInteger();
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005988 auto *M = cast<MemSDNode>(Op);
Matt Arsenault1349a042018-05-22 06:32:10 +00005989 EVT LoadVT = Op.getValueType();
Matt Arsenault1349a042018-05-22 06:32:10 +00005990
Tim Renouf366a49d2018-08-02 23:33:01 +00005991 if (LoadVT.getScalarType() == MVT::f16)
5992 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5993 M, DAG, Ops);
Ryan Taylor00e063a2019-03-19 16:07:00 +00005994
5995 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5996 if (LoadVT.getScalarType() == MVT::i8 ||
5997 LoadVT.getScalarType() == MVT::i16)
5998 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
5999
Tim Renouf677387d2019-03-22 14:58:02 +00006000 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
6001 M->getMemOperand(), DAG);
Tom Stellard6f9ef142016-12-20 17:19:44 +00006002 }
David Stuttard70e8bc12017-06-22 16:29:22 +00006003 case Intrinsic::amdgcn_tbuffer_load: {
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00006004 MemSDNode *M = cast<MemSDNode>(Op);
Matt Arsenault1349a042018-05-22 06:32:10 +00006005 EVT LoadVT = Op.getValueType();
Matt Arsenault1349a042018-05-22 06:32:10 +00006006
Tim Renouf35484c92018-08-21 11:06:05 +00006007 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
6008 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
6009 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
6010 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
6011 unsigned IdxEn = 1;
6012 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
6013 IdxEn = Idx->getZExtValue() != 0;
David Stuttard70e8bc12017-06-22 16:29:22 +00006014 SDValue Ops[] = {
6015 Op.getOperand(0), // Chain
6016 Op.getOperand(2), // rsrc
6017 Op.getOperand(3), // vindex
6018 Op.getOperand(4), // voffset
6019 Op.getOperand(5), // soffset
6020 Op.getOperand(6), // offset
Tim Renouf35484c92018-08-21 11:06:05 +00006021 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
6022 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6023 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
6024 };
6025
6026 if (LoadVT.getScalarType() == MVT::f16)
6027 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
6028 M, DAG, Ops);
Tim Renouf677387d2019-03-22 14:58:02 +00006029 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
6030 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
6031 DAG);
Tim Renouf35484c92018-08-21 11:06:05 +00006032 }
6033 case Intrinsic::amdgcn_raw_tbuffer_load: {
6034 MemSDNode *M = cast<MemSDNode>(Op);
6035 EVT LoadVT = Op.getValueType();
6036 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
6037
6038 SDValue Ops[] = {
6039 Op.getOperand(0), // Chain
6040 Op.getOperand(2), // rsrc
6041 DAG.getConstant(0, DL, MVT::i32), // vindex
6042 Offsets.first, // voffset
6043 Op.getOperand(4), // soffset
6044 Offsets.second, // offset
6045 Op.getOperand(5), // format
6046 Op.getOperand(6), // cachepolicy
6047 DAG.getConstant(0, DL, MVT::i1), // idxen
6048 };
6049
6050 if (LoadVT.getScalarType() == MVT::f16)
6051 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
6052 M, DAG, Ops);
Tim Renouf677387d2019-03-22 14:58:02 +00006053 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
6054 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
6055 DAG);
Tim Renouf35484c92018-08-21 11:06:05 +00006056 }
6057 case Intrinsic::amdgcn_struct_tbuffer_load: {
6058 MemSDNode *M = cast<MemSDNode>(Op);
6059 EVT LoadVT = Op.getValueType();
6060 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6061
6062 SDValue Ops[] = {
6063 Op.getOperand(0), // Chain
6064 Op.getOperand(2), // rsrc
6065 Op.getOperand(3), // vindex
6066 Offsets.first, // voffset
6067 Op.getOperand(5), // soffset
6068 Offsets.second, // offset
6069 Op.getOperand(6), // format
6070 Op.getOperand(7), // cachepolicy
6071 DAG.getConstant(1, DL, MVT::i1), // idxen
David Stuttard70e8bc12017-06-22 16:29:22 +00006072 };
6073
Tim Renouf366a49d2018-08-02 23:33:01 +00006074 if (LoadVT.getScalarType() == MVT::f16)
6075 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
6076 M, DAG, Ops);
Tim Renouf677387d2019-03-22 14:58:02 +00006077 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
6078 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
6079 DAG);
David Stuttard70e8bc12017-06-22 16:29:22 +00006080 }
Marek Olsak5cec6412017-11-09 01:52:48 +00006081 case Intrinsic::amdgcn_buffer_atomic_swap:
6082 case Intrinsic::amdgcn_buffer_atomic_add:
6083 case Intrinsic::amdgcn_buffer_atomic_sub:
6084 case Intrinsic::amdgcn_buffer_atomic_smin:
6085 case Intrinsic::amdgcn_buffer_atomic_umin:
6086 case Intrinsic::amdgcn_buffer_atomic_smax:
6087 case Intrinsic::amdgcn_buffer_atomic_umax:
6088 case Intrinsic::amdgcn_buffer_atomic_and:
6089 case Intrinsic::amdgcn_buffer_atomic_or:
6090 case Intrinsic::amdgcn_buffer_atomic_xor: {
Tim Renouf4f703f52018-08-21 11:07:10 +00006091 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
6092 unsigned IdxEn = 1;
6093 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6094 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00006095 SDValue Ops[] = {
6096 Op.getOperand(0), // Chain
6097 Op.getOperand(2), // vdata
6098 Op.getOperand(3), // rsrc
6099 Op.getOperand(4), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00006100 SDValue(), // voffset -- will be set by setBufferOffsets
6101 SDValue(), // soffset -- will be set by setBufferOffsets
6102 SDValue(), // offset -- will be set by setBufferOffsets
6103 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
6104 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00006105 };
Tim Renouf4f703f52018-08-21 11:07:10 +00006106 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00006107 EVT VT = Op.getValueType();
6108
6109 auto *M = cast<MemSDNode>(Op);
Marek Olsak5cec6412017-11-09 01:52:48 +00006110 unsigned Opcode = 0;
6111
6112 switch (IntrID) {
6113 case Intrinsic::amdgcn_buffer_atomic_swap:
6114 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
6115 break;
6116 case Intrinsic::amdgcn_buffer_atomic_add:
6117 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
6118 break;
6119 case Intrinsic::amdgcn_buffer_atomic_sub:
6120 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
6121 break;
6122 case Intrinsic::amdgcn_buffer_atomic_smin:
6123 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
6124 break;
6125 case Intrinsic::amdgcn_buffer_atomic_umin:
6126 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
6127 break;
6128 case Intrinsic::amdgcn_buffer_atomic_smax:
6129 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
6130 break;
6131 case Intrinsic::amdgcn_buffer_atomic_umax:
6132 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
6133 break;
6134 case Intrinsic::amdgcn_buffer_atomic_and:
6135 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
6136 break;
6137 case Intrinsic::amdgcn_buffer_atomic_or:
6138 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
6139 break;
6140 case Intrinsic::amdgcn_buffer_atomic_xor:
6141 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
6142 break;
6143 default:
6144 llvm_unreachable("unhandled atomic opcode");
6145 }
6146
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00006147 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
6148 M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00006149 }
Tim Renouf4f703f52018-08-21 11:07:10 +00006150 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6151 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6152 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6153 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6154 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6155 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6156 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6157 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6158 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6159 case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
6160 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6161 SDValue Ops[] = {
6162 Op.getOperand(0), // Chain
6163 Op.getOperand(2), // vdata
6164 Op.getOperand(3), // rsrc
6165 DAG.getConstant(0, DL, MVT::i32), // vindex
6166 Offsets.first, // voffset
6167 Op.getOperand(5), // soffset
6168 Offsets.second, // offset
6169 Op.getOperand(6), // cachepolicy
6170 DAG.getConstant(0, DL, MVT::i1), // idxen
6171 };
6172 EVT VT = Op.getValueType();
Marek Olsak5cec6412017-11-09 01:52:48 +00006173
Tim Renouf4f703f52018-08-21 11:07:10 +00006174 auto *M = cast<MemSDNode>(Op);
6175 unsigned Opcode = 0;
6176
6177 switch (IntrID) {
6178 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6179 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
6180 break;
6181 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6182 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
6183 break;
6184 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6185 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
6186 break;
6187 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6188 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
6189 break;
6190 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6191 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
6192 break;
6193 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6194 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
6195 break;
6196 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6197 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
6198 break;
6199 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6200 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
6201 break;
6202 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6203 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
6204 break;
6205 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6206 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
6207 break;
6208 default:
6209 llvm_unreachable("unhandled atomic opcode");
6210 }
6211
6212 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
6213 M->getMemOperand());
6214 }
6215 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6216 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6217 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6218 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6219 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6220 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6221 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6222 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6223 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6224 case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
6225 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6226 SDValue Ops[] = {
6227 Op.getOperand(0), // Chain
6228 Op.getOperand(2), // vdata
6229 Op.getOperand(3), // rsrc
6230 Op.getOperand(4), // vindex
6231 Offsets.first, // voffset
6232 Op.getOperand(6), // soffset
6233 Offsets.second, // offset
6234 Op.getOperand(7), // cachepolicy
6235 DAG.getConstant(1, DL, MVT::i1), // idxen
6236 };
6237 EVT VT = Op.getValueType();
6238
6239 auto *M = cast<MemSDNode>(Op);
6240 unsigned Opcode = 0;
6241
6242 switch (IntrID) {
6243 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6244 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
6245 break;
6246 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6247 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
6248 break;
6249 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6250 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
6251 break;
6252 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6253 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
6254 break;
6255 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6256 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
6257 break;
6258 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6259 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
6260 break;
6261 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6262 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
6263 break;
6264 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6265 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
6266 break;
6267 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6268 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
6269 break;
6270 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6271 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
6272 break;
6273 default:
6274 llvm_unreachable("unhandled atomic opcode");
6275 }
6276
6277 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
6278 M->getMemOperand());
6279 }
Marek Olsak5cec6412017-11-09 01:52:48 +00006280 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
Tim Renouf4f703f52018-08-21 11:07:10 +00006281 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
6282 unsigned IdxEn = 1;
6283 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
6284 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00006285 SDValue Ops[] = {
6286 Op.getOperand(0), // Chain
6287 Op.getOperand(2), // src
6288 Op.getOperand(3), // cmp
6289 Op.getOperand(4), // rsrc
6290 Op.getOperand(5), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00006291 SDValue(), // voffset -- will be set by setBufferOffsets
6292 SDValue(), // soffset -- will be set by setBufferOffsets
6293 SDValue(), // offset -- will be set by setBufferOffsets
6294 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
6295 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
6296 };
6297 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
6298 EVT VT = Op.getValueType();
6299 auto *M = cast<MemSDNode>(Op);
6300
6301 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
6302 Op->getVTList(), Ops, VT, M->getMemOperand());
6303 }
6304 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
6305 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6306 SDValue Ops[] = {
6307 Op.getOperand(0), // Chain
6308 Op.getOperand(2), // src
6309 Op.getOperand(3), // cmp
6310 Op.getOperand(4), // rsrc
6311 DAG.getConstant(0, DL, MVT::i32), // vindex
6312 Offsets.first, // voffset
6313 Op.getOperand(6), // soffset
6314 Offsets.second, // offset
6315 Op.getOperand(7), // cachepolicy
6316 DAG.getConstant(0, DL, MVT::i1), // idxen
6317 };
6318 EVT VT = Op.getValueType();
6319 auto *M = cast<MemSDNode>(Op);
6320
6321 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
6322 Op->getVTList(), Ops, VT, M->getMemOperand());
6323 }
6324 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
6325 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
6326 SDValue Ops[] = {
6327 Op.getOperand(0), // Chain
6328 Op.getOperand(2), // src
6329 Op.getOperand(3), // cmp
6330 Op.getOperand(4), // rsrc
6331 Op.getOperand(5), // vindex
6332 Offsets.first, // voffset
6333 Op.getOperand(7), // soffset
6334 Offsets.second, // offset
6335 Op.getOperand(8), // cachepolicy
6336 DAG.getConstant(1, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00006337 };
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00006338 EVT VT = Op.getValueType();
6339 auto *M = cast<MemSDNode>(Op);
Marek Olsak5cec6412017-11-09 01:52:48 +00006340
6341 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00006342 Op->getVTList(), Ops, VT, M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00006343 }
6344
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00006345 default:
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00006346 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6347 AMDGPU::getImageDimIntrinsicInfo(IntrID))
6348 return lowerImage(Op, ImageDimIntr, DAG);
Matt Arsenault1349a042018-05-22 06:32:10 +00006349
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00006350 return SDValue();
6351 }
6352}
6353
Tim Renouf677387d2019-03-22 14:58:02 +00006354// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
6355// dwordx4 if on SI.
6356SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
6357 SDVTList VTList,
6358 ArrayRef<SDValue> Ops, EVT MemVT,
6359 MachineMemOperand *MMO,
6360 SelectionDAG &DAG) const {
6361 EVT VT = VTList.VTs[0];
6362 EVT WidenedVT = VT;
6363 EVT WidenedMemVT = MemVT;
6364 if (!Subtarget->hasDwordx3LoadStores() &&
6365 (WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) {
6366 WidenedVT = EVT::getVectorVT(*DAG.getContext(),
6367 WidenedVT.getVectorElementType(), 4);
6368 WidenedMemVT = EVT::getVectorVT(*DAG.getContext(),
6369 WidenedMemVT.getVectorElementType(), 4);
6370 MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16);
6371 }
6372
6373 assert(VTList.NumVTs == 2);
6374 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
6375
6376 auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
6377 WidenedMemVT, MMO);
6378 if (WidenedVT != VT) {
6379 auto Extract = DAG.getNode(
6380 ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
6381 DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
6382 NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);
6383 }
6384 return NewOp;
6385}
6386
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006387SDValue SITargetLowering::handleD16VData(SDValue VData,
6388 SelectionDAG &DAG) const {
6389 EVT StoreVT = VData.getValueType();
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006390
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006391 // No change for f16 and legal vector D16 types.
Matt Arsenault1349a042018-05-22 06:32:10 +00006392 if (!StoreVT.isVector())
6393 return VData;
6394
6395 SDLoc DL(VData);
6396 assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
6397
6398 if (Subtarget->hasUnpackedD16VMem()) {
6399 // We need to unpack the packed data to store.
6400 EVT IntStoreVT = StoreVT.changeTypeToInteger();
6401 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
6402
6403 EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6404 StoreVT.getVectorNumElements());
6405 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
6406 return DAG.UnrollVectorOp(ZExt.getNode());
6407 }
6408
Matt Arsenault02dc7e12018-06-15 15:15:46 +00006409 assert(isTypeLegal(StoreVT));
6410 return VData;
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006411}
6412
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006413SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6414 SelectionDAG &DAG) const {
Tom Stellardfc92e772015-05-12 14:18:14 +00006415 SDLoc DL(Op);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006416 SDValue Chain = Op.getOperand(0);
6417 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
David Stuttard70e8bc12017-06-22 16:29:22 +00006418 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006419
6420 switch (IntrinsicID) {
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00006421 case Intrinsic::amdgcn_exp: {
Matt Arsenault4165efd2017-01-17 07:26:53 +00006422 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
6423 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
6424 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
6425 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
6426
6427 const SDValue Ops[] = {
6428 Chain,
6429 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
6430 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
6431 Op.getOperand(4), // src0
6432 Op.getOperand(5), // src1
6433 Op.getOperand(6), // src2
6434 Op.getOperand(7), // src3
6435 DAG.getTargetConstant(0, DL, MVT::i1), // compr
6436 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
6437 };
6438
6439 unsigned Opc = Done->isNullValue() ?
6440 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
6441 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
6442 }
6443 case Intrinsic::amdgcn_exp_compr: {
6444 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
6445 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
6446 SDValue Src0 = Op.getOperand(4);
6447 SDValue Src1 = Op.getOperand(5);
6448 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
6449 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
6450
6451 SDValue Undef = DAG.getUNDEF(MVT::f32);
6452 const SDValue Ops[] = {
6453 Chain,
6454 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
6455 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
6456 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
6457 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
6458 Undef, // src2
6459 Undef, // src3
6460 DAG.getTargetConstant(1, DL, MVT::i1), // compr
6461 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
6462 };
6463
6464 unsigned Opc = Done->isNullValue() ?
6465 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
6466 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
6467 }
6468 case Intrinsic::amdgcn_s_sendmsg:
Matt Arsenaultd3e5cb72017-02-16 02:01:17 +00006469 case Intrinsic::amdgcn_s_sendmsghalt: {
6470 unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
6471 AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
Tom Stellardfc92e772015-05-12 14:18:14 +00006472 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
6473 SDValue Glue = Chain.getValue(1);
Matt Arsenaulta78ca622017-02-15 22:17:09 +00006474 return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
Jan Veselyd48445d2017-01-04 18:06:55 +00006475 Op.getOperand(2), Glue);
6476 }
Marek Olsak2d825902017-04-28 20:21:58 +00006477 case Intrinsic::amdgcn_init_exec: {
6478 return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
6479 Op.getOperand(2));
6480 }
6481 case Intrinsic::amdgcn_init_exec_from_input: {
6482 return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
6483 Op.getOperand(2), Op.getOperand(3));
6484 }
Stanislav Mekhanoshinea57c382017-04-06 16:48:30 +00006485 case Intrinsic::amdgcn_s_barrier: {
6486 if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00006487 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Matthias Braunf1caa282017-12-15 22:22:58 +00006488 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
Stanislav Mekhanoshinea57c382017-04-06 16:48:30 +00006489 if (WGSize <= ST.getWavefrontSize())
6490 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
6491 Op.getOperand(0)), 0);
6492 }
6493 return SDValue();
6494 };
David Stuttard70e8bc12017-06-22 16:29:22 +00006495 case Intrinsic::amdgcn_tbuffer_store: {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006496 SDValue VData = Op.getOperand(2);
6497 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6498 if (IsD16)
6499 VData = handleD16VData(VData, DAG);
Tim Renouf35484c92018-08-21 11:06:05 +00006500 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
6501 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
6502 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
6503 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
6504 unsigned IdxEn = 1;
6505 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6506 IdxEn = Idx->getZExtValue() != 0;
David Stuttard70e8bc12017-06-22 16:29:22 +00006507 SDValue Ops[] = {
6508 Chain,
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006509 VData, // vdata
David Stuttard70e8bc12017-06-22 16:29:22 +00006510 Op.getOperand(3), // rsrc
6511 Op.getOperand(4), // vindex
6512 Op.getOperand(5), // voffset
6513 Op.getOperand(6), // soffset
6514 Op.getOperand(7), // offset
Tim Renouf35484c92018-08-21 11:06:05 +00006515 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
6516 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6517 DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
6518 };
6519 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6520 AMDGPUISD::TBUFFER_STORE_FORMAT;
6521 MemSDNode *M = cast<MemSDNode>(Op);
6522 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6523 M->getMemoryVT(), M->getMemOperand());
6524 }
6525
6526 case Intrinsic::amdgcn_struct_tbuffer_store: {
6527 SDValue VData = Op.getOperand(2);
6528 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6529 if (IsD16)
6530 VData = handleD16VData(VData, DAG);
6531 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6532 SDValue Ops[] = {
6533 Chain,
6534 VData, // vdata
6535 Op.getOperand(3), // rsrc
6536 Op.getOperand(4), // vindex
6537 Offsets.first, // voffset
6538 Op.getOperand(6), // soffset
6539 Offsets.second, // offset
6540 Op.getOperand(7), // format
6541 Op.getOperand(8), // cachepolicy
6542 DAG.getConstant(1, DL, MVT::i1), // idexen
6543 };
6544 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6545 AMDGPUISD::TBUFFER_STORE_FORMAT;
6546 MemSDNode *M = cast<MemSDNode>(Op);
6547 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6548 M->getMemoryVT(), M->getMemOperand());
6549 }
6550
6551 case Intrinsic::amdgcn_raw_tbuffer_store: {
6552 SDValue VData = Op.getOperand(2);
6553 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6554 if (IsD16)
6555 VData = handleD16VData(VData, DAG);
6556 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6557 SDValue Ops[] = {
6558 Chain,
6559 VData, // vdata
6560 Op.getOperand(3), // rsrc
6561 DAG.getConstant(0, DL, MVT::i32), // vindex
6562 Offsets.first, // voffset
6563 Op.getOperand(5), // soffset
6564 Offsets.second, // offset
6565 Op.getOperand(6), // format
6566 Op.getOperand(7), // cachepolicy
6567 DAG.getConstant(0, DL, MVT::i1), // idexen
David Stuttard70e8bc12017-06-22 16:29:22 +00006568 };
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006569 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6570 AMDGPUISD::TBUFFER_STORE_FORMAT;
6571 MemSDNode *M = cast<MemSDNode>(Op);
6572 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6573 M->getMemoryVT(), M->getMemOperand());
David Stuttard70e8bc12017-06-22 16:29:22 +00006574 }
6575
Marek Olsak5cec6412017-11-09 01:52:48 +00006576 case Intrinsic::amdgcn_buffer_store:
6577 case Intrinsic::amdgcn_buffer_store_format: {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006578 SDValue VData = Op.getOperand(2);
6579 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6580 if (IsD16)
6581 VData = handleD16VData(VData, DAG);
Tim Renouf4f703f52018-08-21 11:07:10 +00006582 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
6583 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
6584 unsigned IdxEn = 1;
6585 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6586 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00006587 SDValue Ops[] = {
6588 Chain,
Tim Renouf4f703f52018-08-21 11:07:10 +00006589 VData,
Marek Olsak5cec6412017-11-09 01:52:48 +00006590 Op.getOperand(3), // rsrc
6591 Op.getOperand(4), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00006592 SDValue(), // voffset -- will be set by setBufferOffsets
6593 SDValue(), // soffset -- will be set by setBufferOffsets
6594 SDValue(), // offset -- will be set by setBufferOffsets
6595 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6596 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00006597 };
Tim Renouf4f703f52018-08-21 11:07:10 +00006598 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006599 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
6600 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6601 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6602 MemSDNode *M = cast<MemSDNode>(Op);
Ryan Taylor00e063a2019-03-19 16:07:00 +00006603
6604 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6605 EVT VDataType = VData.getValueType().getScalarType();
6606 if (VDataType == MVT::i8 || VDataType == MVT::i16)
6607 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
6608
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006609 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6610 M->getMemoryVT(), M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00006611 }
Tim Renouf4f703f52018-08-21 11:07:10 +00006612
6613 case Intrinsic::amdgcn_raw_buffer_store:
6614 case Intrinsic::amdgcn_raw_buffer_store_format: {
6615 SDValue VData = Op.getOperand(2);
6616 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6617 if (IsD16)
6618 VData = handleD16VData(VData, DAG);
6619 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6620 SDValue Ops[] = {
6621 Chain,
6622 VData,
6623 Op.getOperand(3), // rsrc
6624 DAG.getConstant(0, DL, MVT::i32), // vindex
6625 Offsets.first, // voffset
6626 Op.getOperand(5), // soffset
6627 Offsets.second, // offset
6628 Op.getOperand(6), // cachepolicy
6629 DAG.getConstant(0, DL, MVT::i1), // idxen
6630 };
6631 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
6632 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6633 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6634 MemSDNode *M = cast<MemSDNode>(Op);
Ryan Taylor00e063a2019-03-19 16:07:00 +00006635
6636 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6637 EVT VDataType = VData.getValueType().getScalarType();
6638 if (VDataType == MVT::i8 || VDataType == MVT::i16)
6639 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
6640
Tim Renouf4f703f52018-08-21 11:07:10 +00006641 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6642 M->getMemoryVT(), M->getMemOperand());
6643 }
6644
6645 case Intrinsic::amdgcn_struct_buffer_store:
6646 case Intrinsic::amdgcn_struct_buffer_store_format: {
6647 SDValue VData = Op.getOperand(2);
6648 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6649 if (IsD16)
6650 VData = handleD16VData(VData, DAG);
6651 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6652 SDValue Ops[] = {
6653 Chain,
6654 VData,
6655 Op.getOperand(3), // rsrc
6656 Op.getOperand(4), // vindex
6657 Offsets.first, // voffset
6658 Op.getOperand(6), // soffset
6659 Offsets.second, // offset
6660 Op.getOperand(7), // cachepolicy
6661 DAG.getConstant(1, DL, MVT::i1), // idxen
6662 };
6663 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
6664 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6665 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6666 MemSDNode *M = cast<MemSDNode>(Op);
Ryan Taylor00e063a2019-03-19 16:07:00 +00006667
6668 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6669 EVT VDataType = VData.getValueType().getScalarType();
6670 if (VDataType == MVT::i8 || VDataType == MVT::i16)
6671 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
6672
Tim Renouf4f703f52018-08-21 11:07:10 +00006673 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6674 M->getMemoryVT(), M->getMemOperand());
6675 }
6676
Stanislav Mekhanoshin68a2fef2019-06-13 23:47:36 +00006677 case Intrinsic::amdgcn_end_cf:
6678 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
6679 Op->getOperand(2), Chain), 0);
6680
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006681 default: {
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00006682 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6683 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
6684 return lowerImage(Op, ImageDimIntr, DAG);
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006685
Matt Arsenault754dd3e2017-04-03 18:08:08 +00006686 return Op;
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006687 }
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006688 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006689}
6690
Tim Renouf4f703f52018-08-21 11:07:10 +00006691// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6692// offset (the offset that is included in bounds checking and swizzling, to be
6693// split between the instruction's voffset and immoffset fields) and soffset
6694// (the offset that is excluded from bounds checking and swizzling, to go in
6695// the instruction's soffset field). This function takes the first kind of
6696// offset and figures out how to split it between voffset and immoffset.
Tim Renouf35484c92018-08-21 11:06:05 +00006697std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
6698 SDValue Offset, SelectionDAG &DAG) const {
6699 SDLoc DL(Offset);
6700 const unsigned MaxImm = 4095;
6701 SDValue N0 = Offset;
6702 ConstantSDNode *C1 = nullptr;
Piotr Sobczak378131b2019-01-02 09:47:41 +00006703
6704 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
Tim Renouf35484c92018-08-21 11:06:05 +00006705 N0 = SDValue();
Piotr Sobczak378131b2019-01-02 09:47:41 +00006706 else if (DAG.isBaseWithConstantOffset(N0)) {
6707 C1 = cast<ConstantSDNode>(N0.getOperand(1));
6708 N0 = N0.getOperand(0);
6709 }
Tim Renouf35484c92018-08-21 11:06:05 +00006710
6711 if (C1) {
6712 unsigned ImmOffset = C1->getZExtValue();
6713 // If the immediate value is too big for the immoffset field, put the value
Tim Renoufa37679d2018-10-03 10:29:43 +00006714 // and -4096 into the immoffset field so that the value that is copied/added
Tim Renouf35484c92018-08-21 11:06:05 +00006715 // for the voffset field is a multiple of 4096, and it stands more chance
6716 // of being CSEd with the copy/add for another similar load/store.
Tim Renoufa37679d2018-10-03 10:29:43 +00006717 // However, do not do that rounding down to a multiple of 4096 if that is a
6718 // negative number, as it appears to be illegal to have a negative offset
6719 // in the vgpr, even if adding the immediate offset makes it positive.
Tim Renouf35484c92018-08-21 11:06:05 +00006720 unsigned Overflow = ImmOffset & ~MaxImm;
6721 ImmOffset -= Overflow;
Tim Renoufa37679d2018-10-03 10:29:43 +00006722 if ((int32_t)Overflow < 0) {
6723 Overflow += ImmOffset;
6724 ImmOffset = 0;
6725 }
Tim Renouf35484c92018-08-21 11:06:05 +00006726 C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
6727 if (Overflow) {
6728 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
6729 if (!N0)
6730 N0 = OverflowVal;
6731 else {
6732 SDValue Ops[] = { N0, OverflowVal };
6733 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
6734 }
6735 }
6736 }
6737 if (!N0)
6738 N0 = DAG.getConstant(0, DL, MVT::i32);
6739 if (!C1)
6740 C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
6741 return {N0, SDValue(C1, 0)};
6742}
6743
Tim Renouf4f703f52018-08-21 11:07:10 +00006744// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
6745// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
6746// pointed to by Offsets.
6747void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006748 SelectionDAG &DAG, SDValue *Offsets,
6749 unsigned Align) const {
Tim Renouf4f703f52018-08-21 11:07:10 +00006750 SDLoc DL(CombinedOffset);
6751 if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
6752 uint32_t Imm = C->getZExtValue();
6753 uint32_t SOffset, ImmOffset;
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006754 if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
Tim Renouf4f703f52018-08-21 11:07:10 +00006755 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
6756 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6757 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6758 return;
6759 }
6760 }
6761 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
6762 SDValue N0 = CombinedOffset.getOperand(0);
6763 SDValue N1 = CombinedOffset.getOperand(1);
6764 uint32_t SOffset, ImmOffset;
6765 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006766 if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
6767 Subtarget, Align)) {
Tim Renouf4f703f52018-08-21 11:07:10 +00006768 Offsets[0] = N0;
6769 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6770 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6771 return;
6772 }
6773 }
6774 Offsets[0] = CombinedOffset;
6775 Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
6776 Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
6777}
6778
Ryan Taylor00e063a2019-03-19 16:07:00 +00006779// Handle 8 bit and 16 bit buffer loads
6780SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
6781 EVT LoadVT, SDLoc DL,
6782 ArrayRef<SDValue> Ops,
6783 MemSDNode *M) const {
6784 EVT IntVT = LoadVT.changeTypeToInteger();
6785 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
6786 AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
6787
6788 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
6789 SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
6790 Ops, IntVT,
6791 M->getMemOperand());
6792 SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL,
6793 LoadVT.getScalarType(), BufferLoad);
6794 return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL);
6795}
6796
6797// Handle 8 bit and 16 bit buffer stores
6798SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
6799 EVT VDataType, SDLoc DL,
6800 SDValue Ops[],
6801 MemSDNode *M) const {
6802 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
6803 Ops[1] = BufferStoreExt;
6804 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
6805 AMDGPUISD::BUFFER_STORE_SHORT;
6806 ArrayRef<SDValue> OpsRef = makeArrayRef(&Ops[0], 9);
6807 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
6808 M->getMemOperand());
6809}
6810
Matt Arsenault90083d32018-06-07 09:54:49 +00006811static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
6812 ISD::LoadExtType ExtType, SDValue Op,
6813 const SDLoc &SL, EVT VT) {
6814 if (VT.bitsLT(Op.getValueType()))
6815 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
6816
6817 switch (ExtType) {
6818 case ISD::SEXTLOAD:
6819 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
6820 case ISD::ZEXTLOAD:
6821 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
6822 case ISD::EXTLOAD:
6823 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
6824 case ISD::NON_EXTLOAD:
6825 return Op;
6826 }
6827
6828 llvm_unreachable("invalid ext type");
6829}
6830
6831SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
6832 SelectionDAG &DAG = DCI.DAG;
6833 if (Ld->getAlignment() < 4 || Ld->isDivergent())
6834 return SDValue();
6835
6836 // FIXME: Constant loads should all be marked invariant.
6837 unsigned AS = Ld->getAddressSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00006838 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
6839 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
Matt Arsenault90083d32018-06-07 09:54:49 +00006840 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
6841 return SDValue();
6842
6843 // Don't do this early, since it may interfere with adjacent load merging for
6844 // illegal types. We can avoid losing alignment information for exotic types
6845 // pre-legalize.
6846 EVT MemVT = Ld->getMemoryVT();
6847 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
6848 MemVT.getSizeInBits() >= 32)
6849 return SDValue();
6850
6851 SDLoc SL(Ld);
6852
6853 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
6854 "unexpected vector extload");
6855
6856 // TODO: Drop only high part of range.
6857 SDValue Ptr = Ld->getBasePtr();
6858 SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
6859 MVT::i32, SL, Ld->getChain(), Ptr,
6860 Ld->getOffset(),
6861 Ld->getPointerInfo(), MVT::i32,
6862 Ld->getAlignment(),
6863 Ld->getMemOperand()->getFlags(),
6864 Ld->getAAInfo(),
6865 nullptr); // Drop ranges
6866
6867 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
6868 if (MemVT.isFloatingPoint()) {
6869 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
6870 "unexpected fp extload");
6871 TruncVT = MemVT.changeTypeToInteger();
6872 }
6873
6874 SDValue Cvt = NewLoad;
6875 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
6876 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
6877 DAG.getValueType(TruncVT));
6878 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
6879 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
6880 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
6881 } else {
6882 assert(Ld->getExtensionType() == ISD::EXTLOAD);
6883 }
6884
6885 EVT VT = Ld->getValueType(0);
6886 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
6887
6888 DCI.AddToWorklist(Cvt.getNode());
6889
6890 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
6891 // the appropriate extension from the 32-bit load.
6892 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
6893 DCI.AddToWorklist(Cvt.getNode());
6894
6895 // Handle conversion back to floating point if necessary.
6896 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
6897
6898 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
6899}
6900
Tom Stellard81d871d2013-11-13 23:36:50 +00006901SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
6902 SDLoc DL(Op);
6903 LoadSDNode *Load = cast<LoadSDNode>(Op);
Matt Arsenault6dfda962016-02-10 18:21:39 +00006904 ISD::LoadExtType ExtType = Load->getExtensionType();
Matt Arsenaulta1436412016-02-10 18:21:45 +00006905 EVT MemVT = Load->getMemoryVT();
Matt Arsenault6dfda962016-02-10 18:21:39 +00006906
Matt Arsenaulta1436412016-02-10 18:21:45 +00006907 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
Matt Arsenault65ca292a2017-09-07 05:37:34 +00006908 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
6909 return SDValue();
6910
Matt Arsenault6dfda962016-02-10 18:21:39 +00006911 // FIXME: Copied from PPC
6912 // First, load into 32 bits, then truncate to 1 bit.
6913
6914 SDValue Chain = Load->getChain();
6915 SDValue BasePtr = Load->getBasePtr();
6916 MachineMemOperand *MMO = Load->getMemOperand();
6917
Tom Stellard115a6152016-11-10 16:02:37 +00006918 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
6919
Matt Arsenault6dfda962016-02-10 18:21:39 +00006920 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
Tom Stellard115a6152016-11-10 16:02:37 +00006921 BasePtr, RealMemVT, MMO);
Matt Arsenault6dfda962016-02-10 18:21:39 +00006922
Tim Renouf361b5b22019-03-21 12:01:21 +00006923 if (!MemVT.isVector()) {
6924 SDValue Ops[] = {
6925 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
6926 NewLD.getValue(1)
6927 };
6928
6929 return DAG.getMergeValues(Ops, DL);
6930 }
6931
6932 SmallVector<SDValue, 3> Elts;
6933 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
6934 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
6935 DAG.getConstant(I, DL, MVT::i32));
6936
6937 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
6938 }
6939
Matt Arsenault6dfda962016-02-10 18:21:39 +00006940 SDValue Ops[] = {
Tim Renouf361b5b22019-03-21 12:01:21 +00006941 DAG.getBuildVector(MemVT, DL, Elts),
Matt Arsenault6dfda962016-02-10 18:21:39 +00006942 NewLD.getValue(1)
6943 };
6944
6945 return DAG.getMergeValues(Ops, DL);
6946 }
Tom Stellard81d871d2013-11-13 23:36:50 +00006947
Matt Arsenaulta1436412016-02-10 18:21:45 +00006948 if (!MemVT.isVector())
6949 return SDValue();
Matt Arsenault4d801cd2015-11-24 12:05:03 +00006950
Matt Arsenaulta1436412016-02-10 18:21:45 +00006951 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
6952 "Custom lowering for non-i32 vectors hasn't been implemented.");
Matt Arsenault4d801cd2015-11-24 12:05:03 +00006953
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006954 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
Simon Pilgrim266f4392019-06-11 11:00:23 +00006955 *Load->getMemOperand())) {
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006956 SDValue Ops[2];
6957 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
6958 return DAG.getMergeValues(Ops, DL);
6959 }
Simon Pilgrim266f4392019-06-11 11:00:23 +00006960
6961 unsigned Alignment = Load->getAlignment();
6962 unsigned AS = Load->getAddressSpace();
Stanislav Mekhanoshina224f682019-05-01 16:11:11 +00006963 if (Subtarget->hasLDSMisalignedBug() &&
6964 AS == AMDGPUAS::FLAT_ADDRESS &&
6965 Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
6966 return SplitVectorLoad(Op, DAG);
6967 }
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006968
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006969 MachineFunction &MF = DAG.getMachineFunction();
6970 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6971 // If there is a possibilty that flat instruction access scratch memory
6972 // then we need to use the same legalization rules we use for private.
Matt Arsenault0da63502018-08-31 05:49:54 +00006973 if (AS == AMDGPUAS::FLAT_ADDRESS)
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006974 AS = MFI->hasFlatScratchInit() ?
Matt Arsenault0da63502018-08-31 05:49:54 +00006975 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006976
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006977 unsigned NumElements = MemVT.getVectorNumElements();
Matt Arsenault6c041a32018-03-29 19:59:28 +00006978
Matt Arsenault0da63502018-08-31 05:49:54 +00006979 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6980 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
Tim Renouf361b5b22019-03-21 12:01:21 +00006981 if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
6982 if (MemVT.isPow2VectorType())
6983 return SDValue();
6984 if (NumElements == 3)
6985 return WidenVectorLoad(Op, DAG);
6986 return SplitVectorLoad(Op, DAG);
6987 }
Matt Arsenaulta1436412016-02-10 18:21:45 +00006988 // Non-uniform loads will be selected to MUBUF instructions, so they
Alexander Timofeev18009562016-12-08 17:28:47 +00006989 // have the same legalization requirements as global and private
Matt Arsenaulta1436412016-02-10 18:21:45 +00006990 // loads.
6991 //
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006992 }
Matt Arsenault6c041a32018-03-29 19:59:28 +00006993
Matt Arsenault0da63502018-08-31 05:49:54 +00006994 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6995 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6996 AS == AMDGPUAS::GLOBAL_ADDRESS) {
Alexander Timofeev2e5eece2018-03-05 15:12:21 +00006997 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
Farhana Aleen89196642018-03-07 17:09:18 +00006998 !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
Tim Renouf361b5b22019-03-21 12:01:21 +00006999 Alignment >= 4 && NumElements < 32) {
7000 if (MemVT.isPow2VectorType())
7001 return SDValue();
7002 if (NumElements == 3)
7003 return WidenVectorLoad(Op, DAG);
7004 return SplitVectorLoad(Op, DAG);
7005 }
Alexander Timofeev18009562016-12-08 17:28:47 +00007006 // Non-uniform loads will be selected to MUBUF instructions, so they
7007 // have the same legalization requirements as global and private
7008 // loads.
7009 //
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00007010 }
Matt Arsenault0da63502018-08-31 05:49:54 +00007011 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
7012 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
7013 AS == AMDGPUAS::GLOBAL_ADDRESS ||
7014 AS == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007015 if (NumElements > 4)
Matt Arsenaulta1436412016-02-10 18:21:45 +00007016 return SplitVectorLoad(Op, DAG);
Tim Renouf361b5b22019-03-21 12:01:21 +00007017 // v3 loads not supported on SI.
7018 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
7019 return WidenVectorLoad(Op, DAG);
7020 // v3 and v4 loads are supported for private and global memory.
Matt Arsenaulta1436412016-02-10 18:21:45 +00007021 return SDValue();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00007022 }
Matt Arsenault0da63502018-08-31 05:49:54 +00007023 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007024 // Depending on the setting of the private_element_size field in the
7025 // resource descriptor, we can only make private accesses up to a certain
7026 // size.
7027 switch (Subtarget->getMaxPrivateElementSize()) {
7028 case 4:
Matt Arsenault9c499c32016-04-14 23:31:26 +00007029 return scalarizeVectorLoad(Load, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007030 case 8:
7031 if (NumElements > 2)
7032 return SplitVectorLoad(Op, DAG);
7033 return SDValue();
7034 case 16:
7035 // Same as global/flat
7036 if (NumElements > 4)
7037 return SplitVectorLoad(Op, DAG);
Tim Renouf361b5b22019-03-21 12:01:21 +00007038 // v3 loads not supported on SI.
7039 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
7040 return WidenVectorLoad(Op, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007041 return SDValue();
7042 default:
7043 llvm_unreachable("unsupported private_element_size");
7044 }
Matt Arsenault0da63502018-08-31 05:49:54 +00007045 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Farhana Aleena7cb3112018-03-09 17:41:39 +00007046 // Use ds_read_b128 if possible.
Marek Olsaka9a58fa2018-04-10 22:48:23 +00007047 if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
Farhana Aleena7cb3112018-03-09 17:41:39 +00007048 MemVT.getStoreSize() == 16)
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00007049 return SDValue();
7050
Farhana Aleena7cb3112018-03-09 17:41:39 +00007051 if (NumElements > 2)
7052 return SplitVectorLoad(Op, DAG);
Nicolai Haehnle48219372018-10-17 15:37:48 +00007053
7054 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7055 // address is negative, then the instruction is incorrectly treated as
7056 // out-of-bounds even if base + offsets is in bounds. Split vectorized
7057 // loads here to avoid emitting ds_read2_b32. We may re-combine the
7058 // load later in the SILoadStoreOptimizer.
7059 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
7060 NumElements == 2 && MemVT.getStoreSize() == 8 &&
7061 Load->getAlignment() < 8) {
7062 return SplitVectorLoad(Op, DAG);
7063 }
Tom Stellarde9373602014-01-22 19:24:14 +00007064 }
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00007065 return SDValue();
Tom Stellard81d871d2013-11-13 23:36:50 +00007066}
7067
Tom Stellard0ec134f2014-02-04 17:18:40 +00007068SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00007069 EVT VT = Op.getValueType();
7070 assert(VT.getSizeInBits() == 64);
Tom Stellard0ec134f2014-02-04 17:18:40 +00007071
7072 SDLoc DL(Op);
7073 SDValue Cond = Op.getOperand(0);
Tom Stellard0ec134f2014-02-04 17:18:40 +00007074
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007075 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
7076 SDValue One = DAG.getConstant(1, DL, MVT::i32);
Tom Stellard0ec134f2014-02-04 17:18:40 +00007077
Tom Stellard7ea3d6d2014-03-31 14:01:55 +00007078 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
7079 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
7080
7081 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
7082 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
Tom Stellard0ec134f2014-02-04 17:18:40 +00007083
7084 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
7085
Tom Stellard7ea3d6d2014-03-31 14:01:55 +00007086 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
7087 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
Tom Stellard0ec134f2014-02-04 17:18:40 +00007088
7089 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
7090
Ahmed Bougacha128f8732016-04-26 21:15:30 +00007091 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
Matt Arsenault02dc7e12018-06-15 15:15:46 +00007092 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
Tom Stellard0ec134f2014-02-04 17:18:40 +00007093}
7094
Matt Arsenault22ca3f82014-07-15 23:50:10 +00007095// Catch division cases where we can use shortcuts with rcp and rsq
7096// instructions.
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00007097SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
7098 SelectionDAG &DAG) const {
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007099 SDLoc SL(Op);
7100 SDValue LHS = Op.getOperand(0);
7101 SDValue RHS = Op.getOperand(1);
7102 EVT VT = Op.getValueType();
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00007103 const SDNodeFlags Flags = Op->getFlags();
Michael Berg7acc81b2018-05-04 18:48:20 +00007104 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007105
Konstantin Zhuravlyovc4b18e72017-04-21 19:25:33 +00007106 if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
7107 return SDValue();
7108
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007109 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
Konstantin Zhuravlyovc4b18e72017-04-21 19:25:33 +00007110 if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
Matt Arsenault979902b2016-08-02 22:25:04 +00007111 if (CLHS->isExactlyValue(1.0)) {
7112 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
7113 // the CI documentation has a worst case error of 1 ulp.
7114 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
7115 // use it as long as we aren't trying to use denormals.
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00007116 //
7117 // v_rcp_f16 and v_rsq_f16 DO support denormals.
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007118
Matt Arsenault979902b2016-08-02 22:25:04 +00007119 // 1.0 / sqrt(x) -> rsq(x)
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00007120
Matt Arsenault979902b2016-08-02 22:25:04 +00007121 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
7122 // error seems really high at 2^29 ULP.
7123 if (RHS.getOpcode() == ISD::FSQRT)
7124 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
7125
7126 // 1.0 / x -> rcp(x)
7127 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
7128 }
7129
7130 // Same as for 1.0, but expand the sign out of the constant.
7131 if (CLHS->isExactlyValue(-1.0)) {
7132 // -1.0 / x -> rcp (fneg x)
7133 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
7134 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
7135 }
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007136 }
7137 }
7138
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00007139 if (Unsafe) {
Matt Arsenault22ca3f82014-07-15 23:50:10 +00007140 // Turn into multiply by the reciprocal.
7141 // x / y -> x * (1.0 / y)
7142 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00007143 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
Matt Arsenault22ca3f82014-07-15 23:50:10 +00007144 }
7145
7146 return SDValue();
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007147}
7148
Tom Stellard8485fa02016-12-07 02:42:15 +00007149static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
7150 EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
7151 if (GlueChain->getNumValues() <= 1) {
7152 return DAG.getNode(Opcode, SL, VT, A, B);
7153 }
7154
7155 assert(GlueChain->getNumValues() == 3);
7156
7157 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
7158 switch (Opcode) {
7159 default: llvm_unreachable("no chain equivalent for opcode");
7160 case ISD::FMUL:
7161 Opcode = AMDGPUISD::FMUL_W_CHAIN;
7162 break;
7163 }
7164
7165 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
7166 GlueChain.getValue(2));
7167}
7168
7169static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
7170 EVT VT, SDValue A, SDValue B, SDValue C,
7171 SDValue GlueChain) {
7172 if (GlueChain->getNumValues() <= 1) {
7173 return DAG.getNode(Opcode, SL, VT, A, B, C);
7174 }
7175
7176 assert(GlueChain->getNumValues() == 3);
7177
7178 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
7179 switch (Opcode) {
7180 default: llvm_unreachable("no chain equivalent for opcode");
7181 case ISD::FMA:
7182 Opcode = AMDGPUISD::FMA_W_CHAIN;
7183 break;
7184 }
7185
7186 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
7187 GlueChain.getValue(2));
7188}
7189
Matt Arsenault4052a572016-12-22 03:05:41 +00007190SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00007191 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
7192 return FastLowered;
7193
Matt Arsenault4052a572016-12-22 03:05:41 +00007194 SDLoc SL(Op);
7195 SDValue Src0 = Op.getOperand(0);
7196 SDValue Src1 = Op.getOperand(1);
7197
7198 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7199 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7200
7201 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
7202 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
7203
7204 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
7205 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
7206
7207 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
7208}
7209
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00007210// Faster 2.5 ULP division that does not support denormals.
7211SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
7212 SDLoc SL(Op);
7213 SDValue LHS = Op.getOperand(1);
7214 SDValue RHS = Op.getOperand(2);
7215
7216 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
7217
7218 const APFloat K0Val(BitsToFloat(0x6f800000));
7219 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
7220
7221 const APFloat K1Val(BitsToFloat(0x2f800000));
7222 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
7223
7224 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
7225
7226 EVT SetCCVT =
7227 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
7228
7229 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
7230
7231 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
7232
7233 // TODO: Should this propagate fast-math-flags?
7234 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
7235
7236 // rcp does not support denormals.
7237 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
7238
7239 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
7240
7241 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
7242}
7243
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007244SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00007245 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
Eric Christopher538d09d02016-06-07 20:27:12 +00007246 return FastLowered;
Matt Arsenault22ca3f82014-07-15 23:50:10 +00007247
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007248 SDLoc SL(Op);
7249 SDValue LHS = Op.getOperand(0);
7250 SDValue RHS = Op.getOperand(1);
7251
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007252 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007253
Wei Dinged0f97f2016-06-09 19:17:15 +00007254 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007255
Tom Stellard8485fa02016-12-07 02:42:15 +00007256 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
7257 RHS, RHS, LHS);
7258 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
7259 LHS, RHS, LHS);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007260
Matt Arsenaultdfec5ce2016-07-09 07:48:11 +00007261 // Denominator is scaled to not be denormal, so using rcp is ok.
Tom Stellard8485fa02016-12-07 02:42:15 +00007262 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
7263 DenominatorScaled);
7264 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
7265 DenominatorScaled);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007266
Tom Stellard8485fa02016-12-07 02:42:15 +00007267 const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
7268 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
7269 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007270
Tom Stellard8485fa02016-12-07 02:42:15 +00007271 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007272
Tom Stellard8485fa02016-12-07 02:42:15 +00007273 if (!Subtarget->hasFP32Denormals()) {
7274 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
7275 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
7276 SL, MVT::i32);
7277 SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
7278 DAG.getEntryNode(),
7279 EnableDenormValue, BitField);
7280 SDValue Ops[3] = {
7281 NegDivScale0,
7282 EnableDenorm.getValue(0),
7283 EnableDenorm.getValue(1)
7284 };
Matt Arsenault37fefd62016-06-10 02:18:02 +00007285
Tom Stellard8485fa02016-12-07 02:42:15 +00007286 NegDivScale0 = DAG.getMergeValues(Ops, SL);
7287 }
7288
7289 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
7290 ApproxRcp, One, NegDivScale0);
7291
7292 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
7293 ApproxRcp, Fma0);
7294
7295 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
7296 Fma1, Fma1);
7297
7298 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
7299 NumeratorScaled, Mul);
7300
7301 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
7302
7303 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
7304 NumeratorScaled, Fma3);
7305
7306 if (!Subtarget->hasFP32Denormals()) {
7307 const SDValue DisableDenormValue =
7308 DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
7309 SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
7310 Fma4.getValue(1),
7311 DisableDenormValue,
7312 BitField,
7313 Fma4.getValue(2));
7314
7315 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
7316 DisableDenorm, DAG.getRoot());
7317 DAG.setRoot(OutputChain);
7318 }
Matt Arsenault37fefd62016-06-10 02:18:02 +00007319
Wei Dinged0f97f2016-06-09 19:17:15 +00007320 SDValue Scale = NumeratorScaled.getValue(1);
Tom Stellard8485fa02016-12-07 02:42:15 +00007321 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
7322 Fma4, Fma1, Fma3, Scale);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007323
Wei Dinged0f97f2016-06-09 19:17:15 +00007324 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007325}
7326
7327SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007328 if (DAG.getTarget().Options.UnsafeFPMath)
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00007329 return lowerFastUnsafeFDIV(Op, DAG);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007330
7331 SDLoc SL(Op);
7332 SDValue X = Op.getOperand(0);
7333 SDValue Y = Op.getOperand(1);
7334
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007335 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007336
7337 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
7338
7339 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
7340
7341 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
7342
7343 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
7344
7345 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
7346
7347 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
7348
7349 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
7350
7351 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
7352
7353 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
7354 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
7355
7356 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
7357 NegDivScale0, Mul, DivScale1);
7358
7359 SDValue Scale;
7360
Matt Arsenaulte4c2e9b2019-06-19 23:54:58 +00007361 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007362 // Workaround a hardware bug on SI where the condition output from div_scale
7363 // is not usable.
7364
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007365 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007366
7367 // Figure out if the scale to use for div_fmas.
7368 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
7369 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
7370 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
7371 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
7372
7373 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
7374 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
7375
7376 SDValue Scale0Hi
7377 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
7378 SDValue Scale1Hi
7379 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
7380
7381 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
7382 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
7383 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
7384 } else {
7385 Scale = DivScale1.getValue(1);
7386 }
7387
7388 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
7389 Fma4, Fma3, Mul, Scale);
7390
7391 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007392}
7393
7394SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
7395 EVT VT = Op.getValueType();
7396
7397 if (VT == MVT::f32)
7398 return LowerFDIV32(Op, DAG);
7399
7400 if (VT == MVT::f64)
7401 return LowerFDIV64(Op, DAG);
7402
Matt Arsenault4052a572016-12-22 03:05:41 +00007403 if (VT == MVT::f16)
7404 return LowerFDIV16(Op, DAG);
7405
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007406 llvm_unreachable("Unexpected type for fdiv");
7407}
7408
Tom Stellard81d871d2013-11-13 23:36:50 +00007409SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
7410 SDLoc DL(Op);
7411 StoreSDNode *Store = cast<StoreSDNode>(Op);
7412 EVT VT = Store->getMemoryVT();
7413
Matt Arsenault95245662016-02-11 05:32:46 +00007414 if (VT == MVT::i1) {
7415 return DAG.getTruncStore(Store->getChain(), DL,
7416 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
7417 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
Tom Stellardb02094e2014-07-21 15:45:01 +00007418 }
7419
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00007420 assert(VT.isVector() &&
7421 Store->getValue().getValueType().getScalarType() == MVT::i32);
7422
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00007423 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
Simon Pilgrim266f4392019-06-11 11:00:23 +00007424 *Store->getMemOperand())) {
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00007425 return expandUnalignedStore(Store, DAG);
7426 }
Tom Stellard81d871d2013-11-13 23:36:50 +00007427
Simon Pilgrim266f4392019-06-11 11:00:23 +00007428 unsigned AS = Store->getAddressSpace();
Stanislav Mekhanoshina224f682019-05-01 16:11:11 +00007429 if (Subtarget->hasLDSMisalignedBug() &&
7430 AS == AMDGPUAS::FLAT_ADDRESS &&
7431 Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
7432 return SplitVectorStore(Op, DAG);
7433 }
7434
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00007435 MachineFunction &MF = DAG.getMachineFunction();
7436 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
7437 // If there is a possibilty that flat instruction access scratch memory
7438 // then we need to use the same legalization rules we use for private.
Matt Arsenault0da63502018-08-31 05:49:54 +00007439 if (AS == AMDGPUAS::FLAT_ADDRESS)
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00007440 AS = MFI->hasFlatScratchInit() ?
Matt Arsenault0da63502018-08-31 05:49:54 +00007441 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00007442
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007443 unsigned NumElements = VT.getVectorNumElements();
Matt Arsenault0da63502018-08-31 05:49:54 +00007444 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
7445 AS == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007446 if (NumElements > 4)
7447 return SplitVectorStore(Op, DAG);
Tim Renouf361b5b22019-03-21 12:01:21 +00007448 // v3 stores not supported on SI.
7449 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
7450 return SplitVectorStore(Op, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007451 return SDValue();
Matt Arsenault0da63502018-08-31 05:49:54 +00007452 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007453 switch (Subtarget->getMaxPrivateElementSize()) {
7454 case 4:
Matt Arsenault9c499c32016-04-14 23:31:26 +00007455 return scalarizeVectorStore(Store, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007456 case 8:
7457 if (NumElements > 2)
7458 return SplitVectorStore(Op, DAG);
7459 return SDValue();
7460 case 16:
Tim Renouf361b5b22019-03-21 12:01:21 +00007461 if (NumElements > 4 || NumElements == 3)
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007462 return SplitVectorStore(Op, DAG);
7463 return SDValue();
7464 default:
7465 llvm_unreachable("unsupported private_element_size");
7466 }
Matt Arsenault0da63502018-08-31 05:49:54 +00007467 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00007468 // Use ds_write_b128 if possible.
Marek Olsaka9a58fa2018-04-10 22:48:23 +00007469 if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
Tim Renouf361b5b22019-03-21 12:01:21 +00007470 VT.getStoreSize() == 16 && NumElements != 3)
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00007471 return SDValue();
7472
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00007473 if (NumElements > 2)
7474 return SplitVectorStore(Op, DAG);
Nicolai Haehnle48219372018-10-17 15:37:48 +00007475
7476 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7477 // address is negative, then the instruction is incorrectly treated as
7478 // out-of-bounds even if base + offsets is in bounds. Split vectorized
7479 // stores here to avoid emitting ds_write2_b32. We may re-combine the
7480 // store later in the SILoadStoreOptimizer.
Matt Arsenaulte4c2e9b2019-06-19 23:54:58 +00007481 if (!Subtarget->hasUsableDSOffset() &&
Nicolai Haehnle48219372018-10-17 15:37:48 +00007482 NumElements == 2 && VT.getStoreSize() == 8 &&
7483 Store->getAlignment() < 8) {
7484 return SplitVectorStore(Op, DAG);
7485 }
7486
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00007487 return SDValue();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00007488 } else {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007489 llvm_unreachable("unhandled address space");
Matt Arsenault95245662016-02-11 05:32:46 +00007490 }
Tom Stellard81d871d2013-11-13 23:36:50 +00007491}
7492
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007493SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007494 SDLoc DL(Op);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007495 EVT VT = Op.getValueType();
7496 SDValue Arg = Op.getOperand(0);
David Stuttard20de3e92018-09-14 10:27:19 +00007497 SDValue TrigVal;
7498
Sanjay Patela2607012015-09-16 16:31:21 +00007499 // TODO: Should this propagate fast-math-flags?
David Stuttard20de3e92018-09-14 10:27:19 +00007500
7501 SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT);
7502
7503 if (Subtarget->hasTrigReducedRange()) {
7504 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
7505 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
7506 } else {
7507 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
7508 }
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007509
7510 switch (Op.getOpcode()) {
7511 case ISD::FCOS:
David Stuttard20de3e92018-09-14 10:27:19 +00007512 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007513 case ISD::FSIN:
David Stuttard20de3e92018-09-14 10:27:19 +00007514 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007515 default:
7516 llvm_unreachable("Wrong trig opcode");
7517 }
7518}
7519
Tom Stellard354a43c2016-04-01 18:27:37 +00007520SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
7521 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
7522 assert(AtomicNode->isCompareAndSwap());
7523 unsigned AS = AtomicNode->getAddressSpace();
7524
7525 // No custom lowering required for local address space
Matt Arsenault0da63502018-08-31 05:49:54 +00007526 if (!isFlatGlobalAddrSpace(AS))
Tom Stellard354a43c2016-04-01 18:27:37 +00007527 return Op;
7528
7529 // Non-local address space requires custom lowering for atomic compare
7530 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
7531 SDLoc DL(Op);
7532 SDValue ChainIn = Op.getOperand(0);
7533 SDValue Addr = Op.getOperand(1);
7534 SDValue Old = Op.getOperand(2);
7535 SDValue New = Op.getOperand(3);
7536 EVT VT = Op.getValueType();
7537 MVT SimpleVT = VT.getSimpleVT();
7538 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
7539
Ahmed Bougacha128f8732016-04-26 21:15:30 +00007540 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
Tom Stellard354a43c2016-04-01 18:27:37 +00007541 SDValue Ops[] = { ChainIn, Addr, NewOld };
Matt Arsenault88701812016-06-09 23:42:48 +00007542
7543 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
7544 Ops, VT, AtomicNode->getMemOperand());
Tom Stellard354a43c2016-04-01 18:27:37 +00007545}
7546
Tom Stellard75aadc22012-12-11 21:25:42 +00007547//===----------------------------------------------------------------------===//
7548// Custom DAG optimizations
7549//===----------------------------------------------------------------------===//
7550
Matt Arsenault364a6742014-06-11 17:50:44 +00007551SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
Matt Arsenaulte6986632015-01-14 01:35:22 +00007552 DAGCombinerInfo &DCI) const {
Matt Arsenault364a6742014-06-11 17:50:44 +00007553 EVT VT = N->getValueType(0);
7554 EVT ScalarVT = VT.getScalarType();
7555 if (ScalarVT != MVT::f32)
7556 return SDValue();
7557
7558 SelectionDAG &DAG = DCI.DAG;
7559 SDLoc DL(N);
7560
7561 SDValue Src = N->getOperand(0);
7562 EVT SrcVT = Src.getValueType();
7563
7564 // TODO: We could try to match extracting the higher bytes, which would be
7565 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
7566 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
7567 // about in practice.
Craig Topper80d3bb32018-03-06 19:44:52 +00007568 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
Matt Arsenault364a6742014-06-11 17:50:44 +00007569 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
7570 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
7571 DCI.AddToWorklist(Cvt.getNode());
7572 return Cvt;
7573 }
7574 }
7575
Matt Arsenault364a6742014-06-11 17:50:44 +00007576 return SDValue();
7577}
7578
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007579// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
7580
7581// This is a variant of
7582// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
7583//
7584// The normal DAG combiner will do this, but only if the add has one use since
7585// that would increase the number of instructions.
7586//
7587// This prevents us from seeing a constant offset that can be folded into a
7588// memory instruction's addressing mode. If we know the resulting add offset of
7589// a pointer can be folded into an addressing offset, we can replace the pointer
7590// operand with the add of new constant offset. This eliminates one of the uses,
7591// and may allow the remaining use to also be simplified.
7592//
7593SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
7594 unsigned AddrSpace,
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007595 EVT MemVT,
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007596 DAGCombinerInfo &DCI) const {
7597 SDValue N0 = N->getOperand(0);
7598 SDValue N1 = N->getOperand(1);
7599
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007600 // We only do this to handle cases where it's profitable when there are
7601 // multiple uses of the add, so defer to the standard combine.
Matt Arsenaultc8903122017-11-14 23:46:42 +00007602 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
7603 N0->hasOneUse())
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007604 return SDValue();
7605
7606 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
7607 if (!CN1)
7608 return SDValue();
7609
7610 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
7611 if (!CAdd)
7612 return SDValue();
7613
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007614 // If the resulting offset is too large, we can't fold it into the addressing
7615 // mode offset.
7616 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007617 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
7618
7619 AddrMode AM;
7620 AM.HasBaseReg = true;
7621 AM.BaseOffs = Offset.getSExtValue();
7622 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007623 return SDValue();
7624
7625 SelectionDAG &DAG = DCI.DAG;
7626 SDLoc SL(N);
7627 EVT VT = N->getValueType(0);
7628
7629 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007630 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007631
Matt Arsenaulte5e0c742017-11-13 05:33:35 +00007632 SDNodeFlags Flags;
7633 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
7634 (N0.getOpcode() == ISD::OR ||
7635 N0->getFlags().hasNoUnsignedWrap()));
7636
7637 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007638}
7639
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00007640SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
7641 DAGCombinerInfo &DCI) const {
7642 SDValue Ptr = N->getBasePtr();
7643 SelectionDAG &DAG = DCI.DAG;
7644 SDLoc SL(N);
7645
7646 // TODO: We could also do this for multiplies.
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007647 if (Ptr.getOpcode() == ISD::SHL) {
7648 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
7649 N->getMemoryVT(), DCI);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00007650 if (NewPtr) {
7651 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
7652
7653 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
7654 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
7655 }
7656 }
7657
7658 return SDValue();
7659}
7660
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007661static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
7662 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
7663 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
7664 (Opc == ISD::XOR && Val == 0);
7665}
7666
7667// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
7668// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
7669// integer combine opportunities since most 64-bit operations are decomposed
7670// this way. TODO: We won't want this for SALU especially if it is an inline
7671// immediate.
7672SDValue SITargetLowering::splitBinaryBitConstantOp(
7673 DAGCombinerInfo &DCI,
7674 const SDLoc &SL,
7675 unsigned Opc, SDValue LHS,
7676 const ConstantSDNode *CRHS) const {
7677 uint64_t Val = CRHS->getZExtValue();
7678 uint32_t ValLo = Lo_32(Val);
7679 uint32_t ValHi = Hi_32(Val);
7680 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7681
7682 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
7683 bitOpWithConstantIsReducible(Opc, ValHi)) ||
7684 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
7685 // If we need to materialize a 64-bit immediate, it will be split up later
7686 // anyway. Avoid creating the harder to understand 64-bit immediate
7687 // materialization.
7688 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
7689 }
7690
7691 return SDValue();
7692}
7693
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00007694// Returns true if argument is a boolean value which is not serialized into
7695// memory or argument and does not require v_cmdmask_b32 to be deserialized.
7696static bool isBoolSGPR(SDValue V) {
7697 if (V.getValueType() != MVT::i1)
7698 return false;
7699 switch (V.getOpcode()) {
7700 default: break;
7701 case ISD::SETCC:
7702 case ISD::AND:
7703 case ISD::OR:
7704 case ISD::XOR:
7705 case AMDGPUISD::FP_CLASS:
7706 return true;
7707 }
7708 return false;
7709}
7710
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007711// If a constant has all zeroes or all ones within each byte return it.
7712// Otherwise return 0.
7713static uint32_t getConstantPermuteMask(uint32_t C) {
7714 // 0xff for any zero byte in the mask
7715 uint32_t ZeroByteMask = 0;
7716 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
7717 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
7718 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
7719 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
7720 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
7721 if ((NonZeroByteMask & C) != NonZeroByteMask)
7722 return 0; // Partial bytes selected.
7723 return C;
7724}
7725
7726// Check if a node selects whole bytes from its operand 0 starting at a byte
7727// boundary while masking the rest. Returns select mask as in the v_perm_b32
7728// or -1 if not succeeded.
7729// Note byte select encoding:
7730// value 0-3 selects corresponding source byte;
7731// value 0xc selects zero;
7732// value 0xff selects 0xff.
7733static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
7734 assert(V.getValueSizeInBits() == 32);
7735
7736 if (V.getNumOperands() != 2)
7737 return ~0;
7738
7739 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
7740 if (!N1)
7741 return ~0;
7742
7743 uint32_t C = N1->getZExtValue();
7744
7745 switch (V.getOpcode()) {
7746 default:
7747 break;
7748 case ISD::AND:
7749 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7750 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
7751 }
7752 break;
7753
7754 case ISD::OR:
7755 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7756 return (0x03020100 & ~ConstMask) | ConstMask;
7757 }
7758 break;
7759
7760 case ISD::SHL:
7761 if (C % 8)
7762 return ~0;
7763
7764 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
7765
7766 case ISD::SRL:
7767 if (C % 8)
7768 return ~0;
7769
7770 return uint32_t(0x0c0c0c0c03020100ull >> C);
7771 }
7772
7773 return ~0;
7774}
7775
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007776SDValue SITargetLowering::performAndCombine(SDNode *N,
7777 DAGCombinerInfo &DCI) const {
7778 if (DCI.isBeforeLegalize())
7779 return SDValue();
7780
7781 SelectionDAG &DAG = DCI.DAG;
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007782 EVT VT = N->getValueType(0);
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007783 SDValue LHS = N->getOperand(0);
7784 SDValue RHS = N->getOperand(1);
7785
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007786
Stanislav Mekhanoshin53a21292017-05-23 19:54:48 +00007787 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7788 if (VT == MVT::i64 && CRHS) {
7789 if (SDValue Split
7790 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
7791 return Split;
7792 }
7793
7794 if (CRHS && VT == MVT::i32) {
7795 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
7796 // nb = number of trailing zeroes in mask
7797 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
7798 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
7799 uint64_t Mask = CRHS->getZExtValue();
7800 unsigned Bits = countPopulation(Mask);
7801 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
7802 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
7803 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
7804 unsigned Shift = CShift->getZExtValue();
7805 unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
7806 unsigned Offset = NB + Shift;
7807 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
7808 SDLoc SL(N);
7809 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
7810 LHS->getOperand(0),
7811 DAG.getConstant(Offset, SL, MVT::i32),
7812 DAG.getConstant(Bits, SL, MVT::i32));
7813 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
7814 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
7815 DAG.getValueType(NarrowVT));
7816 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
7817 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
7818 return Shl;
7819 }
7820 }
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007821 }
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007822
7823 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7824 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
7825 isa<ConstantSDNode>(LHS.getOperand(2))) {
7826 uint32_t Sel = getConstantPermuteMask(Mask);
7827 if (!Sel)
7828 return SDValue();
7829
7830 // Select 0xc for all zero bytes
7831 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
7832 SDLoc DL(N);
7833 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7834 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7835 }
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007836 }
7837
7838 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
7839 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
7840 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007841 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7842 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
7843
7844 SDValue X = LHS.getOperand(0);
7845 SDValue Y = RHS.getOperand(0);
7846 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
7847 return SDValue();
7848
7849 if (LCC == ISD::SETO) {
7850 if (X != LHS.getOperand(1))
7851 return SDValue();
7852
7853 if (RCC == ISD::SETUNE) {
7854 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
7855 if (!C1 || !C1->isInfinity() || C1->isNegative())
7856 return SDValue();
7857
7858 const uint32_t Mask = SIInstrFlags::N_NORMAL |
7859 SIInstrFlags::N_SUBNORMAL |
7860 SIInstrFlags::N_ZERO |
7861 SIInstrFlags::P_ZERO |
7862 SIInstrFlags::P_SUBNORMAL |
7863 SIInstrFlags::P_NORMAL;
7864
7865 static_assert(((~(SIInstrFlags::S_NAN |
7866 SIInstrFlags::Q_NAN |
7867 SIInstrFlags::N_INFINITY |
7868 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
7869 "mask not equal");
7870
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007871 SDLoc DL(N);
7872 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7873 X, DAG.getConstant(Mask, DL, MVT::i32));
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007874 }
7875 }
7876 }
7877
Matt Arsenault3dcf4ce2018-08-10 18:58:56 +00007878 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
7879 std::swap(LHS, RHS);
7880
7881 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7882 RHS.hasOneUse()) {
7883 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7884 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
7885 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
7886 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7887 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
7888 (RHS.getOperand(0) == LHS.getOperand(0) &&
7889 LHS.getOperand(0) == LHS.getOperand(1))) {
7890 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
7891 unsigned NewMask = LCC == ISD::SETO ?
7892 Mask->getZExtValue() & ~OrdMask :
7893 Mask->getZExtValue() & OrdMask;
7894
7895 SDLoc DL(N);
7896 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
7897 DAG.getConstant(NewMask, DL, MVT::i32));
7898 }
7899 }
7900
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00007901 if (VT == MVT::i32 &&
7902 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
7903 // and x, (sext cc from i1) => select cc, x, 0
7904 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
7905 std::swap(LHS, RHS);
7906 if (isBoolSGPR(RHS.getOperand(0)))
7907 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
7908 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
7909 }
7910
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007911 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7912 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7913 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7914 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7915 uint32_t LHSMask = getPermuteMask(DAG, LHS);
7916 uint32_t RHSMask = getPermuteMask(DAG, RHS);
7917 if (LHSMask != ~0u && RHSMask != ~0u) {
7918 // Canonicalize the expression in an attempt to have fewer unique masks
7919 // and therefore fewer registers used to hold the masks.
7920 if (LHSMask > RHSMask) {
7921 std::swap(LHSMask, RHSMask);
7922 std::swap(LHS, RHS);
7923 }
7924
7925 // Select 0xc for each lane used from source operand. Zero has 0xc mask
7926 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7927 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7928 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7929
7930 // Check of we need to combine values from two sources within a byte.
7931 if (!(LHSUsedLanes & RHSUsedLanes) &&
7932 // If we select high and lower word keep it for SDWA.
7933 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7934 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7935 // Each byte in each mask is either selector mask 0-3, or has higher
7936 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
7937 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
7938 // mask which is not 0xff wins. By anding both masks we have a correct
7939 // result except that 0x0c shall be corrected to give 0x0c only.
7940 uint32_t Mask = LHSMask & RHSMask;
7941 for (unsigned I = 0; I < 32; I += 8) {
7942 uint32_t ByteSel = 0xff << I;
7943 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
7944 Mask &= (0x0c << I) & 0xffffffff;
7945 }
7946
7947 // Add 4 to each active LHS lane. It will not affect any existing 0xff
7948 // or 0x0c.
7949 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
7950 SDLoc DL(N);
7951
7952 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7953 LHS.getOperand(0), RHS.getOperand(0),
7954 DAG.getConstant(Sel, DL, MVT::i32));
7955 }
7956 }
7957 }
7958
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007959 return SDValue();
7960}
7961
Matt Arsenaultf2290332015-01-06 23:00:39 +00007962SDValue SITargetLowering::performOrCombine(SDNode *N,
7963 DAGCombinerInfo &DCI) const {
7964 SelectionDAG &DAG = DCI.DAG;
7965 SDValue LHS = N->getOperand(0);
7966 SDValue RHS = N->getOperand(1);
7967
Matt Arsenault3b082382016-04-12 18:24:38 +00007968 EVT VT = N->getValueType(0);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007969 if (VT == MVT::i1) {
7970 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
7971 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7972 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
7973 SDValue Src = LHS.getOperand(0);
7974 if (Src != RHS.getOperand(0))
7975 return SDValue();
Matt Arsenault3b082382016-04-12 18:24:38 +00007976
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007977 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
7978 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7979 if (!CLHS || !CRHS)
7980 return SDValue();
Matt Arsenault3b082382016-04-12 18:24:38 +00007981
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007982 // Only 10 bits are used.
7983 static const uint32_t MaxMask = 0x3ff;
Matt Arsenault3b082382016-04-12 18:24:38 +00007984
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007985 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
7986 SDLoc DL(N);
7987 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7988 Src, DAG.getConstant(NewMask, DL, MVT::i32));
7989 }
Matt Arsenault3b082382016-04-12 18:24:38 +00007990
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007991 return SDValue();
7992 }
7993
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007994 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7995 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
7996 LHS.getOpcode() == AMDGPUISD::PERM &&
7997 isa<ConstantSDNode>(LHS.getOperand(2))) {
7998 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
7999 if (!Sel)
8000 return SDValue();
8001
8002 Sel |= LHS.getConstantOperandVal(2);
8003 SDLoc DL(N);
8004 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
8005 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
8006 }
8007
8008 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
8009 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
8010 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
8011 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
8012 uint32_t LHSMask = getPermuteMask(DAG, LHS);
8013 uint32_t RHSMask = getPermuteMask(DAG, RHS);
8014 if (LHSMask != ~0u && RHSMask != ~0u) {
8015 // Canonicalize the expression in an attempt to have fewer unique masks
8016 // and therefore fewer registers used to hold the masks.
8017 if (LHSMask > RHSMask) {
8018 std::swap(LHSMask, RHSMask);
8019 std::swap(LHS, RHS);
8020 }
8021
8022 // Select 0xc for each lane used from source operand. Zero has 0xc mask
8023 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
8024 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
8025 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
8026
8027 // Check of we need to combine values from two sources within a byte.
8028 if (!(LHSUsedLanes & RHSUsedLanes) &&
8029 // If we select high and lower word keep it for SDWA.
8030 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
8031 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
8032 // Kill zero bytes selected by other mask. Zero value is 0xc.
8033 LHSMask &= ~RHSUsedLanes;
8034 RHSMask &= ~LHSUsedLanes;
8035 // Add 4 to each active LHS lane
8036 LHSMask |= LHSUsedLanes & 0x04040404;
8037 // Combine masks
8038 uint32_t Sel = LHSMask | RHSMask;
8039 SDLoc DL(N);
8040
8041 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
8042 LHS.getOperand(0), RHS.getOperand(0),
8043 DAG.getConstant(Sel, DL, MVT::i32));
8044 }
8045 }
8046 }
8047
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00008048 if (VT != MVT::i64)
8049 return SDValue();
8050
8051 // TODO: This could be a generic combine with a predicate for extracting the
8052 // high half of an integer being free.
8053
8054 // (or i64:x, (zero_extend i32:y)) ->
8055 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
8056 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
8057 RHS.getOpcode() != ISD::ZERO_EXTEND)
8058 std::swap(LHS, RHS);
8059
8060 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
8061 SDValue ExtSrc = RHS.getOperand(0);
8062 EVT SrcVT = ExtSrc.getValueType();
8063 if (SrcVT == MVT::i32) {
8064 SDLoc SL(N);
8065 SDValue LowLHS, HiBits;
8066 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
8067 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
8068
8069 DCI.AddToWorklist(LowOr.getNode());
8070 DCI.AddToWorklist(HiBits.getNode());
8071
8072 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
8073 LowOr, HiBits);
8074 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
Matt Arsenault3b082382016-04-12 18:24:38 +00008075 }
8076 }
8077
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00008078 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
8079 if (CRHS) {
8080 if (SDValue Split
8081 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
8082 return Split;
8083 }
Matt Arsenaultf2290332015-01-06 23:00:39 +00008084
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00008085 return SDValue();
8086}
Matt Arsenaultf2290332015-01-06 23:00:39 +00008087
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00008088SDValue SITargetLowering::performXorCombine(SDNode *N,
8089 DAGCombinerInfo &DCI) const {
8090 EVT VT = N->getValueType(0);
8091 if (VT != MVT::i64)
8092 return SDValue();
Matt Arsenaultf2290332015-01-06 23:00:39 +00008093
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00008094 SDValue LHS = N->getOperand(0);
8095 SDValue RHS = N->getOperand(1);
8096
8097 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
8098 if (CRHS) {
8099 if (SDValue Split
8100 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
8101 return Split;
Matt Arsenaultf2290332015-01-06 23:00:39 +00008102 }
8103
8104 return SDValue();
8105}
8106
Matt Arsenault5cf42712017-04-06 20:58:30 +00008107// Instructions that will be lowered with a final instruction that zeros the
8108// high result bits.
8109// XXX - probably only need to list legal operations.
Matt Arsenault8edfaee2017-03-31 19:53:03 +00008110static bool fp16SrcZerosHighBits(unsigned Opc) {
8111 switch (Opc) {
Matt Arsenault5cf42712017-04-06 20:58:30 +00008112 case ISD::FADD:
8113 case ISD::FSUB:
8114 case ISD::FMUL:
8115 case ISD::FDIV:
8116 case ISD::FREM:
8117 case ISD::FMA:
8118 case ISD::FMAD:
8119 case ISD::FCANONICALIZE:
8120 case ISD::FP_ROUND:
8121 case ISD::UINT_TO_FP:
8122 case ISD::SINT_TO_FP:
8123 case ISD::FABS:
8124 // Fabs is lowered to a bit operation, but it's an and which will clear the
8125 // high bits anyway.
8126 case ISD::FSQRT:
8127 case ISD::FSIN:
8128 case ISD::FCOS:
8129 case ISD::FPOWI:
8130 case ISD::FPOW:
8131 case ISD::FLOG:
8132 case ISD::FLOG2:
8133 case ISD::FLOG10:
8134 case ISD::FEXP:
8135 case ISD::FEXP2:
8136 case ISD::FCEIL:
8137 case ISD::FTRUNC:
8138 case ISD::FRINT:
8139 case ISD::FNEARBYINT:
8140 case ISD::FROUND:
8141 case ISD::FFLOOR:
8142 case ISD::FMINNUM:
8143 case ISD::FMAXNUM:
8144 case AMDGPUISD::FRACT:
8145 case AMDGPUISD::CLAMP:
8146 case AMDGPUISD::COS_HW:
8147 case AMDGPUISD::SIN_HW:
8148 case AMDGPUISD::FMIN3:
8149 case AMDGPUISD::FMAX3:
8150 case AMDGPUISD::FMED3:
8151 case AMDGPUISD::FMAD_FTZ:
8152 case AMDGPUISD::RCP:
8153 case AMDGPUISD::RSQ:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00008154 case AMDGPUISD::RCP_IFLAG:
Matt Arsenault5cf42712017-04-06 20:58:30 +00008155 case AMDGPUISD::LDEXP:
Matt Arsenault8edfaee2017-03-31 19:53:03 +00008156 return true;
Matt Arsenault5cf42712017-04-06 20:58:30 +00008157 default:
8158 // fcopysign, select and others may be lowered to 32-bit bit operations
8159 // which don't zero the high bits.
8160 return false;
Matt Arsenault8edfaee2017-03-31 19:53:03 +00008161 }
8162}
8163
8164SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
8165 DAGCombinerInfo &DCI) const {
8166 if (!Subtarget->has16BitInsts() ||
8167 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8168 return SDValue();
8169
8170 EVT VT = N->getValueType(0);
8171 if (VT != MVT::i32)
8172 return SDValue();
8173
8174 SDValue Src = N->getOperand(0);
8175 if (Src.getValueType() != MVT::i16)
8176 return SDValue();
8177
8178 // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
8179 // FIXME: It is not universally true that the high bits are zeroed on gfx9.
8180 if (Src.getOpcode() == ISD::BITCAST) {
8181 SDValue BCSrc = Src.getOperand(0);
8182 if (BCSrc.getValueType() == MVT::f16 &&
8183 fp16SrcZerosHighBits(BCSrc.getOpcode()))
8184 return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
8185 }
8186
8187 return SDValue();
8188}
8189
Ryan Taylor00e063a2019-03-19 16:07:00 +00008190SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N,
8191 DAGCombinerInfo &DCI)
8192 const {
8193 SDValue Src = N->getOperand(0);
8194 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
8195
8196 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
8197 VTSign->getVT() == MVT::i8) ||
8198 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
8199 VTSign->getVT() == MVT::i16)) &&
8200 Src.hasOneUse()) {
8201 auto *M = cast<MemSDNode>(Src);
8202 SDValue Ops[] = {
8203 Src.getOperand(0), // Chain
8204 Src.getOperand(1), // rsrc
8205 Src.getOperand(2), // vindex
8206 Src.getOperand(3), // voffset
8207 Src.getOperand(4), // soffset
8208 Src.getOperand(5), // offset
8209 Src.getOperand(6),
8210 Src.getOperand(7)
8211 };
8212 // replace with BUFFER_LOAD_BYTE/SHORT
8213 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
8214 Src.getOperand(0).getValueType());
8215 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
8216 AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT;
8217 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
8218 ResList,
8219 Ops, M->getMemoryVT(),
8220 M->getMemOperand());
8221 return DCI.DAG.getMergeValues({BufferLoadSignExt,
8222 BufferLoadSignExt.getValue(1)}, SDLoc(N));
8223 }
8224 return SDValue();
8225}
8226
Matt Arsenaultf2290332015-01-06 23:00:39 +00008227SDValue SITargetLowering::performClassCombine(SDNode *N,
8228 DAGCombinerInfo &DCI) const {
8229 SelectionDAG &DAG = DCI.DAG;
8230 SDValue Mask = N->getOperand(1);
8231
8232 // fp_class x, 0 -> false
8233 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
8234 if (CMask->isNullValue())
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00008235 return DAG.getConstant(0, SDLoc(N), MVT::i1);
Matt Arsenaultf2290332015-01-06 23:00:39 +00008236 }
8237
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00008238 if (N->getOperand(0).isUndef())
8239 return DAG.getUNDEF(MVT::i1);
8240
Matt Arsenaultf2290332015-01-06 23:00:39 +00008241 return SDValue();
8242}
8243
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00008244SDValue SITargetLowering::performRcpCombine(SDNode *N,
8245 DAGCombinerInfo &DCI) const {
8246 EVT VT = N->getValueType(0);
8247 SDValue N0 = N->getOperand(0);
8248
8249 if (N0.isUndef())
8250 return N0;
8251
8252 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
8253 N0.getOpcode() == ISD::SINT_TO_FP)) {
8254 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
8255 N->getFlags());
8256 }
8257
8258 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
8259}
8260
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008261bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
8262 unsigned MaxDepth) const {
8263 unsigned Opcode = Op.getOpcode();
8264 if (Opcode == ISD::FCANONICALIZE)
8265 return true;
8266
8267 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
8268 auto F = CFP->getValueAPF();
8269 if (F.isNaN() && F.isSignaling())
8270 return false;
8271 return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
8272 }
8273
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008274 // If source is a result of another standard FP operation it is already in
8275 // canonical form.
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008276 if (MaxDepth == 0)
8277 return false;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008278
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008279 switch (Opcode) {
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008280 // These will flush denorms if required.
8281 case ISD::FADD:
8282 case ISD::FSUB:
8283 case ISD::FMUL:
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008284 case ISD::FCEIL:
8285 case ISD::FFLOOR:
8286 case ISD::FMA:
8287 case ISD::FMAD:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008288 case ISD::FSQRT:
8289 case ISD::FDIV:
8290 case ISD::FREM:
Matt Arsenaultce6d61f2018-08-06 21:51:52 +00008291 case ISD::FP_ROUND:
8292 case ISD::FP_EXTEND:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008293 case AMDGPUISD::FMUL_LEGACY:
8294 case AMDGPUISD::FMAD_FTZ:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00008295 case AMDGPUISD::RCP:
8296 case AMDGPUISD::RSQ:
8297 case AMDGPUISD::RSQ_CLAMP:
8298 case AMDGPUISD::RCP_LEGACY:
8299 case AMDGPUISD::RSQ_LEGACY:
8300 case AMDGPUISD::RCP_IFLAG:
8301 case AMDGPUISD::TRIG_PREOP:
8302 case AMDGPUISD::DIV_SCALE:
8303 case AMDGPUISD::DIV_FMAS:
8304 case AMDGPUISD::DIV_FIXUP:
8305 case AMDGPUISD::FRACT:
8306 case AMDGPUISD::LDEXP:
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008307 case AMDGPUISD::CVT_PKRTZ_F16_F32:
Matt Arsenault940e6072018-08-10 19:20:17 +00008308 case AMDGPUISD::CVT_F32_UBYTE0:
8309 case AMDGPUISD::CVT_F32_UBYTE1:
8310 case AMDGPUISD::CVT_F32_UBYTE2:
8311 case AMDGPUISD::CVT_F32_UBYTE3:
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008312 return true;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008313
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008314 // It can/will be lowered or combined as a bit operation.
8315 // Need to check their input recursively to handle.
8316 case ISD::FNEG:
8317 case ISD::FABS:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008318 case ISD::FCOPYSIGN:
8319 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008320
8321 case ISD::FSIN:
8322 case ISD::FCOS:
8323 case ISD::FSINCOS:
8324 return Op.getValueType().getScalarType() != MVT::f16;
8325
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008326 case ISD::FMINNUM:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00008327 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008328 case ISD::FMINNUM_IEEE:
8329 case ISD::FMAXNUM_IEEE:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00008330 case AMDGPUISD::CLAMP:
8331 case AMDGPUISD::FMED3:
8332 case AMDGPUISD::FMAX3:
8333 case AMDGPUISD::FMIN3: {
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008334 // FIXME: Shouldn't treat the generic operations different based these.
Matt Arsenault687ec752018-10-22 16:27:27 +00008335 // However, we aren't really required to flush the result from
8336 // minnum/maxnum..
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008337
Matt Arsenault687ec752018-10-22 16:27:27 +00008338 // snans will be quieted, so we only need to worry about denormals.
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008339 if (Subtarget->supportsMinMaxDenormModes() ||
Matt Arsenault687ec752018-10-22 16:27:27 +00008340 denormalsEnabledForType(Op.getValueType()))
8341 return true;
8342
8343 // Flushing may be required.
8344 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
8345 // targets need to check their input recursively.
8346
8347 // FIXME: Does this apply with clamp? It's implemented with max.
8348 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
8349 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
8350 return false;
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008351 }
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008352
Matt Arsenault687ec752018-10-22 16:27:27 +00008353 return true;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008354 }
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008355 case ISD::SELECT: {
8356 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
8357 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008358 }
Matt Arsenaulte94ee832018-08-06 22:45:51 +00008359 case ISD::BUILD_VECTOR: {
8360 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
8361 SDValue SrcOp = Op.getOperand(i);
8362 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
8363 return false;
8364 }
8365
8366 return true;
8367 }
8368 case ISD::EXTRACT_VECTOR_ELT:
8369 case ISD::EXTRACT_SUBVECTOR: {
8370 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
8371 }
8372 case ISD::INSERT_VECTOR_ELT: {
8373 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
8374 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
8375 }
8376 case ISD::UNDEF:
8377 // Could be anything.
8378 return false;
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008379
Matt Arsenault687ec752018-10-22 16:27:27 +00008380 case ISD::BITCAST: {
8381 // Hack round the mess we make when legalizing extract_vector_elt
8382 SDValue Src = Op.getOperand(0);
8383 if (Src.getValueType() == MVT::i16 &&
8384 Src.getOpcode() == ISD::TRUNCATE) {
8385 SDValue TruncSrc = Src.getOperand(0);
8386 if (TruncSrc.getValueType() == MVT::i32 &&
8387 TruncSrc.getOpcode() == ISD::BITCAST &&
8388 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
8389 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
8390 }
8391 }
8392
8393 return false;
8394 }
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008395 case ISD::INTRINSIC_WO_CHAIN: {
8396 unsigned IntrinsicID
8397 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
8398 // TODO: Handle more intrinsics
8399 switch (IntrinsicID) {
8400 case Intrinsic::amdgcn_cvt_pkrtz:
Matt Arsenault940e6072018-08-10 19:20:17 +00008401 case Intrinsic::amdgcn_cubeid:
8402 case Intrinsic::amdgcn_frexp_mant:
8403 case Intrinsic::amdgcn_fdot2:
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008404 return true;
8405 default:
8406 break;
8407 }
Matt Arsenault5bb9d792018-08-10 17:57:12 +00008408
8409 LLVM_FALLTHROUGH;
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008410 }
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008411 default:
8412 return denormalsEnabledForType(Op.getValueType()) &&
8413 DAG.isKnownNeverSNaN(Op);
8414 }
8415
8416 llvm_unreachable("invalid operation");
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008417}
8418
Matt Arsenault9cd90712016-04-14 01:42:16 +00008419// Constant fold canonicalize.
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00008420SDValue SITargetLowering::getCanonicalConstantFP(
8421 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
8422 // Flush denormals to 0 if not enabled.
8423 if (C.isDenormal() && !denormalsEnabledForType(VT))
8424 return DAG.getConstantFP(0.0, SL, VT);
8425
8426 if (C.isNaN()) {
8427 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
8428 if (C.isSignaling()) {
8429 // Quiet a signaling NaN.
8430 // FIXME: Is this supposed to preserve payload bits?
8431 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
8432 }
8433
8434 // Make sure it is the canonical NaN bitpattern.
8435 //
8436 // TODO: Can we use -1 as the canonical NaN value since it's an inline
8437 // immediate?
8438 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
8439 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
8440 }
8441
8442 // Already canonical.
8443 return DAG.getConstantFP(C, SL, VT);
8444}
8445
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008446static bool vectorEltWillFoldAway(SDValue Op) {
8447 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
8448}
8449
Matt Arsenault9cd90712016-04-14 01:42:16 +00008450SDValue SITargetLowering::performFCanonicalizeCombine(
8451 SDNode *N,
8452 DAGCombinerInfo &DCI) const {
Matt Arsenault9cd90712016-04-14 01:42:16 +00008453 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault4aec86d2018-07-31 13:34:31 +00008454 SDValue N0 = N->getOperand(0);
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008455 EVT VT = N->getValueType(0);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008456
Matt Arsenault4aec86d2018-07-31 13:34:31 +00008457 // fcanonicalize undef -> qnan
8458 if (N0.isUndef()) {
Matt Arsenault4aec86d2018-07-31 13:34:31 +00008459 APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
8460 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
8461 }
8462
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00008463 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
Matt Arsenault9cd90712016-04-14 01:42:16 +00008464 EVT VT = N->getValueType(0);
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00008465 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
Matt Arsenault9cd90712016-04-14 01:42:16 +00008466 }
8467
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008468 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
8469 // (fcanonicalize k)
8470 //
8471 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
8472
8473 // TODO: This could be better with wider vectors that will be split to v2f16,
8474 // and to consider uses since there aren't that many packed operations.
Matt Arsenaultb5acec12018-08-12 08:42:54 +00008475 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
8476 isTypeLegal(MVT::v2f16)) {
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008477 SDLoc SL(N);
8478 SDValue NewElts[2];
8479 SDValue Lo = N0.getOperand(0);
8480 SDValue Hi = N0.getOperand(1);
Matt Arsenaultb5acec12018-08-12 08:42:54 +00008481 EVT EltVT = Lo.getValueType();
8482
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008483 if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
8484 for (unsigned I = 0; I != 2; ++I) {
8485 SDValue Op = N0.getOperand(I);
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008486 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
8487 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
8488 CFP->getValueAPF());
8489 } else if (Op.isUndef()) {
Matt Arsenaultb5acec12018-08-12 08:42:54 +00008490 // Handled below based on what the other operand is.
8491 NewElts[I] = Op;
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008492 } else {
8493 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
8494 }
8495 }
8496
Matt Arsenaultb5acec12018-08-12 08:42:54 +00008497 // If one half is undef, and one is constant, perfer a splat vector rather
8498 // than the normal qNaN. If it's a register, prefer 0.0 since that's
8499 // cheaper to use and may be free with a packed operation.
8500 if (NewElts[0].isUndef()) {
8501 if (isa<ConstantFPSDNode>(NewElts[1]))
8502 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
8503 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
8504 }
8505
8506 if (NewElts[1].isUndef()) {
8507 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
8508 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
8509 }
8510
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008511 return DAG.getBuildVector(VT, SL, NewElts);
8512 }
8513 }
8514
Matt Arsenault687ec752018-10-22 16:27:27 +00008515 unsigned SrcOpc = N0.getOpcode();
8516
8517 // If it's free to do so, push canonicalizes further up the source, which may
8518 // find a canonical source.
8519 //
8520 // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
8521 // sNaNs.
8522 if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
8523 auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
8524 if (CRHS && N0.hasOneUse()) {
8525 SDLoc SL(N);
8526 SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
8527 N0.getOperand(0));
8528 SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
8529 DCI.AddToWorklist(Canon0.getNode());
8530
8531 return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
8532 }
8533 }
8534
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00008535 return isCanonicalized(DAG, N0) ? N0 : SDValue();
Matt Arsenault9cd90712016-04-14 01:42:16 +00008536}
8537
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008538static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
8539 switch (Opc) {
8540 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008541 case ISD::FMAXNUM_IEEE:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008542 return AMDGPUISD::FMAX3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008543 case ISD::SMAX:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008544 return AMDGPUISD::SMAX3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008545 case ISD::UMAX:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008546 return AMDGPUISD::UMAX3;
8547 case ISD::FMINNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008548 case ISD::FMINNUM_IEEE:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008549 return AMDGPUISD::FMIN3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008550 case ISD::SMIN:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008551 return AMDGPUISD::SMIN3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008552 case ISD::UMIN:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008553 return AMDGPUISD::UMIN3;
8554 default:
8555 llvm_unreachable("Not a min/max opcode");
8556 }
8557}
8558
Matt Arsenault10268f92017-02-27 22:40:39 +00008559SDValue SITargetLowering::performIntMed3ImmCombine(
8560 SelectionDAG &DAG, const SDLoc &SL,
8561 SDValue Op0, SDValue Op1, bool Signed) const {
Matt Arsenaultf639c322016-01-28 20:53:42 +00008562 ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
8563 if (!K1)
8564 return SDValue();
8565
8566 ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
8567 if (!K0)
8568 return SDValue();
8569
Matt Arsenaultf639c322016-01-28 20:53:42 +00008570 if (Signed) {
8571 if (K0->getAPIntValue().sge(K1->getAPIntValue()))
8572 return SDValue();
8573 } else {
8574 if (K0->getAPIntValue().uge(K1->getAPIntValue()))
8575 return SDValue();
8576 }
8577
8578 EVT VT = K0->getValueType(0);
Matt Arsenault10268f92017-02-27 22:40:39 +00008579 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
8580 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
8581 return DAG.getNode(Med3Opc, SL, VT,
8582 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
8583 }
Tom Stellard115a6152016-11-10 16:02:37 +00008584
Matt Arsenault10268f92017-02-27 22:40:39 +00008585 // If there isn't a 16-bit med3 operation, convert to 32-bit.
Tom Stellard115a6152016-11-10 16:02:37 +00008586 MVT NVT = MVT::i32;
8587 unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8588
Matt Arsenault10268f92017-02-27 22:40:39 +00008589 SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
8590 SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
8591 SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
Tom Stellard115a6152016-11-10 16:02:37 +00008592
Matt Arsenault10268f92017-02-27 22:40:39 +00008593 SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
8594 return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
Matt Arsenaultf639c322016-01-28 20:53:42 +00008595}
8596
Matt Arsenault6b114d22017-08-30 01:20:17 +00008597static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
8598 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
8599 return C;
8600
8601 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
8602 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
8603 return C;
8604 }
8605
8606 return nullptr;
8607}
8608
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008609SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
8610 const SDLoc &SL,
8611 SDValue Op0,
8612 SDValue Op1) const {
Matt Arsenault6b114d22017-08-30 01:20:17 +00008613 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
Matt Arsenaultf639c322016-01-28 20:53:42 +00008614 if (!K1)
8615 return SDValue();
8616
Matt Arsenault6b114d22017-08-30 01:20:17 +00008617 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
Matt Arsenaultf639c322016-01-28 20:53:42 +00008618 if (!K0)
8619 return SDValue();
8620
8621 // Ordered >= (although NaN inputs should have folded away by now).
8622 APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
8623 if (Cmp == APFloat::cmpGreaterThan)
8624 return SDValue();
8625
Matt Arsenault055e4dc2019-03-29 19:14:54 +00008626 const MachineFunction &MF = DAG.getMachineFunction();
8627 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8628
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008629 // TODO: Check IEEE bit enabled?
Matt Arsenault6b114d22017-08-30 01:20:17 +00008630 EVT VT = Op0.getValueType();
Matt Arsenault055e4dc2019-03-29 19:14:54 +00008631 if (Info->getMode().DX10Clamp) {
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008632 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
8633 // hardware fmed3 behavior converting to a min.
8634 // FIXME: Should this be allowing -0.0?
8635 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
8636 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
8637 }
8638
Matt Arsenault6b114d22017-08-30 01:20:17 +00008639 // med3 for f16 is only available on gfx9+, and not available for v2f16.
8640 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
8641 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
8642 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
8643 // then give the other result, which is different from med3 with a NaN
8644 // input.
8645 SDValue Var = Op0.getOperand(0);
Matt Arsenaultc3dc8e62018-08-03 18:27:52 +00008646 if (!DAG.isKnownNeverSNaN(Var))
Matt Arsenault6b114d22017-08-30 01:20:17 +00008647 return SDValue();
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008648
Matt Arsenaultebf46142018-09-18 02:34:54 +00008649 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
8650
8651 if ((!K0->hasOneUse() ||
8652 TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
8653 (!K1->hasOneUse() ||
8654 TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
8655 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
8656 Var, SDValue(K0, 0), SDValue(K1, 0));
8657 }
Matt Arsenault6b114d22017-08-30 01:20:17 +00008658 }
Matt Arsenaultf639c322016-01-28 20:53:42 +00008659
Matt Arsenault6b114d22017-08-30 01:20:17 +00008660 return SDValue();
Matt Arsenaultf639c322016-01-28 20:53:42 +00008661}
8662
8663SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
8664 DAGCombinerInfo &DCI) const {
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008665 SelectionDAG &DAG = DCI.DAG;
8666
Matt Arsenault79a45db2017-02-22 23:53:37 +00008667 EVT VT = N->getValueType(0);
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008668 unsigned Opc = N->getOpcode();
8669 SDValue Op0 = N->getOperand(0);
8670 SDValue Op1 = N->getOperand(1);
8671
8672 // Only do this if the inner op has one use since this will just increases
8673 // register pressure for no benefit.
8674
Matt Arsenault79a45db2017-02-22 23:53:37 +00008675 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
Neil Henninge85f6bd2019-03-19 15:50:24 +00008676 !VT.isVector() &&
8677 (VT == MVT::i32 || VT == MVT::f32 ||
8678 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
Matt Arsenault5b39b342016-01-28 20:53:48 +00008679 // max(max(a, b), c) -> max3(a, b, c)
8680 // min(min(a, b), c) -> min3(a, b, c)
8681 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
8682 SDLoc DL(N);
8683 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
8684 DL,
8685 N->getValueType(0),
8686 Op0.getOperand(0),
8687 Op0.getOperand(1),
8688 Op1);
8689 }
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008690
Matt Arsenault5b39b342016-01-28 20:53:48 +00008691 // Try commuted.
8692 // max(a, max(b, c)) -> max3(a, b, c)
8693 // min(a, min(b, c)) -> min3(a, b, c)
8694 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
8695 SDLoc DL(N);
8696 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
8697 DL,
8698 N->getValueType(0),
8699 Op0,
8700 Op1.getOperand(0),
8701 Op1.getOperand(1));
8702 }
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008703 }
8704
Matt Arsenaultf639c322016-01-28 20:53:42 +00008705 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
8706 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
8707 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
8708 return Med3;
8709 }
8710
8711 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
8712 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
8713 return Med3;
8714 }
8715
8716 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
Matt Arsenault5b39b342016-01-28 20:53:48 +00008717 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
Matt Arsenault687ec752018-10-22 16:27:27 +00008718 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
Matt Arsenault5b39b342016-01-28 20:53:48 +00008719 (Opc == AMDGPUISD::FMIN_LEGACY &&
8720 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
Matt Arsenault79a45db2017-02-22 23:53:37 +00008721 (VT == MVT::f32 || VT == MVT::f64 ||
Matt Arsenault6b114d22017-08-30 01:20:17 +00008722 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
8723 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008724 Op0.hasOneUse()) {
Matt Arsenaultf639c322016-01-28 20:53:42 +00008725 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
8726 return Res;
8727 }
8728
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008729 return SDValue();
8730}
8731
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008732static bool isClampZeroToOne(SDValue A, SDValue B) {
8733 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
8734 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
8735 // FIXME: Should this be allowing -0.0?
8736 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
8737 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
8738 }
8739 }
8740
8741 return false;
8742}
8743
8744// FIXME: Should only worry about snans for version with chain.
8745SDValue SITargetLowering::performFMed3Combine(SDNode *N,
8746 DAGCombinerInfo &DCI) const {
8747 EVT VT = N->getValueType(0);
8748 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
8749 // NaNs. With a NaN input, the order of the operands may change the result.
8750
8751 SelectionDAG &DAG = DCI.DAG;
8752 SDLoc SL(N);
8753
8754 SDValue Src0 = N->getOperand(0);
8755 SDValue Src1 = N->getOperand(1);
8756 SDValue Src2 = N->getOperand(2);
8757
8758 if (isClampZeroToOne(Src0, Src1)) {
8759 // const_a, const_b, x -> clamp is safe in all cases including signaling
8760 // nans.
8761 // FIXME: Should this be allowing -0.0?
8762 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
8763 }
8764
Matt Arsenault055e4dc2019-03-29 19:14:54 +00008765 const MachineFunction &MF = DAG.getMachineFunction();
8766 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8767
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008768 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
8769 // handling no dx10-clamp?
Matt Arsenault055e4dc2019-03-29 19:14:54 +00008770 if (Info->getMode().DX10Clamp) {
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008771 // If NaNs is clamped to 0, we are free to reorder the inputs.
8772
8773 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8774 std::swap(Src0, Src1);
8775
8776 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
8777 std::swap(Src1, Src2);
8778
8779 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8780 std::swap(Src0, Src1);
8781
8782 if (isClampZeroToOne(Src1, Src2))
8783 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
8784 }
8785
8786 return SDValue();
8787}
8788
Matt Arsenault1f17c662017-02-22 00:27:34 +00008789SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
8790 DAGCombinerInfo &DCI) const {
8791 SDValue Src0 = N->getOperand(0);
8792 SDValue Src1 = N->getOperand(1);
8793 if (Src0.isUndef() && Src1.isUndef())
8794 return DCI.DAG.getUNDEF(N->getValueType(0));
8795 return SDValue();
8796}
8797
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008798SDValue SITargetLowering::performExtractVectorEltCombine(
8799 SDNode *N, DAGCombinerInfo &DCI) const {
8800 SDValue Vec = N->getOperand(0);
Matt Arsenault8cbb4882017-09-20 21:01:24 +00008801 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008802
8803 EVT VecVT = Vec.getValueType();
8804 EVT EltVT = VecVT.getVectorElementType();
8805
Matt Arsenaultfcc5ba42018-04-26 19:21:32 +00008806 if ((Vec.getOpcode() == ISD::FNEG ||
8807 Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008808 SDLoc SL(N);
8809 EVT EltVT = N->getValueType(0);
8810 SDValue Idx = N->getOperand(1);
8811 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8812 Vec.getOperand(0), Idx);
Matt Arsenaultfcc5ba42018-04-26 19:21:32 +00008813 return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008814 }
8815
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008816 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
8817 // =>
8818 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
8819 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
8820 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
Farhana Aleene24f3ff2018-05-09 21:18:34 +00008821 if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008822 SDLoc SL(N);
8823 EVT EltVT = N->getValueType(0);
8824 SDValue Idx = N->getOperand(1);
8825 unsigned Opc = Vec.getOpcode();
8826
8827 switch(Opc) {
8828 default:
Stanislav Mekhanoshinbcb34ac2018-11-13 21:18:21 +00008829 break;
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008830 // TODO: Support other binary operations.
8831 case ISD::FADD:
Matt Arsenaulta8160732018-08-15 21:34:06 +00008832 case ISD::FSUB:
8833 case ISD::FMUL:
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008834 case ISD::ADD:
Farhana Aleene24f3ff2018-05-09 21:18:34 +00008835 case ISD::UMIN:
8836 case ISD::UMAX:
8837 case ISD::SMIN:
8838 case ISD::SMAX:
8839 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008840 case ISD::FMINNUM:
8841 case ISD::FMAXNUM_IEEE:
8842 case ISD::FMINNUM_IEEE: {
Matt Arsenaulta8160732018-08-15 21:34:06 +00008843 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8844 Vec.getOperand(0), Idx);
8845 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8846 Vec.getOperand(1), Idx);
8847
8848 DCI.AddToWorklist(Elt0.getNode());
8849 DCI.AddToWorklist(Elt1.getNode());
8850 return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
8851 }
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008852 }
8853 }
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008854
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008855 unsigned VecSize = VecVT.getSizeInBits();
8856 unsigned EltSize = EltVT.getSizeInBits();
8857
Stanislav Mekhanoshinbcb34ac2018-11-13 21:18:21 +00008858 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
8859 // This elminates non-constant index and subsequent movrel or scratch access.
8860 // Sub-dword vectors of size 2 dword or less have better implementation.
8861 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8862 // instructions.
8863 if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) &&
8864 !isa<ConstantSDNode>(N->getOperand(1))) {
8865 SDLoc SL(N);
8866 SDValue Idx = N->getOperand(1);
8867 EVT IdxVT = Idx.getValueType();
8868 SDValue V;
8869 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8870 SDValue IC = DAG.getConstant(I, SL, IdxVT);
8871 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8872 if (I == 0)
8873 V = Elt;
8874 else
8875 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
8876 }
8877 return V;
8878 }
8879
8880 if (!DCI.isBeforeLegalize())
8881 return SDValue();
8882
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008883 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
8884 // elements. This exposes more load reduction opportunities by replacing
8885 // multiple small extract_vector_elements with a single 32-bit extract.
8886 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
Matt Arsenaultbf07a502018-08-31 15:39:52 +00008887 if (isa<MemSDNode>(Vec) &&
8888 EltSize <= 16 &&
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008889 EltVT.isByteSized() &&
8890 VecSize > 32 &&
8891 VecSize % 32 == 0 &&
8892 Idx) {
8893 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
8894
8895 unsigned BitIndex = Idx->getZExtValue() * EltSize;
8896 unsigned EltIdx = BitIndex / 32;
8897 unsigned LeftoverBitIdx = BitIndex % 32;
8898 SDLoc SL(N);
8899
8900 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
8901 DCI.AddToWorklist(Cast.getNode());
8902
8903 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
8904 DAG.getConstant(EltIdx, SL, MVT::i32));
8905 DCI.AddToWorklist(Elt.getNode());
8906 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
8907 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
8908 DCI.AddToWorklist(Srl.getNode());
8909
8910 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
8911 DCI.AddToWorklist(Trunc.getNode());
8912 return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
8913 }
8914
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008915 return SDValue();
8916}
8917
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00008918SDValue
8919SITargetLowering::performInsertVectorEltCombine(SDNode *N,
8920 DAGCombinerInfo &DCI) const {
8921 SDValue Vec = N->getOperand(0);
8922 SDValue Idx = N->getOperand(2);
8923 EVT VecVT = Vec.getValueType();
8924 EVT EltVT = VecVT.getVectorElementType();
8925 unsigned VecSize = VecVT.getSizeInBits();
8926 unsigned EltSize = EltVT.getSizeInBits();
8927
8928 // INSERT_VECTOR_ELT (<n x e>, var-idx)
8929 // => BUILD_VECTOR n x select (e, const-idx)
8930 // This elminates non-constant index and subsequent movrel or scratch access.
8931 // Sub-dword vectors of size 2 dword or less have better implementation.
8932 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8933 // instructions.
8934 if (isa<ConstantSDNode>(Idx) ||
8935 VecSize > 256 || (VecSize <= 64 && EltSize < 32))
8936 return SDValue();
8937
8938 SelectionDAG &DAG = DCI.DAG;
8939 SDLoc SL(N);
8940 SDValue Ins = N->getOperand(1);
8941 EVT IdxVT = Idx.getValueType();
8942
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00008943 SmallVector<SDValue, 16> Ops;
8944 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8945 SDValue IC = DAG.getConstant(I, SL, IdxVT);
8946 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8947 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
8948 Ops.push_back(V);
8949 }
8950
8951 return DAG.getBuildVector(VecVT, SL, Ops);
8952}
8953
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008954unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
8955 const SDNode *N0,
8956 const SDNode *N1) const {
8957 EVT VT = N0->getValueType(0);
8958
Matt Arsenault770ec862016-12-22 03:55:35 +00008959 // Only do this if we are not trying to support denormals. v_mad_f32 does not
8960 // support denormals ever.
Stanislav Mekhanoshin28a19362019-05-04 04:20:37 +00008961 if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
8962 (VT == MVT::f16 && !Subtarget->hasFP16Denormals() &&
8963 getSubtarget()->hasMadF16())) &&
8964 isOperationLegal(ISD::FMAD, VT))
Matt Arsenault770ec862016-12-22 03:55:35 +00008965 return ISD::FMAD;
8966
8967 const TargetOptions &Options = DAG.getTarget().Options;
Amara Emersond28f0cd42017-05-01 15:17:51 +00008968 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
Michael Berg7acc81b2018-05-04 18:48:20 +00008969 (N0->getFlags().hasAllowContract() &&
8970 N1->getFlags().hasAllowContract())) &&
Matt Arsenault770ec862016-12-22 03:55:35 +00008971 isFMAFasterThanFMulAndFAdd(VT)) {
8972 return ISD::FMA;
8973 }
8974
8975 return 0;
8976}
8977
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00008978// For a reassociatable opcode perform:
8979// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
8980SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
8981 SelectionDAG &DAG) const {
8982 EVT VT = N->getValueType(0);
8983 if (VT != MVT::i32 && VT != MVT::i64)
8984 return SDValue();
8985
8986 unsigned Opc = N->getOpcode();
8987 SDValue Op0 = N->getOperand(0);
8988 SDValue Op1 = N->getOperand(1);
8989
8990 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
8991 return SDValue();
8992
8993 if (Op0->isDivergent())
8994 std::swap(Op0, Op1);
8995
8996 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
8997 return SDValue();
8998
8999 SDValue Op2 = Op1.getOperand(1);
9000 Op1 = Op1.getOperand(0);
9001 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
9002 return SDValue();
9003
9004 if (Op1->isDivergent())
9005 std::swap(Op1, Op2);
9006
9007 // If either operand is constant this will conflict with
9008 // DAGCombiner::ReassociateOps().
Stanislav Mekhanoshinda1628e2019-02-26 20:56:25 +00009009 if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
9010 DAG.isConstantIntBuildVectorOrConstantInt(Op1))
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00009011 return SDValue();
9012
9013 SDLoc SL(N);
9014 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
9015 return DAG.getNode(Opc, SL, VT, Add1, Op2);
9016}
9017
Matt Arsenault4f6318f2017-11-06 17:04:37 +00009018static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
9019 EVT VT,
9020 SDValue N0, SDValue N1, SDValue N2,
9021 bool Signed) {
9022 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
9023 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
9024 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
9025 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
9026}
9027
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00009028SDValue SITargetLowering::performAddCombine(SDNode *N,
9029 DAGCombinerInfo &DCI) const {
9030 SelectionDAG &DAG = DCI.DAG;
9031 EVT VT = N->getValueType(0);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00009032 SDLoc SL(N);
9033 SDValue LHS = N->getOperand(0);
9034 SDValue RHS = N->getOperand(1);
9035
Matt Arsenault4f6318f2017-11-06 17:04:37 +00009036 if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
9037 && Subtarget->hasMad64_32() &&
9038 !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
9039 VT.getScalarSizeInBits() <= 64) {
9040 if (LHS.getOpcode() != ISD::MUL)
9041 std::swap(LHS, RHS);
9042
9043 SDValue MulLHS = LHS.getOperand(0);
9044 SDValue MulRHS = LHS.getOperand(1);
9045 SDValue AddRHS = RHS;
9046
9047 // TODO: Maybe restrict if SGPR inputs.
9048 if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
9049 numBitsUnsigned(MulRHS, DAG) <= 32) {
9050 MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
9051 MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
9052 AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
9053 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
9054 }
9055
9056 if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
9057 MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
9058 MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
9059 AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
9060 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
9061 }
9062
9063 return SDValue();
9064 }
9065
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00009066 if (SDValue V = reassociateScalarOps(N, DAG)) {
9067 return V;
9068 }
9069
Farhana Aleen07e61232018-05-02 18:16:39 +00009070 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
Matt Arsenault4f6318f2017-11-06 17:04:37 +00009071 return SDValue();
9072
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00009073 // add x, zext (setcc) => addcarry x, 0, setcc
9074 // add x, sext (setcc) => subcarry x, 0, setcc
9075 unsigned Opc = LHS.getOpcode();
9076 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00009077 Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00009078 std::swap(RHS, LHS);
9079
9080 Opc = RHS.getOpcode();
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00009081 switch (Opc) {
9082 default: break;
9083 case ISD::ZERO_EXTEND:
9084 case ISD::SIGN_EXTEND:
9085 case ISD::ANY_EXTEND: {
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00009086 auto Cond = RHS.getOperand(0);
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00009087 if (!isBoolSGPR(Cond))
Stanislav Mekhanoshin3ed38c62017-06-21 23:46:22 +00009088 break;
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00009089 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
9090 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
9091 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
9092 return DAG.getNode(Opc, SL, VTList, Args);
9093 }
9094 case ISD::ADDCARRY: {
9095 // add x, (addcarry y, 0, cc) => addcarry x, y, cc
9096 auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
9097 if (!C || C->getZExtValue() != 0) break;
9098 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
9099 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
9100 }
9101 }
9102 return SDValue();
9103}
9104
9105SDValue SITargetLowering::performSubCombine(SDNode *N,
9106 DAGCombinerInfo &DCI) const {
9107 SelectionDAG &DAG = DCI.DAG;
9108 EVT VT = N->getValueType(0);
9109
9110 if (VT != MVT::i32)
9111 return SDValue();
9112
9113 SDLoc SL(N);
9114 SDValue LHS = N->getOperand(0);
9115 SDValue RHS = N->getOperand(1);
9116
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00009117 if (LHS.getOpcode() == ISD::SUBCARRY) {
9118 // sub (subcarry x, 0, cc), y => subcarry x, y, cc
9119 auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
Stanislav Mekhanoshin42e229e2019-02-21 02:58:00 +00009120 if (!C || !C->isNullValue())
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00009121 return SDValue();
9122 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
9123 return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
9124 }
9125 return SDValue();
9126}
9127
9128SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
9129 DAGCombinerInfo &DCI) const {
9130
9131 if (N->getValueType(0) != MVT::i32)
9132 return SDValue();
9133
9134 auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
9135 if (!C || C->getZExtValue() != 0)
9136 return SDValue();
9137
9138 SelectionDAG &DAG = DCI.DAG;
9139 SDValue LHS = N->getOperand(0);
9140
9141 // addcarry (add x, y), 0, cc => addcarry x, y, cc
9142 // subcarry (sub x, y), 0, cc => subcarry x, y, cc
9143 unsigned LHSOpc = LHS.getOpcode();
9144 unsigned Opc = N->getOpcode();
9145 if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
9146 (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
9147 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
9148 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00009149 }
9150 return SDValue();
9151}
9152
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009153SDValue SITargetLowering::performFAddCombine(SDNode *N,
9154 DAGCombinerInfo &DCI) const {
9155 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
9156 return SDValue();
9157
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009158 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault770ec862016-12-22 03:55:35 +00009159 EVT VT = N->getValueType(0);
Matt Arsenault770ec862016-12-22 03:55:35 +00009160
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009161 SDLoc SL(N);
9162 SDValue LHS = N->getOperand(0);
9163 SDValue RHS = N->getOperand(1);
9164
9165 // These should really be instruction patterns, but writing patterns with
9166 // source modiifiers is a pain.
9167
9168 // fadd (fadd (a, a), b) -> mad 2.0, a, b
9169 if (LHS.getOpcode() == ISD::FADD) {
9170 SDValue A = LHS.getOperand(0);
9171 if (A == LHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00009172 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00009173 if (FusedOp != 0) {
9174 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00009175 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
Matt Arsenault770ec862016-12-22 03:55:35 +00009176 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009177 }
9178 }
9179
9180 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
9181 if (RHS.getOpcode() == ISD::FADD) {
9182 SDValue A = RHS.getOperand(0);
9183 if (A == RHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00009184 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00009185 if (FusedOp != 0) {
9186 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00009187 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
Matt Arsenault770ec862016-12-22 03:55:35 +00009188 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009189 }
9190 }
9191
9192 return SDValue();
9193}
9194
9195SDValue SITargetLowering::performFSubCombine(SDNode *N,
9196 DAGCombinerInfo &DCI) const {
9197 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
9198 return SDValue();
9199
9200 SelectionDAG &DAG = DCI.DAG;
9201 SDLoc SL(N);
9202 EVT VT = N->getValueType(0);
9203 assert(!VT.isVector());
9204
9205 // Try to get the fneg to fold into the source modifier. This undoes generic
9206 // DAG combines and folds them into the mad.
9207 //
9208 // Only do this if we are not trying to support denormals. v_mad_f32 does
9209 // not support denormals ever.
Matt Arsenault770ec862016-12-22 03:55:35 +00009210 SDValue LHS = N->getOperand(0);
9211 SDValue RHS = N->getOperand(1);
9212 if (LHS.getOpcode() == ISD::FADD) {
9213 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
9214 SDValue A = LHS.getOperand(0);
9215 if (A == LHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00009216 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00009217 if (FusedOp != 0){
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009218 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
9219 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
9220
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00009221 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009222 }
9223 }
Matt Arsenault770ec862016-12-22 03:55:35 +00009224 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009225
Matt Arsenault770ec862016-12-22 03:55:35 +00009226 if (RHS.getOpcode() == ISD::FADD) {
9227 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009228
Matt Arsenault770ec862016-12-22 03:55:35 +00009229 SDValue A = RHS.getOperand(0);
9230 if (A == RHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00009231 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00009232 if (FusedOp != 0){
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009233 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00009234 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009235 }
9236 }
9237 }
9238
9239 return SDValue();
9240}
9241
Farhana Aleenc370d7b2018-07-16 18:19:59 +00009242SDValue SITargetLowering::performFMACombine(SDNode *N,
9243 DAGCombinerInfo &DCI) const {
9244 SelectionDAG &DAG = DCI.DAG;
9245 EVT VT = N->getValueType(0);
9246 SDLoc SL(N);
9247
Stanislav Mekhanoshin0e858b02019-02-09 00:34:21 +00009248 if (!Subtarget->hasDot2Insts() || VT != MVT::f32)
Farhana Aleenc370d7b2018-07-16 18:19:59 +00009249 return SDValue();
9250
9251 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
9252 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
9253 SDValue Op1 = N->getOperand(0);
9254 SDValue Op2 = N->getOperand(1);
9255 SDValue FMA = N->getOperand(2);
9256
9257 if (FMA.getOpcode() != ISD::FMA ||
9258 Op1.getOpcode() != ISD::FP_EXTEND ||
9259 Op2.getOpcode() != ISD::FP_EXTEND)
9260 return SDValue();
9261
9262 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
9263 // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
9264 // is sufficient to allow generaing fdot2.
9265 const TargetOptions &Options = DAG.getTarget().Options;
9266 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
9267 (N->getFlags().hasAllowContract() &&
9268 FMA->getFlags().hasAllowContract())) {
9269 Op1 = Op1.getOperand(0);
9270 Op2 = Op2.getOperand(0);
9271 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9272 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9273 return SDValue();
9274
9275 SDValue Vec1 = Op1.getOperand(0);
9276 SDValue Idx1 = Op1.getOperand(1);
9277 SDValue Vec2 = Op2.getOperand(0);
9278
9279 SDValue FMAOp1 = FMA.getOperand(0);
9280 SDValue FMAOp2 = FMA.getOperand(1);
9281 SDValue FMAAcc = FMA.getOperand(2);
9282
9283 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
9284 FMAOp2.getOpcode() != ISD::FP_EXTEND)
9285 return SDValue();
9286
9287 FMAOp1 = FMAOp1.getOperand(0);
9288 FMAOp2 = FMAOp2.getOperand(0);
9289 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9290 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9291 return SDValue();
9292
9293 SDValue Vec3 = FMAOp1.getOperand(0);
9294 SDValue Vec4 = FMAOp2.getOperand(0);
9295 SDValue Idx2 = FMAOp1.getOperand(1);
9296
9297 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
9298 // Idx1 and Idx2 cannot be the same.
9299 Idx1 == Idx2)
9300 return SDValue();
9301
9302 if (Vec1 == Vec2 || Vec3 == Vec4)
9303 return SDValue();
9304
9305 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
9306 return SDValue();
9307
9308 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
Konstantin Zhuravlyovbb30ef72018-08-01 01:31:30 +00009309 (Vec1 == Vec4 && Vec2 == Vec3)) {
9310 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
9311 DAG.getTargetConstant(0, SL, MVT::i1));
9312 }
Farhana Aleenc370d7b2018-07-16 18:19:59 +00009313 }
9314 return SDValue();
9315}
9316
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009317SDValue SITargetLowering::performSetCCCombine(SDNode *N,
9318 DAGCombinerInfo &DCI) const {
9319 SelectionDAG &DAG = DCI.DAG;
9320 SDLoc SL(N);
9321
9322 SDValue LHS = N->getOperand(0);
9323 SDValue RHS = N->getOperand(1);
9324 EVT VT = LHS.getValueType();
Stanislav Mekhanoshinc9bd53a2017-06-27 18:53:03 +00009325 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
9326
9327 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
9328 if (!CRHS) {
9329 CRHS = dyn_cast<ConstantSDNode>(LHS);
9330 if (CRHS) {
9331 std::swap(LHS, RHS);
9332 CC = getSetCCSwappedOperands(CC);
9333 }
9334 }
9335
Stanislav Mekhanoshin3b117942018-06-16 03:46:59 +00009336 if (CRHS) {
9337 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
9338 isBoolSGPR(LHS.getOperand(0))) {
9339 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
9340 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
9341 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
9342 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
9343 if ((CRHS->isAllOnesValue() &&
9344 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
9345 (CRHS->isNullValue() &&
9346 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
9347 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
9348 DAG.getConstant(-1, SL, MVT::i1));
9349 if ((CRHS->isAllOnesValue() &&
9350 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
9351 (CRHS->isNullValue() &&
9352 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
9353 return LHS.getOperand(0);
9354 }
9355
9356 uint64_t CRHSVal = CRHS->getZExtValue();
9357 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
9358 LHS.getOpcode() == ISD::SELECT &&
9359 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9360 isa<ConstantSDNode>(LHS.getOperand(2)) &&
9361 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
9362 isBoolSGPR(LHS.getOperand(0))) {
9363 // Given CT != FT:
9364 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
9365 // setcc (select cc, CT, CF), CF, ne => cc
9366 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
9367 // setcc (select cc, CT, CF), CT, eq => cc
9368 uint64_t CT = LHS.getConstantOperandVal(1);
9369 uint64_t CF = LHS.getConstantOperandVal(2);
9370
9371 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
9372 (CT == CRHSVal && CC == ISD::SETNE))
9373 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
9374 DAG.getConstant(-1, SL, MVT::i1));
9375 if ((CF == CRHSVal && CC == ISD::SETNE) ||
9376 (CT == CRHSVal && CC == ISD::SETEQ))
9377 return LHS.getOperand(0);
9378 }
Stanislav Mekhanoshinc9bd53a2017-06-27 18:53:03 +00009379 }
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009380
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00009381 if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
9382 VT != MVT::f16))
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009383 return SDValue();
9384
Matt Arsenault8ad00d32018-08-10 18:58:41 +00009385 // Match isinf/isfinite pattern
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009386 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
Matt Arsenault8ad00d32018-08-10 18:58:41 +00009387 // (fcmp one (fabs x), inf) -> (fp_class x,
9388 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
9389 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009390 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
9391 if (!CRHS)
9392 return SDValue();
9393
9394 const APFloat &APF = CRHS->getValueAPF();
9395 if (APF.isInfinity() && !APF.isNegative()) {
Matt Arsenault8ad00d32018-08-10 18:58:41 +00009396 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
9397 SIInstrFlags::N_INFINITY;
9398 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
9399 SIInstrFlags::P_ZERO |
9400 SIInstrFlags::N_NORMAL |
9401 SIInstrFlags::P_NORMAL |
9402 SIInstrFlags::N_SUBNORMAL |
9403 SIInstrFlags::P_SUBNORMAL;
9404 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009405 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
9406 DAG.getConstant(Mask, SL, MVT::i32));
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009407 }
9408 }
9409
9410 return SDValue();
9411}
9412
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009413SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
9414 DAGCombinerInfo &DCI) const {
9415 SelectionDAG &DAG = DCI.DAG;
9416 SDLoc SL(N);
9417 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
9418
9419 SDValue Src = N->getOperand(0);
9420 SDValue Srl = N->getOperand(0);
9421 if (Srl.getOpcode() == ISD::ZERO_EXTEND)
9422 Srl = Srl.getOperand(0);
9423
9424 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
9425 if (Srl.getOpcode() == ISD::SRL) {
9426 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
9427 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
9428 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
9429
9430 if (const ConstantSDNode *C =
9431 dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
9432 Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
9433 EVT(MVT::i32));
9434
9435 unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
9436 if (SrcOffset < 32 && SrcOffset % 8 == 0) {
9437 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
9438 MVT::f32, Srl);
9439 }
9440 }
9441 }
9442
9443 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
9444
Craig Topperd0af7e82017-04-28 05:31:46 +00009445 KnownBits Known;
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009446 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
9447 !DCI.isBeforeLegalizeOps());
9448 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
Stanislav Mekhanoshined0d6c62019-01-09 02:24:22 +00009449 if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009450 DCI.CommitTargetLoweringOpt(TLO);
9451 }
9452
9453 return SDValue();
9454}
9455
Tom Stellard1b95fed2018-05-24 05:28:34 +00009456SDValue SITargetLowering::performClampCombine(SDNode *N,
9457 DAGCombinerInfo &DCI) const {
9458 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
9459 if (!CSrc)
9460 return SDValue();
9461
Matt Arsenault055e4dc2019-03-29 19:14:54 +00009462 const MachineFunction &MF = DCI.DAG.getMachineFunction();
Tom Stellard1b95fed2018-05-24 05:28:34 +00009463 const APFloat &F = CSrc->getValueAPF();
9464 APFloat Zero = APFloat::getZero(F.getSemantics());
9465 APFloat::cmpResult Cmp0 = F.compare(Zero);
9466 if (Cmp0 == APFloat::cmpLessThan ||
Matt Arsenault055e4dc2019-03-29 19:14:54 +00009467 (Cmp0 == APFloat::cmpUnordered &&
9468 MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
Tom Stellard1b95fed2018-05-24 05:28:34 +00009469 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
9470 }
9471
9472 APFloat One(F.getSemantics(), "1.0");
9473 APFloat::cmpResult Cmp1 = F.compare(One);
9474 if (Cmp1 == APFloat::cmpGreaterThan)
9475 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
9476
9477 return SDValue(CSrc, 0);
9478}
9479
9480
Tom Stellard75aadc22012-12-11 21:25:42 +00009481SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
9482 DAGCombinerInfo &DCI) const {
Stanislav Mekhanoshin443a7f92018-11-27 15:13:37 +00009483 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
9484 return SDValue();
Tom Stellard75aadc22012-12-11 21:25:42 +00009485 switch (N->getOpcode()) {
Matt Arsenault22b4c252014-12-21 16:48:42 +00009486 default:
9487 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00009488 case ISD::ADD:
9489 return performAddCombine(N, DCI);
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00009490 case ISD::SUB:
9491 return performSubCombine(N, DCI);
9492 case ISD::ADDCARRY:
9493 case ISD::SUBCARRY:
9494 return performAddCarrySubCarryCombine(N, DCI);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009495 case ISD::FADD:
9496 return performFAddCombine(N, DCI);
9497 case ISD::FSUB:
9498 return performFSubCombine(N, DCI);
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009499 case ISD::SETCC:
9500 return performSetCCCombine(N, DCI);
Matt Arsenault5b39b342016-01-28 20:53:48 +00009501 case ISD::FMAXNUM:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00009502 case ISD::FMINNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00009503 case ISD::FMAXNUM_IEEE:
9504 case ISD::FMINNUM_IEEE:
Matt Arsenault5881f4e2015-06-09 00:52:37 +00009505 case ISD::SMAX:
9506 case ISD::SMIN:
9507 case ISD::UMAX:
Matt Arsenault5b39b342016-01-28 20:53:48 +00009508 case ISD::UMIN:
9509 case AMDGPUISD::FMIN_LEGACY:
Stanislav Mekhanoshin443a7f92018-11-27 15:13:37 +00009510 case AMDGPUISD::FMAX_LEGACY:
9511 return performMinMaxCombine(N, DCI);
Farhana Aleenc370d7b2018-07-16 18:19:59 +00009512 case ISD::FMA:
9513 return performFMACombine(N, DCI);
Matt Arsenault90083d32018-06-07 09:54:49 +00009514 case ISD::LOAD: {
9515 if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
9516 return Widended;
9517 LLVM_FALLTHROUGH;
9518 }
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00009519 case ISD::STORE:
9520 case ISD::ATOMIC_LOAD:
9521 case ISD::ATOMIC_STORE:
9522 case ISD::ATOMIC_CMP_SWAP:
9523 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
9524 case ISD::ATOMIC_SWAP:
9525 case ISD::ATOMIC_LOAD_ADD:
9526 case ISD::ATOMIC_LOAD_SUB:
9527 case ISD::ATOMIC_LOAD_AND:
9528 case ISD::ATOMIC_LOAD_OR:
9529 case ISD::ATOMIC_LOAD_XOR:
9530 case ISD::ATOMIC_LOAD_NAND:
9531 case ISD::ATOMIC_LOAD_MIN:
9532 case ISD::ATOMIC_LOAD_MAX:
9533 case ISD::ATOMIC_LOAD_UMIN:
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00009534 case ISD::ATOMIC_LOAD_UMAX:
Matt Arsenaulta5840c32019-01-22 18:36:06 +00009535 case ISD::ATOMIC_LOAD_FADD:
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00009536 case AMDGPUISD::ATOMIC_INC:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00009537 case AMDGPUISD::ATOMIC_DEC:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00009538 case AMDGPUISD::ATOMIC_LOAD_FMIN:
Matt Arsenaulta5840c32019-01-22 18:36:06 +00009539 case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00009540 if (DCI.isBeforeLegalize())
9541 break;
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009542 return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
Matt Arsenaultd0101a22015-01-06 23:00:46 +00009543 case ISD::AND:
9544 return performAndCombine(N, DCI);
Matt Arsenaultf2290332015-01-06 23:00:39 +00009545 case ISD::OR:
9546 return performOrCombine(N, DCI);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00009547 case ISD::XOR:
9548 return performXorCombine(N, DCI);
Matt Arsenault8edfaee2017-03-31 19:53:03 +00009549 case ISD::ZERO_EXTEND:
9550 return performZeroExtendCombine(N, DCI);
Ryan Taylor00e063a2019-03-19 16:07:00 +00009551 case ISD::SIGN_EXTEND_INREG:
9552 return performSignExtendInRegCombine(N , DCI);
Matt Arsenaultf2290332015-01-06 23:00:39 +00009553 case AMDGPUISD::FP_CLASS:
9554 return performClassCombine(N, DCI);
Matt Arsenault9cd90712016-04-14 01:42:16 +00009555 case ISD::FCANONICALIZE:
9556 return performFCanonicalizeCombine(N, DCI);
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009557 case AMDGPUISD::RCP:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00009558 return performRcpCombine(N, DCI);
9559 case AMDGPUISD::FRACT:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009560 case AMDGPUISD::RSQ:
Matt Arsenault32fc5272016-07-26 16:45:45 +00009561 case AMDGPUISD::RCP_LEGACY:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009562 case AMDGPUISD::RSQ_LEGACY:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00009563 case AMDGPUISD::RCP_IFLAG:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009564 case AMDGPUISD::RSQ_CLAMP:
9565 case AMDGPUISD::LDEXP: {
9566 SDValue Src = N->getOperand(0);
9567 if (Src.isUndef())
9568 return Src;
9569 break;
9570 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009571 case ISD::SINT_TO_FP:
9572 case ISD::UINT_TO_FP:
9573 return performUCharToFloatCombine(N, DCI);
9574 case AMDGPUISD::CVT_F32_UBYTE0:
9575 case AMDGPUISD::CVT_F32_UBYTE1:
9576 case AMDGPUISD::CVT_F32_UBYTE2:
9577 case AMDGPUISD::CVT_F32_UBYTE3:
9578 return performCvtF32UByteNCombine(N, DCI);
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00009579 case AMDGPUISD::FMED3:
9580 return performFMed3Combine(N, DCI);
Matt Arsenault1f17c662017-02-22 00:27:34 +00009581 case AMDGPUISD::CVT_PKRTZ_F16_F32:
9582 return performCvtPkRTZCombine(N, DCI);
Tom Stellard1b95fed2018-05-24 05:28:34 +00009583 case AMDGPUISD::CLAMP:
9584 return performClampCombine(N, DCI);
Matt Arsenaulteb522e62017-02-27 22:15:25 +00009585 case ISD::SCALAR_TO_VECTOR: {
9586 SelectionDAG &DAG = DCI.DAG;
9587 EVT VT = N->getValueType(0);
9588
9589 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
9590 if (VT == MVT::v2i16 || VT == MVT::v2f16) {
9591 SDLoc SL(N);
9592 SDValue Src = N->getOperand(0);
9593 EVT EltVT = Src.getValueType();
9594 if (EltVT == MVT::f16)
9595 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
9596
9597 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
9598 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
9599 }
9600
9601 break;
9602 }
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00009603 case ISD::EXTRACT_VECTOR_ELT:
9604 return performExtractVectorEltCombine(N, DCI);
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00009605 case ISD::INSERT_VECTOR_ELT:
9606 return performInsertVectorEltCombine(N, DCI);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00009607 }
Matt Arsenault5565f65e2014-05-22 18:09:07 +00009608 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
Tom Stellard75aadc22012-12-11 21:25:42 +00009609}
Christian Konigd910b7d2013-02-26 17:52:16 +00009610
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009611/// Helper function for adjustWritemask
Benjamin Kramer635e3682013-05-23 15:43:05 +00009612static unsigned SubIdx2Lane(unsigned Idx) {
Christian Konig8e06e2a2013-04-10 08:39:08 +00009613 switch (Idx) {
9614 default: return 0;
9615 case AMDGPU::sub0: return 0;
9616 case AMDGPU::sub1: return 1;
9617 case AMDGPU::sub2: return 2;
9618 case AMDGPU::sub3: return 3;
David Stuttardf77079f2019-01-14 11:55:24 +00009619 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
Christian Konig8e06e2a2013-04-10 08:39:08 +00009620 }
9621}
9622
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009623/// Adjust the writemask of MIMG instructions
Matt Arsenault68f05052017-12-04 22:18:27 +00009624SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
9625 SelectionDAG &DAG) const {
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009626 unsigned Opcode = Node->getMachineOpcode();
9627
9628 // Subtract 1 because the vdata output is not a MachineSDNode operand.
9629 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
9630 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
9631 return Node; // not implemented for D16
9632
David Stuttardf77079f2019-01-14 11:55:24 +00009633 SDNode *Users[5] = { nullptr };
Tom Stellard54774e52013-10-23 02:53:47 +00009634 unsigned Lane = 0;
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009635 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009636 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
Tom Stellard54774e52013-10-23 02:53:47 +00009637 unsigned NewDmask = 0;
David Stuttardf77079f2019-01-14 11:55:24 +00009638 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
9639 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
9640 bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
9641 Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
9642 unsigned TFCLane = 0;
Matt Arsenault856777d2017-12-08 20:00:57 +00009643 bool HasChain = Node->getNumValues() > 1;
9644
9645 if (OldDmask == 0) {
9646 // These are folded out, but on the chance it happens don't assert.
9647 return Node;
9648 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009649
David Stuttardf77079f2019-01-14 11:55:24 +00009650 unsigned OldBitsSet = countPopulation(OldDmask);
9651 // Work out which is the TFE/LWE lane if that is enabled.
9652 if (UsesTFC) {
9653 TFCLane = OldBitsSet;
9654 }
9655
Christian Konig8e06e2a2013-04-10 08:39:08 +00009656 // Try to figure out the used register components
9657 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
9658 I != E; ++I) {
9659
Matt Arsenault93e65ea2017-02-22 21:16:41 +00009660 // Don't look at users of the chain.
9661 if (I.getUse().getResNo() != 0)
9662 continue;
9663
Christian Konig8e06e2a2013-04-10 08:39:08 +00009664 // Abort if we can't understand the usage
9665 if (!I->isMachineOpcode() ||
9666 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
Matt Arsenault68f05052017-12-04 22:18:27 +00009667 return Node;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009668
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +00009669 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
Tom Stellard54774e52013-10-23 02:53:47 +00009670 // Note that subregs are packed, i.e. Lane==0 is the first bit set
9671 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
9672 // set, etc.
Christian Konig8b1ed282013-04-10 08:39:16 +00009673 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
Christian Konig8e06e2a2013-04-10 08:39:08 +00009674
David Stuttardf77079f2019-01-14 11:55:24 +00009675 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
9676 if (UsesTFC && Lane == TFCLane) {
9677 Users[Lane] = *I;
9678 } else {
9679 // Set which texture component corresponds to the lane.
9680 unsigned Comp;
9681 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
9682 Comp = countTrailingZeros(Dmask);
9683 Dmask &= ~(1 << Comp);
9684 }
9685
9686 // Abort if we have more than one user per component.
9687 if (Users[Lane])
9688 return Node;
9689
9690 Users[Lane] = *I;
9691 NewDmask |= 1 << Comp;
Tom Stellard54774e52013-10-23 02:53:47 +00009692 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009693 }
9694
David Stuttardf77079f2019-01-14 11:55:24 +00009695 // Don't allow 0 dmask, as hardware assumes one channel enabled.
9696 bool NoChannels = !NewDmask;
9697 if (NoChannels) {
David Stuttardfc2a7472019-03-20 09:29:55 +00009698 if (!UsesTFC) {
9699 // No uses of the result and not using TFC. Then do nothing.
9700 return Node;
9701 }
David Stuttardf77079f2019-01-14 11:55:24 +00009702 // If the original dmask has one channel - then nothing to do
9703 if (OldBitsSet == 1)
9704 return Node;
9705 // Use an arbitrary dmask - required for the instruction to work
9706 NewDmask = 1;
9707 }
Tom Stellard54774e52013-10-23 02:53:47 +00009708 // Abort if there's no change
9709 if (NewDmask == OldDmask)
Matt Arsenault68f05052017-12-04 22:18:27 +00009710 return Node;
9711
9712 unsigned BitsSet = countPopulation(NewDmask);
9713
David Stuttardf77079f2019-01-14 11:55:24 +00009714 // Check for TFE or LWE - increase the number of channels by one to account
9715 // for the extra return value
9716 // This will need adjustment for D16 if this is also included in
9717 // adjustWriteMask (this function) but at present D16 are excluded.
9718 unsigned NewChannels = BitsSet + UsesTFC;
9719
9720 int NewOpcode =
9721 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
Matt Arsenault68f05052017-12-04 22:18:27 +00009722 assert(NewOpcode != -1 &&
9723 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
9724 "failed to find equivalent MIMG op");
Christian Konig8e06e2a2013-04-10 08:39:08 +00009725
9726 // Adjust the writemask in the node
Matt Arsenault68f05052017-12-04 22:18:27 +00009727 SmallVector<SDValue, 12> Ops;
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009728 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009729 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009730 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
Christian Konig8e06e2a2013-04-10 08:39:08 +00009731
Matt Arsenault68f05052017-12-04 22:18:27 +00009732 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
9733
David Stuttardf77079f2019-01-14 11:55:24 +00009734 MVT ResultVT = NewChannels == 1 ?
9735 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
9736 NewChannels == 5 ? 8 : NewChannels);
Matt Arsenault856777d2017-12-08 20:00:57 +00009737 SDVTList NewVTList = HasChain ?
9738 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
9739
Matt Arsenault68f05052017-12-04 22:18:27 +00009740
9741 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
9742 NewVTList, Ops);
Matt Arsenaultecad0d532017-12-08 20:00:45 +00009743
Matt Arsenault856777d2017-12-08 20:00:57 +00009744 if (HasChain) {
9745 // Update chain.
Chandler Carruth66654b72018-08-14 23:30:32 +00009746 DAG.setNodeMemRefs(NewNode, Node->memoperands());
Matt Arsenault856777d2017-12-08 20:00:57 +00009747 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
9748 }
Matt Arsenault68f05052017-12-04 22:18:27 +00009749
David Stuttardf77079f2019-01-14 11:55:24 +00009750 if (NewChannels == 1) {
Matt Arsenault68f05052017-12-04 22:18:27 +00009751 assert(Node->hasNUsesOfValue(1, 0));
9752 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
9753 SDLoc(Node), Users[Lane]->getValueType(0),
9754 SDValue(NewNode, 0));
Christian Konig8b1ed282013-04-10 08:39:16 +00009755 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
Matt Arsenault68f05052017-12-04 22:18:27 +00009756 return nullptr;
Christian Konig8b1ed282013-04-10 08:39:16 +00009757 }
9758
Christian Konig8e06e2a2013-04-10 08:39:08 +00009759 // Update the users of the node with the new indices
David Stuttardf77079f2019-01-14 11:55:24 +00009760 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
Christian Konig8e06e2a2013-04-10 08:39:08 +00009761 SDNode *User = Users[i];
David Stuttardf77079f2019-01-14 11:55:24 +00009762 if (!User) {
9763 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
9764 // Users[0] is still nullptr because channel 0 doesn't really have a use.
9765 if (i || !NoChannels)
9766 continue;
9767 } else {
9768 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
9769 DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
9770 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009771
9772 switch (Idx) {
9773 default: break;
9774 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
9775 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
9776 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
David Stuttardf77079f2019-01-14 11:55:24 +00009777 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009778 }
9779 }
Matt Arsenault68f05052017-12-04 22:18:27 +00009780
9781 DAG.RemoveDeadNode(Node);
9782 return nullptr;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009783}
9784
Tom Stellardc98ee202015-07-16 19:40:07 +00009785static bool isFrameIndexOp(SDValue Op) {
9786 if (Op.getOpcode() == ISD::AssertZext)
9787 Op = Op.getOperand(0);
9788
9789 return isa<FrameIndexSDNode>(Op);
9790}
9791
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009792/// Legalize target independent instructions (e.g. INSERT_SUBREG)
Tom Stellard3457a842014-10-09 19:06:00 +00009793/// with frame index operands.
9794/// LLVM assumes that inputs are to these instructions are registers.
Matt Arsenault0d0d6c22017-04-12 21:58:23 +00009795SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
9796 SelectionDAG &DAG) const {
9797 if (Node->getOpcode() == ISD::CopyToReg) {
9798 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
9799 SDValue SrcVal = Node->getOperand(2);
9800
9801 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
9802 // to try understanding copies to physical registers.
9803 if (SrcVal.getValueType() == MVT::i1 &&
9804 TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
9805 SDLoc SL(Node);
9806 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
9807 SDValue VReg = DAG.getRegister(
9808 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
9809
9810 SDNode *Glued = Node->getGluedNode();
9811 SDValue ToVReg
9812 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
9813 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
9814 SDValue ToResultReg
9815 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
9816 VReg, ToVReg.getValue(1));
9817 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
9818 DAG.RemoveDeadNode(Node);
9819 return ToResultReg.getNode();
9820 }
9821 }
Tom Stellard8dd392e2014-10-09 18:09:15 +00009822
9823 SmallVector<SDValue, 8> Ops;
Tom Stellard3457a842014-10-09 19:06:00 +00009824 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
Tom Stellardc98ee202015-07-16 19:40:07 +00009825 if (!isFrameIndexOp(Node->getOperand(i))) {
Tom Stellard3457a842014-10-09 19:06:00 +00009826 Ops.push_back(Node->getOperand(i));
Tom Stellard8dd392e2014-10-09 18:09:15 +00009827 continue;
9828 }
9829
Tom Stellard3457a842014-10-09 19:06:00 +00009830 SDLoc DL(Node);
Tom Stellard8dd392e2014-10-09 18:09:15 +00009831 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
Tom Stellard3457a842014-10-09 19:06:00 +00009832 Node->getOperand(i).getValueType(),
9833 Node->getOperand(i)), 0));
Tom Stellard8dd392e2014-10-09 18:09:15 +00009834 }
9835
Mark Searles4e3d6162017-10-16 23:38:53 +00009836 return DAG.UpdateNodeOperands(Node, Ops);
Tom Stellard8dd392e2014-10-09 18:09:15 +00009837}
9838
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009839/// Fold the instructions after selecting them.
Matt Arsenault68f05052017-12-04 22:18:27 +00009840/// Returns null if users were already updated.
Christian Konig8e06e2a2013-04-10 08:39:08 +00009841SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
9842 SelectionDAG &DAG) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009843 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Nicolai Haehnlef2c64db2016-02-18 16:44:18 +00009844 unsigned Opcode = Node->getMachineOpcode();
Christian Konig8e06e2a2013-04-10 08:39:08 +00009845
Nicolai Haehnlec06bfa12016-07-11 21:59:43 +00009846 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009847 !TII->isGather4(Opcode)) {
Matt Arsenault68f05052017-12-04 22:18:27 +00009848 return adjustWritemask(Node, DAG);
9849 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009850
Nicolai Haehnlef2c64db2016-02-18 16:44:18 +00009851 if (Opcode == AMDGPU::INSERT_SUBREG ||
9852 Opcode == AMDGPU::REG_SEQUENCE) {
Tom Stellard8dd392e2014-10-09 18:09:15 +00009853 legalizeTargetIndependentNode(Node, DAG);
9854 return Node;
9855 }
Matt Arsenault206f8262017-08-01 20:49:41 +00009856
9857 switch (Opcode) {
9858 case AMDGPU::V_DIV_SCALE_F32:
9859 case AMDGPU::V_DIV_SCALE_F64: {
9860 // Satisfy the operand register constraint when one of the inputs is
9861 // undefined. Ordinarily each undef value will have its own implicit_def of
9862 // a vreg, so force these to use a single register.
9863 SDValue Src0 = Node->getOperand(0);
9864 SDValue Src1 = Node->getOperand(1);
9865 SDValue Src2 = Node->getOperand(2);
9866
9867 if ((Src0.isMachineOpcode() &&
9868 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
9869 (Src0 == Src1 || Src0 == Src2))
9870 break;
9871
9872 MVT VT = Src0.getValueType().getSimpleVT();
Alexander Timofeevba447ba2019-05-26 20:33:26 +00009873 const TargetRegisterClass *RC =
9874 getRegClassFor(VT, Src0.getNode()->isDivergent());
Matt Arsenault206f8262017-08-01 20:49:41 +00009875
9876 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
9877 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
9878
9879 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
9880 UndefReg, Src0, SDValue());
9881
9882 // src0 must be the same register as src1 or src2, even if the value is
9883 // undefined, so make sure we don't violate this constraint.
9884 if (Src0.isMachineOpcode() &&
9885 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
9886 if (Src1.isMachineOpcode() &&
9887 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9888 Src0 = Src1;
9889 else if (Src2.isMachineOpcode() &&
9890 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9891 Src0 = Src2;
9892 else {
9893 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
9894 Src0 = UndefReg;
9895 Src1 = UndefReg;
9896 }
9897 } else
9898 break;
9899
9900 SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
9901 for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
9902 Ops.push_back(Node->getOperand(I));
9903
9904 Ops.push_back(ImpDef.getValue(1));
9905 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
9906 }
Stanislav Mekhanoshin5f581c92019-06-12 17:52:51 +00009907 case AMDGPU::V_PERMLANE16_B32:
9908 case AMDGPU::V_PERMLANEX16_B32: {
9909 ConstantSDNode *FI = cast<ConstantSDNode>(Node->getOperand(0));
9910 ConstantSDNode *BC = cast<ConstantSDNode>(Node->getOperand(2));
9911 if (!FI->getZExtValue() && !BC->getZExtValue())
9912 break;
9913 SDValue VDstIn = Node->getOperand(6);
9914 if (VDstIn.isMachineOpcode()
9915 && VDstIn.getMachineOpcode() == AMDGPU::IMPLICIT_DEF)
9916 break;
9917 MachineSDNode *ImpDef = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
9918 SDLoc(Node), MVT::i32);
9919 SmallVector<SDValue, 8> Ops = { SDValue(FI, 0), Node->getOperand(1),
9920 SDValue(BC, 0), Node->getOperand(3),
9921 Node->getOperand(4), Node->getOperand(5),
9922 SDValue(ImpDef, 0), Node->getOperand(7) };
9923 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
9924 }
Matt Arsenault206f8262017-08-01 20:49:41 +00009925 default:
9926 break;
9927 }
9928
Tom Stellard654d6692015-01-08 15:08:17 +00009929 return Node;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009930}
Christian Konig8b1ed282013-04-10 08:39:16 +00009931
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009932/// Assign the register class depending on the number of
Christian Konig8b1ed282013-04-10 08:39:16 +00009933/// bits set in the writemask
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009934void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
Christian Konig8b1ed282013-04-10 08:39:16 +00009935 SDNode *Node) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009936 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009937
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009938 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009939
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009940 if (TII->isVOP3(MI.getOpcode())) {
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009941 // Make sure constant bus requirements are respected.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009942 TII->legalizeOperandsVOP3(MRI, MI);
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009943 return;
9944 }
Matt Arsenaultcb0ac3d2014-09-26 17:54:59 +00009945
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009946 // Replace unused atomics with the no return version.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009947 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009948 if (NoRetAtomicOp != -1) {
9949 if (!Node->hasAnyUseOfValue(0)) {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009950 MI.setDesc(TII->get(NoRetAtomicOp));
9951 MI.RemoveOperand(0);
Tom Stellard354a43c2016-04-01 18:27:37 +00009952 return;
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009953 }
9954
Tom Stellard354a43c2016-04-01 18:27:37 +00009955 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
9956 // instruction, because the return type of these instructions is a vec2 of
9957 // the memory type, so it can be tied to the input operand.
9958 // This means these instructions always have a use, so we need to add a
9959 // special case to check if the atomic has only one extract_subreg use,
9960 // which itself has no uses.
9961 if ((Node->hasNUsesOfValue(1, 0) &&
Nicolai Haehnle750082d2016-04-15 14:42:36 +00009962 Node->use_begin()->isMachineOpcode() &&
Tom Stellard354a43c2016-04-01 18:27:37 +00009963 Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
9964 !Node->use_begin()->hasAnyUseOfValue(0))) {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009965 unsigned Def = MI.getOperand(0).getReg();
Tom Stellard354a43c2016-04-01 18:27:37 +00009966
9967 // Change this into a noret atomic.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009968 MI.setDesc(TII->get(NoRetAtomicOp));
9969 MI.RemoveOperand(0);
Tom Stellard354a43c2016-04-01 18:27:37 +00009970
9971 // If we only remove the def operand from the atomic instruction, the
9972 // extract_subreg will be left with a use of a vreg without a def.
9973 // So we need to insert an implicit_def to avoid machine verifier
9974 // errors.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009975 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
Tom Stellard354a43c2016-04-01 18:27:37 +00009976 TII->get(AMDGPU::IMPLICIT_DEF), Def);
9977 }
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009978 return;
9979 }
Christian Konig8b1ed282013-04-10 08:39:16 +00009980}
Tom Stellard0518ff82013-06-03 17:39:58 +00009981
Benjamin Kramerbdc49562016-06-12 15:39:02 +00009982static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
9983 uint64_t Val) {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009984 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
Matt Arsenault485defe2014-11-05 19:01:17 +00009985 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
9986}
9987
9988MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
Benjamin Kramerbdc49562016-06-12 15:39:02 +00009989 const SDLoc &DL,
Matt Arsenault485defe2014-11-05 19:01:17 +00009990 SDValue Ptr) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009991 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Matt Arsenault485defe2014-11-05 19:01:17 +00009992
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009993 // Build the half of the subregister with the constants before building the
9994 // full 128-bit register. If we are building multiple resource descriptors,
9995 // this will allow CSEing of the 2-component register.
9996 const SDValue Ops0[] = {
9997 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
9998 buildSMovImm32(DAG, DL, 0),
9999 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
10000 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
10001 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
10002 };
Matt Arsenault485defe2014-11-05 19:01:17 +000010003
Matt Arsenault2d6fdb82015-09-25 17:08:42 +000010004 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
10005 MVT::v2i32, Ops0), 0);
Matt Arsenault485defe2014-11-05 19:01:17 +000010006
Matt Arsenault2d6fdb82015-09-25 17:08:42 +000010007 // Combine the constants and the pointer.
10008 const SDValue Ops1[] = {
10009 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
10010 Ptr,
10011 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
10012 SubRegHi,
10013 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
10014 };
Matt Arsenault485defe2014-11-05 19:01:17 +000010015
Matt Arsenault2d6fdb82015-09-25 17:08:42 +000010016 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
Matt Arsenault485defe2014-11-05 19:01:17 +000010017}
10018
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000010019/// Return a resource descriptor with the 'Add TID' bit enabled
Benjamin Kramerdf005cb2015-08-08 18:27:36 +000010020/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
10021/// of the resource descriptor) to create an offset, which is added to
10022/// the resource pointer.
Benjamin Kramerbdc49562016-06-12 15:39:02 +000010023MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
10024 SDValue Ptr, uint32_t RsrcDword1,
Matt Arsenaultf3cd4512014-11-05 19:01:19 +000010025 uint64_t RsrcDword2And3) const {
10026 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
10027 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
10028 if (RsrcDword1) {
10029 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +000010030 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
10031 0);
Matt Arsenaultf3cd4512014-11-05 19:01:19 +000010032 }
10033
10034 SDValue DataLo = buildSMovImm32(DAG, DL,
10035 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
10036 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
10037
10038 const SDValue Ops[] = {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +000010039 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +000010040 PtrLo,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +000010041 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +000010042 PtrHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +000010043 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +000010044 DataLo,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +000010045 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +000010046 DataHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +000010047 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
Matt Arsenaultf3cd4512014-11-05 19:01:19 +000010048 };
10049
10050 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
10051}
10052
Tom Stellardd7e6f132015-04-08 01:09:26 +000010053//===----------------------------------------------------------------------===//
10054// SI Inline Assembly Support
10055//===----------------------------------------------------------------------===//
10056
10057std::pair<unsigned, const TargetRegisterClass *>
10058SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
Benjamin Kramer9bfb6272015-07-05 19:29:18 +000010059 StringRef Constraint,
Tom Stellardd7e6f132015-04-08 01:09:26 +000010060 MVT VT) const {
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010061 const TargetRegisterClass *RC = nullptr;
Tom Stellardb3c3bda2015-12-10 02:12:53 +000010062 if (Constraint.size() == 1) {
10063 switch (Constraint[0]) {
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010064 default:
10065 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
Tom Stellardb3c3bda2015-12-10 02:12:53 +000010066 case 's':
10067 case 'r':
10068 switch (VT.getSizeInBits()) {
10069 default:
10070 return std::make_pair(0U, nullptr);
10071 case 32:
Matt Arsenault9e910142016-12-20 19:06:12 +000010072 case 16:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010073 RC = &AMDGPU::SReg_32_XM0RegClass;
10074 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +000010075 case 64:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010076 RC = &AMDGPU::SGPR_64RegClass;
10077 break;
Tim Renouf361b5b22019-03-21 12:01:21 +000010078 case 96:
10079 RC = &AMDGPU::SReg_96RegClass;
10080 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +000010081 case 128:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010082 RC = &AMDGPU::SReg_128RegClass;
10083 break;
Tim Renouf033f99a2019-03-22 10:11:21 +000010084 case 160:
10085 RC = &AMDGPU::SReg_160RegClass;
10086 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +000010087 case 256:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010088 RC = &AMDGPU::SReg_256RegClass;
10089 break;
Matt Arsenaulte0bf7d02017-02-21 19:12:08 +000010090 case 512:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010091 RC = &AMDGPU::SReg_512RegClass;
10092 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +000010093 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010094 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +000010095 case 'v':
10096 switch (VT.getSizeInBits()) {
10097 default:
10098 return std::make_pair(0U, nullptr);
10099 case 32:
Matt Arsenault9e910142016-12-20 19:06:12 +000010100 case 16:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010101 RC = &AMDGPU::VGPR_32RegClass;
10102 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +000010103 case 64:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010104 RC = &AMDGPU::VReg_64RegClass;
10105 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +000010106 case 96:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010107 RC = &AMDGPU::VReg_96RegClass;
10108 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +000010109 case 128:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010110 RC = &AMDGPU::VReg_128RegClass;
10111 break;
Tim Renouf033f99a2019-03-22 10:11:21 +000010112 case 160:
10113 RC = &AMDGPU::VReg_160RegClass;
10114 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +000010115 case 256:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010116 RC = &AMDGPU::VReg_256RegClass;
10117 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +000010118 case 512:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010119 RC = &AMDGPU::VReg_512RegClass;
10120 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +000010121 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010122 break;
Tom Stellardd7e6f132015-04-08 01:09:26 +000010123 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +000010124 // We actually support i128, i16 and f16 as inline parameters
10125 // even if they are not reported as legal
10126 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
10127 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
10128 return std::make_pair(0U, RC);
Tom Stellardd7e6f132015-04-08 01:09:26 +000010129 }
10130
10131 if (Constraint.size() > 1) {
Tom Stellardd7e6f132015-04-08 01:09:26 +000010132 if (Constraint[1] == 'v') {
10133 RC = &AMDGPU::VGPR_32RegClass;
10134 } else if (Constraint[1] == 's') {
10135 RC = &AMDGPU::SGPR_32RegClass;
10136 }
10137
10138 if (RC) {
Matt Arsenault0b554ed2015-06-23 02:05:55 +000010139 uint32_t Idx;
10140 bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
10141 if (!Failed && Idx < RC->getNumRegs())
Tom Stellardd7e6f132015-04-08 01:09:26 +000010142 return std::make_pair(RC->getRegister(Idx), RC);
10143 }
10144 }
10145 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
10146}
Tom Stellardb3c3bda2015-12-10 02:12:53 +000010147
10148SITargetLowering::ConstraintType
10149SITargetLowering::getConstraintType(StringRef Constraint) const {
10150 if (Constraint.size() == 1) {
10151 switch (Constraint[0]) {
10152 default: break;
10153 case 's':
10154 case 'v':
10155 return C_RegisterClass;
10156 }
10157 }
10158 return TargetLowering::getConstraintType(Constraint);
10159}
Matt Arsenault1cc47f82017-07-18 16:44:56 +000010160
10161// Figure out which registers should be reserved for stack access. Only after
10162// the function is legalized do we know all of the non-spill stack objects or if
10163// calls are present.
10164void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
10165 MachineRegisterInfo &MRI = MF.getRegInfo();
10166 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +000010167 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Tom Stellardc5a154d2018-06-28 23:47:12 +000010168 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenault1cc47f82017-07-18 16:44:56 +000010169
10170 if (Info->isEntryFunction()) {
10171 // Callable functions have fixed registers used for stack access.
10172 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
10173 }
10174
Matt Arsenaultb812b7a2019-06-05 22:20:47 +000010175 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
10176 Info->getStackPtrOffsetReg()));
10177 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
10178 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
Matt Arsenault1cc47f82017-07-18 16:44:56 +000010179
Matt Arsenaultbc6d07c2019-03-14 22:54:43 +000010180 // We need to worry about replacing the default register with itself in case
10181 // of MIR testcases missing the MFI.
10182 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
10183 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
10184
10185 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
10186 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
10187
10188 if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) {
10189 MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
10190 Info->getScratchWaveOffsetReg());
10191 }
Matt Arsenault1cc47f82017-07-18 16:44:56 +000010192
Stanislav Mekhanoshind4b500c2018-05-31 05:36:04 +000010193 Info->limitOccupancy(MF);
10194
Stanislav Mekhanoshin52500212019-06-16 17:13:09 +000010195 if (ST.isWave32() && !MF.empty()) {
10196 // Add VCC_HI def because many instructions marked as imp-use VCC where
10197 // we may only define VCC_LO. If nothing defines VCC_HI we may end up
10198 // having a use of undef.
10199
10200 const SIInstrInfo *TII = ST.getInstrInfo();
10201 DebugLoc DL;
10202
10203 MachineBasicBlock &MBB = MF.front();
10204 MachineBasicBlock::iterator I = MBB.getFirstNonDebugInstr();
10205 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), AMDGPU::VCC_HI);
10206
10207 for (auto &MBB : MF) {
10208 for (auto &MI : MBB) {
10209 TII->fixImplicitOperands(MI);
10210 }
10211 }
10212 }
10213
Matt Arsenault1cc47f82017-07-18 16:44:56 +000010214 TargetLoweringBase::finalizeLowering(MF);
10215}
Matt Arsenault45b98182017-11-15 00:45:43 +000010216
10217void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
10218 KnownBits &Known,
10219 const APInt &DemandedElts,
10220 const SelectionDAG &DAG,
10221 unsigned Depth) const {
10222 TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
10223 DAG, Depth);
10224
Matt Arsenault5c714cb2019-05-23 19:38:14 +000010225 // Set the high bits to zero based on the maximum allowed scratch size per
10226 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
Matt Arsenault45b98182017-11-15 00:45:43 +000010227 // calculation won't overflow, so assume the sign bit is never set.
Matt Arsenault5c714cb2019-05-23 19:38:14 +000010228 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
Matt Arsenault45b98182017-11-15 00:45:43 +000010229}
Tom Stellard264c1712018-06-13 15:06:37 +000010230
Stanislav Mekhanoshin93f15c92019-05-03 21:17:29 +000010231unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
10232 const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
10233 const unsigned CacheLineAlign = 6; // log2(64)
10234
10235 // Pre-GFX10 target did not benefit from loop alignment
10236 if (!ML || DisableLoopAlignment ||
10237 (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) ||
10238 getSubtarget()->hasInstFwdPrefetchBug())
10239 return PrefAlign;
10240
10241 // On GFX10 I$ is 4 x 64 bytes cache lines.
10242 // By default prefetcher keeps one cache line behind and reads two ahead.
10243 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
10244 // behind and one ahead.
10245 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
10246 // If loop fits 64 bytes it always spans no more than two cache lines and
10247 // does not need an alignment.
10248 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
10249 // Else if loop is less or equal 192 bytes we need two lines behind.
10250
10251 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10252 const MachineBasicBlock *Header = ML->getHeader();
10253 if (Header->getAlignment() != PrefAlign)
10254 return Header->getAlignment(); // Already processed.
10255
10256 unsigned LoopSize = 0;
10257 for (const MachineBasicBlock *MBB : ML->blocks()) {
10258 // If inner loop block is aligned assume in average half of the alignment
10259 // size to be added as nops.
10260 if (MBB != Header)
10261 LoopSize += (1 << MBB->getAlignment()) / 2;
10262
10263 for (const MachineInstr &MI : *MBB) {
10264 LoopSize += TII->getInstSizeInBytes(MI);
10265 if (LoopSize > 192)
10266 return PrefAlign;
10267 }
10268 }
10269
10270 if (LoopSize <= 64)
10271 return PrefAlign;
10272
10273 if (LoopSize <= 128)
10274 return CacheLineAlign;
10275
10276 // If any of parent loops is surrounded by prefetch instructions do not
10277 // insert new for inner loop, which would reset parent's settings.
10278 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
10279 if (MachineBasicBlock *Exit = P->getExitBlock()) {
10280 auto I = Exit->getFirstNonDebugInstr();
10281 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
10282 return CacheLineAlign;
10283 }
10284 }
10285
10286 MachineBasicBlock *Pre = ML->getLoopPreheader();
10287 MachineBasicBlock *Exit = ML->getExitBlock();
10288
10289 if (Pre && Exit) {
10290 BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(),
10291 TII->get(AMDGPU::S_INST_PREFETCH))
10292 .addImm(1); // prefetch 2 lines behind PC
10293
10294 BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(),
10295 TII->get(AMDGPU::S_INST_PREFETCH))
10296 .addImm(2); // prefetch 1 line behind PC
10297 }
10298
10299 return CacheLineAlign;
10300}
10301
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010302LLVM_ATTRIBUTE_UNUSED
10303static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
10304 assert(N->getOpcode() == ISD::CopyFromReg);
10305 do {
10306 // Follow the chain until we find an INLINEASM node.
10307 N = N->getOperand(0).getNode();
Craig Topper784929d2019-02-08 20:48:56 +000010308 if (N->getOpcode() == ISD::INLINEASM ||
10309 N->getOpcode() == ISD::INLINEASM_BR)
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010310 return true;
10311 } while (N->getOpcode() == ISD::CopyFromReg);
10312 return false;
10313}
10314
Tom Stellard264c1712018-06-13 15:06:37 +000010315bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
Nicolai Haehnle35617ed2018-08-30 14:21:36 +000010316 FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
Tom Stellard264c1712018-06-13 15:06:37 +000010317{
10318 switch (N->getOpcode()) {
Tom Stellard264c1712018-06-13 15:06:37 +000010319 case ISD::CopyFromReg:
10320 {
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010321 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
10322 const MachineFunction * MF = FLI->MF;
10323 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
10324 const MachineRegisterInfo &MRI = MF->getRegInfo();
10325 const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
10326 unsigned Reg = R->getReg();
10327 if (TRI.isPhysicalRegister(Reg))
10328 return !TRI.isSGPRReg(MRI, Reg);
Tom Stellard264c1712018-06-13 15:06:37 +000010329
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010330 if (MRI.isLiveIn(Reg)) {
10331 // workitem.id.x workitem.id.y workitem.id.z
10332 // Any VGPR formal argument is also considered divergent
10333 if (!TRI.isSGPRReg(MRI, Reg))
10334 return true;
10335 // Formal arguments of non-entry functions
10336 // are conservatively considered divergent
10337 else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
10338 return true;
10339 return false;
Tom Stellard264c1712018-06-13 15:06:37 +000010340 }
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010341 const Value *V = FLI->getValueFromVirtualReg(Reg);
10342 if (V)
10343 return KDA->isDivergent(V);
10344 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
10345 return !TRI.isSGPRReg(MRI, Reg);
Tom Stellard264c1712018-06-13 15:06:37 +000010346 }
10347 break;
10348 case ISD::LOAD: {
Matt Arsenault813613c2018-09-04 18:58:19 +000010349 const LoadSDNode *L = cast<LoadSDNode>(N);
10350 unsigned AS = L->getAddressSpace();
10351 // A flat load may access private memory.
10352 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
Tom Stellard264c1712018-06-13 15:06:37 +000010353 } break;
10354 case ISD::CALLSEQ_END:
10355 return true;
10356 break;
10357 case ISD::INTRINSIC_WO_CHAIN:
10358 {
10359
10360 }
10361 return AMDGPU::isIntrinsicSourceOfDivergence(
10362 cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
10363 case ISD::INTRINSIC_W_CHAIN:
10364 return AMDGPU::isIntrinsicSourceOfDivergence(
10365 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
10366 // In some cases intrinsics that are a source of divergence have been
10367 // lowered to AMDGPUISD so we also need to check those too.
10368 case AMDGPUISD::INTERP_MOV:
10369 case AMDGPUISD::INTERP_P1:
10370 case AMDGPUISD::INTERP_P2:
10371 return true;
10372 }
10373 return false;
10374}
Matt Arsenaultf8768bf2018-08-06 21:38:27 +000010375
10376bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
10377 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
10378 case MVT::f32:
10379 return Subtarget->hasFP32Denormals();
10380 case MVT::f64:
10381 return Subtarget->hasFP64Denormals();
10382 case MVT::f16:
10383 return Subtarget->hasFP16Denormals();
10384 default:
10385 return false;
10386 }
10387}
Matt Arsenault687ec752018-10-22 16:27:27 +000010388
10389bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
10390 const SelectionDAG &DAG,
10391 bool SNaN,
10392 unsigned Depth) const {
10393 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
Matt Arsenault055e4dc2019-03-29 19:14:54 +000010394 const MachineFunction &MF = DAG.getMachineFunction();
10395 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
10396
10397 if (Info->getMode().DX10Clamp)
Matt Arsenault687ec752018-10-22 16:27:27 +000010398 return true; // Clamped to 0.
10399 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
10400 }
10401
10402 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
10403 SNaN, Depth);
10404}
Matt Arsenaulta5840c32019-01-22 18:36:06 +000010405
10406TargetLowering::AtomicExpansionKind
10407SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
10408 switch (RMW->getOperation()) {
10409 case AtomicRMWInst::FAdd: {
10410 Type *Ty = RMW->getType();
10411
10412 // We don't have a way to support 16-bit atomics now, so just leave them
10413 // as-is.
10414 if (Ty->isHalfTy())
10415 return AtomicExpansionKind::None;
10416
10417 if (!Ty->isFloatTy())
10418 return AtomicExpansionKind::CmpXChg;
10419
10420 // TODO: Do have these for flat. Older targets also had them for buffers.
10421 unsigned AS = RMW->getPointerAddressSpace();
10422 return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
10423 AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
10424 }
10425 default:
10426 break;
10427 }
10428
10429 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
10430}