blob: 5a8e8b14be32ff43c6daec1cfba561df3ca0a040 [file] [log] [blame]
Tom Stellard75aadc22012-12-11 21:25:42 +00001//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Tom Stellard75aadc22012-12-11 21:25:42 +00006//
7//===----------------------------------------------------------------------===//
8//
9/// \file
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000010/// Custom DAG lowering for SI
Tom Stellard75aadc22012-12-11 21:25:42 +000011//
12//===----------------------------------------------------------------------===//
13
Sylvestre Ledrudf92dab2018-11-02 17:25:40 +000014#if defined(_MSC_VER) || defined(__MINGW32__)
NAKAMURA Takumi45e0a832014-07-20 11:15:07 +000015// Provide M_PI.
16#define _USE_MATH_DEFINES
NAKAMURA Takumi45e0a832014-07-20 11:15:07 +000017#endif
18
Chandler Carruth6bda14b2017-06-06 11:49:48 +000019#include "SIISelLowering.h"
Christian Konig99ee0f42013-03-07 09:04:14 +000020#include "AMDGPU.h"
Matt Arsenault41e2f2b2014-02-24 21:01:28 +000021#include "AMDGPUSubtarget.h"
Chandler Carruth6bda14b2017-06-06 11:49:48 +000022#include "AMDGPUTargetMachine.h"
Tom Stellard8485fa02016-12-07 02:42:15 +000023#include "SIDefines.h"
Tom Stellard75aadc22012-12-11 21:25:42 +000024#include "SIInstrInfo.h"
25#include "SIMachineFunctionInfo.h"
26#include "SIRegisterInfo.h"
Tom Stellard44b30b42018-05-22 02:03:23 +000027#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000028#include "Utils/AMDGPUBaseInfo.h"
29#include "llvm/ADT/APFloat.h"
30#include "llvm/ADT/APInt.h"
31#include "llvm/ADT/ArrayRef.h"
Alexey Samsonova253bf92014-08-27 19:36:53 +000032#include "llvm/ADT/BitVector.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000033#include "llvm/ADT/SmallVector.h"
Matt Arsenault71bcbd42017-08-11 20:42:08 +000034#include "llvm/ADT/Statistic.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000035#include "llvm/ADT/StringRef.h"
Matt Arsenault9a10cea2016-01-26 04:29:24 +000036#include "llvm/ADT/StringSwitch.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000037#include "llvm/ADT/Twine.h"
Wei Ding07e03712016-07-28 16:42:13 +000038#include "llvm/CodeGen/Analysis.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000039#include "llvm/CodeGen/CallingConvLower.h"
40#include "llvm/CodeGen/DAGCombine.h"
41#include "llvm/CodeGen/ISDOpcodes.h"
42#include "llvm/CodeGen/MachineBasicBlock.h"
43#include "llvm/CodeGen/MachineFrameInfo.h"
44#include "llvm/CodeGen/MachineFunction.h"
45#include "llvm/CodeGen/MachineInstr.h"
46#include "llvm/CodeGen/MachineInstrBuilder.h"
47#include "llvm/CodeGen/MachineMemOperand.h"
Matt Arsenault8623e8d2017-08-03 23:00:29 +000048#include "llvm/CodeGen/MachineModuleInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000049#include "llvm/CodeGen/MachineOperand.h"
50#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000051#include "llvm/CodeGen/SelectionDAG.h"
52#include "llvm/CodeGen/SelectionDAGNodes.h"
David Blaikieb3bde2e2017-11-17 01:07:10 +000053#include "llvm/CodeGen/TargetCallingConv.h"
54#include "llvm/CodeGen/TargetRegisterInfo.h"
Craig Topper2fa14362018-03-29 17:21:10 +000055#include "llvm/CodeGen/ValueTypes.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000056#include "llvm/IR/Constants.h"
57#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/DebugLoc.h"
59#include "llvm/IR/DerivedTypes.h"
Oliver Stannard7e7d9832016-02-02 13:52:43 +000060#include "llvm/IR/DiagnosticInfo.h"
Benjamin Kramerd78bb462013-05-23 17:10:37 +000061#include "llvm/IR/Function.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000062#include "llvm/IR/GlobalValue.h"
63#include "llvm/IR/InstrTypes.h"
64#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Instructions.h"
Matt Arsenault7dc01c92017-03-15 23:15:12 +000066#include "llvm/IR/IntrinsicInst.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000067#include "llvm/IR/Type.h"
68#include "llvm/Support/Casting.h"
69#include "llvm/Support/CodeGen.h"
70#include "llvm/Support/CommandLine.h"
71#include "llvm/Support/Compiler.h"
72#include "llvm/Support/ErrorHandling.h"
Craig Topperd0af7e82017-04-28 05:31:46 +000073#include "llvm/Support/KnownBits.h"
David Blaikie13e77db2018-03-23 23:58:25 +000074#include "llvm/Support/MachineValueType.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000075#include "llvm/Support/MathExtras.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000076#include "llvm/Target/TargetOptions.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000077#include <cassert>
78#include <cmath>
79#include <cstdint>
80#include <iterator>
81#include <tuple>
82#include <utility>
83#include <vector>
Tom Stellard75aadc22012-12-11 21:25:42 +000084
85using namespace llvm;
86
Matt Arsenault71bcbd42017-08-11 20:42:08 +000087#define DEBUG_TYPE "si-lower"
88
89STATISTIC(NumTailCalls, "Number of tail calls");
90
Matt Arsenaultd486d3f2016-10-12 18:49:05 +000091static cl::opt<bool> EnableVGPRIndexMode(
92 "amdgpu-vgpr-index-mode",
93 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
94 cl::init(false));
95
Stanislav Mekhanoshin93f15c92019-05-03 21:17:29 +000096static cl::opt<bool> DisableLoopAlignment(
97 "amdgpu-disable-loop-alignment",
98 cl::desc("Do not align and prefetch loops"),
99 cl::init(false));
100
Tom Stellardf110f8f2016-04-14 16:27:03 +0000101static unsigned findFirstFreeSGPR(CCState &CCInfo) {
102 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
103 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
104 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
105 return AMDGPU::SGPR0 + Reg;
106 }
107 }
108 llvm_unreachable("Cannot allocate sgpr");
109}
110
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000111SITargetLowering::SITargetLowering(const TargetMachine &TM,
Tom Stellard5bfbae52018-07-11 20:59:01 +0000112 const GCNSubtarget &STI)
Tom Stellardc5a154d2018-06-28 23:47:12 +0000113 : AMDGPUTargetLowering(TM, STI),
114 Subtarget(&STI) {
Tom Stellard1bd80722014-04-30 15:31:33 +0000115 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
Tom Stellard436780b2014-05-15 14:41:57 +0000116 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000117
Marek Olsak79c05872016-11-25 17:37:09 +0000118 addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
Tom Stellard45c0b3a2015-01-07 20:59:25 +0000119 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
Tom Stellard75aadc22012-12-11 21:25:42 +0000120
Tom Stellard436780b2014-05-15 14:41:57 +0000121 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
122 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
123 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000124
Tim Renouf361b5b22019-03-21 12:01:21 +0000125 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
126 addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
127
Matt Arsenault61001bb2015-11-25 19:58:34 +0000128 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
129 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
130
Tom Stellard436780b2014-05-15 14:41:57 +0000131 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
132 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000133
Tim Renouf033f99a2019-03-22 10:11:21 +0000134 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
135 addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
136
Tom Stellardf0a21072014-11-18 20:39:39 +0000137 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000138 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
139
Tom Stellardf0a21072014-11-18 20:39:39 +0000140 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
Christian Konig2214f142013-03-07 09:03:38 +0000141 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
Tom Stellard75aadc22012-12-11 21:25:42 +0000142
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000143 if (Subtarget->has16BitInsts()) {
Marek Olsak79c05872016-11-25 17:37:09 +0000144 addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
145 addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
Tom Stellard115a6152016-11-10 16:02:37 +0000146
Matt Arsenault1349a042018-05-22 06:32:10 +0000147 // Unless there are also VOP3P operations, not operations are really legal.
Matt Arsenault7596f132017-02-27 20:52:10 +0000148 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
149 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000150 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
151 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
Matt Arsenault7596f132017-02-27 20:52:10 +0000152 }
153
Tom Stellardc5a154d2018-06-28 23:47:12 +0000154 computeRegisterProperties(Subtarget->getRegisterInfo());
Tom Stellard75aadc22012-12-11 21:25:42 +0000155
Tom Stellard35bb18c2013-08-26 15:06:04 +0000156 // We need to custom lower vector stores from local memory
Matt Arsenault71e66762016-05-21 02:27:49 +0000157 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
Tim Renouf361b5b22019-03-21 12:01:21 +0000158 setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
Tom Stellard35bb18c2013-08-26 15:06:04 +0000159 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
Tim Renouf033f99a2019-03-22 10:11:21 +0000160 setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
Tom Stellardaf775432013-10-23 00:44:32 +0000161 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
162 setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000163 setOperationAction(ISD::LOAD, MVT::i1, Custom);
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000164 setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
Matt Arsenault2b957b52016-05-02 20:07:26 +0000165
Matt Arsenaultbcdfee72016-05-02 20:13:51 +0000166 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
Tim Renouf361b5b22019-03-21 12:01:21 +0000167 setOperationAction(ISD::STORE, MVT::v3i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000168 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
Tim Renouf033f99a2019-03-22 10:11:21 +0000169 setOperationAction(ISD::STORE, MVT::v5i32, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000170 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
171 setOperationAction(ISD::STORE, MVT::v16i32, Custom);
172 setOperationAction(ISD::STORE, MVT::i1, Custom);
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000173 setOperationAction(ISD::STORE, MVT::v32i32, Custom);
Matt Arsenaultbcdfee72016-05-02 20:13:51 +0000174
Jan Vesely06200bd2017-01-06 21:00:46 +0000175 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
176 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
177 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
178 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
179 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
180 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
181 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
182 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
183 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
184 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
185
Matt Arsenault71e66762016-05-21 02:27:49 +0000186 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
187 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000188
189 setOperationAction(ISD::SELECT, MVT::i1, Promote);
Tom Stellard0ec134f2014-02-04 17:18:40 +0000190 setOperationAction(ISD::SELECT, MVT::i64, Custom);
Tom Stellardda99c6e2014-03-24 16:07:30 +0000191 setOperationAction(ISD::SELECT, MVT::f64, Promote);
192 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
Tom Stellard81d871d2013-11-13 23:36:50 +0000193
Tom Stellard3ca1bfc2014-06-10 16:01:22 +0000194 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
195 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
196 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
197 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
Matt Arsenault71e66762016-05-21 02:27:49 +0000198 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
Tom Stellard754f80f2013-04-05 23:31:51 +0000199
Tom Stellardd1efda82016-01-20 21:48:24 +0000200 setOperationAction(ISD::SETCC, MVT::i1, Promote);
Tom Stellard83747202013-07-18 21:43:53 +0000201 setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
202 setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
Matt Arsenault18f56be2016-12-22 16:27:11 +0000203 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
Tom Stellard83747202013-07-18 21:43:53 +0000204
Matt Arsenault71e66762016-05-21 02:27:49 +0000205 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
206 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
Matt Arsenaulte306a322014-10-21 16:25:08 +0000207
Matt Arsenault4e466652014-04-16 01:41:30 +0000208 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
209 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000210 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
211 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000212 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
213 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
Matt Arsenault4e466652014-04-16 01:41:30 +0000214 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
215
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000216 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
Tom Stellard9fa17912013-08-14 23:24:45 +0000217 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
Tom Stellard9fa17912013-08-14 23:24:45 +0000218 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
Matt Arsenaultb3a80e52018-08-15 21:25:20 +0000219 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
220 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
Marek Olsak13e47412018-01-31 20:18:04 +0000221 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000222 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
223
Changpeng Fang44dfa1d2018-01-12 21:12:19 +0000224 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
225 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
David Stuttardf77079f2019-01-14 11:55:24 +0000226 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000227 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
Ryan Taylor00e063a2019-03-19 16:07:00 +0000228 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
229 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
Matt Arsenault754dd3e2017-04-03 18:08:08 +0000230
231 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
Matt Arsenault4165efd2017-01-17 07:26:53 +0000232 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
233 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +0000234 setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
Ryan Taylor00e063a2019-03-19 16:07:00 +0000235 setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
236 setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000237
Matt Arsenaulte54e1c32014-06-23 18:00:44 +0000238 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000239 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
Tom Stellardbc4497b2016-02-12 23:45:29 +0000240 setOperationAction(ISD::BR_CC, MVT::i32, Expand);
241 setOperationAction(ISD::BR_CC, MVT::i64, Expand);
242 setOperationAction(ISD::BR_CC, MVT::f32, Expand);
243 setOperationAction(ISD::BR_CC, MVT::f64, Expand);
Tom Stellardafcf12f2013-09-12 02:55:14 +0000244
Matt Arsenaultee3f0ac2017-01-30 18:11:38 +0000245 setOperationAction(ISD::UADDO, MVT::i32, Legal);
246 setOperationAction(ISD::USUBO, MVT::i32, Legal);
247
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +0000248 setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
249 setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
250
Matt Arsenaulte7191392018-08-08 16:58:33 +0000251 setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
252 setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
253 setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
254
Matt Arsenault84445dd2017-11-30 22:51:26 +0000255#if 0
256 setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
257 setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
258#endif
259
Benjamin Kramer867bfc52015-03-07 17:41:00 +0000260 // We only support LOAD/STORE and vector manipulation ops for vectors
261 // with > 4 elements.
Matt Arsenault7596f132017-02-27 20:52:10 +0000262 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
Stanislav Mekhanoshin44451b32018-08-31 22:43:36 +0000263 MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
Tom Stellard967bf582014-02-13 23:34:15 +0000264 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
Matt Arsenault71e66762016-05-21 02:27:49 +0000265 switch (Op) {
Tom Stellard967bf582014-02-13 23:34:15 +0000266 case ISD::LOAD:
267 case ISD::STORE:
268 case ISD::BUILD_VECTOR:
269 case ISD::BITCAST:
270 case ISD::EXTRACT_VECTOR_ELT:
271 case ISD::INSERT_VECTOR_ELT:
Tom Stellard967bf582014-02-13 23:34:15 +0000272 case ISD::INSERT_SUBVECTOR:
273 case ISD::EXTRACT_SUBVECTOR:
Matt Arsenault61001bb2015-11-25 19:58:34 +0000274 case ISD::SCALAR_TO_VECTOR:
Tom Stellard967bf582014-02-13 23:34:15 +0000275 break;
Tom Stellardc0503db2014-08-09 01:06:56 +0000276 case ISD::CONCAT_VECTORS:
277 setOperationAction(Op, VT, Custom);
278 break;
Tom Stellard967bf582014-02-13 23:34:15 +0000279 default:
Matt Arsenaultd504a742014-05-15 21:44:05 +0000280 setOperationAction(Op, VT, Expand);
Tom Stellard967bf582014-02-13 23:34:15 +0000281 break;
282 }
283 }
284 }
285
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000286 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
287
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000288 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
289 // is expanded to avoid having two separate loops in case the index is a VGPR.
290
Matt Arsenault61001bb2015-11-25 19:58:34 +0000291 // Most operations are naturally 32-bit vector operations. We only support
292 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
293 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
294 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
295 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
296
297 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
298 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
299
300 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
301 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
302
303 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
304 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
305 }
306
Matt Arsenault71e66762016-05-21 02:27:49 +0000307 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
308 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
309 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
310 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +0000311
Matt Arsenault67a98152018-05-16 11:47:30 +0000312 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
313 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
314
Matt Arsenault3aef8092017-01-23 23:09:58 +0000315 // Avoid stack access for these.
316 // TODO: Generalize to more vector types.
317 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
318 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault67a98152018-05-16 11:47:30 +0000319 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
320 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
321
Matt Arsenault3aef8092017-01-23 23:09:58 +0000322 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
323 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault9224c002018-06-05 19:52:46 +0000324 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
325 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
326 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
327
328 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
329 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
330 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
Matt Arsenault3aef8092017-01-23 23:09:58 +0000331
Matt Arsenault67a98152018-05-16 11:47:30 +0000332 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
333 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
334 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
335 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
336
Tim Renouf361b5b22019-03-21 12:01:21 +0000337 // Deal with vec3 vector operations when widened to vec4.
338 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Expand);
339 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Expand);
340 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Expand);
341 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Expand);
342
Tim Renouf033f99a2019-03-22 10:11:21 +0000343 // Deal with vec5 vector operations when widened to vec8.
344 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Expand);
345 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Expand);
346 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Expand);
347 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Expand);
348
Tom Stellard354a43c2016-04-01 18:27:37 +0000349 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
350 // and output demarshalling
351 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
352 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
353
354 // We can't return success/failure, only the old value,
355 // let LLVM add the comparison
356 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
357 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
358
Tom Stellardc5a154d2018-06-28 23:47:12 +0000359 if (Subtarget->hasFlatAddressSpace()) {
Matt Arsenault99c14522016-04-25 19:27:24 +0000360 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
361 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
362 }
363
Matt Arsenault71e66762016-05-21 02:27:49 +0000364 setOperationAction(ISD::BSWAP, MVT::i32, Legal);
365 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
366
367 // On SI this is s_memtime and s_memrealtime on VI.
368 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
Matt Arsenault3e025382017-04-24 17:49:13 +0000369 setOperationAction(ISD::TRAP, MVT::Other, Custom);
370 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000371
Tom Stellardc5a154d2018-06-28 23:47:12 +0000372 if (Subtarget->has16BitInsts()) {
373 setOperationAction(ISD::FLOG, MVT::f16, Custom);
Matt Arsenault7121bed2018-08-16 17:07:52 +0000374 setOperationAction(ISD::FEXP, MVT::f16, Custom);
Tom Stellardc5a154d2018-06-28 23:47:12 +0000375 setOperationAction(ISD::FLOG10, MVT::f16, Custom);
376 }
377
378 // v_mad_f32 does not support denormals according to some sources.
379 if (!Subtarget->hasFP32Denormals())
380 setOperationAction(ISD::FMAD, MVT::f32, Legal);
381
382 if (!Subtarget->hasBFI()) {
383 // fcopysign can be done in a single instruction with BFI.
384 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
385 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
386 }
387
388 if (!Subtarget->hasBCNT(32))
389 setOperationAction(ISD::CTPOP, MVT::i32, Expand);
390
391 if (!Subtarget->hasBCNT(64))
392 setOperationAction(ISD::CTPOP, MVT::i64, Expand);
393
394 if (Subtarget->hasFFBH())
395 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
396
397 if (Subtarget->hasFFBL())
398 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
399
400 // We only really have 32-bit BFE instructions (and 16-bit on VI).
401 //
402 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
403 // effort to match them now. We want this to be false for i64 cases when the
404 // extraction isn't restricted to the upper or lower half. Ideally we would
405 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
406 // span the midpoint are probably relatively rare, so don't worry about them
407 // for now.
408 if (Subtarget->hasBFE())
409 setHasExtractBitsInsn(true);
410
Matt Arsenault687ec752018-10-22 16:27:27 +0000411 setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
412 setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
413 setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
414 setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
415
416
417 // These are really only legal for ieee_mode functions. We should be avoiding
418 // them for functions that don't have ieee_mode enabled, so just say they are
419 // legal.
420 setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
421 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
422 setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
423 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
424
Matt Arsenault71e66762016-05-21 02:27:49 +0000425
Tom Stellard5bfbae52018-07-11 20:59:01 +0000426 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
Matt Arsenault71e66762016-05-21 02:27:49 +0000427 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
428 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
429 setOperationAction(ISD::FRINT, MVT::f64, Legal);
Tom Stellardc5a154d2018-06-28 23:47:12 +0000430 } else {
431 setOperationAction(ISD::FCEIL, MVT::f64, Custom);
432 setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
433 setOperationAction(ISD::FRINT, MVT::f64, Custom);
434 setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
Matt Arsenault71e66762016-05-21 02:27:49 +0000435 }
436
437 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
438
439 setOperationAction(ISD::FSIN, MVT::f32, Custom);
440 setOperationAction(ISD::FCOS, MVT::f32, Custom);
441 setOperationAction(ISD::FDIV, MVT::f32, Custom);
442 setOperationAction(ISD::FDIV, MVT::f64, Custom);
443
Tom Stellard115a6152016-11-10 16:02:37 +0000444 if (Subtarget->has16BitInsts()) {
445 setOperationAction(ISD::Constant, MVT::i16, Legal);
446
447 setOperationAction(ISD::SMIN, MVT::i16, Legal);
448 setOperationAction(ISD::SMAX, MVT::i16, Legal);
449
450 setOperationAction(ISD::UMIN, MVT::i16, Legal);
451 setOperationAction(ISD::UMAX, MVT::i16, Legal);
452
Tom Stellard115a6152016-11-10 16:02:37 +0000453 setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
454 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
455
456 setOperationAction(ISD::ROTR, MVT::i16, Promote);
457 setOperationAction(ISD::ROTL, MVT::i16, Promote);
458
459 setOperationAction(ISD::SDIV, MVT::i16, Promote);
460 setOperationAction(ISD::UDIV, MVT::i16, Promote);
461 setOperationAction(ISD::SREM, MVT::i16, Promote);
462 setOperationAction(ISD::UREM, MVT::i16, Promote);
463
464 setOperationAction(ISD::BSWAP, MVT::i16, Promote);
465 setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
466
467 setOperationAction(ISD::CTTZ, MVT::i16, Promote);
468 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
469 setOperationAction(ISD::CTLZ, MVT::i16, Promote);
470 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
Jan Veselyb283ea02018-03-02 02:50:22 +0000471 setOperationAction(ISD::CTPOP, MVT::i16, Promote);
Tom Stellard115a6152016-11-10 16:02:37 +0000472
473 setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
474
475 setOperationAction(ISD::BR_CC, MVT::i16, Expand);
476
477 setOperationAction(ISD::LOAD, MVT::i16, Custom);
478
479 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
480
Tom Stellard115a6152016-11-10 16:02:37 +0000481 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
482 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
483 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
484 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
Tom Stellardb4c8e8e2016-11-12 00:19:11 +0000485
Konstantin Zhuravlyov3f0cdc72016-11-17 04:00:46 +0000486 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
487 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
488 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
489 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
Tom Stellardb4c8e8e2016-11-12 00:19:11 +0000490
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000491 // F16 - Constant Actions.
Matt Arsenaulte96d0372016-12-08 20:14:46 +0000492 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000493
494 // F16 - Load/Store Actions.
495 setOperationAction(ISD::LOAD, MVT::f16, Promote);
496 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
497 setOperationAction(ISD::STORE, MVT::f16, Promote);
498 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
499
500 // F16 - VOP1 Actions.
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +0000501 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000502 setOperationAction(ISD::FCOS, MVT::f16, Promote);
503 setOperationAction(ISD::FSIN, MVT::f16, Promote);
Konstantin Zhuravlyov3f0cdc72016-11-17 04:00:46 +0000504 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
505 setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
506 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
507 setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
Matt Arsenaultb5d23272017-03-24 20:04:18 +0000508 setOperationAction(ISD::FROUND, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000509
510 // F16 - VOP2 Actions.
Konstantin Zhuravlyov662e01d2016-11-17 03:49:01 +0000511 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
Konstantin Zhuravlyov2a87a422016-11-16 03:16:26 +0000512 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
Matt Arsenault687ec752018-10-22 16:27:27 +0000513
Matt Arsenault4052a572016-12-22 03:05:41 +0000514 setOperationAction(ISD::FDIV, MVT::f16, Custom);
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000515
516 // F16 - VOP3 Actions.
517 setOperationAction(ISD::FMA, MVT::f16, Legal);
Stanislav Mekhanoshin28a19362019-05-04 04:20:37 +0000518 if (!Subtarget->hasFP16Denormals() && STI.hasMadF16())
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000519 setOperationAction(ISD::FMAD, MVT::f16, Legal);
Tom Stellard115a6152016-11-10 16:02:37 +0000520
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000521 for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
Matt Arsenault7596f132017-02-27 20:52:10 +0000522 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
523 switch (Op) {
524 case ISD::LOAD:
525 case ISD::STORE:
526 case ISD::BUILD_VECTOR:
527 case ISD::BITCAST:
528 case ISD::EXTRACT_VECTOR_ELT:
529 case ISD::INSERT_VECTOR_ELT:
530 case ISD::INSERT_SUBVECTOR:
531 case ISD::EXTRACT_SUBVECTOR:
532 case ISD::SCALAR_TO_VECTOR:
533 break;
534 case ISD::CONCAT_VECTORS:
535 setOperationAction(Op, VT, Custom);
536 break;
537 default:
538 setOperationAction(Op, VT, Expand);
539 break;
540 }
541 }
542 }
543
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000544 // XXX - Do these do anything? Vector constants turn into build_vector.
545 setOperationAction(ISD::Constant, MVT::v2i16, Legal);
546 setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
547
Matt Arsenaultdfb88df2018-05-13 10:04:38 +0000548 setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
549 setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
550
Matt Arsenault7596f132017-02-27 20:52:10 +0000551 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
552 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
553 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
554 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
555
556 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
557 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
558 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
559 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000560
561 setOperationAction(ISD::AND, MVT::v2i16, Promote);
562 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
563 setOperationAction(ISD::OR, MVT::v2i16, Promote);
564 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
565 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
566 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000567
Matt Arsenault1349a042018-05-22 06:32:10 +0000568 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
569 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
570 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
571 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
572
573 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
574 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
575 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
576 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
577
578 setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
579 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
580 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
581 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
582
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000583 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
584 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
585 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
586
Matt Arsenault1349a042018-05-22 06:32:10 +0000587 if (!Subtarget->hasVOP3PInsts()) {
588 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
589 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
590 }
591
592 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
593 // This isn't really legal, but this avoids the legalizer unrolling it (and
594 // allows matching fneg (fabs x) patterns)
595 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
Matt Arsenault687ec752018-10-22 16:27:27 +0000596
597 setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
598 setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
599 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
600 setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
601
602 setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
603 setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
604
605 setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
606 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
Matt Arsenault1349a042018-05-22 06:32:10 +0000607 }
608
609 if (Subtarget->hasVOP3PInsts()) {
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000610 setOperationAction(ISD::ADD, MVT::v2i16, Legal);
611 setOperationAction(ISD::SUB, MVT::v2i16, Legal);
612 setOperationAction(ISD::MUL, MVT::v2i16, Legal);
613 setOperationAction(ISD::SHL, MVT::v2i16, Legal);
614 setOperationAction(ISD::SRL, MVT::v2i16, Legal);
615 setOperationAction(ISD::SRA, MVT::v2i16, Legal);
616 setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
617 setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
618 setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
619 setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
620
621 setOperationAction(ISD::FADD, MVT::v2f16, Legal);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000622 setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
623 setOperationAction(ISD::FMA, MVT::v2f16, Legal);
Matt Arsenault687ec752018-10-22 16:27:27 +0000624
625 setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
626 setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
627
Matt Arsenault540512c2018-04-26 19:21:37 +0000628 setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000629
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000630 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
631 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000632
633 setOperationAction(ISD::SHL, MVT::v4i16, Custom);
634 setOperationAction(ISD::SRA, MVT::v4i16, Custom);
635 setOperationAction(ISD::SRL, MVT::v4i16, Custom);
636 setOperationAction(ISD::ADD, MVT::v4i16, Custom);
637 setOperationAction(ISD::SUB, MVT::v4i16, Custom);
638 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
639
640 setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
641 setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
642 setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
643 setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
644
645 setOperationAction(ISD::FADD, MVT::v4f16, Custom);
646 setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
Matt Arsenault687ec752018-10-22 16:27:27 +0000647
648 setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
649 setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
650
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000651 setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
652 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
Matt Arsenault36cdcfa2018-08-02 13:43:42 +0000653 setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000654
Matt Arsenault7121bed2018-08-16 17:07:52 +0000655 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000656 setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
657 setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
Matt Arsenault1349a042018-05-22 06:32:10 +0000658 }
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000659
Matt Arsenault02dc7e12018-06-15 15:15:46 +0000660 setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
661 setOperationAction(ISD::FABS, MVT::v4f16, Custom);
662
Matt Arsenault1349a042018-05-22 06:32:10 +0000663 if (Subtarget->has16BitInsts()) {
664 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
665 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
666 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
667 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
Matt Arsenault4a486232017-04-19 20:53:07 +0000668 } else {
Matt Arsenault1349a042018-05-22 06:32:10 +0000669 // Legalization hack.
Matt Arsenault4a486232017-04-19 20:53:07 +0000670 setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
671 setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
Matt Arsenaulte9524f12018-06-06 21:28:11 +0000672
673 setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
674 setOperationAction(ISD::FABS, MVT::v2f16, Custom);
Matt Arsenault4a486232017-04-19 20:53:07 +0000675 }
676
677 for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
678 setOperationAction(ISD::SELECT, VT, Custom);
Matt Arsenault7596f132017-02-27 20:52:10 +0000679 }
680
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +0000681 setTargetDAGCombine(ISD::ADD);
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +0000682 setTargetDAGCombine(ISD::ADDCARRY);
683 setTargetDAGCombine(ISD::SUB);
684 setTargetDAGCombine(ISD::SUBCARRY);
Matt Arsenault02cb0ff2014-09-29 14:59:34 +0000685 setTargetDAGCombine(ISD::FADD);
Matt Arsenault8675db12014-08-29 16:01:14 +0000686 setTargetDAGCombine(ISD::FSUB);
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +0000687 setTargetDAGCombine(ISD::FMINNUM);
688 setTargetDAGCombine(ISD::FMAXNUM);
Matt Arsenault687ec752018-10-22 16:27:27 +0000689 setTargetDAGCombine(ISD::FMINNUM_IEEE);
690 setTargetDAGCombine(ISD::FMAXNUM_IEEE);
Farhana Aleenc370d7b2018-07-16 18:19:59 +0000691 setTargetDAGCombine(ISD::FMA);
Matt Arsenault5881f4e2015-06-09 00:52:37 +0000692 setTargetDAGCombine(ISD::SMIN);
693 setTargetDAGCombine(ISD::SMAX);
694 setTargetDAGCombine(ISD::UMIN);
695 setTargetDAGCombine(ISD::UMAX);
Tom Stellard75aadc22012-12-11 21:25:42 +0000696 setTargetDAGCombine(ISD::SETCC);
Matt Arsenaultd0101a22015-01-06 23:00:46 +0000697 setTargetDAGCombine(ISD::AND);
Matt Arsenaultf2290332015-01-06 23:00:39 +0000698 setTargetDAGCombine(ISD::OR);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +0000699 setTargetDAGCombine(ISD::XOR);
Konstantin Zhuravlyovfda33ea2016-10-21 22:10:03 +0000700 setTargetDAGCombine(ISD::SINT_TO_FP);
Matt Arsenault364a6742014-06-11 17:50:44 +0000701 setTargetDAGCombine(ISD::UINT_TO_FP);
Matt Arsenault9cd90712016-04-14 01:42:16 +0000702 setTargetDAGCombine(ISD::FCANONICALIZE);
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000703 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
Matt Arsenault8edfaee2017-03-31 19:53:03 +0000704 setTargetDAGCombine(ISD::ZERO_EXTEND);
Ryan Taylor00e063a2019-03-19 16:07:00 +0000705 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
Matt Arsenaultbf5482e2017-05-11 17:26:25 +0000706 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +0000707 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
Matt Arsenault364a6742014-06-11 17:50:44 +0000708
Matt Arsenaultb2baffa2014-08-15 17:49:05 +0000709 // All memory operations. Some folding on the pointer operand is done to help
710 // matching the constant offsets in the addressing modes.
711 setTargetDAGCombine(ISD::LOAD);
712 setTargetDAGCombine(ISD::STORE);
713 setTargetDAGCombine(ISD::ATOMIC_LOAD);
714 setTargetDAGCombine(ISD::ATOMIC_STORE);
715 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
716 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
717 setTargetDAGCombine(ISD::ATOMIC_SWAP);
718 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
719 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
720 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
721 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
722 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
723 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
724 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
725 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
726 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
727 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
Matt Arsenaulta5840c32019-01-22 18:36:06 +0000728 setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +0000729
Christian Konigeecebd02013-03-26 14:04:02 +0000730 setSchedulingPreference(Sched::RegPressure);
Tom Stellard75aadc22012-12-11 21:25:42 +0000731}
732
Tom Stellard5bfbae52018-07-11 20:59:01 +0000733const GCNSubtarget *SITargetLowering::getSubtarget() const {
Tom Stellardc5a154d2018-06-28 23:47:12 +0000734 return Subtarget;
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000735}
736
Tom Stellard0125f2a2013-06-25 02:39:35 +0000737//===----------------------------------------------------------------------===//
738// TargetLowering queries
739//===----------------------------------------------------------------------===//
740
Tom Stellardb12f4de2018-05-22 19:37:55 +0000741// v_mad_mix* support a conversion from f16 to f32.
742//
743// There is only one special case when denormals are enabled we don't currently,
744// where this is OK to use.
745bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
746 EVT DestVT, EVT SrcVT) const {
747 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
748 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
749 DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
750 SrcVT.getScalarType() == MVT::f16;
751}
752
Zvi Rackover1b736822017-07-26 08:06:58 +0000753bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
Matt Arsenault7dc01c92017-03-15 23:15:12 +0000754 // SI has some legal vector types, but no legal vector operations. Say no
755 // shuffles are legal in order to prefer scalarizing some vector operations.
756 return false;
757}
758
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000759MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
760 CallingConv::ID CC,
761 EVT VT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000762 // TODO: Consider splitting all arguments into 32-bit pieces.
763 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000764 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000765 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000766 if (Size == 32)
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000767 return ScalarVT.getSimpleVT();
Matt Arsenault0395da72018-07-31 19:17:47 +0000768
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000769 if (Size == 64)
770 return MVT::i32;
771
Matt Arsenault57b59662018-09-10 11:49:23 +0000772 if (Size == 16 && Subtarget->has16BitInsts())
Matt Arsenault0395da72018-07-31 19:17:47 +0000773 return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000774 }
775
776 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
777}
778
779unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
780 CallingConv::ID CC,
781 EVT VT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000782 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000783 unsigned NumElts = VT.getVectorNumElements();
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000784 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000785 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenault0395da72018-07-31 19:17:47 +0000786
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000787 if (Size == 32)
Matt Arsenault0395da72018-07-31 19:17:47 +0000788 return NumElts;
789
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000790 if (Size == 64)
791 return 2 * NumElts;
792
Matt Arsenault57b59662018-09-10 11:49:23 +0000793 if (Size == 16 && Subtarget->has16BitInsts())
794 return (VT.getVectorNumElements() + 1) / 2;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000795 }
796
797 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
798}
799
800unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
801 LLVMContext &Context, CallingConv::ID CC,
802 EVT VT, EVT &IntermediateVT,
803 unsigned &NumIntermediates, MVT &RegisterVT) const {
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000804 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000805 unsigned NumElts = VT.getVectorNumElements();
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000806 EVT ScalarVT = VT.getScalarType();
Matt Arsenault9ced1e02018-07-31 19:05:14 +0000807 unsigned Size = ScalarVT.getSizeInBits();
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000808 if (Size == 32) {
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000809 RegisterVT = ScalarVT.getSimpleVT();
810 IntermediateVT = RegisterVT;
Matt Arsenault0395da72018-07-31 19:17:47 +0000811 NumIntermediates = NumElts;
812 return NumIntermediates;
813 }
814
Matt Arsenaultfeedabf2018-07-31 19:29:04 +0000815 if (Size == 64) {
816 RegisterVT = MVT::i32;
817 IntermediateVT = RegisterVT;
818 NumIntermediates = 2 * NumElts;
819 return NumIntermediates;
820 }
821
Matt Arsenault0395da72018-07-31 19:17:47 +0000822 // FIXME: We should fix the ABI to be the same on targets without 16-bit
823 // support, but unless we can properly handle 3-vectors, it will be still be
824 // inconsistent.
Matt Arsenault57b59662018-09-10 11:49:23 +0000825 if (Size == 16 && Subtarget->has16BitInsts()) {
Matt Arsenault0395da72018-07-31 19:17:47 +0000826 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
827 IntermediateVT = RegisterVT;
Matt Arsenault57b59662018-09-10 11:49:23 +0000828 NumIntermediates = (NumElts + 1) / 2;
Matt Arsenault8f9dde92018-07-28 14:11:34 +0000829 return NumIntermediates;
830 }
831 }
832
833 return TargetLowering::getVectorTypeBreakdownForCallingConv(
834 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
835}
836
David Stuttardf77079f2019-01-14 11:55:24 +0000837static MVT memVTFromAggregate(Type *Ty) {
838 // Only limited forms of aggregate type currently expected.
839 assert(Ty->isStructTy() && "Expected struct type");
840
841
842 Type *ElementType = nullptr;
843 unsigned NumElts;
844 if (Ty->getContainedType(0)->isVectorTy()) {
845 VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
846 ElementType = VecComponent->getElementType();
847 NumElts = VecComponent->getNumElements();
848 } else {
849 ElementType = Ty->getContainedType(0);
850 NumElts = 1;
851 }
852
853 assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
854
855 // Calculate the size of the memVT type from the aggregate
856 unsigned Pow2Elts = 0;
857 unsigned ElementSize;
858 switch (ElementType->getTypeID()) {
859 default:
860 llvm_unreachable("Unknown type!");
861 case Type::IntegerTyID:
862 ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
863 break;
864 case Type::HalfTyID:
865 ElementSize = 16;
866 break;
867 case Type::FloatTyID:
868 ElementSize = 32;
869 break;
870 }
871 unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
872 Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
873
874 return MVT::getVectorVT(MVT::getVT(ElementType, false),
875 Pow2Elts);
876}
877
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000878bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
879 const CallInst &CI,
Matt Arsenault7d7adf42017-12-14 22:34:10 +0000880 MachineFunction &MF,
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000881 unsigned IntrID) const {
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000882 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
Nicolai Haehnlee741d7e2018-06-21 13:36:33 +0000883 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000884 AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
885 (Intrinsic::ID)IntrID);
886 if (Attr.hasFnAttribute(Attribute::ReadNone))
887 return false;
888
889 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
890
891 if (RsrcIntr->IsImage) {
892 Info.ptrVal = MFI->getImagePSV(
Tom Stellard5bfbae52018-07-11 20:59:01 +0000893 *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000894 CI.getArgOperand(RsrcIntr->RsrcArg));
895 Info.align = 0;
896 } else {
897 Info.ptrVal = MFI->getBufferPSV(
Tom Stellard5bfbae52018-07-11 20:59:01 +0000898 *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000899 CI.getArgOperand(RsrcIntr->RsrcArg));
900 }
901
902 Info.flags = MachineMemOperand::MODereferenceable;
903 if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
904 Info.opc = ISD::INTRINSIC_W_CHAIN;
David Stuttardf77079f2019-01-14 11:55:24 +0000905 Info.memVT = MVT::getVT(CI.getType(), true);
906 if (Info.memVT == MVT::Other) {
907 // Some intrinsics return an aggregate type - special case to work out
908 // the correct memVT
909 Info.memVT = memVTFromAggregate(CI.getType());
910 }
Nicolai Haehnle5d0d3032018-04-01 17:09:07 +0000911 Info.flags |= MachineMemOperand::MOLoad;
912 } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
913 Info.opc = ISD::INTRINSIC_VOID;
914 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
915 Info.flags |= MachineMemOperand::MOStore;
916 } else {
917 // Atomic
918 Info.opc = ISD::INTRINSIC_W_CHAIN;
919 Info.memVT = MVT::getVT(CI.getType());
920 Info.flags = MachineMemOperand::MOLoad |
921 MachineMemOperand::MOStore |
922 MachineMemOperand::MODereferenceable;
923
924 // XXX - Should this be volatile without known ordering?
925 Info.flags |= MachineMemOperand::MOVolatile;
926 }
927 return true;
928 }
929
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000930 switch (IntrID) {
931 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +0000932 case Intrinsic::amdgcn_atomic_dec:
Marek Olsakc5cec5e2019-01-16 15:43:53 +0000933 case Intrinsic::amdgcn_ds_ordered_add:
934 case Intrinsic::amdgcn_ds_ordered_swap:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +0000935 case Intrinsic::amdgcn_ds_fadd:
936 case Intrinsic::amdgcn_ds_fmin:
937 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000938 Info.opc = ISD::INTRINSIC_W_CHAIN;
939 Info.memVT = MVT::getVT(CI.getType());
940 Info.ptrVal = CI.getOperand(0);
941 Info.align = 0;
Matt Arsenault11171332017-12-14 21:39:51 +0000942 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
Matt Arsenault79f837c2017-03-30 22:21:40 +0000943
Matt Arsenaultcaf13162019-03-12 21:02:54 +0000944 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
945 if (!Vol->isZero())
Matt Arsenault11171332017-12-14 21:39:51 +0000946 Info.flags |= MachineMemOperand::MOVolatile;
947
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000948 return true;
Matt Arsenault79f837c2017-03-30 22:21:40 +0000949 }
Matt Arsenaultcdd191d2019-01-28 20:14:49 +0000950 case Intrinsic::amdgcn_ds_append:
951 case Intrinsic::amdgcn_ds_consume: {
952 Info.opc = ISD::INTRINSIC_W_CHAIN;
953 Info.memVT = MVT::getVT(CI.getType());
954 Info.ptrVal = CI.getOperand(0);
955 Info.align = 0;
956 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
Matt Arsenault905f3512017-12-29 17:18:14 +0000957
Matt Arsenaultcaf13162019-03-12 21:02:54 +0000958 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
959 if (!Vol->isZero())
Matt Arsenaultcdd191d2019-01-28 20:14:49 +0000960 Info.flags |= MachineMemOperand::MOVolatile;
961
962 return true;
963 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +0000964 default:
965 return false;
966 }
967}
968
Matt Arsenault7dc01c92017-03-15 23:15:12 +0000969bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
970 SmallVectorImpl<Value*> &Ops,
971 Type *&AccessTy) const {
972 switch (II->getIntrinsicID()) {
973 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +0000974 case Intrinsic::amdgcn_atomic_dec:
Marek Olsakc5cec5e2019-01-16 15:43:53 +0000975 case Intrinsic::amdgcn_ds_ordered_add:
976 case Intrinsic::amdgcn_ds_ordered_swap:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +0000977 case Intrinsic::amdgcn_ds_fadd:
978 case Intrinsic::amdgcn_ds_fmin:
979 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenault7dc01c92017-03-15 23:15:12 +0000980 Value *Ptr = II->getArgOperand(0);
981 AccessTy = II->getType();
982 Ops.push_back(Ptr);
983 return true;
984 }
985 default:
986 return false;
987 }
Matt Arsenaulte306a322014-10-21 16:25:08 +0000988}
989
Tom Stellard70580f82015-07-20 14:28:41 +0000990bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
Matt Arsenaultd9b77842017-06-12 17:06:35 +0000991 if (!Subtarget->hasFlatInstOffsets()) {
992 // Flat instructions do not have offsets, and only have the register
993 // address.
994 return AM.BaseOffs == 0 && AM.Scale == 0;
995 }
996
997 // GFX9 added a 13-bit signed offset. When using regular flat instructions,
998 // the sign bit is ignored and is treated as a 12-bit unsigned offset.
999
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00001000 // GFX10 shrinked signed offset to 12 bits. When using regular flat
1001 // instructions, the sign bit is also ignored and is treated as 11-bit
1002 // unsigned offset.
1003
1004 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
1005 return isUInt<11>(AM.BaseOffs) && AM.Scale == 0;
1006
Matt Arsenaultd9b77842017-06-12 17:06:35 +00001007 // Just r + i
1008 return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
Tom Stellard70580f82015-07-20 14:28:41 +00001009}
1010
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +00001011bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1012 if (Subtarget->hasFlatGlobalInsts())
1013 return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
1014
1015 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1016 // Assume the we will use FLAT for all global memory accesses
1017 // on VI.
1018 // FIXME: This assumption is currently wrong. On VI we still use
1019 // MUBUF instructions for the r + i addressing mode. As currently
1020 // implemented, the MUBUF instructions only work on buffer < 4GB.
1021 // It may be possible to support > 4GB buffers with MUBUF instructions,
1022 // by setting the stride value in the resource descriptor which would
1023 // increase the size limit to (stride * 4GB). However, this is risky,
1024 // because it has never been validated.
1025 return isLegalFlatAddressingMode(AM);
1026 }
1027
1028 return isLegalMUBUFAddressingMode(AM);
1029}
1030
Matt Arsenault711b3902015-08-07 20:18:34 +00001031bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1032 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1033 // additionally can do r + r + i with addr64. 32-bit has more addressing
1034 // mode options. Depending on the resource constant, it can also do
1035 // (i64 r0) + (i32 r1) * (i14 i).
1036 //
1037 // Private arrays end up using a scratch buffer most of the time, so also
1038 // assume those use MUBUF instructions. Scratch loads / stores are currently
1039 // implemented as mubuf instructions with offen bit set, so slightly
1040 // different than the normal addr64.
1041 if (!isUInt<12>(AM.BaseOffs))
1042 return false;
1043
1044 // FIXME: Since we can split immediate into soffset and immediate offset,
1045 // would it make sense to allow any immediate?
1046
1047 switch (AM.Scale) {
1048 case 0: // r + i or just i, depending on HasBaseReg.
1049 return true;
1050 case 1:
1051 return true; // We have r + r or r + i.
1052 case 2:
1053 if (AM.HasBaseReg) {
1054 // Reject 2 * r + r.
1055 return false;
1056 }
1057
1058 // Allow 2 * r as r + r
1059 // Or 2 * r + i is allowed as r + r + i.
1060 return true;
1061 default: // Don't allow n * r
1062 return false;
1063 }
1064}
1065
Mehdi Amini0cdec1e2015-07-09 02:09:40 +00001066bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1067 const AddrMode &AM, Type *Ty,
Jonas Paulsson024e3192017-07-21 11:59:37 +00001068 unsigned AS, Instruction *I) const {
Matt Arsenault5015a892014-08-15 17:17:07 +00001069 // No global is ever allowed as a base.
1070 if (AM.BaseGV)
1071 return false;
1072
Matt Arsenault0da63502018-08-31 05:49:54 +00001073 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +00001074 return isLegalGlobalAddressingMode(AM);
Matt Arsenault5015a892014-08-15 17:17:07 +00001075
Matt Arsenault0da63502018-08-31 05:49:54 +00001076 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
Neil Henning523dab02019-03-18 14:44:28 +00001077 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1078 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001079 // If the offset isn't a multiple of 4, it probably isn't going to be
1080 // correctly aligned.
Matt Arsenault3cc1e002016-08-13 01:43:51 +00001081 // FIXME: Can we get the real alignment here?
Matt Arsenault711b3902015-08-07 20:18:34 +00001082 if (AM.BaseOffs % 4 != 0)
1083 return isLegalMUBUFAddressingMode(AM);
1084
1085 // There are no SMRD extloads, so if we have to do a small type access we
1086 // will use a MUBUF load.
1087 // FIXME?: We also need to do this if unaligned, but we don't know the
1088 // alignment here.
Stanislav Mekhanoshin57d341c2018-05-15 22:07:51 +00001089 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
Matt Arsenaultdc8f5cc2017-07-29 01:12:31 +00001090 return isLegalGlobalAddressingMode(AM);
Matt Arsenault711b3902015-08-07 20:18:34 +00001091
Tom Stellard5bfbae52018-07-11 20:59:01 +00001092 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001093 // SMRD instructions have an 8-bit, dword offset on SI.
1094 if (!isUInt<8>(AM.BaseOffs / 4))
1095 return false;
Tom Stellard5bfbae52018-07-11 20:59:01 +00001096 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001097 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1098 // in 8-bits, it can use a smaller encoding.
1099 if (!isUInt<32>(AM.BaseOffs / 4))
1100 return false;
Tom Stellard5bfbae52018-07-11 20:59:01 +00001101 } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001102 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1103 if (!isUInt<20>(AM.BaseOffs))
1104 return false;
1105 } else
1106 llvm_unreachable("unhandled generation");
1107
1108 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1109 return true;
1110
1111 if (AM.Scale == 1 && AM.HasBaseReg)
1112 return true;
1113
1114 return false;
Matt Arsenault711b3902015-08-07 20:18:34 +00001115
Matt Arsenault0da63502018-08-31 05:49:54 +00001116 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault711b3902015-08-07 20:18:34 +00001117 return isLegalMUBUFAddressingMode(AM);
Matt Arsenault0da63502018-08-31 05:49:54 +00001118 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1119 AS == AMDGPUAS::REGION_ADDRESS) {
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001120 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1121 // field.
1122 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1123 // an 8-bit dword offset but we don't know the alignment here.
1124 if (!isUInt<16>(AM.BaseOffs))
Matt Arsenault5015a892014-08-15 17:17:07 +00001125 return false;
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001126
1127 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1128 return true;
1129
1130 if (AM.Scale == 1 && AM.HasBaseReg)
1131 return true;
1132
Matt Arsenault5015a892014-08-15 17:17:07 +00001133 return false;
Matt Arsenault0da63502018-08-31 05:49:54 +00001134 } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1135 AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
Matt Arsenault7d1b6c82016-04-29 06:25:10 +00001136 // For an unknown address space, this usually means that this is for some
1137 // reason being used for pure arithmetic, and not based on some addressing
1138 // computation. We don't have instructions that compute pointers with any
1139 // addressing modes, so treat them as having no offset like flat
1140 // instructions.
Tom Stellard70580f82015-07-20 14:28:41 +00001141 return isLegalFlatAddressingMode(AM);
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00001142 } else {
Matt Arsenault73e06fa2015-06-04 16:17:42 +00001143 llvm_unreachable("unhandled address space");
1144 }
Matt Arsenault5015a892014-08-15 17:17:07 +00001145}
1146
Nirav Dave4dcad5d2017-07-10 20:25:54 +00001147bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1148 const SelectionDAG &DAG) const {
Matt Arsenault0da63502018-08-31 05:49:54 +00001149 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001150 return (MemVT.getSizeInBits() <= 4 * 32);
Matt Arsenault0da63502018-08-31 05:49:54 +00001151 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001152 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1153 return (MemVT.getSizeInBits() <= MaxPrivateBits);
Matt Arsenault0da63502018-08-31 05:49:54 +00001154 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Nirav Daved20066c2017-05-24 15:59:09 +00001155 return (MemVT.getSizeInBits() <= 2 * 32);
1156 }
1157 return true;
1158}
1159
Matt Arsenaulte6986632015-01-14 01:35:22 +00001160bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
Matt Arsenault6f2a5262014-07-27 17:46:40 +00001161 unsigned AddrSpace,
1162 unsigned Align,
1163 bool *IsFast) const {
Matt Arsenault1018c892014-04-24 17:08:26 +00001164 if (IsFast)
1165 *IsFast = false;
1166
Matt Arsenault1018c892014-04-24 17:08:26 +00001167 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1168 // which isn't a simple VT.
Alina Sbirlea6f937b12016-08-04 16:38:44 +00001169 // Until MVT is extended to handle this, simply check for the size and
1170 // rely on the condition below: allow accesses if the size is a multiple of 4.
1171 if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1172 VT.getStoreSize() > 16)) {
Tom Stellard81d871d2013-11-13 23:36:50 +00001173 return false;
Alina Sbirlea6f937b12016-08-04 16:38:44 +00001174 }
Matt Arsenault1018c892014-04-24 17:08:26 +00001175
Matt Arsenault0da63502018-08-31 05:49:54 +00001176 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1177 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
Matt Arsenault6f2a5262014-07-27 17:46:40 +00001178 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1179 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1180 // with adjacent offsets.
Sanjay Patelce74db92015-09-03 15:03:19 +00001181 bool AlignedBy4 = (Align % 4 == 0);
1182 if (IsFast)
1183 *IsFast = AlignedBy4;
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001184
Sanjay Patelce74db92015-09-03 15:03:19 +00001185 return AlignedBy4;
Matt Arsenault6f2a5262014-07-27 17:46:40 +00001186 }
Matt Arsenault1018c892014-04-24 17:08:26 +00001187
Tom Stellard64a9d082016-10-14 18:10:39 +00001188 // FIXME: We have to be conservative here and assume that flat operations
1189 // will access scratch. If we had access to the IR function, then we
1190 // could determine if any private memory was used in the function.
1191 if (!Subtarget->hasUnalignedScratchAccess() &&
Matt Arsenault0da63502018-08-31 05:49:54 +00001192 (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1193 AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
Matt Arsenaultf4320112018-09-24 13:18:15 +00001194 bool AlignedBy4 = Align >= 4;
1195 if (IsFast)
1196 *IsFast = AlignedBy4;
1197
1198 return AlignedBy4;
Tom Stellard64a9d082016-10-14 18:10:39 +00001199 }
1200
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001201 if (Subtarget->hasUnalignedBufferAccess()) {
1202 // If we have an uniform constant load, it still requires using a slow
1203 // buffer instruction if unaligned.
1204 if (IsFast) {
Matt Arsenault0da63502018-08-31 05:49:54 +00001205 *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1206 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
Matt Arsenault7f681ac2016-07-01 23:03:44 +00001207 (Align % 4 == 0) : true;
1208 }
1209
1210 return true;
1211 }
1212
Tom Stellard33e64c62015-02-04 20:49:52 +00001213 // Smaller than dword value must be aligned.
Tom Stellard33e64c62015-02-04 20:49:52 +00001214 if (VT.bitsLT(MVT::i32))
1215 return false;
1216
Matt Arsenault1018c892014-04-24 17:08:26 +00001217 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1218 // byte-address are ignored, thus forcing Dword alignment.
Tom Stellarde812f2f2014-07-21 15:45:06 +00001219 // This applies to private, global, and constant memory.
Matt Arsenault1018c892014-04-24 17:08:26 +00001220 if (IsFast)
1221 *IsFast = true;
Tom Stellardc6b299c2015-02-02 18:02:28 +00001222
1223 return VT.bitsGT(MVT::i32) && Align % 4 == 0;
Tom Stellard0125f2a2013-06-25 02:39:35 +00001224}
1225
Sjoerd Meijer180f1ae2019-04-30 08:38:12 +00001226EVT SITargetLowering::getOptimalMemOpType(
1227 uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
1228 bool ZeroMemset, bool MemcpyStrSrc,
1229 const AttributeList &FuncAttributes) const {
Matt Arsenault46645fa2014-07-28 17:49:26 +00001230 // FIXME: Should account for address space here.
1231
1232 // The default fallback uses the private pointer size as a guess for a type to
1233 // use. Make sure we switch these to 64-bit accesses.
1234
1235 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
1236 return MVT::v4i32;
1237
1238 if (Size >= 8 && DstAlign >= 4)
1239 return MVT::v2i32;
1240
1241 // Use the default.
1242 return MVT::Other;
1243}
1244
Matt Arsenault0da63502018-08-31 05:49:54 +00001245static bool isFlatGlobalAddrSpace(unsigned AS) {
1246 return AS == AMDGPUAS::GLOBAL_ADDRESS ||
1247 AS == AMDGPUAS::FLAT_ADDRESS ||
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001248 AS == AMDGPUAS::CONSTANT_ADDRESS ||
1249 AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
Matt Arsenaultf9bfeaf2015-12-01 23:04:00 +00001250}
1251
1252bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1253 unsigned DestAS) const {
Matt Arsenault0da63502018-08-31 05:49:54 +00001254 return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
Matt Arsenaultf9bfeaf2015-12-01 23:04:00 +00001255}
1256
Alexander Timofeev18009562016-12-08 17:28:47 +00001257bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1258 const MemSDNode *MemNode = cast<MemSDNode>(N);
1259 const Value *Ptr = MemNode->getMemOperand()->getValue();
Matt Arsenault0a0c8712018-03-27 18:39:45 +00001260 const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
Alexander Timofeev18009562016-12-08 17:28:47 +00001261 return I && I->getMetadata("amdgpu.noclobber");
1262}
1263
Matt Arsenault8dbeb922019-06-03 18:41:34 +00001264bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
1265 unsigned DestAS) const {
Matt Arsenaultd4da0ed2016-12-02 18:12:53 +00001266 // Flat -> private/local is a simple truncate.
1267 // Flat -> global is no-op
Matt Arsenault0da63502018-08-31 05:49:54 +00001268 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
Matt Arsenaultd4da0ed2016-12-02 18:12:53 +00001269 return true;
1270
1271 return isNoopAddrSpaceCast(SrcAS, DestAS);
1272}
1273
Tom Stellarda6f24c62015-12-15 20:55:55 +00001274bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1275 const MemSDNode *MemNode = cast<MemSDNode>(N);
Tom Stellarda6f24c62015-12-15 20:55:55 +00001276
Matt Arsenaultbcf7bec2018-02-09 16:57:48 +00001277 return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
Tom Stellarda6f24c62015-12-15 20:55:55 +00001278}
1279
Chandler Carruth9d010ff2014-07-03 00:23:43 +00001280TargetLoweringBase::LegalizeTypeAction
Craig Topper0b5f8162018-11-05 23:26:13 +00001281SITargetLowering::getPreferredVectorAction(MVT VT) const {
Chandler Carruth9d010ff2014-07-03 00:23:43 +00001282 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
1283 return TypeSplitVector;
1284
1285 return TargetLoweringBase::getPreferredVectorAction(VT);
Tom Stellardd86003e2013-08-14 23:25:00 +00001286}
Tom Stellard0125f2a2013-06-25 02:39:35 +00001287
Matt Arsenaultd7bdcc42014-03-31 19:54:27 +00001288bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1289 Type *Ty) const {
Matt Arsenault749035b2016-07-30 01:40:36 +00001290 // FIXME: Could be smarter if called for vector constants.
1291 return true;
Matt Arsenaultd7bdcc42014-03-31 19:54:27 +00001292}
1293
Tom Stellard2e045bb2016-01-20 00:13:22 +00001294bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
Matt Arsenault7b00cf42016-12-09 17:57:43 +00001295 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1296 switch (Op) {
1297 case ISD::LOAD:
1298 case ISD::STORE:
Tom Stellard2e045bb2016-01-20 00:13:22 +00001299
Matt Arsenault7b00cf42016-12-09 17:57:43 +00001300 // These operations are done with 32-bit instructions anyway.
1301 case ISD::AND:
1302 case ISD::OR:
1303 case ISD::XOR:
1304 case ISD::SELECT:
1305 // TODO: Extensions?
1306 return true;
1307 default:
1308 return false;
1309 }
1310 }
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +00001311
Tom Stellard2e045bb2016-01-20 00:13:22 +00001312 // SimplifySetCC uses this function to determine whether or not it should
1313 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1314 if (VT == MVT::i1 && Op == ISD::SETCC)
1315 return false;
1316
1317 return TargetLowering::isTypeDesirableForOp(Op, VT);
1318}
1319
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001320SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1321 const SDLoc &SL,
1322 SDValue Chain,
1323 uint64_t Offset) const {
Mehdi Aminia749f2a2015-07-09 02:09:52 +00001324 const DataLayout &DL = DAG.getDataLayout();
Tom Stellardec2e43c2014-09-22 15:35:29 +00001325 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001326 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1327
1328 const ArgDescriptor *InputPtrReg;
1329 const TargetRegisterClass *RC;
1330
1331 std::tie(InputPtrReg, RC)
1332 = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
Tom Stellard94593ee2013-06-03 17:40:18 +00001333
Matt Arsenault86033ca2014-07-28 17:31:39 +00001334 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
Matt Arsenault0da63502018-08-31 05:49:54 +00001335 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
Matt Arsenaulta0269b62015-06-01 21:58:24 +00001336 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001337 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1338
Matt Arsenault2fb9ccf2018-05-29 17:42:38 +00001339 return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
Jan Veselyfea814d2016-06-21 20:46:20 +00001340}
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00001341
Matt Arsenault9166ce82017-07-28 15:52:08 +00001342SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1343 const SDLoc &SL) const {
Matt Arsenault75e71922018-06-28 10:18:55 +00001344 uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1345 FIRST_IMPLICIT);
Matt Arsenault9166ce82017-07-28 15:52:08 +00001346 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1347}
1348
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001349SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1350 const SDLoc &SL, SDValue Val,
1351 bool Signed,
Matt Arsenault6dca5422017-01-09 18:52:39 +00001352 const ISD::InputArg *Arg) const {
Tim Renouf361b5b22019-03-21 12:01:21 +00001353 // First, if it is a widened vector, narrow it.
1354 if (VT.isVector() &&
1355 VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1356 EVT NarrowedVT =
1357 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
1358 VT.getVectorNumElements());
1359 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1360 DAG.getConstant(0, SL, MVT::i32));
1361 }
1362
1363 // Then convert the vector elements or scalar value.
Matt Arsenault6dca5422017-01-09 18:52:39 +00001364 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1365 VT.bitsLT(MemVT)) {
1366 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1367 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1368 }
1369
Tom Stellardbc6c5232016-10-17 16:21:45 +00001370 if (MemVT.isFloatingPoint())
Matt Arsenault6dca5422017-01-09 18:52:39 +00001371 Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001372 else if (Signed)
Matt Arsenault6dca5422017-01-09 18:52:39 +00001373 Val = DAG.getSExtOrTrunc(Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001374 else
Matt Arsenault6dca5422017-01-09 18:52:39 +00001375 Val = DAG.getZExtOrTrunc(Val, SL, VT);
Tom Stellardbc6c5232016-10-17 16:21:45 +00001376
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001377 return Val;
1378}
1379
1380SDValue SITargetLowering::lowerKernargMemParameter(
1381 SelectionDAG &DAG, EVT VT, EVT MemVT,
1382 const SDLoc &SL, SDValue Chain,
Matt Arsenault7b4826e2018-05-30 16:17:51 +00001383 uint64_t Offset, unsigned Align, bool Signed,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001384 const ISD::InputArg *Arg) const {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001385 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
Matt Arsenault0da63502018-08-31 05:49:54 +00001386 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001387 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1388
Matt Arsenault90083d32018-06-07 09:54:49 +00001389 // Try to avoid using an extload by loading earlier than the argument address,
1390 // and extracting the relevant bits. The load should hopefully be merged with
1391 // the previous argument.
Matt Arsenault4bec7d42018-07-20 09:05:08 +00001392 if (MemVT.getStoreSize() < 4 && Align < 4) {
1393 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
Matt Arsenault90083d32018-06-07 09:54:49 +00001394 int64_t AlignDownOffset = alignDown(Offset, 4);
1395 int64_t OffsetDiff = Offset - AlignDownOffset;
1396
1397 EVT IntVT = MemVT.changeTypeToInteger();
1398
1399 // TODO: If we passed in the base kernel offset we could have a better
1400 // alignment than 4, but we don't really need it.
1401 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1402 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1403 MachineMemOperand::MODereferenceable |
1404 MachineMemOperand::MOInvariant);
1405
1406 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1407 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1408
1409 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1410 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1411 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1412
1413
1414 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1415 }
1416
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001417 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1418 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001419 MachineMemOperand::MODereferenceable |
1420 MachineMemOperand::MOInvariant);
1421
1422 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
Matt Arsenault6dca5422017-01-09 18:52:39 +00001423 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
Tom Stellard94593ee2013-06-03 17:40:18 +00001424}
1425
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001426SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1427 const SDLoc &SL, SDValue Chain,
1428 const ISD::InputArg &Arg) const {
1429 MachineFunction &MF = DAG.getMachineFunction();
1430 MachineFrameInfo &MFI = MF.getFrameInfo();
1431
1432 if (Arg.Flags.isByVal()) {
1433 unsigned Size = Arg.Flags.getByValSize();
1434 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1435 return DAG.getFrameIndex(FrameIdx, MVT::i32);
1436 }
1437
1438 unsigned ArgOffset = VA.getLocMemOffset();
1439 unsigned ArgSize = VA.getValVT().getStoreSize();
1440
1441 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1442
1443 // Create load nodes to retrieve arguments from the stack.
1444 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1445 SDValue ArgValue;
1446
1447 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1448 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
1449 MVT MemVT = VA.getValVT();
1450
1451 switch (VA.getLocInfo()) {
1452 default:
1453 break;
1454 case CCValAssign::BCvt:
1455 MemVT = VA.getLocVT();
1456 break;
1457 case CCValAssign::SExt:
1458 ExtType = ISD::SEXTLOAD;
1459 break;
1460 case CCValAssign::ZExt:
1461 ExtType = ISD::ZEXTLOAD;
1462 break;
1463 case CCValAssign::AExt:
1464 ExtType = ISD::EXTLOAD;
1465 break;
1466 }
1467
1468 ArgValue = DAG.getExtLoad(
1469 ExtType, SL, VA.getLocVT(), Chain, FIN,
1470 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1471 MemVT);
1472 return ArgValue;
1473}
1474
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001475SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1476 const SIMachineFunctionInfo &MFI,
1477 EVT VT,
1478 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1479 const ArgDescriptor *Reg;
1480 const TargetRegisterClass *RC;
1481
1482 std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1483 return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1484}
1485
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001486static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1487 CallingConv::ID CallConv,
1488 ArrayRef<ISD::InputArg> Ins,
1489 BitVector &Skipped,
1490 FunctionType *FType,
1491 SIMachineFunctionInfo *Info) {
1492 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001493 const ISD::InputArg *Arg = &Ins[I];
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001494
Matt Arsenault55ab9212018-08-01 19:57:34 +00001495 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1496 "vector type argument should have been split");
Matt Arsenault9ced1e02018-07-31 19:05:14 +00001497
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001498 // First check if it's a PS input addr.
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001499 if (CallConv == CallingConv::AMDGPU_PS &&
1500 !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001501
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001502 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1503
1504 // Inconveniently only the first part of the split is marked as isSplit,
1505 // so skip to the end. We only want to increment PSInputNum once for the
1506 // entire split argument.
1507 if (Arg->Flags.isSplit()) {
1508 while (!Arg->Flags.isSplitEnd()) {
1509 assert(!Arg->VT.isVector() &&
1510 "unexpected vector split in ps argument type");
1511 if (!SkipArg)
1512 Splits.push_back(*Arg);
1513 Arg = &Ins[++I];
1514 }
1515 }
1516
1517 if (SkipArg) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001518 // We can safely skip PS inputs.
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001519 Skipped.set(Arg->getOrigArgIndex());
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001520 ++PSInputNum;
1521 continue;
1522 }
1523
1524 Info->markPSInputAllocated(PSInputNum);
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00001525 if (Arg->Used)
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001526 Info->markPSInputEnabled(PSInputNum);
1527
1528 ++PSInputNum;
1529 }
1530
Matt Arsenault9ced1e02018-07-31 19:05:14 +00001531 Splits.push_back(*Arg);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001532 }
1533}
1534
1535// Allocate special inputs passed in VGPRs.
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001536static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1537 MachineFunction &MF,
1538 const SIRegisterInfo &TRI,
1539 SIMachineFunctionInfo &Info) {
1540 if (Info.hasWorkItemIDX()) {
1541 unsigned Reg = AMDGPU::VGPR0;
1542 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001543
1544 CCInfo.AllocateReg(Reg);
1545 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
1546 }
1547
1548 if (Info.hasWorkItemIDY()) {
1549 unsigned Reg = AMDGPU::VGPR1;
1550 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1551
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001552 CCInfo.AllocateReg(Reg);
1553 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1554 }
1555
1556 if (Info.hasWorkItemIDZ()) {
1557 unsigned Reg = AMDGPU::VGPR2;
1558 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1559
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001560 CCInfo.AllocateReg(Reg);
1561 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1562 }
1563}
1564
1565// Try to allocate a VGPR at the end of the argument list, or if no argument
1566// VGPRs are left allocating a stack slot.
1567static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
1568 ArrayRef<MCPhysReg> ArgVGPRs
1569 = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1570 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1571 if (RegIdx == ArgVGPRs.size()) {
1572 // Spill to stack required.
1573 int64_t Offset = CCInfo.AllocateStack(4, 4);
1574
1575 return ArgDescriptor::createStack(Offset);
1576 }
1577
1578 unsigned Reg = ArgVGPRs[RegIdx];
1579 Reg = CCInfo.AllocateReg(Reg);
1580 assert(Reg != AMDGPU::NoRegister);
1581
1582 MachineFunction &MF = CCInfo.getMachineFunction();
1583 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1584 return ArgDescriptor::createRegister(Reg);
1585}
1586
1587static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1588 const TargetRegisterClass *RC,
1589 unsigned NumArgRegs) {
1590 ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1591 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1592 if (RegIdx == ArgSGPRs.size())
1593 report_fatal_error("ran out of SGPRs for arguments");
1594
1595 unsigned Reg = ArgSGPRs[RegIdx];
1596 Reg = CCInfo.AllocateReg(Reg);
1597 assert(Reg != AMDGPU::NoRegister);
1598
1599 MachineFunction &MF = CCInfo.getMachineFunction();
1600 MF.addLiveIn(Reg, RC);
1601 return ArgDescriptor::createRegister(Reg);
1602}
1603
1604static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
1605 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1606}
1607
1608static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
1609 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1610}
1611
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001612static void allocateSpecialInputVGPRs(CCState &CCInfo,
1613 MachineFunction &MF,
1614 const SIRegisterInfo &TRI,
1615 SIMachineFunctionInfo &Info) {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001616 if (Info.hasWorkItemIDX())
1617 Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001618
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001619 if (Info.hasWorkItemIDY())
1620 Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001621
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001622 if (Info.hasWorkItemIDZ())
1623 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1624}
1625
1626static void allocateSpecialInputSGPRs(CCState &CCInfo,
1627 MachineFunction &MF,
1628 const SIRegisterInfo &TRI,
1629 SIMachineFunctionInfo &Info) {
1630 auto &ArgInfo = Info.getArgInfo();
1631
1632 // TODO: Unify handling with private memory pointers.
1633
1634 if (Info.hasDispatchPtr())
1635 ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1636
1637 if (Info.hasQueuePtr())
1638 ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1639
1640 if (Info.hasKernargSegmentPtr())
1641 ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1642
1643 if (Info.hasDispatchID())
1644 ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1645
1646 // flat_scratch_init is not applicable for non-kernel functions.
1647
1648 if (Info.hasWorkGroupIDX())
1649 ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1650
1651 if (Info.hasWorkGroupIDY())
1652 ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1653
1654 if (Info.hasWorkGroupIDZ())
1655 ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
Matt Arsenault817c2532017-08-03 23:12:44 +00001656
1657 if (Info.hasImplicitArgPtr())
1658 ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001659}
1660
1661// Allocate special inputs passed in user SGPRs.
1662static void allocateHSAUserSGPRs(CCState &CCInfo,
1663 MachineFunction &MF,
1664 const SIRegisterInfo &TRI,
1665 SIMachineFunctionInfo &Info) {
Matt Arsenault10fc0622017-06-26 03:01:31 +00001666 if (Info.hasImplicitBufferPtr()) {
1667 unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1668 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1669 CCInfo.AllocateReg(ImplicitBufferPtrReg);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001670 }
1671
1672 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1673 if (Info.hasPrivateSegmentBuffer()) {
1674 unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1675 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1676 CCInfo.AllocateReg(PrivateSegmentBufferReg);
1677 }
1678
1679 if (Info.hasDispatchPtr()) {
1680 unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1681 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1682 CCInfo.AllocateReg(DispatchPtrReg);
1683 }
1684
1685 if (Info.hasQueuePtr()) {
1686 unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1687 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1688 CCInfo.AllocateReg(QueuePtrReg);
1689 }
1690
1691 if (Info.hasKernargSegmentPtr()) {
1692 unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1693 MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1694 CCInfo.AllocateReg(InputPtrReg);
1695 }
1696
1697 if (Info.hasDispatchID()) {
1698 unsigned DispatchIDReg = Info.addDispatchID(TRI);
1699 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1700 CCInfo.AllocateReg(DispatchIDReg);
1701 }
1702
1703 if (Info.hasFlatScratchInit()) {
1704 unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1705 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1706 CCInfo.AllocateReg(FlatScratchInitReg);
1707 }
1708
1709 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1710 // these from the dispatch pointer.
1711}
1712
1713// Allocate special input registers that are initialized per-wave.
1714static void allocateSystemSGPRs(CCState &CCInfo,
1715 MachineFunction &MF,
1716 SIMachineFunctionInfo &Info,
Marek Olsak584d2c02017-05-04 22:25:20 +00001717 CallingConv::ID CallConv,
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001718 bool IsShader) {
1719 if (Info.hasWorkGroupIDX()) {
1720 unsigned Reg = Info.addWorkGroupIDX();
1721 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1722 CCInfo.AllocateReg(Reg);
1723 }
1724
1725 if (Info.hasWorkGroupIDY()) {
1726 unsigned Reg = Info.addWorkGroupIDY();
1727 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1728 CCInfo.AllocateReg(Reg);
1729 }
1730
1731 if (Info.hasWorkGroupIDZ()) {
1732 unsigned Reg = Info.addWorkGroupIDZ();
1733 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1734 CCInfo.AllocateReg(Reg);
1735 }
1736
1737 if (Info.hasWorkGroupInfo()) {
1738 unsigned Reg = Info.addWorkGroupInfo();
1739 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1740 CCInfo.AllocateReg(Reg);
1741 }
1742
1743 if (Info.hasPrivateSegmentWaveByteOffset()) {
1744 // Scratch wave offset passed in system SGPR.
1745 unsigned PrivateSegmentWaveByteOffsetReg;
1746
1747 if (IsShader) {
Marek Olsak584d2c02017-05-04 22:25:20 +00001748 PrivateSegmentWaveByteOffsetReg =
1749 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
1750
1751 // This is true if the scratch wave byte offset doesn't have a fixed
1752 // location.
1753 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1754 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1755 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1756 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001757 } else
1758 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1759
1760 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1761 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1762 }
1763}
1764
1765static void reservePrivateMemoryRegs(const TargetMachine &TM,
1766 MachineFunction &MF,
1767 const SIRegisterInfo &TRI,
Matt Arsenault1cc47f82017-07-18 16:44:56 +00001768 SIMachineFunctionInfo &Info) {
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001769 // Now that we've figured out where the scratch register inputs are, see if
1770 // should reserve the arguments and use them directly.
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001771 MachineFrameInfo &MFI = MF.getFrameInfo();
1772 bool HasStackObjects = MFI.hasStackObjects();
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001773 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001774
1775 // Record that we know we have non-spill stack objects so we don't need to
1776 // check all stack objects later.
1777 if (HasStackObjects)
1778 Info.setHasNonSpillStackObjects(true);
1779
1780 // Everything live out of a block is spilled with fast regalloc, so it's
1781 // almost certain that spilling will be required.
1782 if (TM.getOptLevel() == CodeGenOpt::None)
1783 HasStackObjects = true;
1784
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001785 // For now assume stack access is needed in any callee functions, so we need
1786 // the scratch registers to pass in.
1787 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1788
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001789 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
1790 // If we have stack objects, we unquestionably need the private buffer
1791 // resource. For the Code Object V2 ABI, this will be the first 4 user
1792 // SGPR inputs. We can reserve those and use them directly.
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001793
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001794 unsigned PrivateSegmentBufferReg =
1795 Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
1796 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001797 } else {
1798 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001799 // We tentatively reserve the last registers (skipping the last registers
1800 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
1801 // we'll replace these with the ones immediately after those which were
1802 // really allocated. In the prologue copies will be inserted from the
1803 // argument to these reserved registers.
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001804
1805 // Without HSA, relocations are used for the scratch pointer and the
1806 // buffer resource setup is always inserted in the prologue. Scratch wave
1807 // offset is still in an input SGPR.
1808 Info.setScratchRSrcReg(ReservedBufferReg);
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001809 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001810
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001811 // This should be accurate for kernels even before the frame is finalized.
1812 const bool HasFP = ST.getFrameLowering()->hasFP(MF);
1813 if (HasFP) {
1814 unsigned ReservedOffsetReg =
1815 TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1816 MachineRegisterInfo &MRI = MF.getRegInfo();
1817
1818 // Try to use s32 as the SP, but move it if it would interfere with input
1819 // arguments. This won't work with calls though.
1820 //
1821 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
1822 // registers.
1823 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
1824 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001825 } else {
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001826 assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
1827
1828 if (MFI.hasCalls())
1829 report_fatal_error("call in graphics shader with too many input SGPRs");
1830
1831 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
1832 if (!MRI.isLiveIn(Reg)) {
1833 Info.setStackPtrOffsetReg(Reg);
1834 break;
1835 }
1836 }
1837
1838 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
1839 report_fatal_error("failed to find register for SP");
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001840 }
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00001841
1842 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1843 Info.setFrameOffsetReg(ReservedOffsetReg);
1844 } else if (RequiresStackAccess) {
1845 assert(!MFI.hasCalls());
1846 // We know there are accesses and they will be done relative to SP, so just
1847 // pin it to the input.
1848 //
1849 // FIXME: Should not do this if inline asm is reading/writing these
1850 // registers.
1851 unsigned PreloadedSP = Info.getPreloadedReg(
1852 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1853
1854 Info.setStackPtrOffsetReg(PreloadedSP);
1855 Info.setScratchWaveOffsetReg(PreloadedSP);
1856 Info.setFrameOffsetReg(PreloadedSP);
1857 } else {
1858 assert(!MFI.hasCalls());
1859
1860 // There may not be stack access at all. There may still be spills, or
1861 // access of a constant pointer (in which cases an extra copy will be
1862 // emitted in the prolog).
1863 unsigned ReservedOffsetReg
1864 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1865 Info.setStackPtrOffsetReg(ReservedOffsetReg);
1866 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1867 Info.setFrameOffsetReg(ReservedOffsetReg);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001868 }
1869}
1870
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00001871bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
1872 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1873 return !Info->isEntryFunction();
1874}
1875
1876void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
1877
1878}
1879
1880void SITargetLowering::insertCopiesSplitCSR(
1881 MachineBasicBlock *Entry,
1882 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1883 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1884
1885 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1886 if (!IStart)
1887 return;
1888
1889 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1890 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1891 MachineBasicBlock::iterator MBBI = Entry->begin();
1892 for (const MCPhysReg *I = IStart; *I; ++I) {
1893 const TargetRegisterClass *RC = nullptr;
1894 if (AMDGPU::SReg_64RegClass.contains(*I))
1895 RC = &AMDGPU::SGPR_64RegClass;
1896 else if (AMDGPU::SReg_32RegClass.contains(*I))
1897 RC = &AMDGPU::SGPR_32RegClass;
1898 else
1899 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1900
1901 unsigned NewVR = MRI->createVirtualRegister(RC);
1902 // Create copy from CSR to a virtual register.
1903 Entry->addLiveIn(*I);
1904 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1905 .addReg(*I);
1906
1907 // Insert the copy-back instructions right before the terminator.
1908 for (auto *Exit : Exits)
1909 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1910 TII->get(TargetOpcode::COPY), *I)
1911 .addReg(NewVR);
1912 }
1913}
1914
Christian Konig2c8f6d52013-03-07 09:03:52 +00001915SDValue SITargetLowering::LowerFormalArguments(
Eric Christopher7792e322015-01-30 23:24:40 +00001916 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
Benjamin Kramerbdc49562016-06-12 15:39:02 +00001917 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1918 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00001919 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
Christian Konig2c8f6d52013-03-07 09:03:52 +00001920
1921 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenaultceafc552018-05-29 17:42:50 +00001922 const Function &Fn = MF.getFunction();
Matthias Braunf1caa282017-12-15 22:22:58 +00001923 FunctionType *FType = MF.getFunction().getFunctionType();
Christian Konig99ee0f42013-03-07 09:04:14 +00001924 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Christian Konig2c8f6d52013-03-07 09:03:52 +00001925
Nicolai Haehnledf3a20c2016-04-06 19:40:20 +00001926 if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
Oliver Stannard7e7d9832016-02-02 13:52:43 +00001927 DiagnosticInfoUnsupported NoGraphicsHSA(
Matthias Braunf1caa282017-12-15 22:22:58 +00001928 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
Matt Arsenaultd48da142015-11-02 23:23:02 +00001929 DAG.getContext()->diagnose(NoGraphicsHSA);
Diana Picus81bc3172016-05-26 15:24:55 +00001930 return DAG.getEntryNode();
Matt Arsenaultd48da142015-11-02 23:23:02 +00001931 }
1932
Christian Konig2c8f6d52013-03-07 09:03:52 +00001933 SmallVector<ISD::InputArg, 16> Splits;
Christian Konig2c8f6d52013-03-07 09:03:52 +00001934 SmallVector<CCValAssign, 16> ArgLocs;
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001935 BitVector Skipped(Ins.size());
Eric Christopherb5217502014-08-06 18:45:26 +00001936 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1937 *DAG.getContext());
Christian Konig2c8f6d52013-03-07 09:03:52 +00001938
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001939 bool IsShader = AMDGPU::isShader(CallConv);
Matt Arsenaultefa9f4b2017-04-11 22:29:28 +00001940 bool IsKernel = AMDGPU::isKernel(CallConv);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001941 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
Christian Konig99ee0f42013-03-07 09:04:14 +00001942
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001943 if (IsShader) {
1944 processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1945
1946 // At least one interpolation mode must be enabled or else the GPU will
1947 // hang.
1948 //
1949 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1950 // set PSInputAddr, the user wants to enable some bits after the compilation
1951 // based on run-time states. Since we can't know what the final PSInputEna
1952 // will look like, so we shouldn't do anything here and the user should take
1953 // responsibility for the correct programming.
1954 //
1955 // Otherwise, the following restrictions apply:
1956 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1957 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1958 // enabled too.
Tim Renoufc8ffffe2017-10-12 16:16:41 +00001959 if (CallConv == CallingConv::AMDGPU_PS) {
1960 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1961 ((Info->getPSInputAddr() & 0xF) == 0 &&
1962 Info->isPSInputAllocated(11))) {
1963 CCInfo.AllocateReg(AMDGPU::VGPR0);
1964 CCInfo.AllocateReg(AMDGPU::VGPR1);
1965 Info->markPSInputAllocated(0);
1966 Info->markPSInputEnabled(0);
1967 }
1968 if (Subtarget->isAmdPalOS()) {
1969 // For isAmdPalOS, the user does not enable some bits after compilation
1970 // based on run-time states; the register values being generated here are
1971 // the final ones set in hardware. Therefore we need to apply the
1972 // workaround to PSInputAddr and PSInputEnable together. (The case where
1973 // a bit is set in PSInputAddr but not PSInputEnable is where the
1974 // frontend set up an input arg for a particular interpolation mode, but
1975 // nothing uses that input arg. Really we should have an earlier pass
1976 // that removes such an arg.)
1977 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1978 if ((PsInputBits & 0x7F) == 0 ||
1979 ((PsInputBits & 0xF) == 0 &&
1980 (PsInputBits >> 11 & 1)))
1981 Info->markPSInputEnabled(
1982 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
1983 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001984 }
1985
Tom Stellard2f3f9852017-01-25 01:25:13 +00001986 assert(!Info->hasDispatchPtr() &&
Tom Stellardf110f8f2016-04-14 16:27:03 +00001987 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
1988 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
1989 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
1990 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
1991 !Info->hasWorkItemIDZ());
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001992 } else if (IsKernel) {
1993 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001994 } else {
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00001995 Splits.append(Ins.begin(), Ins.end());
Tom Stellardaf775432013-10-23 00:44:32 +00001996 }
1997
Matt Arsenaulte622dc32017-04-11 22:29:24 +00001998 if (IsEntryFunc) {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00001999 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002000 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
Tom Stellard2f3f9852017-01-25 01:25:13 +00002001 }
2002
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002003 if (IsKernel) {
Tom Stellardbbeb45a2016-09-16 21:53:00 +00002004 analyzeFormalArgumentsCompute(CCInfo, Ins);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002005 } else {
2006 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2007 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2008 }
Christian Konig2c8f6d52013-03-07 09:03:52 +00002009
Matt Arsenaultcf13d182015-07-10 22:51:36 +00002010 SmallVector<SDValue, 16> Chains;
2011
Matt Arsenault7b4826e2018-05-30 16:17:51 +00002012 // FIXME: This is the minimum kernel argument alignment. We should improve
2013 // this to the maximum alignment of the arguments.
2014 //
2015 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2016 // kern arg offset.
2017 const unsigned KernelArgBaseAlign = 16;
Matt Arsenault7b4826e2018-05-30 16:17:51 +00002018
2019 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
Christian Konigb7be72d2013-05-17 09:46:48 +00002020 const ISD::InputArg &Arg = Ins[i];
Matt Arsenaultd362b6a2018-07-13 16:40:37 +00002021 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
Christian Konigb7be72d2013-05-17 09:46:48 +00002022 InVals.push_back(DAG.getUNDEF(Arg.VT));
Christian Konig99ee0f42013-03-07 09:04:14 +00002023 continue;
2024 }
2025
Christian Konig2c8f6d52013-03-07 09:03:52 +00002026 CCValAssign &VA = ArgLocs[ArgIdx++];
Craig Topper7f416c82014-11-16 21:17:18 +00002027 MVT VT = VA.getLocVT();
Tom Stellarded882c22013-06-03 17:40:11 +00002028
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002029 if (IsEntryFunc && VA.isMemLoc()) {
Tom Stellardaf775432013-10-23 00:44:32 +00002030 VT = Ins[i].VT;
Tom Stellardbbeb45a2016-09-16 21:53:00 +00002031 EVT MemVT = VA.getLocVT();
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002032
Matt Arsenault4bec7d42018-07-20 09:05:08 +00002033 const uint64_t Offset = VA.getLocMemOffset();
Matt Arsenault7b4826e2018-05-30 16:17:51 +00002034 unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002035
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002036 SDValue Arg = lowerKernargMemParameter(
Matt Arsenault7b4826e2018-05-30 16:17:51 +00002037 DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
Matt Arsenaultcf13d182015-07-10 22:51:36 +00002038 Chains.push_back(Arg.getValue(1));
Tom Stellardca7ecf32014-08-22 18:49:31 +00002039
Craig Toppere3dcce92015-08-01 22:20:21 +00002040 auto *ParamTy =
Andrew Trick05938a52015-02-16 18:10:47 +00002041 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
Tom Stellard5bfbae52018-07-11 20:59:01 +00002042 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
Matt Arsenaultcdd191d2019-01-28 20:14:49 +00002043 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2044 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
Tom Stellardca7ecf32014-08-22 18:49:31 +00002045 // On SI local pointers are just offsets into LDS, so they are always
2046 // less than 16-bits. On CI and newer they could potentially be
2047 // real pointers, so we can't guarantee their size.
2048 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2049 DAG.getValueType(MVT::i16));
2050 }
2051
Tom Stellarded882c22013-06-03 17:40:11 +00002052 InVals.push_back(Arg);
2053 continue;
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002054 } else if (!IsEntryFunc && VA.isMemLoc()) {
2055 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2056 InVals.push_back(Val);
2057 if (!Arg.Flags.isByVal())
2058 Chains.push_back(Val.getValue(1));
2059 continue;
Tom Stellarded882c22013-06-03 17:40:11 +00002060 }
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002061
Christian Konig2c8f6d52013-03-07 09:03:52 +00002062 assert(VA.isRegLoc() && "Parameter must be in a register!");
2063
2064 unsigned Reg = VA.getLocReg();
Christian Konig2c8f6d52013-03-07 09:03:52 +00002065 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
Matt Arsenaultb3463552017-07-15 05:52:59 +00002066 EVT ValVT = VA.getValVT();
Christian Konig2c8f6d52013-03-07 09:03:52 +00002067
2068 Reg = MF.addLiveIn(Reg, RC);
2069 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2070
Matt Arsenault5c714cb2019-05-23 19:38:14 +00002071 if (Arg.Flags.isSRet()) {
Matt Arsenault45b98182017-11-15 00:45:43 +00002072 // The return object should be reasonably addressable.
2073
2074 // FIXME: This helps when the return is a real sret. If it is a
2075 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2076 // extra copy is inserted in SelectionDAGBuilder which obscures this.
Matt Arsenault5c714cb2019-05-23 19:38:14 +00002077 unsigned NumBits
2078 = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
Matt Arsenault45b98182017-11-15 00:45:43 +00002079 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2080 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2081 }
2082
Matt Arsenaultb3463552017-07-15 05:52:59 +00002083 // If this is an 8 or 16-bit value, it is really passed promoted
2084 // to 32 bits. Insert an assert[sz]ext to capture this, then
2085 // truncate to the right size.
2086 switch (VA.getLocInfo()) {
2087 case CCValAssign::Full:
2088 break;
2089 case CCValAssign::BCvt:
2090 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2091 break;
2092 case CCValAssign::SExt:
2093 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2094 DAG.getValueType(ValVT));
2095 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2096 break;
2097 case CCValAssign::ZExt:
2098 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2099 DAG.getValueType(ValVT));
2100 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2101 break;
2102 case CCValAssign::AExt:
2103 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2104 break;
2105 default:
2106 llvm_unreachable("Unknown loc info!");
2107 }
2108
Christian Konig2c8f6d52013-03-07 09:03:52 +00002109 InVals.push_back(Val);
2110 }
Tom Stellarde99fb652015-01-20 19:33:04 +00002111
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002112 if (!IsEntryFunc) {
2113 // Special inputs come after user arguments.
2114 allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2115 }
2116
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002117 // Start adding system SGPRs.
2118 if (IsEntryFunc) {
2119 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002120 } else {
2121 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2122 CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
2123 CCInfo.AllocateReg(Info->getFrameOffsetReg());
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002124 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002125 }
Matt Arsenaultcf13d182015-07-10 22:51:36 +00002126
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002127 auto &ArgUsageInfo =
2128 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
Matt Arsenaultceafc552018-05-29 17:42:50 +00002129 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002130
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002131 unsigned StackArgSize = CCInfo.getNextStackOffset();
2132 Info->setBytesInStackArgArea(StackArgSize);
2133
Matt Arsenaulte622dc32017-04-11 22:29:24 +00002134 return Chains.empty() ? Chain :
2135 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
Christian Konig2c8f6d52013-03-07 09:03:52 +00002136}
2137
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002138// TODO: If return values can't fit in registers, we should return as many as
2139// possible in registers before passing on stack.
2140bool SITargetLowering::CanLowerReturn(
2141 CallingConv::ID CallConv,
2142 MachineFunction &MF, bool IsVarArg,
2143 const SmallVectorImpl<ISD::OutputArg> &Outs,
2144 LLVMContext &Context) const {
2145 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2146 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2147 // for shaders. Vector types should be explicitly handled by CC.
2148 if (AMDGPU::isEntryFunctionCC(CallConv))
2149 return true;
2150
2151 SmallVector<CCValAssign, 16> RVLocs;
2152 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2153 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2154}
2155
Benjamin Kramerbdc49562016-06-12 15:39:02 +00002156SDValue
2157SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2158 bool isVarArg,
2159 const SmallVectorImpl<ISD::OutputArg> &Outs,
2160 const SmallVectorImpl<SDValue> &OutVals,
2161 const SDLoc &DL, SelectionDAG &DAG) const {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002162 MachineFunction &MF = DAG.getMachineFunction();
2163 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2164
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002165 if (AMDGPU::isKernel(CallConv)) {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002166 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2167 OutVals, DL, DAG);
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002168 }
2169
2170 bool IsShader = AMDGPU::isShader(CallConv);
Marek Olsak8a0f3352016-01-13 17:23:04 +00002171
Matt Arsenault55ab9212018-08-01 19:57:34 +00002172 Info->setIfReturnsVoid(Outs.empty());
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002173 bool IsWaveEnd = Info->returnsVoid() && IsShader;
Marek Olsak8e9cc632016-01-13 17:23:09 +00002174
Marek Olsak8a0f3352016-01-13 17:23:04 +00002175 // CCValAssign - represent the assignment of the return value to a location.
2176 SmallVector<CCValAssign, 48> RVLocs;
Matt Arsenault55ab9212018-08-01 19:57:34 +00002177 SmallVector<ISD::OutputArg, 48> Splits;
Marek Olsak8a0f3352016-01-13 17:23:04 +00002178
2179 // CCState - Info about the registers and stack slots.
2180 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2181 *DAG.getContext());
2182
2183 // Analyze outgoing return values.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002184 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
Marek Olsak8a0f3352016-01-13 17:23:04 +00002185
2186 SDValue Flag;
2187 SmallVector<SDValue, 48> RetOps;
2188 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2189
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002190 // Add return address for callable functions.
2191 if (!Info->isEntryFunction()) {
2192 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2193 SDValue ReturnAddrReg = CreateLiveInRegister(
2194 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2195
2196 // FIXME: Should be able to use a vreg here, but need a way to prevent it
2197 // from being allcoated to a CSR.
2198
2199 SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2200 MVT::i64);
2201
2202 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
2203 Flag = Chain.getValue(1);
2204
2205 RetOps.push_back(PhysReturnAddrReg);
2206 }
2207
Marek Olsak8a0f3352016-01-13 17:23:04 +00002208 // Copy the result values into the output registers.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002209 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2210 ++I, ++RealRVLocIdx) {
2211 CCValAssign &VA = RVLocs[I];
Marek Olsak8a0f3352016-01-13 17:23:04 +00002212 assert(VA.isRegLoc() && "Can only return in registers!");
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002213 // TODO: Partially return in registers if return values don't fit.
Matt Arsenault55ab9212018-08-01 19:57:34 +00002214 SDValue Arg = OutVals[RealRVLocIdx];
Marek Olsak8a0f3352016-01-13 17:23:04 +00002215
2216 // Copied from other backends.
2217 switch (VA.getLocInfo()) {
Marek Olsak8a0f3352016-01-13 17:23:04 +00002218 case CCValAssign::Full:
2219 break;
2220 case CCValAssign::BCvt:
2221 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2222 break;
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002223 case CCValAssign::SExt:
2224 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2225 break;
2226 case CCValAssign::ZExt:
2227 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2228 break;
2229 case CCValAssign::AExt:
2230 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2231 break;
2232 default:
2233 llvm_unreachable("Unknown loc info!");
Marek Olsak8a0f3352016-01-13 17:23:04 +00002234 }
2235
2236 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2237 Flag = Chain.getValue(1);
2238 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2239 }
2240
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002241 // FIXME: Does sret work properly?
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002242 if (!Info->isEntryFunction()) {
Tom Stellardc5a154d2018-06-28 23:47:12 +00002243 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002244 const MCPhysReg *I =
2245 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2246 if (I) {
2247 for (; *I; ++I) {
2248 if (AMDGPU::SReg_64RegClass.contains(*I))
2249 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2250 else if (AMDGPU::SReg_32RegClass.contains(*I))
2251 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2252 else
2253 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2254 }
2255 }
2256 }
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002257
Marek Olsak8a0f3352016-01-13 17:23:04 +00002258 // Update chain and glue.
2259 RetOps[0] = Chain;
2260 if (Flag.getNode())
2261 RetOps.push_back(Flag);
2262
Matt Arsenault2b1f9aa2017-05-17 21:56:25 +00002263 unsigned Opc = AMDGPUISD::ENDPGM;
2264 if (!IsWaveEnd)
2265 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
Matt Arsenault9babdf42016-06-22 20:15:28 +00002266 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
Marek Olsak8a0f3352016-01-13 17:23:04 +00002267}
2268
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002269SDValue SITargetLowering::LowerCallResult(
2270 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2271 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2272 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2273 SDValue ThisVal) const {
2274 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2275
2276 // Assign locations to each value returned by this call.
2277 SmallVector<CCValAssign, 16> RVLocs;
2278 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2279 *DAG.getContext());
2280 CCInfo.AnalyzeCallResult(Ins, RetCC);
2281
2282 // Copy all of the result registers out of their specified physreg.
2283 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2284 CCValAssign VA = RVLocs[i];
2285 SDValue Val;
2286
2287 if (VA.isRegLoc()) {
2288 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2289 Chain = Val.getValue(1);
2290 InFlag = Val.getValue(2);
2291 } else if (VA.isMemLoc()) {
2292 report_fatal_error("TODO: return values in memory");
2293 } else
2294 llvm_unreachable("unknown argument location type");
2295
2296 switch (VA.getLocInfo()) {
2297 case CCValAssign::Full:
2298 break;
2299 case CCValAssign::BCvt:
2300 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2301 break;
2302 case CCValAssign::ZExt:
2303 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2304 DAG.getValueType(VA.getValVT()));
2305 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2306 break;
2307 case CCValAssign::SExt:
2308 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2309 DAG.getValueType(VA.getValVT()));
2310 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2311 break;
2312 case CCValAssign::AExt:
2313 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2314 break;
2315 default:
2316 llvm_unreachable("Unknown loc info!");
2317 }
2318
2319 InVals.push_back(Val);
2320 }
2321
2322 return Chain;
2323}
2324
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002325// Add code to pass special inputs required depending on used features separate
2326// from the explicit user arguments present in the IR.
2327void SITargetLowering::passSpecialInputs(
2328 CallLoweringInfo &CLI,
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002329 CCState &CCInfo,
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002330 const SIMachineFunctionInfo &Info,
2331 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2332 SmallVectorImpl<SDValue> &MemOpChains,
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002333 SDValue Chain) const {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002334 // If we don't have a call site, this was a call inserted by
2335 // legalization. These can never use special inputs.
2336 if (!CLI.CS)
2337 return;
2338
2339 const Function *CalleeFunc = CLI.CS.getCalledFunction();
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002340 assert(CalleeFunc);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002341
2342 SelectionDAG &DAG = CLI.DAG;
2343 const SDLoc &DL = CLI.DL;
2344
Tom Stellardc5a154d2018-06-28 23:47:12 +00002345 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002346
2347 auto &ArgUsageInfo =
2348 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2349 const AMDGPUFunctionArgInfo &CalleeArgInfo
2350 = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2351
2352 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2353
2354 // TODO: Unify with private memory register handling. This is complicated by
2355 // the fact that at least in kernels, the input argument is not necessarily
2356 // in the same location as the input.
2357 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
2358 AMDGPUFunctionArgInfo::DISPATCH_PTR,
2359 AMDGPUFunctionArgInfo::QUEUE_PTR,
2360 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
2361 AMDGPUFunctionArgInfo::DISPATCH_ID,
2362 AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
2363 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
2364 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
2365 AMDGPUFunctionArgInfo::WORKITEM_ID_X,
2366 AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
Matt Arsenault817c2532017-08-03 23:12:44 +00002367 AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
2368 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002369 };
2370
2371 for (auto InputID : InputRegs) {
2372 const ArgDescriptor *OutgoingArg;
2373 const TargetRegisterClass *ArgRC;
2374
2375 std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2376 if (!OutgoingArg)
2377 continue;
2378
2379 const ArgDescriptor *IncomingArg;
2380 const TargetRegisterClass *IncomingArgRC;
2381 std::tie(IncomingArg, IncomingArgRC)
2382 = CallerArgInfo.getPreloadedValue(InputID);
2383 assert(IncomingArgRC == ArgRC);
2384
2385 // All special arguments are ints for now.
2386 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
Matt Arsenault817c2532017-08-03 23:12:44 +00002387 SDValue InputReg;
2388
2389 if (IncomingArg) {
2390 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2391 } else {
2392 // The implicit arg ptr is special because it doesn't have a corresponding
2393 // input for kernels, and is computed from the kernarg segment pointer.
2394 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2395 InputReg = getImplicitArgPtr(DAG, DL);
2396 }
2397
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002398 if (OutgoingArg->isRegister()) {
2399 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2400 } else {
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002401 unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
2402 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2403 SpecialArgOffset);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002404 MemOpChains.push_back(ArgStore);
2405 }
2406 }
2407}
2408
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002409static bool canGuaranteeTCO(CallingConv::ID CC) {
2410 return CC == CallingConv::Fast;
2411}
2412
2413/// Return true if we might ever do TCO for calls with this calling convention.
2414static bool mayTailCallThisCC(CallingConv::ID CC) {
2415 switch (CC) {
2416 case CallingConv::C:
2417 return true;
2418 default:
2419 return canGuaranteeTCO(CC);
2420 }
2421}
2422
2423bool SITargetLowering::isEligibleForTailCallOptimization(
2424 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2425 const SmallVectorImpl<ISD::OutputArg> &Outs,
2426 const SmallVectorImpl<SDValue> &OutVals,
2427 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2428 if (!mayTailCallThisCC(CalleeCC))
2429 return false;
2430
2431 MachineFunction &MF = DAG.getMachineFunction();
Matthias Braunf1caa282017-12-15 22:22:58 +00002432 const Function &CallerF = MF.getFunction();
2433 CallingConv::ID CallerCC = CallerF.getCallingConv();
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002434 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2435 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2436
2437 // Kernels aren't callable, and don't have a live in return address so it
2438 // doesn't make sense to do a tail call with entry functions.
2439 if (!CallerPreserved)
2440 return false;
2441
2442 bool CCMatch = CallerCC == CalleeCC;
2443
2444 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
2445 if (canGuaranteeTCO(CalleeCC) && CCMatch)
2446 return true;
2447 return false;
2448 }
2449
2450 // TODO: Can we handle var args?
2451 if (IsVarArg)
2452 return false;
2453
Matthias Braunf1caa282017-12-15 22:22:58 +00002454 for (const Argument &Arg : CallerF.args()) {
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002455 if (Arg.hasByValAttr())
2456 return false;
2457 }
2458
2459 LLVMContext &Ctx = *DAG.getContext();
2460
2461 // Check that the call results are passed in the same way.
2462 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2463 CCAssignFnForCall(CalleeCC, IsVarArg),
2464 CCAssignFnForCall(CallerCC, IsVarArg)))
2465 return false;
2466
2467 // The callee has to preserve all registers the caller needs to preserve.
2468 if (!CCMatch) {
2469 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2470 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2471 return false;
2472 }
2473
2474 // Nothing more to check if the callee is taking no arguments.
2475 if (Outs.empty())
2476 return true;
2477
2478 SmallVector<CCValAssign, 16> ArgLocs;
2479 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2480
2481 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2482
2483 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2484 // If the stack arguments for this call do not fit into our own save area then
2485 // the call cannot be made tail.
2486 // TODO: Is this really necessary?
2487 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2488 return false;
2489
2490 const MachineRegisterInfo &MRI = MF.getRegInfo();
2491 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2492}
2493
2494bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2495 if (!CI->isTailCall())
2496 return false;
2497
2498 const Function *ParentFn = CI->getParent()->getParent();
2499 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2500 return false;
2501
2502 auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2503 return (Attr.getValueAsString() != "true");
2504}
2505
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002506// The wave scratch offset register is used as the global base pointer.
2507SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
2508 SmallVectorImpl<SDValue> &InVals) const {
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002509 SelectionDAG &DAG = CLI.DAG;
2510 const SDLoc &DL = CLI.DL;
2511 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2512 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2513 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2514 SDValue Chain = CLI.Chain;
2515 SDValue Callee = CLI.Callee;
2516 bool &IsTailCall = CLI.IsTailCall;
2517 CallingConv::ID CallConv = CLI.CallConv;
2518 bool IsVarArg = CLI.IsVarArg;
2519 bool IsSibCall = false;
2520 bool IsThisReturn = false;
2521 MachineFunction &MF = DAG.getMachineFunction();
2522
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002523 if (IsVarArg) {
2524 return lowerUnhandledCall(CLI, InVals,
2525 "unsupported call to variadic function ");
2526 }
2527
Matt Arsenault935f3b72018-08-08 16:58:39 +00002528 if (!CLI.CS.getInstruction())
2529 report_fatal_error("unsupported libcall legalization");
2530
Matt Arsenaulta176cc52017-08-03 23:32:41 +00002531 if (!CLI.CS.getCalledFunction()) {
2532 return lowerUnhandledCall(CLI, InVals,
2533 "unsupported indirect call to function ");
2534 }
2535
2536 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2537 return lowerUnhandledCall(CLI, InVals,
2538 "unsupported required tail call to function ");
2539 }
2540
Matt Arsenault1fb90132018-06-28 10:18:36 +00002541 if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
2542 // Note the issue is with the CC of the calling function, not of the call
2543 // itself.
2544 return lowerUnhandledCall(CLI, InVals,
2545 "unsupported call from graphics shader of function ");
2546 }
2547
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002548 if (IsTailCall) {
2549 IsTailCall = isEligibleForTailCallOptimization(
2550 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2551 if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2552 report_fatal_error("failed to perform tail call elimination on a call "
2553 "site marked musttail");
2554 }
2555
2556 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2557
2558 // A sibling call is one where we're under the usual C ABI and not planning
2559 // to change that but can still do a tail call:
2560 if (!TailCallOpt && IsTailCall)
2561 IsSibCall = true;
2562
2563 if (IsTailCall)
2564 ++NumTailCalls;
2565 }
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002566
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002567 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2568
2569 // Analyze operands of the call, assigning locations to each operand.
2570 SmallVector<CCValAssign, 16> ArgLocs;
2571 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2572 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002573
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002574 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2575
2576 // Get a count of how many bytes are to be pushed on the stack.
2577 unsigned NumBytes = CCInfo.getNextStackOffset();
2578
2579 if (IsSibCall) {
2580 // Since we're not changing the ABI to make this a tail call, the memory
2581 // operands are already available in the caller's incoming argument space.
2582 NumBytes = 0;
2583 }
2584
2585 // FPDiff is the byte offset of the call's argument area from the callee's.
2586 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2587 // by this amount for a tail call. In a sibling call it must be 0 because the
2588 // caller will deallocate the entire stack and the callee still expects its
2589 // arguments to begin at SP+0. Completely unused for non-tail calls.
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002590 int32_t FPDiff = 0;
2591 MachineFrameInfo &MFI = MF.getFrameInfo();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002592 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2593
Matt Arsenault6efd0822017-09-14 17:14:57 +00002594 SDValue CallerSavedFP;
2595
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002596 // Adjust the stack pointer for the new arguments...
2597 // These operations are automatically eliminated by the prolog/epilog pass
2598 if (!IsSibCall) {
Matt Arsenaultdefe3712017-09-14 17:37:40 +00002599 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002600
Matt Arsenault99e6f4d2019-05-16 15:10:27 +00002601 SmallVector<SDValue, 4> CopyFromChains;
2602
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002603 unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2604
2605 // In the HSA case, this should be an identity copy.
2606 SDValue ScratchRSrcReg
2607 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2608 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
Matt Arsenault99e6f4d2019-05-16 15:10:27 +00002609 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002610
2611 // TODO: Don't hardcode these registers and get from the callee function.
2612 SDValue ScratchWaveOffsetReg
2613 = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2614 RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
Matt Arsenault99e6f4d2019-05-16 15:10:27 +00002615 CopyFromChains.push_back(ScratchWaveOffsetReg.getValue(1));
Matt Arsenault6efd0822017-09-14 17:14:57 +00002616
2617 if (!Info->isEntryFunction()) {
2618 // Avoid clobbering this function's FP value. In the current convention
2619 // callee will overwrite this, so do save/restore around the call site.
2620 CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2621 Info->getFrameOffsetReg(), MVT::i32);
Matt Arsenault99e6f4d2019-05-16 15:10:27 +00002622 CopyFromChains.push_back(CallerSavedFP.getValue(1));
Matt Arsenault6efd0822017-09-14 17:14:57 +00002623 }
Matt Arsenault99e6f4d2019-05-16 15:10:27 +00002624
2625 Chain = DAG.getTokenFactor(DL, CopyFromChains);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002626 }
2627
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002628 SmallVector<SDValue, 8> MemOpChains;
2629 MVT PtrVT = MVT::i32;
2630
2631 // Walk the register/memloc assignments, inserting copies/loads.
2632 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2633 ++i, ++realArgIdx) {
2634 CCValAssign &VA = ArgLocs[i];
2635 SDValue Arg = OutVals[realArgIdx];
2636
2637 // Promote the value if needed.
2638 switch (VA.getLocInfo()) {
2639 case CCValAssign::Full:
2640 break;
2641 case CCValAssign::BCvt:
2642 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2643 break;
2644 case CCValAssign::ZExt:
2645 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2646 break;
2647 case CCValAssign::SExt:
2648 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2649 break;
2650 case CCValAssign::AExt:
2651 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2652 break;
2653 case CCValAssign::FPExt:
2654 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2655 break;
2656 default:
2657 llvm_unreachable("Unknown loc info!");
2658 }
2659
2660 if (VA.isRegLoc()) {
2661 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2662 } else {
2663 assert(VA.isMemLoc());
2664
2665 SDValue DstAddr;
2666 MachinePointerInfo DstInfo;
2667
2668 unsigned LocMemOffset = VA.getLocMemOffset();
2669 int32_t Offset = LocMemOffset;
Matt Arsenaultb655fa92017-11-29 01:25:12 +00002670
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002671 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002672 unsigned Align = 0;
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002673
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002674 if (IsTailCall) {
2675 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2676 unsigned OpSize = Flags.isByVal() ?
2677 Flags.getByValSize() : VA.getValVT().getStoreSize();
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002678
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002679 // FIXME: We can have better than the minimum byval required alignment.
2680 Align = Flags.isByVal() ? Flags.getByValAlign() :
2681 MinAlign(Subtarget->getStackAlignment(), Offset);
2682
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002683 Offset = Offset + FPDiff;
2684 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2685
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002686 DstAddr = DAG.getFrameIndex(FI, PtrVT);
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002687 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2688
2689 // Make sure any stack arguments overlapping with where we're storing
2690 // are loaded before this eventual operation. Otherwise they'll be
2691 // clobbered.
2692
2693 // FIXME: Why is this really necessary? This seems to just result in a
2694 // lot of code to copy the stack and write them back to the same
2695 // locations, which are supposed to be immutable?
2696 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2697 } else {
2698 DstAddr = PtrOff;
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002699 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002700 Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002701 }
2702
2703 if (Outs[i].Flags.isByVal()) {
2704 SDValue SizeNode =
2705 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2706 SDValue Cpy = DAG.getMemcpy(
2707 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2708 /*isVol = */ false, /*AlwaysInline = */ true,
Yaxun Liuc5962262017-11-22 16:13:35 +00002709 /*isTailCall = */ false, DstInfo,
2710 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
Matt Arsenault0da63502018-08-31 05:49:54 +00002711 *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))));
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002712
2713 MemOpChains.push_back(Cpy);
2714 } else {
Matt Arsenaultff987ac2018-09-13 12:14:31 +00002715 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002716 MemOpChains.push_back(Store);
2717 }
2718 }
2719 }
2720
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002721 // Copy special input registers after user input arguments.
Matt Arsenaultbb8e64e2018-08-22 11:09:45 +00002722 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00002723
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002724 if (!MemOpChains.empty())
2725 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2726
2727 // Build a sequence of copy-to-reg nodes chained together with token chain
2728 // and flag operands which copy the outgoing args into the appropriate regs.
2729 SDValue InFlag;
2730 for (auto &RegToPass : RegsToPass) {
2731 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2732 RegToPass.second, InFlag);
2733 InFlag = Chain.getValue(1);
2734 }
2735
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002736
2737 SDValue PhysReturnAddrReg;
2738 if (IsTailCall) {
2739 // Since the return is being combined with the call, we need to pass on the
2740 // return address.
2741
2742 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2743 SDValue ReturnAddrReg = CreateLiveInRegister(
2744 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2745
2746 PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2747 MVT::i64);
2748 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2749 InFlag = Chain.getValue(1);
2750 }
2751
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002752 // We don't usually want to end the call-sequence here because we would tidy
2753 // the frame up *after* the call, however in the ABI-changing tail-call case
2754 // we've carefully laid out the parameters so that when sp is reset they'll be
2755 // in the correct location.
2756 if (IsTailCall && !IsSibCall) {
2757 Chain = DAG.getCALLSEQ_END(Chain,
2758 DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2759 DAG.getTargetConstant(0, DL, MVT::i32),
2760 InFlag, DL);
2761 InFlag = Chain.getValue(1);
2762 }
2763
2764 std::vector<SDValue> Ops;
2765 Ops.push_back(Chain);
2766 Ops.push_back(Callee);
Scott Linderd19d1972019-02-04 20:00:07 +00002767 // Add a redundant copy of the callee global which will not be legalized, as
2768 // we need direct access to the callee later.
2769 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee);
2770 const GlobalValue *GV = GSD->getGlobal();
2771 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002772
2773 if (IsTailCall) {
2774 // Each tail call may have to adjust the stack by a different amount, so
2775 // this information must travel along with the operation for eventual
2776 // consumption by emitEpilogue.
2777 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002778
2779 Ops.push_back(PhysReturnAddrReg);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002780 }
2781
2782 // Add argument registers to the end of the list so that they are known live
2783 // into the call.
2784 for (auto &RegToPass : RegsToPass) {
2785 Ops.push_back(DAG.getRegister(RegToPass.first,
2786 RegToPass.second.getValueType()));
2787 }
2788
2789 // Add a register mask operand representing the call-preserved registers.
2790
Tom Stellardc5a154d2018-06-28 23:47:12 +00002791 auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002792 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2793 assert(Mask && "Missing call preserved mask for calling convention");
2794 Ops.push_back(DAG.getRegisterMask(Mask));
2795
2796 if (InFlag.getNode())
2797 Ops.push_back(InFlag);
2798
2799 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2800
2801 // If we're doing a tall call, use a TC_RETURN here rather than an
2802 // actual call instruction.
2803 if (IsTailCall) {
Matt Arsenault71bcbd42017-08-11 20:42:08 +00002804 MFI.setHasTailCall();
2805 return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002806 }
2807
2808 // Returns a chain and a flag for retval copy to use.
2809 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2810 Chain = Call.getValue(0);
2811 InFlag = Call.getValue(1);
2812
Matt Arsenault6efd0822017-09-14 17:14:57 +00002813 if (CallerSavedFP) {
2814 SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2815 Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2816 InFlag = Chain.getValue(1);
2817 }
2818
Matt Arsenaultdefe3712017-09-14 17:37:40 +00002819 uint64_t CalleePopBytes = NumBytes;
2820 Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00002821 DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2822 InFlag, DL);
2823 if (!Ins.empty())
2824 InFlag = Chain.getValue(1);
2825
2826 // Handle result values, copying them out of physregs into vregs that we
2827 // return.
2828 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2829 InVals, IsThisReturn,
2830 IsThisReturn ? OutVals[0] : SDValue());
2831}
2832
Matt Arsenault9a10cea2016-01-26 04:29:24 +00002833unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2834 SelectionDAG &DAG) const {
2835 unsigned Reg = StringSwitch<unsigned>(RegName)
2836 .Case("m0", AMDGPU::M0)
2837 .Case("exec", AMDGPU::EXEC)
2838 .Case("exec_lo", AMDGPU::EXEC_LO)
2839 .Case("exec_hi", AMDGPU::EXEC_HI)
2840 .Case("flat_scratch", AMDGPU::FLAT_SCR)
2841 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2842 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2843 .Default(AMDGPU::NoRegister);
2844
2845 if (Reg == AMDGPU::NoRegister) {
2846 report_fatal_error(Twine("invalid register name \""
2847 + StringRef(RegName) + "\"."));
2848
2849 }
2850
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00002851 if ((Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||
2852 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) &&
2853 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
Matt Arsenault9a10cea2016-01-26 04:29:24 +00002854 report_fatal_error(Twine("invalid register \""
2855 + StringRef(RegName) + "\" for subtarget."));
2856 }
2857
2858 switch (Reg) {
2859 case AMDGPU::M0:
2860 case AMDGPU::EXEC_LO:
2861 case AMDGPU::EXEC_HI:
2862 case AMDGPU::FLAT_SCR_LO:
2863 case AMDGPU::FLAT_SCR_HI:
2864 if (VT.getSizeInBits() == 32)
2865 return Reg;
2866 break;
2867 case AMDGPU::EXEC:
2868 case AMDGPU::FLAT_SCR:
2869 if (VT.getSizeInBits() == 64)
2870 return Reg;
2871 break;
2872 default:
2873 llvm_unreachable("missing register type checking");
2874 }
2875
2876 report_fatal_error(Twine("invalid type for register \""
2877 + StringRef(RegName) + "\"."));
2878}
2879
Matt Arsenault786724a2016-07-12 21:41:32 +00002880// If kill is not the last instruction, split the block so kill is always a
2881// proper terminator.
2882MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
2883 MachineBasicBlock *BB) const {
2884 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2885
2886 MachineBasicBlock::iterator SplitPoint(&MI);
2887 ++SplitPoint;
2888
2889 if (SplitPoint == BB->end()) {
2890 // Don't bother with a new block.
Marek Olsakce76ea02017-10-24 10:27:13 +00002891 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
Matt Arsenault786724a2016-07-12 21:41:32 +00002892 return BB;
2893 }
2894
2895 MachineFunction *MF = BB->getParent();
2896 MachineBasicBlock *SplitBB
2897 = MF->CreateMachineBasicBlock(BB->getBasicBlock());
2898
Matt Arsenault786724a2016-07-12 21:41:32 +00002899 MF->insert(++MachineFunction::iterator(BB), SplitBB);
2900 SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2901
Matt Arsenaultd40ded62016-07-22 17:01:15 +00002902 SplitBB->transferSuccessorsAndUpdatePHIs(BB);
Matt Arsenault786724a2016-07-12 21:41:32 +00002903 BB->addSuccessor(SplitBB);
2904
Marek Olsakce76ea02017-10-24 10:27:13 +00002905 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
Matt Arsenault786724a2016-07-12 21:41:32 +00002906 return SplitBB;
2907}
2908
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002909// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2910// wavefront. If the value is uniform and just happens to be in a VGPR, this
2911// will only do one iteration. In the worst case, this will loop 64 times.
2912//
2913// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002914static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
2915 const SIInstrInfo *TII,
2916 MachineRegisterInfo &MRI,
2917 MachineBasicBlock &OrigBB,
2918 MachineBasicBlock &LoopBB,
2919 const DebugLoc &DL,
2920 const MachineOperand &IdxReg,
2921 unsigned InitReg,
2922 unsigned ResultReg,
2923 unsigned PhiReg,
2924 unsigned InitSaveExecReg,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00002925 int Offset,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002926 bool UseGPRIdxMode,
2927 bool IsIndirectSrc) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002928 MachineBasicBlock::iterator I = LoopBB.begin();
2929
2930 unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2931 unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2932 unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2933 unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2934
2935 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2936 .addReg(InitReg)
2937 .addMBB(&OrigBB)
2938 .addReg(ResultReg)
2939 .addMBB(&LoopBB);
2940
2941 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2942 .addReg(InitSaveExecReg)
2943 .addMBB(&OrigBB)
2944 .addReg(NewExec)
2945 .addMBB(&LoopBB);
2946
2947 // Read the next variant <- also loop target.
2948 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2949 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2950
2951 // Compare the just read M0 value to all possible Idx values.
2952 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2953 .addReg(CurrentIdxReg)
Matt Arsenaultf0ba86a2016-07-21 09:40:57 +00002954 .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002955
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002956 // Update EXEC, save the original EXEC value to VCC.
2957 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2958 .addReg(CondReg, RegState::Kill);
2959
2960 MRI.setSimpleHint(NewExec, CondReg);
2961
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00002962 if (UseGPRIdxMode) {
2963 unsigned IdxReg;
2964 if (Offset == 0) {
2965 IdxReg = CurrentIdxReg;
2966 } else {
2967 IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2968 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2969 .addReg(CurrentIdxReg, RegState::Kill)
2970 .addImm(Offset);
2971 }
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002972 unsigned IdxMode = IsIndirectSrc ?
Dmitry Preobrazhenskyef920352019-02-27 13:12:12 +00002973 AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
Changpeng Fangda38b5f2018-02-16 16:31:30 +00002974 MachineInstr *SetOn =
2975 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2976 .addReg(IdxReg, RegState::Kill)
2977 .addImm(IdxMode);
2978 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002979 } else {
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00002980 // Move index from VCC into M0
2981 if (Offset == 0) {
2982 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2983 .addReg(CurrentIdxReg, RegState::Kill);
2984 } else {
2985 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2986 .addReg(CurrentIdxReg, RegState::Kill)
2987 .addImm(Offset);
2988 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002989 }
2990
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002991 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00002992 MachineInstr *InsertPt =
Scott Lindere2c58472019-02-05 19:50:32 +00002993 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00002994 .addReg(AMDGPU::EXEC)
2995 .addReg(NewExec);
2996
2997 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2998 // s_cbranch_scc0?
2999
3000 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3001 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
3002 .addMBB(&LoopBB);
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003003
3004 return InsertPt->getIterator();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003005}
3006
3007// This has slightly sub-optimal regalloc when the source vector is killed by
3008// the read. The register allocator does not understand that the kill is
3009// per-workitem, so is kept alive for the whole loop so we end up not re-using a
3010// subregister from it, using 1 more VGPR than necessary. This was saved when
3011// this was expanded after register allocation.
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003012static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
3013 MachineBasicBlock &MBB,
3014 MachineInstr &MI,
3015 unsigned InitResultReg,
3016 unsigned PhiReg,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003017 int Offset,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003018 bool UseGPRIdxMode,
3019 bool IsIndirectSrc) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003020 MachineFunction *MF = MBB.getParent();
3021 MachineRegisterInfo &MRI = MF->getRegInfo();
3022 const DebugLoc &DL = MI.getDebugLoc();
3023 MachineBasicBlock::iterator I(&MI);
3024
3025 unsigned DstReg = MI.getOperand(0).getReg();
Matt Arsenault301162c2017-11-15 21:51:43 +00003026 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3027 unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003028
3029 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3030
3031 // Save the EXEC mask
3032 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
3033 .addReg(AMDGPU::EXEC);
3034
3035 // To insert the loop we need to split the block. Move everything after this
3036 // point to a new block, and insert a new empty block between the two.
3037 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
3038 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
3039 MachineFunction::iterator MBBI(MBB);
3040 ++MBBI;
3041
3042 MF->insert(MBBI, LoopBB);
3043 MF->insert(MBBI, RemainderBB);
3044
3045 LoopBB->addSuccessor(LoopBB);
3046 LoopBB->addSuccessor(RemainderBB);
3047
3048 // Move the rest of the block into a new block.
Matt Arsenaultd40ded62016-07-22 17:01:15 +00003049 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003050 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3051
3052 MBB.addSuccessor(LoopBB);
3053
3054 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3055
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003056 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3057 InitResultReg, DstReg, PhiReg, TmpExec,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003058 Offset, UseGPRIdxMode, IsIndirectSrc);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003059
3060 MachineBasicBlock::iterator First = RemainderBB->begin();
3061 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
3062 .addReg(SaveExec);
3063
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003064 return InsPt;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003065}
3066
3067// Returns subreg index, offset
3068static std::pair<unsigned, int>
3069computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
3070 const TargetRegisterClass *SuperRC,
3071 unsigned VecReg,
3072 int Offset) {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003073 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003074
3075 // Skip out of bounds offsets, or else we would end up using an undefined
3076 // register.
3077 if (Offset >= NumElts || Offset < 0)
3078 return std::make_pair(AMDGPU::sub0, Offset);
3079
3080 return std::make_pair(AMDGPU::sub0 + Offset, 0);
3081}
3082
3083// Return true if the index is an SGPR and was set.
3084static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
3085 MachineRegisterInfo &MRI,
3086 MachineInstr &MI,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003087 int Offset,
3088 bool UseGPRIdxMode,
3089 bool IsIndirectSrc) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003090 MachineBasicBlock *MBB = MI.getParent();
3091 const DebugLoc &DL = MI.getDebugLoc();
3092 MachineBasicBlock::iterator I(&MI);
3093
3094 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3095 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3096
3097 assert(Idx->getReg() != AMDGPU::NoRegister);
3098
3099 if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
3100 return false;
3101
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003102 if (UseGPRIdxMode) {
3103 unsigned IdxMode = IsIndirectSrc ?
Dmitry Preobrazhenskyef920352019-02-27 13:12:12 +00003104 AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003105 if (Offset == 0) {
3106 MachineInstr *SetOn =
Diana Picus116bbab2017-01-13 09:58:52 +00003107 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3108 .add(*Idx)
3109 .addImm(IdxMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003110
Matt Arsenaultdac31db2016-10-13 12:45:16 +00003111 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003112 } else {
3113 unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3114 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
Diana Picus116bbab2017-01-13 09:58:52 +00003115 .add(*Idx)
3116 .addImm(Offset);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003117 MachineInstr *SetOn =
3118 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3119 .addReg(Tmp, RegState::Kill)
3120 .addImm(IdxMode);
3121
Matt Arsenaultdac31db2016-10-13 12:45:16 +00003122 SetOn->getOperand(3).setIsUndef();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003123 }
3124
3125 return true;
3126 }
3127
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003128 if (Offset == 0) {
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00003129 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3130 .add(*Idx);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003131 } else {
3132 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00003133 .add(*Idx)
3134 .addImm(Offset);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003135 }
3136
3137 return true;
3138}
3139
3140// Control flow needs to be inserted if indexing with a VGPR.
3141static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
3142 MachineBasicBlock &MBB,
Tom Stellard5bfbae52018-07-11 20:59:01 +00003143 const GCNSubtarget &ST) {
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003144 const SIInstrInfo *TII = ST.getInstrInfo();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003145 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3146 MachineFunction *MF = MBB.getParent();
3147 MachineRegisterInfo &MRI = MF->getRegInfo();
3148
3149 unsigned Dst = MI.getOperand(0).getReg();
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003150 unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003151 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3152
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003153 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003154
3155 unsigned SubReg;
3156 std::tie(SubReg, Offset)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003157 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003158
Marek Olsake22fdb92017-03-21 17:00:32 +00003159 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003160
3161 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003162 MachineBasicBlock::iterator I(&MI);
3163 const DebugLoc &DL = MI.getDebugLoc();
3164
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003165 if (UseGPRIdxMode) {
3166 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3167 // to avoid interfering with other uses, so probably requires a new
3168 // optimization pass.
3169 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003170 .addReg(SrcReg, RegState::Undef, SubReg)
3171 .addReg(SrcReg, RegState::Implicit)
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003172 .addReg(AMDGPU::M0, RegState::Implicit);
3173 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3174 } else {
3175 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003176 .addReg(SrcReg, RegState::Undef, SubReg)
3177 .addReg(SrcReg, RegState::Implicit);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003178 }
3179
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003180 MI.eraseFromParent();
3181
3182 return &MBB;
3183 }
3184
3185 const DebugLoc &DL = MI.getDebugLoc();
3186 MachineBasicBlock::iterator I(&MI);
3187
3188 unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3189 unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3190
3191 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3192
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003193 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
3194 Offset, UseGPRIdxMode, true);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003195 MachineBasicBlock *LoopBB = InsPt->getParent();
3196
3197 if (UseGPRIdxMode) {
3198 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003199 .addReg(SrcReg, RegState::Undef, SubReg)
3200 .addReg(SrcReg, RegState::Implicit)
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003201 .addReg(AMDGPU::M0, RegState::Implicit);
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003202 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003203 } else {
3204 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003205 .addReg(SrcReg, RegState::Undef, SubReg)
3206 .addReg(SrcReg, RegState::Implicit);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003207 }
3208
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003209 MI.eraseFromParent();
3210
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003211 return LoopBB;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003212}
3213
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003214static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
3215 const TargetRegisterClass *VecRC) {
3216 switch (TRI.getRegSizeInBits(*VecRC)) {
3217 case 32: // 4 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003218 return AMDGPU::V_MOVRELD_B32_V1;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003219 case 64: // 8 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003220 return AMDGPU::V_MOVRELD_B32_V2;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003221 case 128: // 16 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003222 return AMDGPU::V_MOVRELD_B32_V4;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003223 case 256: // 32 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003224 return AMDGPU::V_MOVRELD_B32_V8;
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003225 case 512: // 64 bytes
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003226 return AMDGPU::V_MOVRELD_B32_V16;
3227 default:
3228 llvm_unreachable("unsupported size for MOVRELD pseudos");
3229 }
3230}
3231
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003232static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
3233 MachineBasicBlock &MBB,
Tom Stellard5bfbae52018-07-11 20:59:01 +00003234 const GCNSubtarget &ST) {
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003235 const SIInstrInfo *TII = ST.getInstrInfo();
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003236 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3237 MachineFunction *MF = MBB.getParent();
3238 MachineRegisterInfo &MRI = MF->getRegInfo();
3239
3240 unsigned Dst = MI.getOperand(0).getReg();
3241 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3242 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3243 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3244 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3245 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3246
3247 // This can be an immediate, but will be folded later.
3248 assert(Val->getReg());
3249
3250 unsigned SubReg;
3251 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3252 SrcVec->getReg(),
3253 Offset);
Marek Olsake22fdb92017-03-21 17:00:32 +00003254 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003255
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003256 if (Idx->getReg() == AMDGPU::NoRegister) {
3257 MachineBasicBlock::iterator I(&MI);
3258 const DebugLoc &DL = MI.getDebugLoc();
3259
3260 assert(Offset == 0);
3261
3262 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
Diana Picus116bbab2017-01-13 09:58:52 +00003263 .add(*SrcVec)
3264 .add(*Val)
3265 .addImm(SubReg);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003266
3267 MI.eraseFromParent();
3268 return &MBB;
3269 }
3270
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003271 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003272 MachineBasicBlock::iterator I(&MI);
3273 const DebugLoc &DL = MI.getDebugLoc();
3274
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003275 if (UseGPRIdxMode) {
3276 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
Diana Picus116bbab2017-01-13 09:58:52 +00003277 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
3278 .add(*Val)
3279 .addReg(Dst, RegState::ImplicitDefine)
3280 .addReg(SrcVec->getReg(), RegState::Implicit)
3281 .addReg(AMDGPU::M0, RegState::Implicit);
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003282
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003283 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3284 } else {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003285 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003286
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003287 BuildMI(MBB, I, DL, MovRelDesc)
3288 .addReg(Dst, RegState::Define)
3289 .addReg(SrcVec->getReg())
Diana Picus116bbab2017-01-13 09:58:52 +00003290 .add(*Val)
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003291 .addImm(SubReg - AMDGPU::sub0);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003292 }
3293
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003294 MI.eraseFromParent();
3295 return &MBB;
3296 }
3297
3298 if (Val->isReg())
3299 MRI.clearKillFlags(Val->getReg());
3300
3301 const DebugLoc &DL = MI.getDebugLoc();
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003302
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003303 unsigned PhiReg = MRI.createVirtualRegister(VecRC);
3304
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003305 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003306 Offset, UseGPRIdxMode, false);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003307 MachineBasicBlock *LoopBB = InsPt->getParent();
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003308
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003309 if (UseGPRIdxMode) {
3310 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
Diana Picus116bbab2017-01-13 09:58:52 +00003311 .addReg(PhiReg, RegState::Undef, SubReg) // vdst
3312 .add(*Val) // src0
3313 .addReg(Dst, RegState::ImplicitDefine)
3314 .addReg(PhiReg, RegState::Implicit)
3315 .addReg(AMDGPU::M0, RegState::Implicit);
Changpeng Fangda38b5f2018-02-16 16:31:30 +00003316 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003317 } else {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +00003318 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003319
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003320 BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
3321 .addReg(Dst, RegState::Define)
3322 .addReg(PhiReg)
Diana Picus116bbab2017-01-13 09:58:52 +00003323 .add(*Val)
Nicolai Haehnlea7852092016-10-24 14:56:02 +00003324 .addImm(SubReg - AMDGPU::sub0);
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003325 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003326
Nicolai Haehnlebd15c322016-10-14 09:03:04 +00003327 MI.eraseFromParent();
3328
Matt Arsenaultd486d3f2016-10-12 18:49:05 +00003329 return LoopBB;
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003330}
3331
Matt Arsenault786724a2016-07-12 21:41:32 +00003332MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
3333 MachineInstr &MI, MachineBasicBlock *BB) const {
Tom Stellard244891d2016-12-20 15:52:17 +00003334
3335 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3336 MachineFunction *MF = BB->getParent();
3337 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
3338
3339 if (TII->isMIMG(MI)) {
Matt Arsenault905f3512017-12-29 17:18:14 +00003340 if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
3341 report_fatal_error("missing mem operand from MIMG instruction");
3342 }
Tom Stellard244891d2016-12-20 15:52:17 +00003343 // Add a memoperand for mimg instructions so that they aren't assumed to
3344 // be ordered memory instuctions.
3345
Tom Stellard244891d2016-12-20 15:52:17 +00003346 return BB;
3347 }
3348
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003349 switch (MI.getOpcode()) {
Matt Arsenault301162c2017-11-15 21:51:43 +00003350 case AMDGPU::S_ADD_U64_PSEUDO:
3351 case AMDGPU::S_SUB_U64_PSEUDO: {
3352 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3353 const DebugLoc &DL = MI.getDebugLoc();
3354
3355 MachineOperand &Dest = MI.getOperand(0);
3356 MachineOperand &Src0 = MI.getOperand(1);
3357 MachineOperand &Src1 = MI.getOperand(2);
3358
3359 unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3360 unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3361
3362 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3363 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3364 &AMDGPU::SReg_32_XM0RegClass);
3365 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3366 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3367 &AMDGPU::SReg_32_XM0RegClass);
3368
3369 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3370 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3371 &AMDGPU::SReg_32_XM0RegClass);
3372 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3373 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3374 &AMDGPU::SReg_32_XM0RegClass);
3375
3376 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3377
3378 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3379 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3380 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3381 .add(Src0Sub0)
3382 .add(Src1Sub0);
3383 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3384 .add(Src0Sub1)
3385 .add(Src1Sub1);
3386 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3387 .addReg(DestSub0)
3388 .addImm(AMDGPU::sub0)
3389 .addReg(DestSub1)
3390 .addImm(AMDGPU::sub1);
3391 MI.eraseFromParent();
3392 return BB;
3393 }
3394 case AMDGPU::SI_INIT_M0: {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003395 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
Matt Arsenault4ac341c2016-04-14 21:58:15 +00003396 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
Diana Picus116bbab2017-01-13 09:58:52 +00003397 .add(MI.getOperand(0));
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003398 MI.eraseFromParent();
Matt Arsenault20711b72015-02-20 22:10:45 +00003399 return BB;
Matt Arsenault301162c2017-11-15 21:51:43 +00003400 }
Marek Olsak2d825902017-04-28 20:21:58 +00003401 case AMDGPU::SI_INIT_EXEC:
3402 // This should be before all vector instructions.
3403 BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3404 AMDGPU::EXEC)
3405 .addImm(MI.getOperand(0).getImm());
3406 MI.eraseFromParent();
3407 return BB;
3408
3409 case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3410 // Extract the thread count from an SGPR input and set EXEC accordingly.
3411 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3412 //
3413 // S_BFE_U32 count, input, {shift, 7}
3414 // S_BFM_B64 exec, count, 0
3415 // S_CMP_EQ_U32 count, 64
3416 // S_CMOV_B64 exec, -1
3417 MachineInstr *FirstMI = &*BB->begin();
3418 MachineRegisterInfo &MRI = MF->getRegInfo();
3419 unsigned InputReg = MI.getOperand(0).getReg();
3420 unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3421 bool Found = false;
3422
3423 // Move the COPY of the input reg to the beginning, so that we can use it.
3424 for (auto I = BB->begin(); I != &MI; I++) {
3425 if (I->getOpcode() != TargetOpcode::COPY ||
3426 I->getOperand(0).getReg() != InputReg)
3427 continue;
3428
3429 if (I == FirstMI) {
3430 FirstMI = &*++BB->begin();
3431 } else {
3432 I->removeFromParent();
3433 BB->insert(FirstMI, &*I);
3434 }
3435 Found = true;
3436 break;
3437 }
3438 assert(Found);
Davide Italiano0dcc0152017-05-11 19:58:52 +00003439 (void)Found;
Marek Olsak2d825902017-04-28 20:21:58 +00003440
3441 // This should be before all vector instructions.
3442 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3443 .addReg(InputReg)
3444 .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3445 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3446 AMDGPU::EXEC)
3447 .addReg(CountReg)
3448 .addImm(0);
3449 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3450 .addReg(CountReg, RegState::Kill)
3451 .addImm(64);
3452 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3453 AMDGPU::EXEC)
3454 .addImm(-1);
3455 MI.eraseFromParent();
3456 return BB;
3457 }
3458
Changpeng Fang01f60622016-03-15 17:28:44 +00003459 case AMDGPU::GET_GROUPSTATICSIZE: {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003460 DebugLoc DL = MI.getDebugLoc();
Matt Arsenault3c07c812016-07-22 17:01:33 +00003461 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
Diana Picus116bbab2017-01-13 09:58:52 +00003462 .add(MI.getOperand(0))
3463 .addImm(MFI->getLDSSize());
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00003464 MI.eraseFromParent();
Changpeng Fang01f60622016-03-15 17:28:44 +00003465 return BB;
3466 }
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003467 case AMDGPU::SI_INDIRECT_SRC_V1:
3468 case AMDGPU::SI_INDIRECT_SRC_V2:
3469 case AMDGPU::SI_INDIRECT_SRC_V4:
3470 case AMDGPU::SI_INDIRECT_SRC_V8:
3471 case AMDGPU::SI_INDIRECT_SRC_V16:
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003472 return emitIndirectSrc(MI, *BB, *getSubtarget());
Matt Arsenaultcb540bc2016-07-19 00:35:03 +00003473 case AMDGPU::SI_INDIRECT_DST_V1:
3474 case AMDGPU::SI_INDIRECT_DST_V2:
3475 case AMDGPU::SI_INDIRECT_DST_V4:
3476 case AMDGPU::SI_INDIRECT_DST_V8:
3477 case AMDGPU::SI_INDIRECT_DST_V16:
Matt Arsenaultdcf0cfc2016-10-04 01:41:05 +00003478 return emitIndirectDst(MI, *BB, *getSubtarget());
Marek Olsakce76ea02017-10-24 10:27:13 +00003479 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3480 case AMDGPU::SI_KILL_I1_PSEUDO:
Matt Arsenault786724a2016-07-12 21:41:32 +00003481 return splitKillBlock(MI, BB);
Matt Arsenault22e41792016-08-27 01:00:37 +00003482 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3483 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
Matt Arsenault22e41792016-08-27 01:00:37 +00003484
3485 unsigned Dst = MI.getOperand(0).getReg();
3486 unsigned Src0 = MI.getOperand(1).getReg();
3487 unsigned Src1 = MI.getOperand(2).getReg();
3488 const DebugLoc &DL = MI.getDebugLoc();
3489 unsigned SrcCond = MI.getOperand(3).getReg();
3490
3491 unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3492 unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003493 unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
Matt Arsenault22e41792016-08-27 01:00:37 +00003494
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003495 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3496 .addReg(SrcCond);
Matt Arsenault22e41792016-08-27 01:00:37 +00003497 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
Tim Renouf2e94f6e2019-03-18 19:25:39 +00003498 .addImm(0)
Matt Arsenault22e41792016-08-27 01:00:37 +00003499 .addReg(Src0, 0, AMDGPU::sub0)
Tim Renouf2e94f6e2019-03-18 19:25:39 +00003500 .addImm(0)
Matt Arsenault22e41792016-08-27 01:00:37 +00003501 .addReg(Src1, 0, AMDGPU::sub0)
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003502 .addReg(SrcCondCopy);
Matt Arsenault22e41792016-08-27 01:00:37 +00003503 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
Tim Renouf2e94f6e2019-03-18 19:25:39 +00003504 .addImm(0)
Matt Arsenault22e41792016-08-27 01:00:37 +00003505 .addReg(Src0, 0, AMDGPU::sub1)
Tim Renouf2e94f6e2019-03-18 19:25:39 +00003506 .addImm(0)
Matt Arsenault22e41792016-08-27 01:00:37 +00003507 .addReg(Src1, 0, AMDGPU::sub1)
Nicolai Haehnlece4ddd02017-09-29 15:37:31 +00003508 .addReg(SrcCondCopy);
Matt Arsenault22e41792016-08-27 01:00:37 +00003509
3510 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3511 .addReg(DstLo)
3512 .addImm(AMDGPU::sub0)
3513 .addReg(DstHi)
3514 .addImm(AMDGPU::sub1);
3515 MI.eraseFromParent();
3516 return BB;
3517 }
Matt Arsenault327188a2016-12-15 21:57:11 +00003518 case AMDGPU::SI_BR_UNDEF: {
3519 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3520 const DebugLoc &DL = MI.getDebugLoc();
3521 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
Diana Picus116bbab2017-01-13 09:58:52 +00003522 .add(MI.getOperand(0));
Matt Arsenault327188a2016-12-15 21:57:11 +00003523 Br->getOperand(1).setIsUndef(true); // read undef SCC
3524 MI.eraseFromParent();
3525 return BB;
3526 }
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003527 case AMDGPU::ADJCALLSTACKUP:
3528 case AMDGPU::ADJCALLSTACKDOWN: {
3529 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3530 MachineInstrBuilder MIB(*MF, &MI);
Matt Arsenaulte9f36792018-03-27 18:38:51 +00003531
3532 // Add an implicit use of the frame offset reg to prevent the restore copy
3533 // inserted after the call from being reorderd after stack operations in the
3534 // the caller's frame.
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003535 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
Matt Arsenaulte9f36792018-03-27 18:38:51 +00003536 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3537 .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003538 return BB;
3539 }
Scott Linderd19d1972019-02-04 20:00:07 +00003540 case AMDGPU::SI_CALL_ISEL: {
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003541 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3542 const DebugLoc &DL = MI.getDebugLoc();
Scott Linderd19d1972019-02-04 20:00:07 +00003543
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003544 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
Matt Arsenault6ed7b9b2017-08-02 01:31:28 +00003545
Matt Arsenault71bcbd42017-08-11 20:42:08 +00003546 MachineInstrBuilder MIB;
Scott Linderd19d1972019-02-04 20:00:07 +00003547 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
Matt Arsenault71bcbd42017-08-11 20:42:08 +00003548
Scott Linderd19d1972019-02-04 20:00:07 +00003549 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003550 MIB.add(MI.getOperand(I));
Matt Arsenault6ed7b9b2017-08-02 01:31:28 +00003551
Chandler Carruthc73c0302018-08-16 21:30:05 +00003552 MIB.cloneMemRefs(MI);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00003553 MI.eraseFromParent();
3554 return BB;
3555 }
Stanislav Mekhanoshin64399da2019-05-02 04:26:35 +00003556 case AMDGPU::V_ADD_I32_e32:
3557 case AMDGPU::V_SUB_I32_e32:
3558 case AMDGPU::V_SUBREV_I32_e32: {
3559 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
3560 const DebugLoc &DL = MI.getDebugLoc();
3561 unsigned Opc = MI.getOpcode();
3562
3563 bool NeedClampOperand = false;
3564 if (TII->pseudoToMCOpcode(Opc) == -1) {
3565 Opc = AMDGPU::getVOPe64(Opc);
3566 NeedClampOperand = true;
3567 }
3568
3569 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
3570 if (TII->isVOP3(*I)) {
3571 I.addReg(AMDGPU::VCC, RegState::Define);
3572 }
3573 I.add(MI.getOperand(1))
3574 .add(MI.getOperand(2));
3575 if (NeedClampOperand)
3576 I.addImm(0); // clamp bit for e64 encoding
3577
3578 TII->legalizeOperands(*I);
3579
3580 MI.eraseFromParent();
3581 return BB;
3582 }
Changpeng Fang01f60622016-03-15 17:28:44 +00003583 default:
3584 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
Tom Stellard75aadc22012-12-11 21:25:42 +00003585 }
Tom Stellard75aadc22012-12-11 21:25:42 +00003586}
3587
Matt Arsenaulte11d8ac2017-10-13 21:10:22 +00003588bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
3589 return isTypeLegal(VT.getScalarType());
3590}
3591
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003592bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
3593 // This currently forces unfolding various combinations of fsub into fma with
3594 // free fneg'd operands. As long as we have fast FMA (controlled by
3595 // isFMAFasterThanFMulAndFAdd), we should perform these.
3596
3597 // When fma is quarter rate, for f64 where add / sub are at best half rate,
3598 // most of these combines appear to be cycle neutral but save on instruction
3599 // count / code size.
3600 return true;
3601}
3602
Mehdi Amini44ede332015-07-09 02:09:04 +00003603EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
3604 EVT VT) const {
Tom Stellard83747202013-07-18 21:43:53 +00003605 if (!VT.isVector()) {
3606 return MVT::i1;
3607 }
Matt Arsenault8596f712014-11-28 22:51:38 +00003608 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
Tom Stellard75aadc22012-12-11 21:25:42 +00003609}
3610
Matt Arsenault94163282016-12-22 16:36:25 +00003611MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
3612 // TODO: Should i16 be used always if legal? For now it would force VALU
3613 // shifts.
3614 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
Christian Konig082a14a2013-03-18 11:34:05 +00003615}
3616
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003617// Answering this is somewhat tricky and depends on the specific device which
3618// have different rates for fma or all f64 operations.
3619//
3620// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3621// regardless of which device (although the number of cycles differs between
3622// devices), so it is always profitable for f64.
3623//
3624// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3625// only on full rate devices. Normally, we should prefer selecting v_mad_f32
3626// which we can always do even without fused FP ops since it returns the same
3627// result as the separate operations and since it is always full
3628// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3629// however does not support denormals, so we do report fma as faster if we have
3630// a fast fma device and require denormals.
3631//
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003632bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3633 VT = VT.getScalarType();
3634
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003635 switch (VT.getSimpleVT().SimpleTy) {
Matt Arsenault0084adc2018-04-30 19:08:16 +00003636 case MVT::f32: {
Matt Arsenault423bf3f2015-01-29 19:34:32 +00003637 // This is as fast on some subtargets. However, we always have full rate f32
3638 // mad available which returns the same result as the separate operations
Matt Arsenault8d630032015-02-20 22:10:41 +00003639 // which we should prefer over fma. We can't use this if we want to support
3640 // denormals, so only report this in these cases.
Matt Arsenault0084adc2018-04-30 19:08:16 +00003641 if (Subtarget->hasFP32Denormals())
3642 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
3643
3644 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3645 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
3646 }
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003647 case MVT::f64:
3648 return true;
Matt Arsenault9e22bc22016-12-22 03:21:48 +00003649 case MVT::f16:
3650 return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
Niels Ole Salscheiderd3a039f2013-08-10 10:38:54 +00003651 default:
3652 break;
3653 }
3654
3655 return false;
3656}
3657
Tom Stellard75aadc22012-12-11 21:25:42 +00003658//===----------------------------------------------------------------------===//
3659// Custom DAG Lowering Operations
3660//===----------------------------------------------------------------------===//
3661
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003662// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3663// wider vector type is legal.
3664SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
3665 SelectionDAG &DAG) const {
3666 unsigned Opc = Op.getOpcode();
3667 EVT VT = Op.getValueType();
3668 assert(VT == MVT::v4f16);
3669
3670 SDValue Lo, Hi;
3671 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
3672
3673 SDLoc SL(Op);
3674 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
3675 Op->getFlags());
3676 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
3677 Op->getFlags());
3678
3679 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3680}
3681
3682// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3683// wider vector type is legal.
3684SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
3685 SelectionDAG &DAG) const {
3686 unsigned Opc = Op.getOpcode();
3687 EVT VT = Op.getValueType();
3688 assert(VT == MVT::v4i16 || VT == MVT::v4f16);
3689
3690 SDValue Lo0, Hi0;
3691 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
3692 SDValue Lo1, Hi1;
3693 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
3694
3695 SDLoc SL(Op);
3696
3697 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
3698 Op->getFlags());
3699 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
3700 Op->getFlags());
3701
3702 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3703}
3704
Tom Stellard75aadc22012-12-11 21:25:42 +00003705SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3706 switch (Op.getOpcode()) {
3707 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
Tom Stellardf8794352012-12-19 22:10:31 +00003708 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
Aakanksha Patild5443f82019-05-29 18:20:11 +00003709 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
Tom Stellard35bb18c2013-08-26 15:06:04 +00003710 case ISD::LOAD: {
Tom Stellarde812f2f2014-07-21 15:45:06 +00003711 SDValue Result = LowerLOAD(Op, DAG);
3712 assert((!Result.getNode() ||
3713 Result.getNode()->getNumValues() == 2) &&
3714 "Load should return a value and a chain");
3715 return Result;
Tom Stellard35bb18c2013-08-26 15:06:04 +00003716 }
Tom Stellardaf775432013-10-23 00:44:32 +00003717
Matt Arsenaultad14ce82014-07-19 18:44:39 +00003718 case ISD::FSIN:
3719 case ISD::FCOS:
3720 return LowerTrig(Op, DAG);
Tom Stellard0ec134f2014-02-04 17:18:40 +00003721 case ISD::SELECT: return LowerSELECT(Op, DAG);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00003722 case ISD::FDIV: return LowerFDIV(Op, DAG);
Tom Stellard354a43c2016-04-01 18:27:37 +00003723 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
Tom Stellard81d871d2013-11-13 23:36:50 +00003724 case ISD::STORE: return LowerSTORE(Op, DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003725 case ISD::GlobalAddress: {
3726 MachineFunction &MF = DAG.getMachineFunction();
3727 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3728 return LowerGlobalAddress(MFI, Op, DAG);
Tom Stellard94593ee2013-06-03 17:40:18 +00003729 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003730 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00003731 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00003732 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
Matt Arsenault99c14522016-04-25 19:27:24 +00003733 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
Matt Arsenault3aef8092017-01-23 23:09:58 +00003734 case ISD::INSERT_VECTOR_ELT:
3735 return lowerINSERT_VECTOR_ELT(Op, DAG);
3736 case ISD::EXTRACT_VECTOR_ELT:
3737 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
Matt Arsenault67a98152018-05-16 11:47:30 +00003738 case ISD::BUILD_VECTOR:
3739 return lowerBUILD_VECTOR(Op, DAG);
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00003740 case ISD::FP_ROUND:
3741 return lowerFP_ROUND(Op, DAG);
Matt Arsenault3e025382017-04-24 17:49:13 +00003742 case ISD::TRAP:
Matt Arsenault3e025382017-04-24 17:49:13 +00003743 return lowerTRAP(Op, DAG);
Tony Tye43259df2018-05-16 16:19:34 +00003744 case ISD::DEBUGTRAP:
3745 return lowerDEBUGTRAP(Op, DAG);
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003746 case ISD::FABS:
3747 case ISD::FNEG:
Matt Arsenault36cdcfa2018-08-02 13:43:42 +00003748 case ISD::FCANONICALIZE:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003749 return splitUnaryVectorOp(Op, DAG);
Matt Arsenault687ec752018-10-22 16:27:27 +00003750 case ISD::FMINNUM:
3751 case ISD::FMAXNUM:
3752 return lowerFMINNUM_FMAXNUM(Op, DAG);
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003753 case ISD::SHL:
3754 case ISD::SRA:
3755 case ISD::SRL:
3756 case ISD::ADD:
3757 case ISD::SUB:
3758 case ISD::MUL:
3759 case ISD::SMIN:
3760 case ISD::SMAX:
3761 case ISD::UMIN:
3762 case ISD::UMAX:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003763 case ISD::FADD:
3764 case ISD::FMUL:
Matt Arsenault687ec752018-10-22 16:27:27 +00003765 case ISD::FMINNUM_IEEE:
3766 case ISD::FMAXNUM_IEEE:
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003767 return splitBinaryVectorOp(Op, DAG);
Tom Stellard75aadc22012-12-11 21:25:42 +00003768 }
3769 return SDValue();
3770}
3771
Matt Arsenault1349a042018-05-22 06:32:10 +00003772static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
3773 const SDLoc &DL,
3774 SelectionDAG &DAG, bool Unpacked) {
3775 if (!LoadVT.isVector())
3776 return Result;
3777
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003778 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3779 // Truncate to v2i16/v4i16.
3780 EVT IntLoadVT = LoadVT.changeTypeToInteger();
Matt Arsenault1349a042018-05-22 06:32:10 +00003781
3782 // Workaround legalizer not scalarizing truncate after vector op
3783 // legalization byt not creating intermediate vector trunc.
3784 SmallVector<SDValue, 4> Elts;
3785 DAG.ExtractVectorElements(Result, Elts);
3786 for (SDValue &Elt : Elts)
3787 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3788
3789 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3790
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003791 // Bitcast to original type (v2f16/v4f16).
Matt Arsenault1349a042018-05-22 06:32:10 +00003792 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003793 }
Matt Arsenault1349a042018-05-22 06:32:10 +00003794
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003795 // Cast back to the original packed type.
3796 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3797}
3798
Matt Arsenault1349a042018-05-22 06:32:10 +00003799SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3800 MemSDNode *M,
3801 SelectionDAG &DAG,
Tim Renouf366a49d2018-08-02 23:33:01 +00003802 ArrayRef<SDValue> Ops,
Matt Arsenault1349a042018-05-22 06:32:10 +00003803 bool IsIntrinsic) const {
3804 SDLoc DL(M);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003805
3806 bool Unpacked = Subtarget->hasUnpackedD16VMem();
Matt Arsenault1349a042018-05-22 06:32:10 +00003807 EVT LoadVT = M->getValueType(0);
3808
Matt Arsenault1349a042018-05-22 06:32:10 +00003809 EVT EquivLoadVT = LoadVT;
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003810 if (Unpacked && LoadVT.isVector()) {
3811 EquivLoadVT = LoadVT.isVector() ?
3812 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3813 LoadVT.getVectorNumElements()) : LoadVT;
Matt Arsenault1349a042018-05-22 06:32:10 +00003814 }
3815
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003816 // Change from v4f16/v2f16 to EquivLoadVT.
3817 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3818
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003819 SDValue Load
3820 = DAG.getMemIntrinsicNode(
3821 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3822 VTList, Ops, M->getMemoryVT(),
3823 M->getMemOperand());
3824 if (!Unpacked) // Just adjusted the opcode.
3825 return Load;
Changpeng Fang4737e892018-01-18 22:08:53 +00003826
Matt Arsenault1349a042018-05-22 06:32:10 +00003827 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
Changpeng Fang4737e892018-01-18 22:08:53 +00003828
Matt Arsenault1349a042018-05-22 06:32:10 +00003829 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003830}
3831
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003832static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
3833 SDNode *N, SelectionDAG &DAG) {
3834 EVT VT = N->getValueType(0);
Matt Arsenaultcaf13162019-03-12 21:02:54 +00003835 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003836 int CondCode = CD->getSExtValue();
3837 if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
3838 CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
3839 return DAG.getUNDEF(VT);
3840
3841 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
3842
3843
3844 SDValue LHS = N->getOperand(1);
3845 SDValue RHS = N->getOperand(2);
3846
3847 SDLoc DL(N);
3848
3849 EVT CmpVT = LHS.getValueType();
3850 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
3851 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
3852 ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3853 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
3854 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
3855 }
3856
3857 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
3858
3859 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
3860 DAG.getCondCode(CCOpcode));
3861}
3862
3863static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
3864 SDNode *N, SelectionDAG &DAG) {
3865 EVT VT = N->getValueType(0);
Matt Arsenaultcaf13162019-03-12 21:02:54 +00003866 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00003867
3868 int CondCode = CD->getSExtValue();
3869 if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
3870 CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
3871 return DAG.getUNDEF(VT);
3872 }
3873
3874 SDValue Src0 = N->getOperand(1);
3875 SDValue Src1 = N->getOperand(2);
3876 EVT CmpVT = Src0.getValueType();
3877 SDLoc SL(N);
3878
3879 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
3880 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
3881 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
3882 }
3883
3884 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
3885 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
3886 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
3887 Src1, DAG.getCondCode(CCOpcode));
3888}
3889
Matt Arsenault3aef8092017-01-23 23:09:58 +00003890void SITargetLowering::ReplaceNodeResults(SDNode *N,
3891 SmallVectorImpl<SDValue> &Results,
3892 SelectionDAG &DAG) const {
3893 switch (N->getOpcode()) {
3894 case ISD::INSERT_VECTOR_ELT: {
3895 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3896 Results.push_back(Res);
3897 return;
3898 }
3899 case ISD::EXTRACT_VECTOR_ELT: {
3900 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3901 Results.push_back(Res);
3902 return;
3903 }
Matt Arsenault1f17c662017-02-22 00:27:34 +00003904 case ISD::INTRINSIC_WO_CHAIN: {
3905 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
Marek Olsak13e47412018-01-31 20:18:04 +00003906 switch (IID) {
3907 case Intrinsic::amdgcn_cvt_pkrtz: {
Matt Arsenault1f17c662017-02-22 00:27:34 +00003908 SDValue Src0 = N->getOperand(1);
3909 SDValue Src1 = N->getOperand(2);
3910 SDLoc SL(N);
3911 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
3912 Src0, Src1);
Matt Arsenault1f17c662017-02-22 00:27:34 +00003913 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3914 return;
3915 }
Marek Olsak13e47412018-01-31 20:18:04 +00003916 case Intrinsic::amdgcn_cvt_pknorm_i16:
3917 case Intrinsic::amdgcn_cvt_pknorm_u16:
3918 case Intrinsic::amdgcn_cvt_pk_i16:
3919 case Intrinsic::amdgcn_cvt_pk_u16: {
3920 SDValue Src0 = N->getOperand(1);
3921 SDValue Src1 = N->getOperand(2);
3922 SDLoc SL(N);
3923 unsigned Opcode;
3924
3925 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
3926 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
3927 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3928 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
3929 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3930 Opcode = AMDGPUISD::CVT_PK_I16_I32;
3931 else
3932 Opcode = AMDGPUISD::CVT_PK_U16_U32;
3933
Matt Arsenault709374d2018-08-01 20:13:58 +00003934 EVT VT = N->getValueType(0);
3935 if (isTypeLegal(VT))
3936 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
3937 else {
3938 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3939 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3940 }
Marek Olsak13e47412018-01-31 20:18:04 +00003941 return;
3942 }
3943 }
Simon Pilgrimd362d272017-07-08 19:50:03 +00003944 break;
Matt Arsenault1f17c662017-02-22 00:27:34 +00003945 }
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003946 case ISD::INTRINSIC_W_CHAIN: {
Matt Arsenault1349a042018-05-22 06:32:10 +00003947 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003948 Results.push_back(Res);
Matt Arsenault1349a042018-05-22 06:32:10 +00003949 Results.push_back(Res.getValue(1));
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003950 return;
3951 }
Matt Arsenault1349a042018-05-22 06:32:10 +00003952
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00003953 break;
3954 }
Matt Arsenault4a486232017-04-19 20:53:07 +00003955 case ISD::SELECT: {
3956 SDLoc SL(N);
3957 EVT VT = N->getValueType(0);
3958 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3959 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3960 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3961
3962 EVT SelectVT = NewVT;
3963 if (NewVT.bitsLT(MVT::i32)) {
3964 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3965 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3966 SelectVT = MVT::i32;
3967 }
3968
3969 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3970 N->getOperand(0), LHS, RHS);
3971
3972 if (NewVT != SelectVT)
3973 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3974 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3975 return;
3976 }
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003977 case ISD::FNEG: {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003978 if (N->getValueType(0) != MVT::v2f16)
3979 break;
3980
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003981 SDLoc SL(N);
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003982 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3983
3984 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
3985 BC,
3986 DAG.getConstant(0x80008000, SL, MVT::i32));
3987 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3988 return;
3989 }
3990 case ISD::FABS: {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00003991 if (N->getValueType(0) != MVT::v2f16)
3992 break;
3993
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003994 SDLoc SL(N);
Matt Arsenaulte9524f12018-06-06 21:28:11 +00003995 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3996
3997 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
3998 BC,
3999 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
4000 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
4001 return;
4002 }
Matt Arsenault3aef8092017-01-23 23:09:58 +00004003 default:
4004 break;
4005 }
4006}
4007
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00004008/// Helper function for LowerBRCOND
Tom Stellardf8794352012-12-19 22:10:31 +00004009static SDNode *findUser(SDValue Value, unsigned Opcode) {
Tom Stellard75aadc22012-12-11 21:25:42 +00004010
Tom Stellardf8794352012-12-19 22:10:31 +00004011 SDNode *Parent = Value.getNode();
4012 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
4013 I != E; ++I) {
4014
4015 if (I.getUse().get() != Value)
4016 continue;
4017
4018 if (I->getOpcode() == Opcode)
4019 return *I;
4020 }
Craig Topper062a2ba2014-04-25 05:30:21 +00004021 return nullptr;
Tom Stellardf8794352012-12-19 22:10:31 +00004022}
4023
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004024unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
Matt Arsenault6408c912016-09-16 22:11:18 +00004025 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
4026 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004027 case Intrinsic::amdgcn_if:
4028 return AMDGPUISD::IF;
4029 case Intrinsic::amdgcn_else:
4030 return AMDGPUISD::ELSE;
4031 case Intrinsic::amdgcn_loop:
4032 return AMDGPUISD::LOOP;
4033 case Intrinsic::amdgcn_end_cf:
4034 llvm_unreachable("should not occur");
Matt Arsenault6408c912016-09-16 22:11:18 +00004035 default:
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004036 return 0;
Matt Arsenault6408c912016-09-16 22:11:18 +00004037 }
Tom Stellardbc4497b2016-02-12 23:45:29 +00004038 }
Matt Arsenault6408c912016-09-16 22:11:18 +00004039
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004040 // break, if_break, else_break are all only used as inputs to loop, not
4041 // directly as branch conditions.
4042 return 0;
Tom Stellardbc4497b2016-02-12 23:45:29 +00004043}
4044
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004045bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
4046 const Triple &TT = getTargetMachine().getTargetTriple();
Matt Arsenault0da63502018-08-31 05:49:54 +00004047 return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4048 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004049 AMDGPU::shouldEmitConstantsToTextSection(TT);
4050}
4051
4052bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
Scott Linderd19d1972019-02-04 20:00:07 +00004053 // FIXME: Either avoid relying on address space here or change the default
4054 // address space for functions to avoid the explicit check.
4055 return (GV->getValueType()->isFunctionTy() ||
4056 GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
Matt Arsenault0da63502018-08-31 05:49:54 +00004057 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4058 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004059 !shouldEmitFixup(GV) &&
4060 !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
4061}
4062
4063bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
4064 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
4065}
4066
Tom Stellardf8794352012-12-19 22:10:31 +00004067/// This transforms the control flow intrinsics to get the branch destination as
4068/// last parameter, also switches branch target with BR if the need arise
4069SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
4070 SelectionDAG &DAG) const {
Andrew Trickef9de2a2013-05-25 02:42:55 +00004071 SDLoc DL(BRCOND);
Tom Stellardf8794352012-12-19 22:10:31 +00004072
4073 SDNode *Intr = BRCOND.getOperand(1).getNode();
4074 SDValue Target = BRCOND.getOperand(2);
Craig Topper062a2ba2014-04-25 05:30:21 +00004075 SDNode *BR = nullptr;
Tom Stellardbc4497b2016-02-12 23:45:29 +00004076 SDNode *SetCC = nullptr;
Tom Stellardf8794352012-12-19 22:10:31 +00004077
4078 if (Intr->getOpcode() == ISD::SETCC) {
4079 // As long as we negate the condition everything is fine
Tom Stellardbc4497b2016-02-12 23:45:29 +00004080 SetCC = Intr;
Tom Stellardf8794352012-12-19 22:10:31 +00004081 Intr = SetCC->getOperand(0).getNode();
4082
4083 } else {
4084 // Get the target from BR if we don't negate the condition
4085 BR = findUser(BRCOND, ISD::BR);
4086 Target = BR->getOperand(1);
4087 }
4088
Matt Arsenault6408c912016-09-16 22:11:18 +00004089 // FIXME: This changes the types of the intrinsics instead of introducing new
4090 // nodes with the correct types.
4091 // e.g. llvm.amdgcn.loop
4092
4093 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
4094 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
4095
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004096 unsigned CFNode = isCFIntrinsic(Intr);
4097 if (CFNode == 0) {
Tom Stellardbc4497b2016-02-12 23:45:29 +00004098 // This is a uniform branch so we don't need to legalize.
4099 return BRCOND;
4100 }
4101
Matt Arsenault6408c912016-09-16 22:11:18 +00004102 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
4103 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
4104
Tom Stellardbc4497b2016-02-12 23:45:29 +00004105 assert(!SetCC ||
4106 (SetCC->getConstantOperandVal(1) == 1 &&
Tom Stellardbc4497b2016-02-12 23:45:29 +00004107 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
4108 ISD::SETNE));
Tom Stellardf8794352012-12-19 22:10:31 +00004109
Tom Stellardf8794352012-12-19 22:10:31 +00004110 // operands of the new intrinsic call
4111 SmallVector<SDValue, 4> Ops;
Matt Arsenault6408c912016-09-16 22:11:18 +00004112 if (HaveChain)
4113 Ops.push_back(BRCOND.getOperand(0));
4114
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004115 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
Tom Stellardf8794352012-12-19 22:10:31 +00004116 Ops.push_back(Target);
4117
Matt Arsenault6408c912016-09-16 22:11:18 +00004118 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
4119
Tom Stellardf8794352012-12-19 22:10:31 +00004120 // build the new intrinsic call
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00004121 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
Tom Stellardf8794352012-12-19 22:10:31 +00004122
Matt Arsenault6408c912016-09-16 22:11:18 +00004123 if (!HaveChain) {
4124 SDValue Ops[] = {
4125 SDValue(Result, 0),
4126 BRCOND.getOperand(0)
4127 };
4128
4129 Result = DAG.getMergeValues(Ops, DL).getNode();
4130 }
4131
Tom Stellardf8794352012-12-19 22:10:31 +00004132 if (BR) {
4133 // Give the branch instruction our target
4134 SDValue Ops[] = {
4135 BR->getOperand(0),
4136 BRCOND.getOperand(2)
4137 };
Chandler Carruth356665a2014-08-01 22:09:43 +00004138 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
4139 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
4140 BR = NewBR.getNode();
Tom Stellardf8794352012-12-19 22:10:31 +00004141 }
4142
4143 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
4144
4145 // Copy the intrinsic results to registers
4146 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
4147 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
4148 if (!CopyToReg)
4149 continue;
4150
4151 Chain = DAG.getCopyToReg(
4152 Chain, DL,
4153 CopyToReg->getOperand(1),
4154 SDValue(Result, i - 1),
4155 SDValue());
4156
4157 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
4158 }
4159
4160 // Remove the old intrinsic from the chain
4161 DAG.ReplaceAllUsesOfValueWith(
4162 SDValue(Intr, Intr->getNumValues() - 1),
4163 Intr->getOperand(0));
4164
4165 return Chain;
Tom Stellard75aadc22012-12-11 21:25:42 +00004166}
4167
Aakanksha Patild5443f82019-05-29 18:20:11 +00004168SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
4169 SelectionDAG &DAG) const {
4170 MVT VT = Op.getSimpleValueType();
4171 SDLoc DL(Op);
4172 // Checking the depth
4173 if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0)
4174 return DAG.getConstant(0, DL, VT);
4175
4176 MachineFunction &MF = DAG.getMachineFunction();
4177 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4178 // Check for kernel and shader functions
4179 if (Info->isEntryFunction())
4180 return DAG.getConstant(0, DL, VT);
4181
4182 MachineFrameInfo &MFI = MF.getFrameInfo();
4183 // There is a call to @llvm.returnaddress in this function
4184 MFI.setReturnAddressIsTaken(true);
4185
4186 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
4187 // Get the return address reg and mark it as an implicit live-in
4188 unsigned Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
4189
4190 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
4191}
4192
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00004193SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
4194 SDValue Op,
4195 const SDLoc &DL,
4196 EVT VT) const {
4197 return Op.getValueType().bitsLE(VT) ?
4198 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
4199 DAG.getNode(ISD::FTRUNC, DL, VT, Op);
4200}
4201
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004202SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004203 assert(Op.getValueType() == MVT::f16 &&
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004204 "Do not know how to custom lower FP_ROUND for non-f16 type");
4205
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004206 SDValue Src = Op.getOperand(0);
4207 EVT SrcVT = Src.getValueType();
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004208 if (SrcVT != MVT::f64)
4209 return Op;
4210
4211 SDLoc DL(Op);
Matt Arsenaultafe614c2016-11-18 18:33:36 +00004212
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004213 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
4214 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
Mandeep Singh Grang5e1697e2017-06-06 05:08:36 +00004215 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
Konstantin Zhuravlyovd709efb2016-11-17 04:28:37 +00004216}
4217
Matt Arsenault687ec752018-10-22 16:27:27 +00004218SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
4219 SelectionDAG &DAG) const {
4220 EVT VT = Op.getValueType();
Matt Arsenault055e4dc2019-03-29 19:14:54 +00004221 const MachineFunction &MF = DAG.getMachineFunction();
4222 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4223 bool IsIEEEMode = Info->getMode().IEEE;
Matt Arsenault687ec752018-10-22 16:27:27 +00004224
4225 // FIXME: Assert during eslection that this is only selected for
4226 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4227 // mode functions, but this happens to be OK since it's only done in cases
4228 // where there is known no sNaN.
4229 if (IsIEEEMode)
4230 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
4231
4232 if (VT == MVT::v4f16)
4233 return splitBinaryVectorOp(Op, DAG);
4234 return Op;
4235}
4236
Matt Arsenault3e025382017-04-24 17:49:13 +00004237SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
4238 SDLoc SL(Op);
Matt Arsenault3e025382017-04-24 17:49:13 +00004239 SDValue Chain = Op.getOperand(0);
4240
Tom Stellard5bfbae52018-07-11 20:59:01 +00004241 if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
Tony Tye43259df2018-05-16 16:19:34 +00004242 !Subtarget->isTrapHandlerEnabled())
Matt Arsenault3e025382017-04-24 17:49:13 +00004243 return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
Tony Tye43259df2018-05-16 16:19:34 +00004244
4245 MachineFunction &MF = DAG.getMachineFunction();
4246 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4247 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4248 assert(UserSGPR != AMDGPU::NoRegister);
4249 SDValue QueuePtr = CreateLiveInRegister(
4250 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4251 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
4252 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
4253 QueuePtr, SDValue());
4254 SDValue Ops[] = {
4255 ToReg,
Tom Stellard5bfbae52018-07-11 20:59:01 +00004256 DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
Tony Tye43259df2018-05-16 16:19:34 +00004257 SGPR01,
4258 ToReg.getValue(1)
4259 };
4260 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4261}
4262
4263SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
4264 SDLoc SL(Op);
4265 SDValue Chain = Op.getOperand(0);
4266 MachineFunction &MF = DAG.getMachineFunction();
4267
Tom Stellard5bfbae52018-07-11 20:59:01 +00004268 if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
Tony Tye43259df2018-05-16 16:19:34 +00004269 !Subtarget->isTrapHandlerEnabled()) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004270 DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
Matt Arsenault3e025382017-04-24 17:49:13 +00004271 "debugtrap handler not supported",
4272 Op.getDebugLoc(),
4273 DS_Warning);
Matthias Braunf1caa282017-12-15 22:22:58 +00004274 LLVMContext &Ctx = MF.getFunction().getContext();
Matt Arsenault3e025382017-04-24 17:49:13 +00004275 Ctx.diagnose(NoTrap);
4276 return Chain;
4277 }
Matt Arsenault3e025382017-04-24 17:49:13 +00004278
Tony Tye43259df2018-05-16 16:19:34 +00004279 SDValue Ops[] = {
4280 Chain,
Tom Stellard5bfbae52018-07-11 20:59:01 +00004281 DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
Tony Tye43259df2018-05-16 16:19:34 +00004282 };
4283 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
Matt Arsenault3e025382017-04-24 17:49:13 +00004284}
4285
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004286SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
Matt Arsenault99c14522016-04-25 19:27:24 +00004287 SelectionDAG &DAG) const {
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004288 // FIXME: Use inline constants (src_{shared, private}_base) instead.
4289 if (Subtarget->hasApertureRegs()) {
Matt Arsenault0da63502018-08-31 05:49:54 +00004290 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004291 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
4292 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
Matt Arsenault0da63502018-08-31 05:49:54 +00004293 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004294 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
4295 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
4296 unsigned Encoding =
4297 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
4298 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
4299 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
Matt Arsenaulte823d922017-02-18 18:29:53 +00004300
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004301 SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
4302 SDValue ApertureReg = SDValue(
4303 DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
4304 SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
4305 return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
Matt Arsenaulte823d922017-02-18 18:29:53 +00004306 }
4307
Matt Arsenault99c14522016-04-25 19:27:24 +00004308 MachineFunction &MF = DAG.getMachineFunction();
4309 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenault3b2e2a52016-06-06 20:03:31 +00004310 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4311 assert(UserSGPR != AMDGPU::NoRegister);
4312
Matt Arsenault99c14522016-04-25 19:27:24 +00004313 SDValue QueuePtr = CreateLiveInRegister(
Matt Arsenault3b2e2a52016-06-06 20:03:31 +00004314 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
Matt Arsenault99c14522016-04-25 19:27:24 +00004315
4316 // Offset into amd_queue_t for group_segment_aperture_base_hi /
4317 // private_segment_aperture_base_hi.
Matt Arsenault0da63502018-08-31 05:49:54 +00004318 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
Matt Arsenault99c14522016-04-25 19:27:24 +00004319
Matt Arsenaultb655fa92017-11-29 01:25:12 +00004320 SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
Matt Arsenault99c14522016-04-25 19:27:24 +00004321
4322 // TODO: Use custom target PseudoSourceValue.
4323 // TODO: We should use the value from the IR intrinsic call, but it might not
4324 // be available and how do we get it?
4325 Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
Matt Arsenault0da63502018-08-31 05:49:54 +00004326 AMDGPUAS::CONSTANT_ADDRESS));
Matt Arsenault99c14522016-04-25 19:27:24 +00004327
4328 MachinePointerInfo PtrInfo(V, StructOffset);
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004329 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
Justin Lebar9c375812016-07-15 18:27:10 +00004330 MinAlign(64, StructOffset),
Justin Lebaradbf09e2016-09-11 01:38:58 +00004331 MachineMemOperand::MODereferenceable |
4332 MachineMemOperand::MOInvariant);
Matt Arsenault99c14522016-04-25 19:27:24 +00004333}
4334
4335SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4336 SelectionDAG &DAG) const {
4337 SDLoc SL(Op);
4338 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4339
4340 SDValue Src = ASC->getOperand(0);
Matt Arsenault99c14522016-04-25 19:27:24 +00004341 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4342
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004343 const AMDGPUTargetMachine &TM =
4344 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4345
Matt Arsenault99c14522016-04-25 19:27:24 +00004346 // flat -> local/private
Matt Arsenault0da63502018-08-31 05:49:54 +00004347 if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenault971c85e2017-03-13 19:47:31 +00004348 unsigned DestAS = ASC->getDestAddressSpace();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00004349
Matt Arsenault0da63502018-08-31 05:49:54 +00004350 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
4351 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004352 unsigned NullVal = TM.getNullPointerValue(DestAS);
4353 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
Matt Arsenault99c14522016-04-25 19:27:24 +00004354 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4355 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4356
4357 return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4358 NonNull, Ptr, SegmentNullPtr);
4359 }
4360 }
4361
4362 // local/private -> flat
Matt Arsenault0da63502018-08-31 05:49:54 +00004363 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenault971c85e2017-03-13 19:47:31 +00004364 unsigned SrcAS = ASC->getSrcAddressSpace();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00004365
Matt Arsenault0da63502018-08-31 05:49:54 +00004366 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
4367 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault747bf8a2017-03-13 20:18:14 +00004368 unsigned NullVal = TM.getNullPointerValue(SrcAS);
4369 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
Matt Arsenault971c85e2017-03-13 19:47:31 +00004370
Matt Arsenault99c14522016-04-25 19:27:24 +00004371 SDValue NonNull
4372 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4373
Konstantin Zhuravlyov4b3847e2017-04-06 23:02:33 +00004374 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
Matt Arsenault99c14522016-04-25 19:27:24 +00004375 SDValue CvtPtr
4376 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4377
4378 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4379 DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4380 FlatNullPtr);
4381 }
4382 }
4383
4384 // global <-> flat are no-ops and never emitted.
4385
4386 const MachineFunction &MF = DAG.getMachineFunction();
4387 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
Matthias Braunf1caa282017-12-15 22:22:58 +00004388 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
Matt Arsenault99c14522016-04-25 19:27:24 +00004389 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4390
4391 return DAG.getUNDEF(ASC->getValueType(0));
4392}
4393
Matt Arsenault3aef8092017-01-23 23:09:58 +00004394SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4395 SelectionDAG &DAG) const {
Matt Arsenault67a98152018-05-16 11:47:30 +00004396 SDValue Vec = Op.getOperand(0);
4397 SDValue InsVal = Op.getOperand(1);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004398 SDValue Idx = Op.getOperand(2);
Matt Arsenault67a98152018-05-16 11:47:30 +00004399 EVT VecVT = Vec.getValueType();
Matt Arsenault9224c002018-06-05 19:52:46 +00004400 EVT EltVT = VecVT.getVectorElementType();
4401 unsigned VecSize = VecVT.getSizeInBits();
4402 unsigned EltSize = EltVT.getSizeInBits();
Matt Arsenault67a98152018-05-16 11:47:30 +00004403
Matt Arsenault9224c002018-06-05 19:52:46 +00004404
4405 assert(VecSize <= 64);
Matt Arsenault67a98152018-05-16 11:47:30 +00004406
4407 unsigned NumElts = VecVT.getVectorNumElements();
4408 SDLoc SL(Op);
4409 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
4410
Matt Arsenault9224c002018-06-05 19:52:46 +00004411 if (NumElts == 4 && EltSize == 16 && KIdx) {
Matt Arsenault67a98152018-05-16 11:47:30 +00004412 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
4413
4414 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4415 DAG.getConstant(0, SL, MVT::i32));
4416 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4417 DAG.getConstant(1, SL, MVT::i32));
4418
4419 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
4420 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
4421
4422 unsigned Idx = KIdx->getZExtValue();
4423 bool InsertLo = Idx < 2;
4424 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
4425 InsertLo ? LoVec : HiVec,
4426 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
4427 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
4428
4429 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
4430
4431 SDValue Concat = InsertLo ?
4432 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
4433 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
4434
4435 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
4436 }
4437
Matt Arsenault3aef8092017-01-23 23:09:58 +00004438 if (isa<ConstantSDNode>(Idx))
4439 return SDValue();
4440
Matt Arsenault9224c002018-06-05 19:52:46 +00004441 MVT IntVT = MVT::getIntegerVT(VecSize);
Matt Arsenault67a98152018-05-16 11:47:30 +00004442
Matt Arsenault3aef8092017-01-23 23:09:58 +00004443 // Avoid stack access for dynamic indexing.
Matt Arsenault3aef8092017-01-23 23:09:58 +00004444 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
Tim Corringhamfa3e4e52019-02-01 16:51:09 +00004445
4446 // Create a congruent vector with the target value in each element so that
4447 // the required element can be masked and ORed into the target vector.
4448 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
4449 DAG.getSplatBuildVector(VecVT, SL, InsVal));
Matt Arsenault3aef8092017-01-23 23:09:58 +00004450
Matt Arsenault9224c002018-06-05 19:52:46 +00004451 assert(isPowerOf2_32(EltSize));
4452 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4453
Matt Arsenault3aef8092017-01-23 23:09:58 +00004454 // Convert vector index to bit-index.
Matt Arsenault9224c002018-06-05 19:52:46 +00004455 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004456
Matt Arsenault67a98152018-05-16 11:47:30 +00004457 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4458 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
4459 DAG.getConstant(0xffff, SL, IntVT),
Matt Arsenault3aef8092017-01-23 23:09:58 +00004460 ScaledIdx);
4461
Matt Arsenault67a98152018-05-16 11:47:30 +00004462 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
4463 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
4464 DAG.getNOT(SL, BFM, IntVT), BCVec);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004465
Matt Arsenault67a98152018-05-16 11:47:30 +00004466 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
4467 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004468}
4469
4470SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4471 SelectionDAG &DAG) const {
4472 SDLoc SL(Op);
4473
4474 EVT ResultVT = Op.getValueType();
4475 SDValue Vec = Op.getOperand(0);
4476 SDValue Idx = Op.getOperand(1);
Matt Arsenault67a98152018-05-16 11:47:30 +00004477 EVT VecVT = Vec.getValueType();
Matt Arsenault9224c002018-06-05 19:52:46 +00004478 unsigned VecSize = VecVT.getSizeInBits();
4479 EVT EltVT = VecVT.getVectorElementType();
4480 assert(VecSize <= 64);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004481
Matt Arsenault98f29462017-05-17 20:30:58 +00004482 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4483
Hiroshi Inoue372ffa12018-04-13 11:37:06 +00004484 // Make sure we do any optimizations that will make it easier to fold
Matt Arsenault98f29462017-05-17 20:30:58 +00004485 // source modifiers before obscuring it with bit operations.
4486
4487 // XXX - Why doesn't this get called when vector_shuffle is expanded?
4488 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4489 return Combined;
4490
Matt Arsenault9224c002018-06-05 19:52:46 +00004491 unsigned EltSize = EltVT.getSizeInBits();
4492 assert(isPowerOf2_32(EltSize));
Matt Arsenault3aef8092017-01-23 23:09:58 +00004493
Matt Arsenault9224c002018-06-05 19:52:46 +00004494 MVT IntVT = MVT::getIntegerVT(VecSize);
4495 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4496
4497 // Convert vector index to bit-index (* EltSize)
4498 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004499
Matt Arsenault67a98152018-05-16 11:47:30 +00004500 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4501 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004502
Matt Arsenault67a98152018-05-16 11:47:30 +00004503 if (ResultVT == MVT::f16) {
4504 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
4505 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4506 }
Matt Arsenault3aef8092017-01-23 23:09:58 +00004507
Matt Arsenault67a98152018-05-16 11:47:30 +00004508 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
4509}
4510
4511SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
4512 SelectionDAG &DAG) const {
4513 SDLoc SL(Op);
4514 EVT VT = Op.getValueType();
Matt Arsenault67a98152018-05-16 11:47:30 +00004515
Matt Arsenault02dc7e12018-06-15 15:15:46 +00004516 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4517 EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
4518
4519 // Turn into pair of packed build_vectors.
4520 // TODO: Special case for constants that can be materialized with s_mov_b64.
4521 SDValue Lo = DAG.getBuildVector(HalfVT, SL,
4522 { Op.getOperand(0), Op.getOperand(1) });
4523 SDValue Hi = DAG.getBuildVector(HalfVT, SL,
4524 { Op.getOperand(2), Op.getOperand(3) });
4525
4526 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
4527 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
4528
4529 SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
4530 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
4531 }
4532
Matt Arsenault1349a042018-05-22 06:32:10 +00004533 assert(VT == MVT::v2f16 || VT == MVT::v2i16);
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004534 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
Matt Arsenault67a98152018-05-16 11:47:30 +00004535
Matt Arsenault1349a042018-05-22 06:32:10 +00004536 SDValue Lo = Op.getOperand(0);
4537 SDValue Hi = Op.getOperand(1);
Matt Arsenault67a98152018-05-16 11:47:30 +00004538
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004539 // Avoid adding defined bits with the zero_extend.
4540 if (Hi.isUndef()) {
4541 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4542 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
4543 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
4544 }
Matt Arsenault67a98152018-05-16 11:47:30 +00004545
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004546 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
Matt Arsenault1349a042018-05-22 06:32:10 +00004547 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
4548
4549 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
4550 DAG.getConstant(16, SL, MVT::i32));
Matt Arsenault3ead7d72018-08-12 08:42:46 +00004551 if (Lo.isUndef())
4552 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
4553
4554 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4555 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
Matt Arsenault1349a042018-05-22 06:32:10 +00004556
4557 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
Matt Arsenault1349a042018-05-22 06:32:10 +00004558 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
Matt Arsenault3aef8092017-01-23 23:09:58 +00004559}
4560
Tom Stellard418beb72016-07-13 14:23:33 +00004561bool
4562SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
4563 // We can fold offsets for anything that doesn't require a GOT relocation.
Matt Arsenault0da63502018-08-31 05:49:54 +00004564 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
4565 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4566 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004567 !shouldEmitGOTReloc(GA->getGlobal());
Tom Stellard418beb72016-07-13 14:23:33 +00004568}
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004569
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004570static SDValue
4571buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
4572 const SDLoc &DL, unsigned Offset, EVT PtrVT,
4573 unsigned GAFlags = SIInstrInfo::MO_NONE) {
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004574 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4575 // lowered to the following code sequence:
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004576 //
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004577 // For constant address space:
4578 // s_getpc_b64 s[0:1]
4579 // s_add_u32 s0, s0, $symbol
4580 // s_addc_u32 s1, s1, 0
4581 //
4582 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4583 // a fixup or relocation is emitted to replace $symbol with a literal
4584 // constant, which is a pc-relative offset from the encoding of the $symbol
4585 // operand to the global variable.
4586 //
4587 // For global address space:
4588 // s_getpc_b64 s[0:1]
4589 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4590 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4591 //
4592 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4593 // fixups or relocations are emitted to replace $symbol@*@lo and
4594 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4595 // which is a 64-bit pc-relative offset from the encoding of the $symbol
4596 // operand to the global variable.
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004597 //
4598 // What we want here is an offset from the value returned by s_getpc
4599 // (which is the address of the s_add_u32 instruction) to the global
4600 // variable, but since the encoding of $symbol starts 4 bytes after the start
4601 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4602 // small. This requires us to add 4 to the global variable offset in order to
4603 // compute the correct address.
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004604 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4605 GAFlags);
4606 SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4607 GAFlags == SIInstrInfo::MO_NONE ?
4608 GAFlags : GAFlags + 1);
4609 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
Tom Stellardbf3e6e52016-06-14 20:29:59 +00004610}
4611
Tom Stellard418beb72016-07-13 14:23:33 +00004612SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
4613 SDValue Op,
4614 SelectionDAG &DAG) const {
4615 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
Matt Arsenaultb62a4eb2017-08-01 19:54:18 +00004616 const GlobalValue *GV = GSD->getGlobal();
Matt Arsenaultd1f45712018-09-10 12:16:11 +00004617 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
4618 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
4619 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
Tom Stellard418beb72016-07-13 14:23:33 +00004620 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
4621
4622 SDLoc DL(GSD);
Tom Stellard418beb72016-07-13 14:23:33 +00004623 EVT PtrVT = Op.getValueType();
4624
Matt Arsenaultd1f45712018-09-10 12:16:11 +00004625 // FIXME: Should not make address space based decisions here.
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004626 if (shouldEmitFixup(GV))
Tom Stellard418beb72016-07-13 14:23:33 +00004627 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
Konstantin Zhuravlyov08326b62016-10-20 18:12:38 +00004628 else if (shouldEmitPCReloc(GV))
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004629 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
4630 SIInstrInfo::MO_REL32);
Tom Stellard418beb72016-07-13 14:23:33 +00004631
4632 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
Konstantin Zhuravlyovc96b5d72016-10-14 04:37:34 +00004633 SIInstrInfo::MO_GOTPCREL32);
Tom Stellard418beb72016-07-13 14:23:33 +00004634
4635 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
Matt Arsenault0da63502018-08-31 05:49:54 +00004636 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
Tom Stellard418beb72016-07-13 14:23:33 +00004637 const DataLayout &DataLayout = DAG.getDataLayout();
4638 unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
Matt Arsenaultd77fcc22018-09-10 02:23:39 +00004639 MachinePointerInfo PtrInfo
4640 = MachinePointerInfo::getGOT(DAG.getMachineFunction());
Tom Stellard418beb72016-07-13 14:23:33 +00004641
Justin Lebar9c375812016-07-15 18:27:10 +00004642 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
Justin Lebaradbf09e2016-09-11 01:38:58 +00004643 MachineMemOperand::MODereferenceable |
4644 MachineMemOperand::MOInvariant);
Tom Stellard418beb72016-07-13 14:23:33 +00004645}
4646
Benjamin Kramerbdc49562016-06-12 15:39:02 +00004647SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
4648 const SDLoc &DL, SDValue V) const {
Matt Arsenault4ac341c2016-04-14 21:58:15 +00004649 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
4650 // the destination register.
4651 //
Tom Stellardfc92e772015-05-12 14:18:14 +00004652 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
4653 // so we will end up with redundant moves to m0.
4654 //
Matt Arsenault4ac341c2016-04-14 21:58:15 +00004655 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
4656
4657 // A Null SDValue creates a glue result.
4658 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
4659 V, Chain);
4660 return SDValue(M0, 0);
Tom Stellardfc92e772015-05-12 14:18:14 +00004661}
4662
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00004663SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
4664 SDValue Op,
4665 MVT VT,
4666 unsigned Offset) const {
4667 SDLoc SL(Op);
Matt Arsenaulte622dc32017-04-11 22:29:24 +00004668 SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
Matt Arsenault7b4826e2018-05-30 16:17:51 +00004669 DAG.getEntryNode(), Offset, 4, false);
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00004670 // The local size values will have the hi 16-bits as zero.
4671 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
4672 DAG.getValueType(VT));
4673}
4674
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004675static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4676 EVT VT) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004677 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00004678 "non-hsa intrinsic with hsa target",
4679 DL.getDebugLoc());
4680 DAG.getContext()->diagnose(BadIntrin);
4681 return DAG.getUNDEF(VT);
4682}
4683
Benjamin Kramer061f4a52017-01-13 14:39:03 +00004684static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4685 EVT VT) {
Matthias Braunf1caa282017-12-15 22:22:58 +00004686 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00004687 "intrinsic not supported on subtarget",
4688 DL.getDebugLoc());
Matt Arsenaulte0132462016-01-30 05:19:45 +00004689 DAG.getContext()->diagnose(BadIntrin);
4690 return DAG.getUNDEF(VT);
4691}
4692
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004693static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
4694 ArrayRef<SDValue> Elts) {
4695 assert(!Elts.empty());
4696 MVT Type;
4697 unsigned NumElts;
4698
4699 if (Elts.size() == 1) {
4700 Type = MVT::f32;
4701 NumElts = 1;
4702 } else if (Elts.size() == 2) {
4703 Type = MVT::v2f32;
4704 NumElts = 2;
4705 } else if (Elts.size() <= 4) {
4706 Type = MVT::v4f32;
4707 NumElts = 4;
4708 } else if (Elts.size() <= 8) {
4709 Type = MVT::v8f32;
4710 NumElts = 8;
4711 } else {
4712 assert(Elts.size() <= 16);
4713 Type = MVT::v16f32;
4714 NumElts = 16;
4715 }
4716
4717 SmallVector<SDValue, 16> VecElts(NumElts);
4718 for (unsigned i = 0; i < Elts.size(); ++i) {
4719 SDValue Elt = Elts[i];
4720 if (Elt.getValueType() != MVT::f32)
4721 Elt = DAG.getBitcast(MVT::f32, Elt);
4722 VecElts[i] = Elt;
4723 }
4724 for (unsigned i = Elts.size(); i < NumElts; ++i)
4725 VecElts[i] = DAG.getUNDEF(MVT::f32);
4726
4727 if (NumElts == 1)
4728 return VecElts[0];
4729 return DAG.getBuildVector(Type, DL, VecElts);
4730}
4731
4732static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00004733 SDValue *GLC, SDValue *SLC, SDValue *DLC) {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004734 auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004735
4736 uint64_t Value = CachePolicyConst->getZExtValue();
4737 SDLoc DL(CachePolicy);
4738 if (GLC) {
4739 *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4740 Value &= ~(uint64_t)0x1;
4741 }
4742 if (SLC) {
4743 *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4744 Value &= ~(uint64_t)0x2;
4745 }
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00004746 if (DLC) {
4747 *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32);
4748 Value &= ~(uint64_t)0x4;
4749 }
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004750
4751 return Value == 0;
4752}
4753
David Stuttardf77079f2019-01-14 11:55:24 +00004754// Re-construct the required return value for a image load intrinsic.
4755// This is more complicated due to the optional use TexFailCtrl which means the required
4756// return type is an aggregate
4757static SDValue constructRetValue(SelectionDAG &DAG,
4758 MachineSDNode *Result,
4759 ArrayRef<EVT> ResultTypes,
4760 bool IsTexFail, bool Unpacked, bool IsD16,
4761 int DMaskPop, int NumVDataDwords,
4762 const SDLoc &DL, LLVMContext &Context) {
4763 // Determine the required return type. This is the same regardless of IsTexFail flag
4764 EVT ReqRetVT = ResultTypes[0];
4765 EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
4766 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
4767 EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
4768 EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
4769 : AdjEltVT
4770 : ReqRetVT;
4771
4772 // Extract data part of the result
4773 // Bitcast the result to the same type as the required return type
4774 int NumElts;
4775 if (IsD16 && !Unpacked)
4776 NumElts = NumVDataDwords << 1;
4777 else
4778 NumElts = NumVDataDwords;
4779
4780 EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
4781 : AdjEltVT;
4782
Tim Renouf6f0191a2019-03-22 15:21:11 +00004783 // Special case for v6f16. Rather than add support for this, use v3i32 to
David Stuttardf77079f2019-01-14 11:55:24 +00004784 // extract the data elements
Tim Renouf6f0191a2019-03-22 15:21:11 +00004785 bool V6F16Special = false;
4786 if (NumElts == 6) {
4787 CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
David Stuttardf77079f2019-01-14 11:55:24 +00004788 DMaskPop >>= 1;
4789 ReqRetNumElts >>= 1;
Tim Renouf6f0191a2019-03-22 15:21:11 +00004790 V6F16Special = true;
David Stuttardf77079f2019-01-14 11:55:24 +00004791 AdjVT = MVT::v2i32;
4792 }
4793
4794 SDValue N = SDValue(Result, 0);
4795 SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
4796
4797 // Iterate over the result
4798 SmallVector<SDValue, 4> BVElts;
4799
4800 if (CastVT.isVector()) {
4801 DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
4802 } else {
4803 BVElts.push_back(CastRes);
4804 }
4805 int ExtraElts = ReqRetNumElts - DMaskPop;
4806 while(ExtraElts--)
4807 BVElts.push_back(DAG.getUNDEF(AdjEltVT));
4808
4809 SDValue PreTFCRes;
4810 if (ReqRetNumElts > 1) {
4811 SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
4812 if (IsD16 && Unpacked)
4813 PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
4814 else
4815 PreTFCRes = NewVec;
4816 } else {
4817 PreTFCRes = BVElts[0];
4818 }
4819
Tim Renouf6f0191a2019-03-22 15:21:11 +00004820 if (V6F16Special)
David Stuttardf77079f2019-01-14 11:55:24 +00004821 PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
4822
4823 if (!IsTexFail) {
4824 if (Result->getNumValues() > 1)
4825 return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
4826 else
4827 return PreTFCRes;
4828 }
4829
4830 // Extract the TexFail result and insert into aggregate return
4831 SmallVector<SDValue, 1> TFCElt;
4832 DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
4833 SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
4834 return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
4835}
4836
4837static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
4838 SDValue *LWE, bool &IsTexFail) {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004839 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
David Stuttardf77079f2019-01-14 11:55:24 +00004840
4841 uint64_t Value = TexFailCtrlConst->getZExtValue();
4842 if (Value) {
4843 IsTexFail = true;
4844 }
4845
4846 SDLoc DL(TexFailCtrlConst);
4847 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4848 Value &= ~(uint64_t)0x1;
4849 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4850 Value &= ~(uint64_t)0x2;
4851
4852 return Value == 0;
4853}
4854
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004855SDValue SITargetLowering::lowerImage(SDValue Op,
4856 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4857 SelectionDAG &DAG) const {
4858 SDLoc DL(Op);
Ryan Taylor1f334d02018-08-28 15:07:30 +00004859 MachineFunction &MF = DAG.getMachineFunction();
4860 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004861 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4862 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4863 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004864 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4865 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
4866 unsigned IntrOpcode = Intr->BaseOpcode;
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00004867 bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004868
David Stuttardf77079f2019-01-14 11:55:24 +00004869 SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
4870 SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004871 bool IsD16 = false;
Ryan Taylor1f334d02018-08-28 15:07:30 +00004872 bool IsA16 = false;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004873 SDValue VData;
4874 int NumVDataDwords;
David Stuttardf77079f2019-01-14 11:55:24 +00004875 bool AdjustRetType = false;
4876
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004877 unsigned AddrIdx; // Index of first address argument
4878 unsigned DMask;
David Stuttardf77079f2019-01-14 11:55:24 +00004879 unsigned DMaskLanes = 0;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004880
4881 if (BaseOpcode->Atomic) {
4882 VData = Op.getOperand(2);
4883
4884 bool Is64Bit = VData.getValueType() == MVT::i64;
4885 if (BaseOpcode->AtomicX2) {
4886 SDValue VData2 = Op.getOperand(3);
4887 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
4888 {VData, VData2});
4889 if (Is64Bit)
4890 VData = DAG.getBitcast(MVT::v4i32, VData);
4891
4892 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
4893 DMask = Is64Bit ? 0xf : 0x3;
4894 NumVDataDwords = Is64Bit ? 4 : 2;
4895 AddrIdx = 4;
4896 } else {
4897 DMask = Is64Bit ? 0x3 : 0x1;
4898 NumVDataDwords = Is64Bit ? 2 : 1;
4899 AddrIdx = 3;
4900 }
4901 } else {
David Stuttardf77079f2019-01-14 11:55:24 +00004902 unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
Matt Arsenaultcaf13162019-03-12 21:02:54 +00004903 auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
David Stuttardf77079f2019-01-14 11:55:24 +00004904 DMask = DMaskConst->getZExtValue();
4905 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004906
4907 if (BaseOpcode->Store) {
4908 VData = Op.getOperand(2);
4909
4910 MVT StoreVT = VData.getSimpleValueType();
4911 if (StoreVT.getScalarType() == MVT::f16) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00004912 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004913 !BaseOpcode->HasD16)
4914 return Op; // D16 is unsupported for this instruction
4915
4916 IsD16 = true;
4917 VData = handleD16VData(VData, DAG);
4918 }
4919
4920 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004921 } else {
David Stuttardf77079f2019-01-14 11:55:24 +00004922 // Work out the num dwords based on the dmask popcount and underlying type
4923 // and whether packing is supported.
4924 MVT LoadVT = ResultTypes[0].getSimpleVT();
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004925 if (LoadVT.getScalarType() == MVT::f16) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00004926 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004927 !BaseOpcode->HasD16)
4928 return Op; // D16 is unsupported for this instruction
4929
4930 IsD16 = true;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004931 }
4932
David Stuttardf77079f2019-01-14 11:55:24 +00004933 // Confirm that the return type is large enough for the dmask specified
4934 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
4935 (!LoadVT.isVector() && DMaskLanes > 1))
4936 return Op;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004937
David Stuttardf77079f2019-01-14 11:55:24 +00004938 if (IsD16 && !Subtarget->hasUnpackedD16VMem())
4939 NumVDataDwords = (DMaskLanes + 1) / 2;
4940 else
4941 NumVDataDwords = DMaskLanes;
4942
4943 AdjustRetType = true;
4944 }
David Stuttardc6603862018-11-29 20:14:17 +00004945
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004946 AddrIdx = DMaskIdx + 1;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004947 }
4948
Ryan Taylor1f334d02018-08-28 15:07:30 +00004949 unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
4950 unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
4951 unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
4952 unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
4953 NumCoords + NumLCM;
4954 unsigned NumMIVAddrs = NumVAddrs;
4955
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00004956 SmallVector<SDValue, 4> VAddrs;
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004957
4958 // Optimize _L to _LZ when _L is zero
4959 if (LZMappingInfo) {
4960 if (auto ConstantLod =
Ryan Taylor1f334d02018-08-28 15:07:30 +00004961 dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004962 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4963 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
Ryan Taylor1f334d02018-08-28 15:07:30 +00004964 NumMIVAddrs--; // remove 'lod'
Ryan Taylor894c8fd2018-08-01 12:12:01 +00004965 }
4966 }
4967 }
4968
Ryan Taylor1f334d02018-08-28 15:07:30 +00004969 // Check for 16 bit addresses and pack if true.
4970 unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4971 MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
Neil Henning63718b22018-10-31 10:34:48 +00004972 const MVT VAddrScalarVT = VAddrVT.getScalarType();
4973 if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
Ryan Taylor1f334d02018-08-28 15:07:30 +00004974 ST->hasFeature(AMDGPU::FeatureR128A16)) {
4975 IsA16 = true;
Neil Henning63718b22018-10-31 10:34:48 +00004976 const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
Ryan Taylor1f334d02018-08-28 15:07:30 +00004977 for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
4978 SDValue AddrLo, AddrHi;
4979 // Push back extra arguments.
4980 if (i < DimIdx) {
4981 AddrLo = Op.getOperand(i);
4982 } else {
4983 AddrLo = Op.getOperand(i);
4984 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
4985 // in 1D, derivatives dx/dh and dx/dv are packed with undef.
4986 if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
Matt Arsenault0da63502018-08-31 05:49:54 +00004987 ((NumGradients / 2) % 2 == 1 &&
4988 (i == DimIdx + (NumGradients / 2) - 1 ||
Ryan Taylor1f334d02018-08-28 15:07:30 +00004989 i == DimIdx + NumGradients - 1))) {
4990 AddrHi = DAG.getUNDEF(MVT::f16);
4991 } else {
4992 AddrHi = Op.getOperand(i + 1);
4993 i++;
4994 }
Neil Henning63718b22018-10-31 10:34:48 +00004995 AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
Ryan Taylor1f334d02018-08-28 15:07:30 +00004996 {AddrLo, AddrHi});
4997 AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
4998 }
4999 VAddrs.push_back(AddrLo);
5000 }
5001 } else {
5002 for (unsigned i = 0; i < NumMIVAddrs; ++i)
5003 VAddrs.push_back(Op.getOperand(AddrIdx + i));
5004 }
5005
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005006 // If the register allocator cannot place the address registers contiguously
5007 // without introducing moves, then using the non-sequential address encoding
5008 // is always preferable, since it saves VALU instructions and is usually a
5009 // wash in terms of code size or even better.
5010 //
5011 // However, we currently have no way of hinting to the register allocator that
5012 // MIMG addresses should be placed contiguously when it is possible to do so,
5013 // so force non-NSA for the common 2-address case as a heuristic.
5014 //
5015 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5016 // allocation when possible.
5017 bool UseNSA =
5018 ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3;
5019 SDValue VAddr;
5020 if (!UseNSA)
5021 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005022
5023 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
5024 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
5025 unsigned CtrlIdx; // Index of texfailctrl argument
5026 SDValue Unorm;
5027 if (!BaseOpcode->Sampler) {
5028 Unorm = True;
5029 CtrlIdx = AddrIdx + NumVAddrs + 1;
5030 } else {
5031 auto UnormConst =
Matt Arsenaultcaf13162019-03-12 21:02:54 +00005032 cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005033
5034 Unorm = UnormConst->getZExtValue() ? True : False;
5035 CtrlIdx = AddrIdx + NumVAddrs + 3;
5036 }
5037
David Stuttardf77079f2019-01-14 11:55:24 +00005038 SDValue TFE;
5039 SDValue LWE;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005040 SDValue TexFail = Op.getOperand(CtrlIdx);
David Stuttardf77079f2019-01-14 11:55:24 +00005041 bool IsTexFail = false;
5042 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005043 return Op;
5044
David Stuttardf77079f2019-01-14 11:55:24 +00005045 if (IsTexFail) {
5046 if (!DMaskLanes) {
5047 // Expecting to get an error flag since TFC is on - and dmask is 0
5048 // Force dmask to be at least 1 otherwise the instruction will fail
5049 DMask = 0x1;
5050 DMaskLanes = 1;
5051 NumVDataDwords = 1;
5052 }
5053 NumVDataDwords += 1;
5054 AdjustRetType = true;
5055 }
5056
5057 // Has something earlier tagged that the return type needs adjusting
5058 // This happens if the instruction is a load or has set TexFailCtrl flags
5059 if (AdjustRetType) {
5060 // NumVDataDwords reflects the true number of dwords required in the return type
5061 if (DMaskLanes == 0 && !BaseOpcode->Store) {
5062 // This is a no-op load. This can be eliminated
5063 SDValue Undef = DAG.getUNDEF(Op.getValueType());
5064 if (isa<MemSDNode>(Op))
5065 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
5066 return Undef;
5067 }
5068
David Stuttardf77079f2019-01-14 11:55:24 +00005069 EVT NewVT = NumVDataDwords > 1 ?
5070 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
5071 : MVT::f32;
5072
5073 ResultTypes[0] = NewVT;
5074 if (ResultTypes.size() == 3) {
5075 // Original result was aggregate type used for TexFailCtrl results
5076 // The actual instruction returns as a vector type which has now been
5077 // created. Remove the aggregate result.
5078 ResultTypes.erase(&ResultTypes[1]);
5079 }
5080 }
5081
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005082 SDValue GLC;
5083 SDValue SLC;
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005084 SDValue DLC;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005085 if (BaseOpcode->Atomic) {
5086 GLC = True; // TODO no-return optimization
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005087 if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC,
5088 IsGFX10 ? &DLC : nullptr))
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005089 return Op;
5090 } else {
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005091 if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC,
5092 IsGFX10 ? &DLC : nullptr))
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005093 return Op;
5094 }
5095
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005096 SmallVector<SDValue, 26> Ops;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005097 if (BaseOpcode->Store || BaseOpcode->Atomic)
5098 Ops.push_back(VData); // vdata
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005099 if (UseNSA) {
5100 for (const SDValue &Addr : VAddrs)
5101 Ops.push_back(Addr);
5102 } else {
5103 Ops.push_back(VAddr);
5104 }
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005105 Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
5106 if (BaseOpcode->Sampler)
5107 Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
5108 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005109 if (IsGFX10)
5110 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005111 Ops.push_back(Unorm);
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005112 if (IsGFX10)
5113 Ops.push_back(DLC);
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005114 Ops.push_back(GLC);
5115 Ops.push_back(SLC);
Ryan Taylor1f334d02018-08-28 15:07:30 +00005116 Ops.push_back(IsA16 && // a16 or r128
5117 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
David Stuttardf77079f2019-01-14 11:55:24 +00005118 Ops.push_back(TFE); // tfe
5119 Ops.push_back(LWE); // lwe
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005120 if (!IsGFX10)
5121 Ops.push_back(DimInfo->DA ? True : False);
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005122 if (BaseOpcode->HasD16)
5123 Ops.push_back(IsD16 ? True : False);
5124 if (isa<MemSDNode>(Op))
5125 Ops.push_back(Op.getOperand(0)); // chain
5126
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005127 int NumVAddrDwords =
5128 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005129 int Opcode = -1;
5130
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005131 if (IsGFX10) {
5132 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
5133 UseNSA ? AMDGPU::MIMGEncGfx10NSA
5134 : AMDGPU::MIMGEncGfx10Default,
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005135 NumVDataDwords, NumVAddrDwords);
Stanislav Mekhanoshin692560d2019-05-01 16:32:58 +00005136 } else {
5137 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5138 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
5139 NumVDataDwords, NumVAddrDwords);
5140 if (Opcode == -1)
5141 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
5142 NumVDataDwords, NumVAddrDwords);
5143 }
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005144 assert(Opcode != -1);
5145
5146 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
5147 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
Chandler Carruth66654b72018-08-14 23:30:32 +00005148 MachineMemOperand *MemRef = MemOp->getMemOperand();
5149 DAG.setNodeMemRefs(NewNode, {MemRef});
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005150 }
5151
5152 if (BaseOpcode->AtomicX2) {
5153 SmallVector<SDValue, 1> Elt;
5154 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
5155 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
David Stuttardf77079f2019-01-14 11:55:24 +00005156 } else if (!BaseOpcode->Store) {
5157 return constructRetValue(DAG, NewNode,
5158 OrigResultTypes, IsTexFail,
5159 Subtarget->hasUnpackedD16VMem(), IsD16,
5160 DMaskLanes, NumVDataDwords, DL,
5161 *DAG.getContext());
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005162 }
5163
5164 return SDValue(NewNode, 0);
5165}
5166
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005167SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
5168 SDValue Offset, SDValue GLC,
5169 SelectionDAG &DAG) const {
5170 MachineFunction &MF = DAG.getMachineFunction();
5171 MachineMemOperand *MMO = MF.getMachineMemOperand(
5172 MachinePointerInfo(),
5173 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5174 MachineMemOperand::MOInvariant,
5175 VT.getStoreSize(), VT.getStoreSize());
5176
5177 if (!Offset->isDivergent()) {
5178 SDValue Ops[] = {
5179 Rsrc,
5180 Offset, // Offset
5181 GLC // glc
5182 };
5183 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
5184 DAG.getVTList(VT), Ops, VT, MMO);
5185 }
5186
5187 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
5188 // assume that the buffer is unswizzled.
5189 SmallVector<SDValue, 4> Loads;
5190 unsigned NumLoads = 1;
5191 MVT LoadVT = VT.getSimpleVT();
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005192 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
Simon Pilgrim44dfd812018-12-07 21:44:25 +00005193 assert((LoadVT.getScalarType() == MVT::i32 ||
5194 LoadVT.getScalarType() == MVT::f32) &&
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005195 isPowerOf2_32(NumElts));
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005196
Matt Arsenaultce2e0532018-12-07 18:41:39 +00005197 if (NumElts == 8 || NumElts == 16) {
5198 NumLoads = NumElts == 16 ? 4 : 2;
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005199 LoadVT = MVT::v4i32;
5200 }
5201
5202 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
5203 unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
5204 SDValue Ops[] = {
5205 DAG.getEntryNode(), // Chain
5206 Rsrc, // rsrc
5207 DAG.getConstant(0, DL, MVT::i32), // vindex
5208 {}, // voffset
5209 {}, // soffset
5210 {}, // offset
5211 DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
5212 DAG.getConstant(0, DL, MVT::i1), // idxen
5213 };
5214
5215 // Use the alignment to ensure that the required offsets will fit into the
5216 // immediate offsets.
5217 setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
5218
5219 uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
5220 for (unsigned i = 0; i < NumLoads; ++i) {
5221 Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
5222 Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
5223 Ops, LoadVT, MMO));
5224 }
5225
5226 if (VT == MVT::v8i32 || VT == MVT::v16i32)
5227 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
5228
5229 return Loads[0];
5230}
5231
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005232SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5233 SelectionDAG &DAG) const {
5234 MachineFunction &MF = DAG.getMachineFunction();
Tom Stellarddcb9f092015-07-09 21:20:37 +00005235 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005236
5237 EVT VT = Op.getValueType();
5238 SDLoc DL(Op);
5239 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5240
Sanjay Patela2607012015-09-16 16:31:21 +00005241 // TODO: Should this propagate fast-math-flags?
5242
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005243 switch (IntrinsicID) {
Tom Stellard2f3f9852017-01-25 01:25:13 +00005244 case Intrinsic::amdgcn_implicit_buffer_ptr: {
Konstantin Zhuravlyovaa067cb2018-10-04 21:02:16 +00005245 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
Matt Arsenault10fc0622017-06-26 03:01:31 +00005246 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005247 return getPreloadedValue(DAG, *MFI, VT,
5248 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
Tom Stellard2f3f9852017-01-25 01:25:13 +00005249 }
Tom Stellard48f29f22015-11-26 00:43:29 +00005250 case Intrinsic::amdgcn_dispatch_ptr:
Matt Arsenault48ab5262016-04-25 19:27:18 +00005251 case Intrinsic::amdgcn_queue_ptr: {
Konstantin Zhuravlyovaa067cb2018-10-04 21:02:16 +00005252 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
Oliver Stannard7e7d9832016-02-02 13:52:43 +00005253 DiagnosticInfoUnsupported BadIntrin(
Matthias Braunf1caa282017-12-15 22:22:58 +00005254 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
Oliver Stannard7e7d9832016-02-02 13:52:43 +00005255 DL.getDebugLoc());
Matt Arsenault800fecf2016-01-11 21:18:33 +00005256 DAG.getContext()->diagnose(BadIntrin);
5257 return DAG.getUNDEF(VT);
5258 }
5259
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005260 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
5261 AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
5262 return getPreloadedValue(DAG, *MFI, VT, RegID);
Matt Arsenault48ab5262016-04-25 19:27:18 +00005263 }
Jan Veselyfea814d2016-06-21 20:46:20 +00005264 case Intrinsic::amdgcn_implicitarg_ptr: {
Matt Arsenault9166ce82017-07-28 15:52:08 +00005265 if (MFI->isEntryFunction())
5266 return getImplicitArgPtr(DAG, DL);
Matt Arsenault817c2532017-08-03 23:12:44 +00005267 return getPreloadedValue(DAG, *MFI, VT,
5268 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
Jan Veselyfea814d2016-06-21 20:46:20 +00005269 }
Matt Arsenaultdc4ebad2016-04-29 21:16:52 +00005270 case Intrinsic::amdgcn_kernarg_segment_ptr: {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005271 return getPreloadedValue(DAG, *MFI, VT,
5272 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
Matt Arsenaultdc4ebad2016-04-29 21:16:52 +00005273 }
Matt Arsenault8d718dc2016-07-22 17:01:30 +00005274 case Intrinsic::amdgcn_dispatch_id: {
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005275 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
Matt Arsenault8d718dc2016-07-22 17:01:30 +00005276 }
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005277 case Intrinsic::amdgcn_rcp:
5278 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
5279 case Intrinsic::amdgcn_rsq:
5280 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
Eugene Zelenko66203762017-01-21 00:53:49 +00005281 case Intrinsic::amdgcn_rsq_legacy:
Tom Stellard5bfbae52018-07-11 20:59:01 +00005282 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005283 return emitRemovedIntrinsicError(DAG, DL, VT);
5284
5285 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
Eugene Zelenko66203762017-01-21 00:53:49 +00005286 case Intrinsic::amdgcn_rcp_legacy:
Tom Stellard5bfbae52018-07-11 20:59:01 +00005287 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenault32fc5272016-07-26 16:45:45 +00005288 return emitRemovedIntrinsicError(DAG, DL, VT);
5289 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
Matt Arsenault09b2c4a2016-07-15 21:26:52 +00005290 case Intrinsic::amdgcn_rsq_clamp: {
Tom Stellard5bfbae52018-07-11 20:59:01 +00005291 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenault79963e82016-02-13 01:03:00 +00005292 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
Tom Stellard48f29f22015-11-26 00:43:29 +00005293
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005294 Type *Type = VT.getTypeForEVT(*DAG.getContext());
5295 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
5296 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
5297
5298 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
5299 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
5300 DAG.getConstantFP(Max, DL, VT));
5301 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
5302 DAG.getConstantFP(Min, DL, VT));
5303 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005304 case Intrinsic::r600_read_ngroups_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005305 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005306 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005307
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005308 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005309 SI::KernelInputOffsets::NGROUPS_X, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005310 case Intrinsic::r600_read_ngroups_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005311 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005312 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005313
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005314 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005315 SI::KernelInputOffsets::NGROUPS_Y, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005316 case Intrinsic::r600_read_ngroups_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005317 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005318 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005319
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005320 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005321 SI::KernelInputOffsets::NGROUPS_Z, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005322 case Intrinsic::r600_read_global_size_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005323 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005324 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005325
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005326 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005327 SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005328 case Intrinsic::r600_read_global_size_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005329 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005330 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005331
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005332 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005333 SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005334 case Intrinsic::r600_read_global_size_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005335 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005336 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005337
Matt Arsenaulte622dc32017-04-11 22:29:24 +00005338 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
Matt Arsenault7b4826e2018-05-30 16:17:51 +00005339 SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005340 case Intrinsic::r600_read_local_size_x:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005341 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005342 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005343
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005344 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5345 SI::KernelInputOffsets::LOCAL_SIZE_X);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005346 case Intrinsic::r600_read_local_size_y:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005347 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005348 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005349
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005350 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5351 SI::KernelInputOffsets::LOCAL_SIZE_Y);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005352 case Intrinsic::r600_read_local_size_z:
Matt Arsenaulte0132462016-01-30 05:19:45 +00005353 if (Subtarget->isAmdHsaOS())
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00005354 return emitNonHSAIntrinsicError(DAG, DL, VT);
Matt Arsenaulte0132462016-01-30 05:19:45 +00005355
Matt Arsenaultff6da2f2015-11-30 21:15:45 +00005356 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5357 SI::KernelInputOffsets::LOCAL_SIZE_Z);
Matt Arsenault43976df2016-01-30 04:25:19 +00005358 case Intrinsic::amdgcn_workgroup_id_x:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005359 case Intrinsic::r600_read_tgid_x:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005360 return getPreloadedValue(DAG, *MFI, VT,
5361 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
Matt Arsenault43976df2016-01-30 04:25:19 +00005362 case Intrinsic::amdgcn_workgroup_id_y:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005363 case Intrinsic::r600_read_tgid_y:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005364 return getPreloadedValue(DAG, *MFI, VT,
5365 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
Matt Arsenault43976df2016-01-30 04:25:19 +00005366 case Intrinsic::amdgcn_workgroup_id_z:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005367 case Intrinsic::r600_read_tgid_z:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005368 return getPreloadedValue(DAG, *MFI, VT,
5369 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
Reid Kleckner4dc0b1a2018-11-01 19:54:45 +00005370 case Intrinsic::amdgcn_workitem_id_x:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005371 case Intrinsic::r600_read_tidig_x:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005372 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5373 SDLoc(DAG.getEntryNode()),
5374 MFI->getArgInfo().WorkItemIDX);
Matt Arsenault43976df2016-01-30 04:25:19 +00005375 case Intrinsic::amdgcn_workitem_id_y:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005376 case Intrinsic::r600_read_tidig_y:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005377 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5378 SDLoc(DAG.getEntryNode()),
5379 MFI->getArgInfo().WorkItemIDY);
Matt Arsenault43976df2016-01-30 04:25:19 +00005380 case Intrinsic::amdgcn_workitem_id_z:
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005381 case Intrinsic::r600_read_tidig_z:
Matt Arsenault8623e8d2017-08-03 23:00:29 +00005382 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5383 SDLoc(DAG.getEntryNode()),
5384 MFI->getArgInfo().WorkItemIDZ);
Tim Renouf904343f2018-08-25 14:53:17 +00005385 case Intrinsic::amdgcn_s_buffer_load: {
5386 unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00005387 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
5388 DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005389 }
Matt Arsenaultc5b641a2017-03-17 20:41:45 +00005390 case Intrinsic::amdgcn_fdiv_fast:
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00005391 return lowerFDIV_FAST(Op, DAG);
Tom Stellard2187bb82016-12-06 23:52:13 +00005392 case Intrinsic::amdgcn_interp_mov: {
5393 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5394 SDValue Glue = M0.getValue(1);
5395 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
5396 Op.getOperand(2), Op.getOperand(3), Glue);
5397 }
Tom Stellardad7d03d2015-12-15 17:02:49 +00005398 case Intrinsic::amdgcn_interp_p1: {
5399 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5400 SDValue Glue = M0.getValue(1);
5401 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
5402 Op.getOperand(2), Op.getOperand(3), Glue);
5403 }
5404 case Intrinsic::amdgcn_interp_p2: {
5405 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5406 SDValue Glue = SDValue(M0.getNode(), 1);
5407 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
5408 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
5409 Glue);
5410 }
Tim Corringham824ca3f2019-01-28 13:48:59 +00005411 case Intrinsic::amdgcn_interp_p1_f16: {
5412 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5413 SDValue Glue = M0.getValue(1);
5414 if (getSubtarget()->getLDSBankCount() == 16) {
5415 // 16 bank LDS
5416 SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
5417 DAG.getConstant(2, DL, MVT::i32), // P0
5418 Op.getOperand(2), // Attrchan
5419 Op.getOperand(3), // Attr
5420 Glue);
5421 SDValue Ops[] = {
5422 Op.getOperand(1), // Src0
5423 Op.getOperand(2), // Attrchan
5424 Op.getOperand(3), // Attr
5425 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5426 S, // Src2 - holds two f16 values selected by high
5427 DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
5428 Op.getOperand(4), // high
5429 DAG.getConstant(0, DL, MVT::i1), // $clamp
5430 DAG.getConstant(0, DL, MVT::i32) // $omod
5431 };
5432 return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
5433 } else {
5434 // 32 bank LDS
5435 SDValue Ops[] = {
5436 Op.getOperand(1), // Src0
5437 Op.getOperand(2), // Attrchan
5438 Op.getOperand(3), // Attr
5439 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5440 Op.getOperand(4), // high
5441 DAG.getConstant(0, DL, MVT::i1), // $clamp
5442 DAG.getConstant(0, DL, MVT::i32), // $omod
5443 Glue
5444 };
5445 return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
5446 }
5447 }
5448 case Intrinsic::amdgcn_interp_p2_f16: {
5449 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6));
5450 SDValue Glue = SDValue(M0.getNode(), 1);
5451 SDValue Ops[] = {
5452 Op.getOperand(2), // Src0
5453 Op.getOperand(3), // Attrchan
5454 Op.getOperand(4), // Attr
5455 DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5456 Op.getOperand(1), // Src2
5457 DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
5458 Op.getOperand(5), // high
5459 DAG.getConstant(0, DL, MVT::i1), // $clamp
5460 Glue
5461 };
5462 return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
5463 }
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005464 case Intrinsic::amdgcn_sin:
5465 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
5466
5467 case Intrinsic::amdgcn_cos:
5468 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
5469
5470 case Intrinsic::amdgcn_log_clamp: {
Tom Stellard5bfbae52018-07-11 20:59:01 +00005471 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005472 return SDValue();
5473
5474 DiagnosticInfoUnsupported BadIntrin(
Matthias Braunf1caa282017-12-15 22:22:58 +00005475 MF.getFunction(), "intrinsic not supported on subtarget",
Matt Arsenaultce56a0e2016-02-13 01:19:56 +00005476 DL.getDebugLoc());
5477 DAG.getContext()->diagnose(BadIntrin);
5478 return DAG.getUNDEF(VT);
5479 }
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005480 case Intrinsic::amdgcn_ldexp:
5481 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
5482 Op.getOperand(1), Op.getOperand(2));
Matt Arsenault74015162016-05-28 00:19:52 +00005483
5484 case Intrinsic::amdgcn_fract:
5485 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
5486
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005487 case Intrinsic::amdgcn_class:
5488 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
5489 Op.getOperand(1), Op.getOperand(2));
5490 case Intrinsic::amdgcn_div_fmas:
5491 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
5492 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5493 Op.getOperand(4));
5494
5495 case Intrinsic::amdgcn_div_fixup:
5496 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
5497 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5498
5499 case Intrinsic::amdgcn_trig_preop:
5500 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
5501 Op.getOperand(1), Op.getOperand(2));
5502 case Intrinsic::amdgcn_div_scale: {
Matt Arsenaultcaf13162019-03-12 21:02:54 +00005503 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
Matt Arsenaultf75257a2016-01-23 05:32:20 +00005504
5505 // Translate to the operands expected by the machine instruction. The
5506 // first parameter must be the same as the first instruction.
5507 SDValue Numerator = Op.getOperand(1);
5508 SDValue Denominator = Op.getOperand(2);
5509
5510 // Note this order is opposite of the machine instruction's operations,
5511 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5512 // intrinsic has the numerator as the first operand to match a normal
5513 // division operation.
5514
5515 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
5516
5517 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
5518 Denominator, Numerator);
5519 }
Wei Ding07e03712016-07-28 16:42:13 +00005520 case Intrinsic::amdgcn_icmp: {
Marek Olsak33eb4d92019-01-15 02:13:18 +00005521 // There is a Pat that handles this variant, so return it as-is.
5522 if (Op.getOperand(1).getValueType() == MVT::i1 &&
5523 Op.getConstantOperandVal(2) == 0 &&
5524 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
5525 return Op;
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00005526 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
Wei Ding07e03712016-07-28 16:42:13 +00005527 }
5528 case Intrinsic::amdgcn_fcmp: {
Matt Arsenaultb3a80e52018-08-15 21:25:20 +00005529 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
Wei Ding07e03712016-07-28 16:42:13 +00005530 }
Matt Arsenaultf84e5d92017-01-31 03:07:46 +00005531 case Intrinsic::amdgcn_fmed3:
5532 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
5533 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
Farhana Aleenc370d7b2018-07-16 18:19:59 +00005534 case Intrinsic::amdgcn_fdot2:
5535 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
Konstantin Zhuravlyovbb30ef72018-08-01 01:31:30 +00005536 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5537 Op.getOperand(4));
Matt Arsenault32fc5272016-07-26 16:45:45 +00005538 case Intrinsic::amdgcn_fmul_legacy:
5539 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
5540 Op.getOperand(1), Op.getOperand(2));
Matt Arsenaultc96e1de2016-07-18 18:35:05 +00005541 case Intrinsic::amdgcn_sffbh:
Matt Arsenaultc96e1de2016-07-18 18:35:05 +00005542 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
Matt Arsenaultf5262252017-02-22 23:04:58 +00005543 case Intrinsic::amdgcn_sbfe:
5544 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
5545 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5546 case Intrinsic::amdgcn_ubfe:
5547 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
5548 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
Marek Olsak13e47412018-01-31 20:18:04 +00005549 case Intrinsic::amdgcn_cvt_pkrtz:
5550 case Intrinsic::amdgcn_cvt_pknorm_i16:
5551 case Intrinsic::amdgcn_cvt_pknorm_u16:
5552 case Intrinsic::amdgcn_cvt_pk_i16:
5553 case Intrinsic::amdgcn_cvt_pk_u16: {
5554 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
Matt Arsenault1f17c662017-02-22 00:27:34 +00005555 EVT VT = Op.getValueType();
Marek Olsak13e47412018-01-31 20:18:04 +00005556 unsigned Opcode;
5557
5558 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
5559 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
5560 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
5561 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
5562 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
5563 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
5564 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
5565 Opcode = AMDGPUISD::CVT_PK_I16_I32;
5566 else
5567 Opcode = AMDGPUISD::CVT_PK_U16_U32;
5568
Matt Arsenault709374d2018-08-01 20:13:58 +00005569 if (isTypeLegal(VT))
5570 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
5571
Marek Olsak13e47412018-01-31 20:18:04 +00005572 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
Matt Arsenault1f17c662017-02-22 00:27:34 +00005573 Op.getOperand(1), Op.getOperand(2));
5574 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
5575 }
Connor Abbott8c217d02017-08-04 18:36:49 +00005576 case Intrinsic::amdgcn_wqm: {
5577 SDValue Src = Op.getOperand(1);
5578 return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
5579 0);
5580 }
Connor Abbott92638ab2017-08-04 18:36:52 +00005581 case Intrinsic::amdgcn_wwm: {
5582 SDValue Src = Op.getOperand(1);
5583 return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
5584 0);
5585 }
Stanislav Mekhanoshindacda792018-06-26 20:04:19 +00005586 case Intrinsic::amdgcn_fmad_ftz:
5587 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
5588 Op.getOperand(2), Op.getOperand(3));
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005589 default:
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00005590 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5591 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
5592 return lowerImage(Op, ImageDimIntr, DAG);
5593
Matt Arsenault754dd3e2017-04-03 18:08:08 +00005594 return Op;
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00005595 }
5596}
5597
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005598SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5599 SelectionDAG &DAG) const {
5600 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
Tom Stellard6f9ef142016-12-20 17:19:44 +00005601 SDLoc DL(Op);
David Stuttard70e8bc12017-06-22 16:29:22 +00005602
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005603 switch (IntrID) {
Marek Olsakc5cec5e2019-01-16 15:43:53 +00005604 case Intrinsic::amdgcn_ds_ordered_add:
5605 case Intrinsic::amdgcn_ds_ordered_swap: {
5606 MemSDNode *M = cast<MemSDNode>(Op);
5607 SDValue Chain = M->getOperand(0);
5608 SDValue M0 = M->getOperand(2);
5609 SDValue Value = M->getOperand(3);
5610 unsigned OrderedCountIndex = M->getConstantOperandVal(7);
5611 unsigned WaveRelease = M->getConstantOperandVal(8);
5612 unsigned WaveDone = M->getConstantOperandVal(9);
5613 unsigned ShaderType;
5614 unsigned Instruction;
5615
5616 switch (IntrID) {
5617 case Intrinsic::amdgcn_ds_ordered_add:
5618 Instruction = 0;
5619 break;
5620 case Intrinsic::amdgcn_ds_ordered_swap:
5621 Instruction = 1;
5622 break;
5623 }
5624
5625 if (WaveDone && !WaveRelease)
5626 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
5627
5628 switch (DAG.getMachineFunction().getFunction().getCallingConv()) {
5629 case CallingConv::AMDGPU_CS:
5630 case CallingConv::AMDGPU_KERNEL:
5631 ShaderType = 0;
5632 break;
5633 case CallingConv::AMDGPU_PS:
5634 ShaderType = 1;
5635 break;
5636 case CallingConv::AMDGPU_VS:
5637 ShaderType = 2;
5638 break;
5639 case CallingConv::AMDGPU_GS:
5640 ShaderType = 3;
5641 break;
5642 default:
5643 report_fatal_error("ds_ordered_count unsupported for this calling conv");
5644 }
5645
5646 unsigned Offset0 = OrderedCountIndex << 2;
5647 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
5648 (Instruction << 4);
5649 unsigned Offset = Offset0 | (Offset1 << 8);
5650
5651 SDValue Ops[] = {
5652 Chain,
5653 Value,
5654 DAG.getTargetConstant(Offset, DL, MVT::i16),
5655 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
5656 };
5657 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
5658 M->getVTList(), Ops, M->getMemoryVT(),
5659 M->getMemOperand());
5660 }
Matt Arsenaulta5840c32019-01-22 18:36:06 +00005661 case Intrinsic::amdgcn_ds_fadd: {
5662 MemSDNode *M = cast<MemSDNode>(Op);
5663 unsigned Opc;
5664 switch (IntrID) {
5665 case Intrinsic::amdgcn_ds_fadd:
5666 Opc = ISD::ATOMIC_LOAD_FADD;
5667 break;
5668 }
5669
5670 return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
5671 M->getOperand(0), M->getOperand(2), M->getOperand(3),
5672 M->getMemOperand());
5673 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005674 case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005675 case Intrinsic::amdgcn_atomic_dec:
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005676 case Intrinsic::amdgcn_ds_fmin:
5677 case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005678 MemSDNode *M = cast<MemSDNode>(Op);
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005679 unsigned Opc;
5680 switch (IntrID) {
5681 case Intrinsic::amdgcn_atomic_inc:
5682 Opc = AMDGPUISD::ATOMIC_INC;
5683 break;
5684 case Intrinsic::amdgcn_atomic_dec:
5685 Opc = AMDGPUISD::ATOMIC_DEC;
5686 break;
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005687 case Intrinsic::amdgcn_ds_fmin:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005688 Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
5689 break;
Daniil Fukalov6e1dc682018-01-26 11:09:38 +00005690 case Intrinsic::amdgcn_ds_fmax:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00005691 Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
5692 break;
5693 default:
5694 llvm_unreachable("Unknown intrinsic!");
5695 }
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00005696 SDValue Ops[] = {
5697 M->getOperand(0), // Chain
5698 M->getOperand(2), // Ptr
5699 M->getOperand(3) // Value
5700 };
5701
5702 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
5703 M->getMemoryVT(), M->getMemOperand());
5704 }
Tom Stellard6f9ef142016-12-20 17:19:44 +00005705 case Intrinsic::amdgcn_buffer_load:
5706 case Intrinsic::amdgcn_buffer_load_format: {
Tim Renouf4f703f52018-08-21 11:07:10 +00005707 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
5708 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5709 unsigned IdxEn = 1;
5710 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5711 IdxEn = Idx->getZExtValue() != 0;
Tom Stellard6f9ef142016-12-20 17:19:44 +00005712 SDValue Ops[] = {
5713 Op.getOperand(0), // Chain
5714 Op.getOperand(2), // rsrc
5715 Op.getOperand(3), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00005716 SDValue(), // voffset -- will be set by setBufferOffsets
5717 SDValue(), // soffset -- will be set by setBufferOffsets
5718 SDValue(), // offset -- will be set by setBufferOffsets
5719 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5720 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Tom Stellard6f9ef142016-12-20 17:19:44 +00005721 };
Tom Stellard6f9ef142016-12-20 17:19:44 +00005722
Tim Renouf4f703f52018-08-21 11:07:10 +00005723 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
Tom Stellard6f9ef142016-12-20 17:19:44 +00005724 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
5725 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
Tim Renouf4f703f52018-08-21 11:07:10 +00005726
5727 EVT VT = Op.getValueType();
5728 EVT IntVT = VT.changeTypeToInteger();
5729 auto *M = cast<MemSDNode>(Op);
5730 EVT LoadVT = Op.getValueType();
5731
5732 if (LoadVT.getScalarType() == MVT::f16)
5733 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5734 M, DAG, Ops);
Ryan Taylor00e063a2019-03-19 16:07:00 +00005735
5736 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5737 if (LoadVT.getScalarType() == MVT::i8 ||
5738 LoadVT.getScalarType() == MVT::i16)
5739 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
5740
Tim Renouf677387d2019-03-22 14:58:02 +00005741 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5742 M->getMemOperand(), DAG);
Tim Renouf4f703f52018-08-21 11:07:10 +00005743 }
5744 case Intrinsic::amdgcn_raw_buffer_load:
5745 case Intrinsic::amdgcn_raw_buffer_load_format: {
5746 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5747 SDValue Ops[] = {
5748 Op.getOperand(0), // Chain
5749 Op.getOperand(2), // rsrc
5750 DAG.getConstant(0, DL, MVT::i32), // vindex
5751 Offsets.first, // voffset
5752 Op.getOperand(4), // soffset
5753 Offsets.second, // offset
5754 Op.getOperand(5), // cachepolicy
5755 DAG.getConstant(0, DL, MVT::i1), // idxen
5756 };
5757
5758 unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
5759 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5760
5761 EVT VT = Op.getValueType();
5762 EVT IntVT = VT.changeTypeToInteger();
5763 auto *M = cast<MemSDNode>(Op);
5764 EVT LoadVT = Op.getValueType();
5765
5766 if (LoadVT.getScalarType() == MVT::f16)
5767 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5768 M, DAG, Ops);
Ryan Taylor00e063a2019-03-19 16:07:00 +00005769
5770 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5771 if (LoadVT.getScalarType() == MVT::i8 ||
5772 LoadVT.getScalarType() == MVT::i16)
5773 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
5774
Tim Renouf677387d2019-03-22 14:58:02 +00005775 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5776 M->getMemOperand(), DAG);
Tim Renouf4f703f52018-08-21 11:07:10 +00005777 }
5778 case Intrinsic::amdgcn_struct_buffer_load:
5779 case Intrinsic::amdgcn_struct_buffer_load_format: {
5780 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5781 SDValue Ops[] = {
5782 Op.getOperand(0), // Chain
5783 Op.getOperand(2), // rsrc
5784 Op.getOperand(3), // vindex
5785 Offsets.first, // voffset
5786 Op.getOperand(5), // soffset
5787 Offsets.second, // offset
5788 Op.getOperand(6), // cachepolicy
5789 DAG.getConstant(1, DL, MVT::i1), // idxen
5790 };
5791
5792 unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
5793 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5794
Tom Stellard6f9ef142016-12-20 17:19:44 +00005795 EVT VT = Op.getValueType();
5796 EVT IntVT = VT.changeTypeToInteger();
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005797 auto *M = cast<MemSDNode>(Op);
Matt Arsenault1349a042018-05-22 06:32:10 +00005798 EVT LoadVT = Op.getValueType();
Matt Arsenault1349a042018-05-22 06:32:10 +00005799
Tim Renouf366a49d2018-08-02 23:33:01 +00005800 if (LoadVT.getScalarType() == MVT::f16)
5801 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5802 M, DAG, Ops);
Ryan Taylor00e063a2019-03-19 16:07:00 +00005803
5804 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5805 if (LoadVT.getScalarType() == MVT::i8 ||
5806 LoadVT.getScalarType() == MVT::i16)
5807 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
5808
Tim Renouf677387d2019-03-22 14:58:02 +00005809 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5810 M->getMemOperand(), DAG);
Tom Stellard6f9ef142016-12-20 17:19:44 +00005811 }
David Stuttard70e8bc12017-06-22 16:29:22 +00005812 case Intrinsic::amdgcn_tbuffer_load: {
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005813 MemSDNode *M = cast<MemSDNode>(Op);
Matt Arsenault1349a042018-05-22 06:32:10 +00005814 EVT LoadVT = Op.getValueType();
Matt Arsenault1349a042018-05-22 06:32:10 +00005815
Tim Renouf35484c92018-08-21 11:06:05 +00005816 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5817 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5818 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5819 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
5820 unsigned IdxEn = 1;
5821 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5822 IdxEn = Idx->getZExtValue() != 0;
David Stuttard70e8bc12017-06-22 16:29:22 +00005823 SDValue Ops[] = {
5824 Op.getOperand(0), // Chain
5825 Op.getOperand(2), // rsrc
5826 Op.getOperand(3), // vindex
5827 Op.getOperand(4), // voffset
5828 Op.getOperand(5), // soffset
5829 Op.getOperand(6), // offset
Tim Renouf35484c92018-08-21 11:06:05 +00005830 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5831 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5832 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5833 };
5834
5835 if (LoadVT.getScalarType() == MVT::f16)
5836 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5837 M, DAG, Ops);
Tim Renouf677387d2019-03-22 14:58:02 +00005838 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5839 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
5840 DAG);
Tim Renouf35484c92018-08-21 11:06:05 +00005841 }
5842 case Intrinsic::amdgcn_raw_tbuffer_load: {
5843 MemSDNode *M = cast<MemSDNode>(Op);
5844 EVT LoadVT = Op.getValueType();
5845 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5846
5847 SDValue Ops[] = {
5848 Op.getOperand(0), // Chain
5849 Op.getOperand(2), // rsrc
5850 DAG.getConstant(0, DL, MVT::i32), // vindex
5851 Offsets.first, // voffset
5852 Op.getOperand(4), // soffset
5853 Offsets.second, // offset
5854 Op.getOperand(5), // format
5855 Op.getOperand(6), // cachepolicy
5856 DAG.getConstant(0, DL, MVT::i1), // idxen
5857 };
5858
5859 if (LoadVT.getScalarType() == MVT::f16)
5860 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5861 M, DAG, Ops);
Tim Renouf677387d2019-03-22 14:58:02 +00005862 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5863 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
5864 DAG);
Tim Renouf35484c92018-08-21 11:06:05 +00005865 }
5866 case Intrinsic::amdgcn_struct_tbuffer_load: {
5867 MemSDNode *M = cast<MemSDNode>(Op);
5868 EVT LoadVT = Op.getValueType();
5869 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5870
5871 SDValue Ops[] = {
5872 Op.getOperand(0), // Chain
5873 Op.getOperand(2), // rsrc
5874 Op.getOperand(3), // vindex
5875 Offsets.first, // voffset
5876 Op.getOperand(5), // soffset
5877 Offsets.second, // offset
5878 Op.getOperand(6), // format
5879 Op.getOperand(7), // cachepolicy
5880 DAG.getConstant(1, DL, MVT::i1), // idxen
David Stuttard70e8bc12017-06-22 16:29:22 +00005881 };
5882
Tim Renouf366a49d2018-08-02 23:33:01 +00005883 if (LoadVT.getScalarType() == MVT::f16)
5884 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5885 M, DAG, Ops);
Tim Renouf677387d2019-03-22 14:58:02 +00005886 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5887 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
5888 DAG);
David Stuttard70e8bc12017-06-22 16:29:22 +00005889 }
Marek Olsak5cec6412017-11-09 01:52:48 +00005890 case Intrinsic::amdgcn_buffer_atomic_swap:
5891 case Intrinsic::amdgcn_buffer_atomic_add:
5892 case Intrinsic::amdgcn_buffer_atomic_sub:
5893 case Intrinsic::amdgcn_buffer_atomic_smin:
5894 case Intrinsic::amdgcn_buffer_atomic_umin:
5895 case Intrinsic::amdgcn_buffer_atomic_smax:
5896 case Intrinsic::amdgcn_buffer_atomic_umax:
5897 case Intrinsic::amdgcn_buffer_atomic_and:
5898 case Intrinsic::amdgcn_buffer_atomic_or:
5899 case Intrinsic::amdgcn_buffer_atomic_xor: {
Tim Renouf4f703f52018-08-21 11:07:10 +00005900 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5901 unsigned IdxEn = 1;
5902 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5903 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00005904 SDValue Ops[] = {
5905 Op.getOperand(0), // Chain
5906 Op.getOperand(2), // vdata
5907 Op.getOperand(3), // rsrc
5908 Op.getOperand(4), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00005909 SDValue(), // voffset -- will be set by setBufferOffsets
5910 SDValue(), // soffset -- will be set by setBufferOffsets
5911 SDValue(), // offset -- will be set by setBufferOffsets
5912 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5913 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00005914 };
Tim Renouf4f703f52018-08-21 11:07:10 +00005915 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005916 EVT VT = Op.getValueType();
5917
5918 auto *M = cast<MemSDNode>(Op);
Marek Olsak5cec6412017-11-09 01:52:48 +00005919 unsigned Opcode = 0;
5920
5921 switch (IntrID) {
5922 case Intrinsic::amdgcn_buffer_atomic_swap:
5923 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5924 break;
5925 case Intrinsic::amdgcn_buffer_atomic_add:
5926 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5927 break;
5928 case Intrinsic::amdgcn_buffer_atomic_sub:
5929 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5930 break;
5931 case Intrinsic::amdgcn_buffer_atomic_smin:
5932 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5933 break;
5934 case Intrinsic::amdgcn_buffer_atomic_umin:
5935 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5936 break;
5937 case Intrinsic::amdgcn_buffer_atomic_smax:
5938 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5939 break;
5940 case Intrinsic::amdgcn_buffer_atomic_umax:
5941 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5942 break;
5943 case Intrinsic::amdgcn_buffer_atomic_and:
5944 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5945 break;
5946 case Intrinsic::amdgcn_buffer_atomic_or:
5947 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5948 break;
5949 case Intrinsic::amdgcn_buffer_atomic_xor:
5950 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5951 break;
5952 default:
5953 llvm_unreachable("unhandled atomic opcode");
5954 }
5955
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00005956 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5957 M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00005958 }
Tim Renouf4f703f52018-08-21 11:07:10 +00005959 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5960 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5961 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5962 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5963 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5964 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5965 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5966 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5967 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5968 case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
5969 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5970 SDValue Ops[] = {
5971 Op.getOperand(0), // Chain
5972 Op.getOperand(2), // vdata
5973 Op.getOperand(3), // rsrc
5974 DAG.getConstant(0, DL, MVT::i32), // vindex
5975 Offsets.first, // voffset
5976 Op.getOperand(5), // soffset
5977 Offsets.second, // offset
5978 Op.getOperand(6), // cachepolicy
5979 DAG.getConstant(0, DL, MVT::i1), // idxen
5980 };
5981 EVT VT = Op.getValueType();
Marek Olsak5cec6412017-11-09 01:52:48 +00005982
Tim Renouf4f703f52018-08-21 11:07:10 +00005983 auto *M = cast<MemSDNode>(Op);
5984 unsigned Opcode = 0;
5985
5986 switch (IntrID) {
5987 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5988 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5989 break;
5990 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5991 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5992 break;
5993 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5994 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5995 break;
5996 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5997 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5998 break;
5999 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6000 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
6001 break;
6002 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6003 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
6004 break;
6005 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6006 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
6007 break;
6008 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6009 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
6010 break;
6011 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6012 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
6013 break;
6014 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6015 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
6016 break;
6017 default:
6018 llvm_unreachable("unhandled atomic opcode");
6019 }
6020
6021 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
6022 M->getMemOperand());
6023 }
6024 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6025 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6026 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6027 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6028 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6029 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6030 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6031 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6032 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6033 case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
6034 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6035 SDValue Ops[] = {
6036 Op.getOperand(0), // Chain
6037 Op.getOperand(2), // vdata
6038 Op.getOperand(3), // rsrc
6039 Op.getOperand(4), // vindex
6040 Offsets.first, // voffset
6041 Op.getOperand(6), // soffset
6042 Offsets.second, // offset
6043 Op.getOperand(7), // cachepolicy
6044 DAG.getConstant(1, DL, MVT::i1), // idxen
6045 };
6046 EVT VT = Op.getValueType();
6047
6048 auto *M = cast<MemSDNode>(Op);
6049 unsigned Opcode = 0;
6050
6051 switch (IntrID) {
6052 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6053 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
6054 break;
6055 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6056 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
6057 break;
6058 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6059 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
6060 break;
6061 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6062 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
6063 break;
6064 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6065 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
6066 break;
6067 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6068 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
6069 break;
6070 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6071 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
6072 break;
6073 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6074 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
6075 break;
6076 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6077 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
6078 break;
6079 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6080 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
6081 break;
6082 default:
6083 llvm_unreachable("unhandled atomic opcode");
6084 }
6085
6086 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
6087 M->getMemOperand());
6088 }
Marek Olsak5cec6412017-11-09 01:52:48 +00006089 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
Tim Renouf4f703f52018-08-21 11:07:10 +00006090 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
6091 unsigned IdxEn = 1;
6092 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
6093 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00006094 SDValue Ops[] = {
6095 Op.getOperand(0), // Chain
6096 Op.getOperand(2), // src
6097 Op.getOperand(3), // cmp
6098 Op.getOperand(4), // rsrc
6099 Op.getOperand(5), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00006100 SDValue(), // voffset -- will be set by setBufferOffsets
6101 SDValue(), // soffset -- will be set by setBufferOffsets
6102 SDValue(), // offset -- will be set by setBufferOffsets
6103 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
6104 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
6105 };
6106 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
6107 EVT VT = Op.getValueType();
6108 auto *M = cast<MemSDNode>(Op);
6109
6110 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
6111 Op->getVTList(), Ops, VT, M->getMemOperand());
6112 }
6113 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
6114 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6115 SDValue Ops[] = {
6116 Op.getOperand(0), // Chain
6117 Op.getOperand(2), // src
6118 Op.getOperand(3), // cmp
6119 Op.getOperand(4), // rsrc
6120 DAG.getConstant(0, DL, MVT::i32), // vindex
6121 Offsets.first, // voffset
6122 Op.getOperand(6), // soffset
6123 Offsets.second, // offset
6124 Op.getOperand(7), // cachepolicy
6125 DAG.getConstant(0, DL, MVT::i1), // idxen
6126 };
6127 EVT VT = Op.getValueType();
6128 auto *M = cast<MemSDNode>(Op);
6129
6130 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
6131 Op->getVTList(), Ops, VT, M->getMemOperand());
6132 }
6133 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
6134 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
6135 SDValue Ops[] = {
6136 Op.getOperand(0), // Chain
6137 Op.getOperand(2), // src
6138 Op.getOperand(3), // cmp
6139 Op.getOperand(4), // rsrc
6140 Op.getOperand(5), // vindex
6141 Offsets.first, // voffset
6142 Op.getOperand(7), // soffset
6143 Offsets.second, // offset
6144 Op.getOperand(8), // cachepolicy
6145 DAG.getConstant(1, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00006146 };
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00006147 EVT VT = Op.getValueType();
6148 auto *M = cast<MemSDNode>(Op);
Marek Olsak5cec6412017-11-09 01:52:48 +00006149
6150 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Matt Arsenaulte19bc2e2017-12-29 17:18:21 +00006151 Op->getVTList(), Ops, VT, M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00006152 }
6153
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00006154 default:
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00006155 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6156 AMDGPU::getImageDimIntrinsicInfo(IntrID))
6157 return lowerImage(Op, ImageDimIntr, DAG);
Matt Arsenault1349a042018-05-22 06:32:10 +00006158
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00006159 return SDValue();
6160 }
6161}
6162
Tim Renouf677387d2019-03-22 14:58:02 +00006163// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
6164// dwordx4 if on SI.
6165SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
6166 SDVTList VTList,
6167 ArrayRef<SDValue> Ops, EVT MemVT,
6168 MachineMemOperand *MMO,
6169 SelectionDAG &DAG) const {
6170 EVT VT = VTList.VTs[0];
6171 EVT WidenedVT = VT;
6172 EVT WidenedMemVT = MemVT;
6173 if (!Subtarget->hasDwordx3LoadStores() &&
6174 (WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) {
6175 WidenedVT = EVT::getVectorVT(*DAG.getContext(),
6176 WidenedVT.getVectorElementType(), 4);
6177 WidenedMemVT = EVT::getVectorVT(*DAG.getContext(),
6178 WidenedMemVT.getVectorElementType(), 4);
6179 MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16);
6180 }
6181
6182 assert(VTList.NumVTs == 2);
6183 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
6184
6185 auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
6186 WidenedMemVT, MMO);
6187 if (WidenedVT != VT) {
6188 auto Extract = DAG.getNode(
6189 ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
6190 DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
6191 NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);
6192 }
6193 return NewOp;
6194}
6195
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006196SDValue SITargetLowering::handleD16VData(SDValue VData,
6197 SelectionDAG &DAG) const {
6198 EVT StoreVT = VData.getValueType();
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006199
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006200 // No change for f16 and legal vector D16 types.
Matt Arsenault1349a042018-05-22 06:32:10 +00006201 if (!StoreVT.isVector())
6202 return VData;
6203
6204 SDLoc DL(VData);
6205 assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
6206
6207 if (Subtarget->hasUnpackedD16VMem()) {
6208 // We need to unpack the packed data to store.
6209 EVT IntStoreVT = StoreVT.changeTypeToInteger();
6210 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
6211
6212 EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6213 StoreVT.getVectorNumElements());
6214 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
6215 return DAG.UnrollVectorOp(ZExt.getNode());
6216 }
6217
Matt Arsenault02dc7e12018-06-15 15:15:46 +00006218 assert(isTypeLegal(StoreVT));
6219 return VData;
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006220}
6221
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006222SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6223 SelectionDAG &DAG) const {
Tom Stellardfc92e772015-05-12 14:18:14 +00006224 SDLoc DL(Op);
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006225 SDValue Chain = Op.getOperand(0);
6226 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
David Stuttard70e8bc12017-06-22 16:29:22 +00006227 MachineFunction &MF = DAG.getMachineFunction();
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006228
6229 switch (IntrinsicID) {
Matt Arsenault7d6b71d2017-02-21 22:50:41 +00006230 case Intrinsic::amdgcn_exp: {
Matt Arsenault4165efd2017-01-17 07:26:53 +00006231 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
6232 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
6233 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
6234 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
6235
6236 const SDValue Ops[] = {
6237 Chain,
6238 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
6239 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
6240 Op.getOperand(4), // src0
6241 Op.getOperand(5), // src1
6242 Op.getOperand(6), // src2
6243 Op.getOperand(7), // src3
6244 DAG.getTargetConstant(0, DL, MVT::i1), // compr
6245 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
6246 };
6247
6248 unsigned Opc = Done->isNullValue() ?
6249 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
6250 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
6251 }
6252 case Intrinsic::amdgcn_exp_compr: {
6253 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
6254 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
6255 SDValue Src0 = Op.getOperand(4);
6256 SDValue Src1 = Op.getOperand(5);
6257 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
6258 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
6259
6260 SDValue Undef = DAG.getUNDEF(MVT::f32);
6261 const SDValue Ops[] = {
6262 Chain,
6263 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
6264 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
6265 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
6266 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
6267 Undef, // src2
6268 Undef, // src3
6269 DAG.getTargetConstant(1, DL, MVT::i1), // compr
6270 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
6271 };
6272
6273 unsigned Opc = Done->isNullValue() ?
6274 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
6275 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
6276 }
6277 case Intrinsic::amdgcn_s_sendmsg:
Matt Arsenaultd3e5cb72017-02-16 02:01:17 +00006278 case Intrinsic::amdgcn_s_sendmsghalt: {
6279 unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
6280 AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
Tom Stellardfc92e772015-05-12 14:18:14 +00006281 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
6282 SDValue Glue = Chain.getValue(1);
Matt Arsenaulta78ca622017-02-15 22:17:09 +00006283 return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
Jan Veselyd48445d2017-01-04 18:06:55 +00006284 Op.getOperand(2), Glue);
6285 }
Marek Olsak2d825902017-04-28 20:21:58 +00006286 case Intrinsic::amdgcn_init_exec: {
6287 return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
6288 Op.getOperand(2));
6289 }
6290 case Intrinsic::amdgcn_init_exec_from_input: {
6291 return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
6292 Op.getOperand(2), Op.getOperand(3));
6293 }
Stanislav Mekhanoshinea57c382017-04-06 16:48:30 +00006294 case Intrinsic::amdgcn_s_barrier: {
6295 if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00006296 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Matthias Braunf1caa282017-12-15 22:22:58 +00006297 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
Stanislav Mekhanoshinea57c382017-04-06 16:48:30 +00006298 if (WGSize <= ST.getWavefrontSize())
6299 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
6300 Op.getOperand(0)), 0);
6301 }
6302 return SDValue();
6303 };
David Stuttard70e8bc12017-06-22 16:29:22 +00006304 case Intrinsic::amdgcn_tbuffer_store: {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006305 SDValue VData = Op.getOperand(2);
6306 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6307 if (IsD16)
6308 VData = handleD16VData(VData, DAG);
Tim Renouf35484c92018-08-21 11:06:05 +00006309 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
6310 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
6311 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
6312 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
6313 unsigned IdxEn = 1;
6314 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6315 IdxEn = Idx->getZExtValue() != 0;
David Stuttard70e8bc12017-06-22 16:29:22 +00006316 SDValue Ops[] = {
6317 Chain,
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006318 VData, // vdata
David Stuttard70e8bc12017-06-22 16:29:22 +00006319 Op.getOperand(3), // rsrc
6320 Op.getOperand(4), // vindex
6321 Op.getOperand(5), // voffset
6322 Op.getOperand(6), // soffset
6323 Op.getOperand(7), // offset
Tim Renouf35484c92018-08-21 11:06:05 +00006324 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
6325 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6326 DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
6327 };
6328 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6329 AMDGPUISD::TBUFFER_STORE_FORMAT;
6330 MemSDNode *M = cast<MemSDNode>(Op);
6331 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6332 M->getMemoryVT(), M->getMemOperand());
6333 }
6334
6335 case Intrinsic::amdgcn_struct_tbuffer_store: {
6336 SDValue VData = Op.getOperand(2);
6337 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6338 if (IsD16)
6339 VData = handleD16VData(VData, DAG);
6340 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6341 SDValue Ops[] = {
6342 Chain,
6343 VData, // vdata
6344 Op.getOperand(3), // rsrc
6345 Op.getOperand(4), // vindex
6346 Offsets.first, // voffset
6347 Op.getOperand(6), // soffset
6348 Offsets.second, // offset
6349 Op.getOperand(7), // format
6350 Op.getOperand(8), // cachepolicy
6351 DAG.getConstant(1, DL, MVT::i1), // idexen
6352 };
6353 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6354 AMDGPUISD::TBUFFER_STORE_FORMAT;
6355 MemSDNode *M = cast<MemSDNode>(Op);
6356 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6357 M->getMemoryVT(), M->getMemOperand());
6358 }
6359
6360 case Intrinsic::amdgcn_raw_tbuffer_store: {
6361 SDValue VData = Op.getOperand(2);
6362 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6363 if (IsD16)
6364 VData = handleD16VData(VData, DAG);
6365 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6366 SDValue Ops[] = {
6367 Chain,
6368 VData, // vdata
6369 Op.getOperand(3), // rsrc
6370 DAG.getConstant(0, DL, MVT::i32), // vindex
6371 Offsets.first, // voffset
6372 Op.getOperand(5), // soffset
6373 Offsets.second, // offset
6374 Op.getOperand(6), // format
6375 Op.getOperand(7), // cachepolicy
6376 DAG.getConstant(0, DL, MVT::i1), // idexen
David Stuttard70e8bc12017-06-22 16:29:22 +00006377 };
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006378 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6379 AMDGPUISD::TBUFFER_STORE_FORMAT;
6380 MemSDNode *M = cast<MemSDNode>(Op);
6381 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6382 M->getMemoryVT(), M->getMemOperand());
David Stuttard70e8bc12017-06-22 16:29:22 +00006383 }
6384
Marek Olsak5cec6412017-11-09 01:52:48 +00006385 case Intrinsic::amdgcn_buffer_store:
6386 case Intrinsic::amdgcn_buffer_store_format: {
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006387 SDValue VData = Op.getOperand(2);
6388 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6389 if (IsD16)
6390 VData = handleD16VData(VData, DAG);
Tim Renouf4f703f52018-08-21 11:07:10 +00006391 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
6392 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
6393 unsigned IdxEn = 1;
6394 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6395 IdxEn = Idx->getZExtValue() != 0;
Marek Olsak5cec6412017-11-09 01:52:48 +00006396 SDValue Ops[] = {
6397 Chain,
Tim Renouf4f703f52018-08-21 11:07:10 +00006398 VData,
Marek Olsak5cec6412017-11-09 01:52:48 +00006399 Op.getOperand(3), // rsrc
6400 Op.getOperand(4), // vindex
Tim Renouf4f703f52018-08-21 11:07:10 +00006401 SDValue(), // voffset -- will be set by setBufferOffsets
6402 SDValue(), // soffset -- will be set by setBufferOffsets
6403 SDValue(), // offset -- will be set by setBufferOffsets
6404 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6405 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
Marek Olsak5cec6412017-11-09 01:52:48 +00006406 };
Tim Renouf4f703f52018-08-21 11:07:10 +00006407 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006408 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
6409 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6410 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6411 MemSDNode *M = cast<MemSDNode>(Op);
Ryan Taylor00e063a2019-03-19 16:07:00 +00006412
6413 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6414 EVT VDataType = VData.getValueType().getScalarType();
6415 if (VDataType == MVT::i8 || VDataType == MVT::i16)
6416 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
6417
Changpeng Fang44dfa1d2018-01-12 21:12:19 +00006418 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6419 M->getMemoryVT(), M->getMemOperand());
Marek Olsak5cec6412017-11-09 01:52:48 +00006420 }
Tim Renouf4f703f52018-08-21 11:07:10 +00006421
6422 case Intrinsic::amdgcn_raw_buffer_store:
6423 case Intrinsic::amdgcn_raw_buffer_store_format: {
6424 SDValue VData = Op.getOperand(2);
6425 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6426 if (IsD16)
6427 VData = handleD16VData(VData, DAG);
6428 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6429 SDValue Ops[] = {
6430 Chain,
6431 VData,
6432 Op.getOperand(3), // rsrc
6433 DAG.getConstant(0, DL, MVT::i32), // vindex
6434 Offsets.first, // voffset
6435 Op.getOperand(5), // soffset
6436 Offsets.second, // offset
6437 Op.getOperand(6), // cachepolicy
6438 DAG.getConstant(0, DL, MVT::i1), // idxen
6439 };
6440 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
6441 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6442 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6443 MemSDNode *M = cast<MemSDNode>(Op);
Ryan Taylor00e063a2019-03-19 16:07:00 +00006444
6445 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6446 EVT VDataType = VData.getValueType().getScalarType();
6447 if (VDataType == MVT::i8 || VDataType == MVT::i16)
6448 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
6449
Tim Renouf4f703f52018-08-21 11:07:10 +00006450 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6451 M->getMemoryVT(), M->getMemOperand());
6452 }
6453
6454 case Intrinsic::amdgcn_struct_buffer_store:
6455 case Intrinsic::amdgcn_struct_buffer_store_format: {
6456 SDValue VData = Op.getOperand(2);
6457 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6458 if (IsD16)
6459 VData = handleD16VData(VData, DAG);
6460 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6461 SDValue Ops[] = {
6462 Chain,
6463 VData,
6464 Op.getOperand(3), // rsrc
6465 Op.getOperand(4), // vindex
6466 Offsets.first, // voffset
6467 Op.getOperand(6), // soffset
6468 Offsets.second, // offset
6469 Op.getOperand(7), // cachepolicy
6470 DAG.getConstant(1, DL, MVT::i1), // idxen
6471 };
6472 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
6473 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6474 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6475 MemSDNode *M = cast<MemSDNode>(Op);
Ryan Taylor00e063a2019-03-19 16:07:00 +00006476
6477 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6478 EVT VDataType = VData.getValueType().getScalarType();
6479 if (VDataType == MVT::i8 || VDataType == MVT::i16)
6480 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
6481
Tim Renouf4f703f52018-08-21 11:07:10 +00006482 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6483 M->getMemoryVT(), M->getMemOperand());
6484 }
6485
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006486 default: {
Nicolai Haehnle7a9c03f2018-06-21 13:36:57 +00006487 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6488 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
6489 return lowerImage(Op, ImageDimIntr, DAG);
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006490
Matt Arsenault754dd3e2017-04-03 18:08:08 +00006491 return Op;
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006492 }
Nicolai Haehnle2f5a7382018-04-04 10:58:54 +00006493 }
Matt Arsenaulta5789bb2014-07-26 06:23:37 +00006494}
6495
Tim Renouf4f703f52018-08-21 11:07:10 +00006496// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6497// offset (the offset that is included in bounds checking and swizzling, to be
6498// split between the instruction's voffset and immoffset fields) and soffset
6499// (the offset that is excluded from bounds checking and swizzling, to go in
6500// the instruction's soffset field). This function takes the first kind of
6501// offset and figures out how to split it between voffset and immoffset.
Tim Renouf35484c92018-08-21 11:06:05 +00006502std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
6503 SDValue Offset, SelectionDAG &DAG) const {
6504 SDLoc DL(Offset);
6505 const unsigned MaxImm = 4095;
6506 SDValue N0 = Offset;
6507 ConstantSDNode *C1 = nullptr;
Piotr Sobczak378131b2019-01-02 09:47:41 +00006508
6509 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
Tim Renouf35484c92018-08-21 11:06:05 +00006510 N0 = SDValue();
Piotr Sobczak378131b2019-01-02 09:47:41 +00006511 else if (DAG.isBaseWithConstantOffset(N0)) {
6512 C1 = cast<ConstantSDNode>(N0.getOperand(1));
6513 N0 = N0.getOperand(0);
6514 }
Tim Renouf35484c92018-08-21 11:06:05 +00006515
6516 if (C1) {
6517 unsigned ImmOffset = C1->getZExtValue();
6518 // If the immediate value is too big for the immoffset field, put the value
Tim Renoufa37679d2018-10-03 10:29:43 +00006519 // and -4096 into the immoffset field so that the value that is copied/added
Tim Renouf35484c92018-08-21 11:06:05 +00006520 // for the voffset field is a multiple of 4096, and it stands more chance
6521 // of being CSEd with the copy/add for another similar load/store.
Tim Renoufa37679d2018-10-03 10:29:43 +00006522 // However, do not do that rounding down to a multiple of 4096 if that is a
6523 // negative number, as it appears to be illegal to have a negative offset
6524 // in the vgpr, even if adding the immediate offset makes it positive.
Tim Renouf35484c92018-08-21 11:06:05 +00006525 unsigned Overflow = ImmOffset & ~MaxImm;
6526 ImmOffset -= Overflow;
Tim Renoufa37679d2018-10-03 10:29:43 +00006527 if ((int32_t)Overflow < 0) {
6528 Overflow += ImmOffset;
6529 ImmOffset = 0;
6530 }
Tim Renouf35484c92018-08-21 11:06:05 +00006531 C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
6532 if (Overflow) {
6533 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
6534 if (!N0)
6535 N0 = OverflowVal;
6536 else {
6537 SDValue Ops[] = { N0, OverflowVal };
6538 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
6539 }
6540 }
6541 }
6542 if (!N0)
6543 N0 = DAG.getConstant(0, DL, MVT::i32);
6544 if (!C1)
6545 C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
6546 return {N0, SDValue(C1, 0)};
6547}
6548
Tim Renouf4f703f52018-08-21 11:07:10 +00006549// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
6550// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
6551// pointed to by Offsets.
6552void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006553 SelectionDAG &DAG, SDValue *Offsets,
6554 unsigned Align) const {
Tim Renouf4f703f52018-08-21 11:07:10 +00006555 SDLoc DL(CombinedOffset);
6556 if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
6557 uint32_t Imm = C->getZExtValue();
6558 uint32_t SOffset, ImmOffset;
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006559 if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
Tim Renouf4f703f52018-08-21 11:07:10 +00006560 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
6561 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6562 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6563 return;
6564 }
6565 }
6566 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
6567 SDValue N0 = CombinedOffset.getOperand(0);
6568 SDValue N1 = CombinedOffset.getOperand(1);
6569 uint32_t SOffset, ImmOffset;
6570 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
Nicolai Haehnlea7b00052018-11-30 22:55:38 +00006571 if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
6572 Subtarget, Align)) {
Tim Renouf4f703f52018-08-21 11:07:10 +00006573 Offsets[0] = N0;
6574 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6575 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6576 return;
6577 }
6578 }
6579 Offsets[0] = CombinedOffset;
6580 Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
6581 Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
6582}
6583
Ryan Taylor00e063a2019-03-19 16:07:00 +00006584// Handle 8 bit and 16 bit buffer loads
6585SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
6586 EVT LoadVT, SDLoc DL,
6587 ArrayRef<SDValue> Ops,
6588 MemSDNode *M) const {
6589 EVT IntVT = LoadVT.changeTypeToInteger();
6590 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
6591 AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
6592
6593 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
6594 SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
6595 Ops, IntVT,
6596 M->getMemOperand());
6597 SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL,
6598 LoadVT.getScalarType(), BufferLoad);
6599 return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL);
6600}
6601
6602// Handle 8 bit and 16 bit buffer stores
6603SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
6604 EVT VDataType, SDLoc DL,
6605 SDValue Ops[],
6606 MemSDNode *M) const {
6607 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
6608 Ops[1] = BufferStoreExt;
6609 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
6610 AMDGPUISD::BUFFER_STORE_SHORT;
6611 ArrayRef<SDValue> OpsRef = makeArrayRef(&Ops[0], 9);
6612 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
6613 M->getMemOperand());
6614}
6615
Matt Arsenault90083d32018-06-07 09:54:49 +00006616static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
6617 ISD::LoadExtType ExtType, SDValue Op,
6618 const SDLoc &SL, EVT VT) {
6619 if (VT.bitsLT(Op.getValueType()))
6620 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
6621
6622 switch (ExtType) {
6623 case ISD::SEXTLOAD:
6624 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
6625 case ISD::ZEXTLOAD:
6626 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
6627 case ISD::EXTLOAD:
6628 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
6629 case ISD::NON_EXTLOAD:
6630 return Op;
6631 }
6632
6633 llvm_unreachable("invalid ext type");
6634}
6635
6636SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
6637 SelectionDAG &DAG = DCI.DAG;
6638 if (Ld->getAlignment() < 4 || Ld->isDivergent())
6639 return SDValue();
6640
6641 // FIXME: Constant loads should all be marked invariant.
6642 unsigned AS = Ld->getAddressSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00006643 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
6644 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
Matt Arsenault90083d32018-06-07 09:54:49 +00006645 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
6646 return SDValue();
6647
6648 // Don't do this early, since it may interfere with adjacent load merging for
6649 // illegal types. We can avoid losing alignment information for exotic types
6650 // pre-legalize.
6651 EVT MemVT = Ld->getMemoryVT();
6652 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
6653 MemVT.getSizeInBits() >= 32)
6654 return SDValue();
6655
6656 SDLoc SL(Ld);
6657
6658 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
6659 "unexpected vector extload");
6660
6661 // TODO: Drop only high part of range.
6662 SDValue Ptr = Ld->getBasePtr();
6663 SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
6664 MVT::i32, SL, Ld->getChain(), Ptr,
6665 Ld->getOffset(),
6666 Ld->getPointerInfo(), MVT::i32,
6667 Ld->getAlignment(),
6668 Ld->getMemOperand()->getFlags(),
6669 Ld->getAAInfo(),
6670 nullptr); // Drop ranges
6671
6672 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
6673 if (MemVT.isFloatingPoint()) {
6674 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
6675 "unexpected fp extload");
6676 TruncVT = MemVT.changeTypeToInteger();
6677 }
6678
6679 SDValue Cvt = NewLoad;
6680 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
6681 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
6682 DAG.getValueType(TruncVT));
6683 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
6684 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
6685 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
6686 } else {
6687 assert(Ld->getExtensionType() == ISD::EXTLOAD);
6688 }
6689
6690 EVT VT = Ld->getValueType(0);
6691 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
6692
6693 DCI.AddToWorklist(Cvt.getNode());
6694
6695 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
6696 // the appropriate extension from the 32-bit load.
6697 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
6698 DCI.AddToWorklist(Cvt.getNode());
6699
6700 // Handle conversion back to floating point if necessary.
6701 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
6702
6703 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
6704}
6705
Tom Stellard81d871d2013-11-13 23:36:50 +00006706SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
6707 SDLoc DL(Op);
6708 LoadSDNode *Load = cast<LoadSDNode>(Op);
Matt Arsenault6dfda962016-02-10 18:21:39 +00006709 ISD::LoadExtType ExtType = Load->getExtensionType();
Matt Arsenaulta1436412016-02-10 18:21:45 +00006710 EVT MemVT = Load->getMemoryVT();
Matt Arsenault6dfda962016-02-10 18:21:39 +00006711
Matt Arsenaulta1436412016-02-10 18:21:45 +00006712 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
Matt Arsenault65ca292a2017-09-07 05:37:34 +00006713 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
6714 return SDValue();
6715
Matt Arsenault6dfda962016-02-10 18:21:39 +00006716 // FIXME: Copied from PPC
6717 // First, load into 32 bits, then truncate to 1 bit.
6718
6719 SDValue Chain = Load->getChain();
6720 SDValue BasePtr = Load->getBasePtr();
6721 MachineMemOperand *MMO = Load->getMemOperand();
6722
Tom Stellard115a6152016-11-10 16:02:37 +00006723 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
6724
Matt Arsenault6dfda962016-02-10 18:21:39 +00006725 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
Tom Stellard115a6152016-11-10 16:02:37 +00006726 BasePtr, RealMemVT, MMO);
Matt Arsenault6dfda962016-02-10 18:21:39 +00006727
Tim Renouf361b5b22019-03-21 12:01:21 +00006728 if (!MemVT.isVector()) {
6729 SDValue Ops[] = {
6730 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
6731 NewLD.getValue(1)
6732 };
6733
6734 return DAG.getMergeValues(Ops, DL);
6735 }
6736
6737 SmallVector<SDValue, 3> Elts;
6738 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
6739 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
6740 DAG.getConstant(I, DL, MVT::i32));
6741
6742 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
6743 }
6744
Matt Arsenault6dfda962016-02-10 18:21:39 +00006745 SDValue Ops[] = {
Tim Renouf361b5b22019-03-21 12:01:21 +00006746 DAG.getBuildVector(MemVT, DL, Elts),
Matt Arsenault6dfda962016-02-10 18:21:39 +00006747 NewLD.getValue(1)
6748 };
6749
6750 return DAG.getMergeValues(Ops, DL);
6751 }
Tom Stellard81d871d2013-11-13 23:36:50 +00006752
Matt Arsenaulta1436412016-02-10 18:21:45 +00006753 if (!MemVT.isVector())
6754 return SDValue();
Matt Arsenault4d801cd2015-11-24 12:05:03 +00006755
Matt Arsenaulta1436412016-02-10 18:21:45 +00006756 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
6757 "Custom lowering for non-i32 vectors hasn't been implemented.");
Matt Arsenault4d801cd2015-11-24 12:05:03 +00006758
Farhana Aleen89196642018-03-07 17:09:18 +00006759 unsigned Alignment = Load->getAlignment();
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006760 unsigned AS = Load->getAddressSpace();
6761 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
Farhana Aleen89196642018-03-07 17:09:18 +00006762 AS, Alignment)) {
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006763 SDValue Ops[2];
6764 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
6765 return DAG.getMergeValues(Ops, DL);
6766 }
Stanislav Mekhanoshina224f682019-05-01 16:11:11 +00006767 if (Subtarget->hasLDSMisalignedBug() &&
6768 AS == AMDGPUAS::FLAT_ADDRESS &&
6769 Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
6770 return SplitVectorLoad(Op, DAG);
6771 }
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006772
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006773 MachineFunction &MF = DAG.getMachineFunction();
6774 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6775 // If there is a possibilty that flat instruction access scratch memory
6776 // then we need to use the same legalization rules we use for private.
Matt Arsenault0da63502018-08-31 05:49:54 +00006777 if (AS == AMDGPUAS::FLAT_ADDRESS)
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006778 AS = MFI->hasFlatScratchInit() ?
Matt Arsenault0da63502018-08-31 05:49:54 +00006779 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00006780
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006781 unsigned NumElements = MemVT.getVectorNumElements();
Matt Arsenault6c041a32018-03-29 19:59:28 +00006782
Matt Arsenault0da63502018-08-31 05:49:54 +00006783 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6784 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
Tim Renouf361b5b22019-03-21 12:01:21 +00006785 if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
6786 if (MemVT.isPow2VectorType())
6787 return SDValue();
6788 if (NumElements == 3)
6789 return WidenVectorLoad(Op, DAG);
6790 return SplitVectorLoad(Op, DAG);
6791 }
Matt Arsenaulta1436412016-02-10 18:21:45 +00006792 // Non-uniform loads will be selected to MUBUF instructions, so they
Alexander Timofeev18009562016-12-08 17:28:47 +00006793 // have the same legalization requirements as global and private
Matt Arsenaulta1436412016-02-10 18:21:45 +00006794 // loads.
6795 //
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006796 }
Matt Arsenault6c041a32018-03-29 19:59:28 +00006797
Matt Arsenault0da63502018-08-31 05:49:54 +00006798 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6799 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6800 AS == AMDGPUAS::GLOBAL_ADDRESS) {
Alexander Timofeev2e5eece2018-03-05 15:12:21 +00006801 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
Farhana Aleen89196642018-03-07 17:09:18 +00006802 !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
Tim Renouf361b5b22019-03-21 12:01:21 +00006803 Alignment >= 4 && NumElements < 32) {
6804 if (MemVT.isPow2VectorType())
6805 return SDValue();
6806 if (NumElements == 3)
6807 return WidenVectorLoad(Op, DAG);
6808 return SplitVectorLoad(Op, DAG);
6809 }
Alexander Timofeev18009562016-12-08 17:28:47 +00006810 // Non-uniform loads will be selected to MUBUF instructions, so they
6811 // have the same legalization requirements as global and private
6812 // loads.
6813 //
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006814 }
Matt Arsenault0da63502018-08-31 05:49:54 +00006815 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6816 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6817 AS == AMDGPUAS::GLOBAL_ADDRESS ||
6818 AS == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006819 if (NumElements > 4)
Matt Arsenaulta1436412016-02-10 18:21:45 +00006820 return SplitVectorLoad(Op, DAG);
Tim Renouf361b5b22019-03-21 12:01:21 +00006821 // v3 loads not supported on SI.
6822 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
6823 return WidenVectorLoad(Op, DAG);
6824 // v3 and v4 loads are supported for private and global memory.
Matt Arsenaulta1436412016-02-10 18:21:45 +00006825 return SDValue();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006826 }
Matt Arsenault0da63502018-08-31 05:49:54 +00006827 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006828 // Depending on the setting of the private_element_size field in the
6829 // resource descriptor, we can only make private accesses up to a certain
6830 // size.
6831 switch (Subtarget->getMaxPrivateElementSize()) {
6832 case 4:
Matt Arsenault9c499c32016-04-14 23:31:26 +00006833 return scalarizeVectorLoad(Load, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006834 case 8:
6835 if (NumElements > 2)
6836 return SplitVectorLoad(Op, DAG);
6837 return SDValue();
6838 case 16:
6839 // Same as global/flat
6840 if (NumElements > 4)
6841 return SplitVectorLoad(Op, DAG);
Tim Renouf361b5b22019-03-21 12:01:21 +00006842 // v3 loads not supported on SI.
6843 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
6844 return WidenVectorLoad(Op, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00006845 return SDValue();
6846 default:
6847 llvm_unreachable("unsupported private_element_size");
6848 }
Matt Arsenault0da63502018-08-31 05:49:54 +00006849 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Farhana Aleena7cb3112018-03-09 17:41:39 +00006850 // Use ds_read_b128 if possible.
Marek Olsaka9a58fa2018-04-10 22:48:23 +00006851 if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
Farhana Aleena7cb3112018-03-09 17:41:39 +00006852 MemVT.getStoreSize() == 16)
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00006853 return SDValue();
6854
Farhana Aleena7cb3112018-03-09 17:41:39 +00006855 if (NumElements > 2)
6856 return SplitVectorLoad(Op, DAG);
Nicolai Haehnle48219372018-10-17 15:37:48 +00006857
6858 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
6859 // address is negative, then the instruction is incorrectly treated as
6860 // out-of-bounds even if base + offsets is in bounds. Split vectorized
6861 // loads here to avoid emitting ds_read2_b32. We may re-combine the
6862 // load later in the SILoadStoreOptimizer.
6863 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
6864 NumElements == 2 && MemVT.getStoreSize() == 8 &&
6865 Load->getAlignment() < 8) {
6866 return SplitVectorLoad(Op, DAG);
6867 }
Tom Stellarde9373602014-01-22 19:24:14 +00006868 }
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00006869 return SDValue();
Tom Stellard81d871d2013-11-13 23:36:50 +00006870}
6871
Tom Stellard0ec134f2014-02-04 17:18:40 +00006872SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenault02dc7e12018-06-15 15:15:46 +00006873 EVT VT = Op.getValueType();
6874 assert(VT.getSizeInBits() == 64);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006875
6876 SDLoc DL(Op);
6877 SDValue Cond = Op.getOperand(0);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006878
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00006879 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
6880 SDValue One = DAG.getConstant(1, DL, MVT::i32);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006881
Tom Stellard7ea3d6d2014-03-31 14:01:55 +00006882 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
6883 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
6884
6885 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
6886 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006887
6888 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
6889
Tom Stellard7ea3d6d2014-03-31 14:01:55 +00006890 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
6891 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006892
6893 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
6894
Ahmed Bougacha128f8732016-04-26 21:15:30 +00006895 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
Matt Arsenault02dc7e12018-06-15 15:15:46 +00006896 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
Tom Stellard0ec134f2014-02-04 17:18:40 +00006897}
6898
Matt Arsenault22ca3f82014-07-15 23:50:10 +00006899// Catch division cases where we can use shortcuts with rcp and rsq
6900// instructions.
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00006901SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
6902 SelectionDAG &DAG) const {
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006903 SDLoc SL(Op);
6904 SDValue LHS = Op.getOperand(0);
6905 SDValue RHS = Op.getOperand(1);
6906 EVT VT = Op.getValueType();
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00006907 const SDNodeFlags Flags = Op->getFlags();
Michael Berg7acc81b2018-05-04 18:48:20 +00006908 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006909
Konstantin Zhuravlyovc4b18e72017-04-21 19:25:33 +00006910 if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
6911 return SDValue();
6912
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006913 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
Konstantin Zhuravlyovc4b18e72017-04-21 19:25:33 +00006914 if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
Matt Arsenault979902b2016-08-02 22:25:04 +00006915 if (CLHS->isExactlyValue(1.0)) {
6916 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
6917 // the CI documentation has a worst case error of 1 ulp.
6918 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
6919 // use it as long as we aren't trying to use denormals.
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00006920 //
6921 // v_rcp_f16 and v_rsq_f16 DO support denormals.
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006922
Matt Arsenault979902b2016-08-02 22:25:04 +00006923 // 1.0 / sqrt(x) -> rsq(x)
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00006924
Matt Arsenault979902b2016-08-02 22:25:04 +00006925 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
6926 // error seems really high at 2^29 ULP.
6927 if (RHS.getOpcode() == ISD::FSQRT)
6928 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
6929
6930 // 1.0 / x -> rcp(x)
6931 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
6932 }
6933
6934 // Same as for 1.0, but expand the sign out of the constant.
6935 if (CLHS->isExactlyValue(-1.0)) {
6936 // -1.0 / x -> rcp (fneg x)
6937 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
6938 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
6939 }
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006940 }
6941 }
6942
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00006943 if (Unsafe) {
Matt Arsenault22ca3f82014-07-15 23:50:10 +00006944 // Turn into multiply by the reciprocal.
6945 // x / y -> x * (1.0 / y)
6946 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +00006947 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
Matt Arsenault22ca3f82014-07-15 23:50:10 +00006948 }
6949
6950 return SDValue();
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00006951}
6952
Tom Stellard8485fa02016-12-07 02:42:15 +00006953static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6954 EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
6955 if (GlueChain->getNumValues() <= 1) {
6956 return DAG.getNode(Opcode, SL, VT, A, B);
6957 }
6958
6959 assert(GlueChain->getNumValues() == 3);
6960
6961 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6962 switch (Opcode) {
6963 default: llvm_unreachable("no chain equivalent for opcode");
6964 case ISD::FMUL:
6965 Opcode = AMDGPUISD::FMUL_W_CHAIN;
6966 break;
6967 }
6968
6969 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
6970 GlueChain.getValue(2));
6971}
6972
6973static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6974 EVT VT, SDValue A, SDValue B, SDValue C,
6975 SDValue GlueChain) {
6976 if (GlueChain->getNumValues() <= 1) {
6977 return DAG.getNode(Opcode, SL, VT, A, B, C);
6978 }
6979
6980 assert(GlueChain->getNumValues() == 3);
6981
6982 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6983 switch (Opcode) {
6984 default: llvm_unreachable("no chain equivalent for opcode");
6985 case ISD::FMA:
6986 Opcode = AMDGPUISD::FMA_W_CHAIN;
6987 break;
6988 }
6989
6990 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
6991 GlueChain.getValue(2));
6992}
6993
Matt Arsenault4052a572016-12-22 03:05:41 +00006994SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaultcdff21b2016-12-22 03:05:44 +00006995 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
6996 return FastLowered;
6997
Matt Arsenault4052a572016-12-22 03:05:41 +00006998 SDLoc SL(Op);
6999 SDValue Src0 = Op.getOperand(0);
7000 SDValue Src1 = Op.getOperand(1);
7001
7002 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7003 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7004
7005 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
7006 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
7007
7008 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
7009 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
7010
7011 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
7012}
7013
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00007014// Faster 2.5 ULP division that does not support denormals.
7015SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
7016 SDLoc SL(Op);
7017 SDValue LHS = Op.getOperand(1);
7018 SDValue RHS = Op.getOperand(2);
7019
7020 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
7021
7022 const APFloat K0Val(BitsToFloat(0x6f800000));
7023 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
7024
7025 const APFloat K1Val(BitsToFloat(0x2f800000));
7026 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
7027
7028 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
7029
7030 EVT SetCCVT =
7031 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
7032
7033 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
7034
7035 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
7036
7037 // TODO: Should this propagate fast-math-flags?
7038 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
7039
7040 // rcp does not support denormals.
7041 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
7042
7043 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
7044
7045 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
7046}
7047
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007048SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00007049 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
Eric Christopher538d09d02016-06-07 20:27:12 +00007050 return FastLowered;
Matt Arsenault22ca3f82014-07-15 23:50:10 +00007051
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007052 SDLoc SL(Op);
7053 SDValue LHS = Op.getOperand(0);
7054 SDValue RHS = Op.getOperand(1);
7055
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007056 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007057
Wei Dinged0f97f2016-06-09 19:17:15 +00007058 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007059
Tom Stellard8485fa02016-12-07 02:42:15 +00007060 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
7061 RHS, RHS, LHS);
7062 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
7063 LHS, RHS, LHS);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007064
Matt Arsenaultdfec5ce2016-07-09 07:48:11 +00007065 // Denominator is scaled to not be denormal, so using rcp is ok.
Tom Stellard8485fa02016-12-07 02:42:15 +00007066 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
7067 DenominatorScaled);
7068 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
7069 DenominatorScaled);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007070
Tom Stellard8485fa02016-12-07 02:42:15 +00007071 const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
7072 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
7073 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007074
Tom Stellard8485fa02016-12-07 02:42:15 +00007075 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007076
Tom Stellard8485fa02016-12-07 02:42:15 +00007077 if (!Subtarget->hasFP32Denormals()) {
7078 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
7079 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
7080 SL, MVT::i32);
7081 SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
7082 DAG.getEntryNode(),
7083 EnableDenormValue, BitField);
7084 SDValue Ops[3] = {
7085 NegDivScale0,
7086 EnableDenorm.getValue(0),
7087 EnableDenorm.getValue(1)
7088 };
Matt Arsenault37fefd62016-06-10 02:18:02 +00007089
Tom Stellard8485fa02016-12-07 02:42:15 +00007090 NegDivScale0 = DAG.getMergeValues(Ops, SL);
7091 }
7092
7093 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
7094 ApproxRcp, One, NegDivScale0);
7095
7096 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
7097 ApproxRcp, Fma0);
7098
7099 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
7100 Fma1, Fma1);
7101
7102 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
7103 NumeratorScaled, Mul);
7104
7105 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
7106
7107 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
7108 NumeratorScaled, Fma3);
7109
7110 if (!Subtarget->hasFP32Denormals()) {
7111 const SDValue DisableDenormValue =
7112 DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
7113 SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
7114 Fma4.getValue(1),
7115 DisableDenormValue,
7116 BitField,
7117 Fma4.getValue(2));
7118
7119 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
7120 DisableDenorm, DAG.getRoot());
7121 DAG.setRoot(OutputChain);
7122 }
Matt Arsenault37fefd62016-06-10 02:18:02 +00007123
Wei Dinged0f97f2016-06-09 19:17:15 +00007124 SDValue Scale = NumeratorScaled.getValue(1);
Tom Stellard8485fa02016-12-07 02:42:15 +00007125 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
7126 Fma4, Fma1, Fma3, Scale);
Matt Arsenault37fefd62016-06-10 02:18:02 +00007127
Wei Dinged0f97f2016-06-09 19:17:15 +00007128 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007129}
7130
7131SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007132 if (DAG.getTarget().Options.UnsafeFPMath)
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +00007133 return lowerFastUnsafeFDIV(Op, DAG);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007134
7135 SDLoc SL(Op);
7136 SDValue X = Op.getOperand(0);
7137 SDValue Y = Op.getOperand(1);
7138
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007139 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007140
7141 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
7142
7143 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
7144
7145 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
7146
7147 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
7148
7149 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
7150
7151 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
7152
7153 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
7154
7155 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
7156
7157 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
7158 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
7159
7160 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
7161 NegDivScale0, Mul, DivScale1);
7162
7163 SDValue Scale;
7164
Tom Stellard5bfbae52018-07-11 20:59:01 +00007165 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007166 // Workaround a hardware bug on SI where the condition output from div_scale
7167 // is not usable.
7168
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007169 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
Matt Arsenault0bbcd8b2015-02-14 04:30:08 +00007170
7171 // Figure out if the scale to use for div_fmas.
7172 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
7173 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
7174 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
7175 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
7176
7177 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
7178 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
7179
7180 SDValue Scale0Hi
7181 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
7182 SDValue Scale1Hi
7183 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
7184
7185 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
7186 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
7187 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
7188 } else {
7189 Scale = DivScale1.getValue(1);
7190 }
7191
7192 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
7193 Fma4, Fma3, Mul, Scale);
7194
7195 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007196}
7197
7198SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
7199 EVT VT = Op.getValueType();
7200
7201 if (VT == MVT::f32)
7202 return LowerFDIV32(Op, DAG);
7203
7204 if (VT == MVT::f64)
7205 return LowerFDIV64(Op, DAG);
7206
Matt Arsenault4052a572016-12-22 03:05:41 +00007207 if (VT == MVT::f16)
7208 return LowerFDIV16(Op, DAG);
7209
Matt Arsenaulte9fa3b82014-07-15 20:18:31 +00007210 llvm_unreachable("Unexpected type for fdiv");
7211}
7212
Tom Stellard81d871d2013-11-13 23:36:50 +00007213SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
7214 SDLoc DL(Op);
7215 StoreSDNode *Store = cast<StoreSDNode>(Op);
7216 EVT VT = Store->getMemoryVT();
7217
Matt Arsenault95245662016-02-11 05:32:46 +00007218 if (VT == MVT::i1) {
7219 return DAG.getTruncStore(Store->getChain(), DL,
7220 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
7221 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
Tom Stellardb02094e2014-07-21 15:45:01 +00007222 }
7223
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00007224 assert(VT.isVector() &&
7225 Store->getValue().getValueType().getScalarType() == MVT::i32);
7226
7227 unsigned AS = Store->getAddressSpace();
7228 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
7229 AS, Store->getAlignment())) {
7230 return expandUnalignedStore(Store, DAG);
7231 }
Tom Stellard81d871d2013-11-13 23:36:50 +00007232
Stanislav Mekhanoshina224f682019-05-01 16:11:11 +00007233 if (Subtarget->hasLDSMisalignedBug() &&
7234 AS == AMDGPUAS::FLAT_ADDRESS &&
7235 Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
7236 return SplitVectorStore(Op, DAG);
7237 }
7238
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00007239 MachineFunction &MF = DAG.getMachineFunction();
7240 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
7241 // If there is a possibilty that flat instruction access scratch memory
7242 // then we need to use the same legalization rules we use for private.
Matt Arsenault0da63502018-08-31 05:49:54 +00007243 if (AS == AMDGPUAS::FLAT_ADDRESS)
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00007244 AS = MFI->hasFlatScratchInit() ?
Matt Arsenault0da63502018-08-31 05:49:54 +00007245 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
Tom Stellardf8e6eaf2016-10-26 14:38:47 +00007246
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007247 unsigned NumElements = VT.getVectorNumElements();
Matt Arsenault0da63502018-08-31 05:49:54 +00007248 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
7249 AS == AMDGPUAS::FLAT_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007250 if (NumElements > 4)
7251 return SplitVectorStore(Op, DAG);
Tim Renouf361b5b22019-03-21 12:01:21 +00007252 // v3 stores not supported on SI.
7253 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
7254 return SplitVectorStore(Op, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007255 return SDValue();
Matt Arsenault0da63502018-08-31 05:49:54 +00007256 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007257 switch (Subtarget->getMaxPrivateElementSize()) {
7258 case 4:
Matt Arsenault9c499c32016-04-14 23:31:26 +00007259 return scalarizeVectorStore(Store, DAG);
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007260 case 8:
7261 if (NumElements > 2)
7262 return SplitVectorStore(Op, DAG);
7263 return SDValue();
7264 case 16:
Tim Renouf361b5b22019-03-21 12:01:21 +00007265 if (NumElements > 4 || NumElements == 3)
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007266 return SplitVectorStore(Op, DAG);
7267 return SDValue();
7268 default:
7269 llvm_unreachable("unsupported private_element_size");
7270 }
Matt Arsenault0da63502018-08-31 05:49:54 +00007271 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00007272 // Use ds_write_b128 if possible.
Marek Olsaka9a58fa2018-04-10 22:48:23 +00007273 if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
Tim Renouf361b5b22019-03-21 12:01:21 +00007274 VT.getStoreSize() == 16 && NumElements != 3)
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00007275 return SDValue();
7276
Matt Arsenaultbcdfee72016-05-02 20:13:51 +00007277 if (NumElements > 2)
7278 return SplitVectorStore(Op, DAG);
Nicolai Haehnle48219372018-10-17 15:37:48 +00007279
7280 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7281 // address is negative, then the instruction is incorrectly treated as
7282 // out-of-bounds even if base + offsets is in bounds. Split vectorized
7283 // stores here to avoid emitting ds_write2_b32. We may re-combine the
7284 // store later in the SILoadStoreOptimizer.
7285 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
7286 NumElements == 2 && VT.getStoreSize() == 8 &&
7287 Store->getAlignment() < 8) {
7288 return SplitVectorStore(Op, DAG);
7289 }
7290
Farhana Aleenc6c9dc82018-03-16 18:12:00 +00007291 return SDValue();
Yaxun Liu1a14bfa2017-03-27 14:04:01 +00007292 } else {
Matt Arsenaultf2ddbf02016-02-13 04:18:53 +00007293 llvm_unreachable("unhandled address space");
Matt Arsenault95245662016-02-11 05:32:46 +00007294 }
Tom Stellard81d871d2013-11-13 23:36:50 +00007295}
7296
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007297SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007298 SDLoc DL(Op);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007299 EVT VT = Op.getValueType();
7300 SDValue Arg = Op.getOperand(0);
David Stuttard20de3e92018-09-14 10:27:19 +00007301 SDValue TrigVal;
7302
Sanjay Patela2607012015-09-16 16:31:21 +00007303 // TODO: Should this propagate fast-math-flags?
David Stuttard20de3e92018-09-14 10:27:19 +00007304
7305 SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT);
7306
7307 if (Subtarget->hasTrigReducedRange()) {
7308 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
7309 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
7310 } else {
7311 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
7312 }
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007313
7314 switch (Op.getOpcode()) {
7315 case ISD::FCOS:
David Stuttard20de3e92018-09-14 10:27:19 +00007316 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007317 case ISD::FSIN:
David Stuttard20de3e92018-09-14 10:27:19 +00007318 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
Matt Arsenaultad14ce82014-07-19 18:44:39 +00007319 default:
7320 llvm_unreachable("Wrong trig opcode");
7321 }
7322}
7323
Tom Stellard354a43c2016-04-01 18:27:37 +00007324SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
7325 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
7326 assert(AtomicNode->isCompareAndSwap());
7327 unsigned AS = AtomicNode->getAddressSpace();
7328
7329 // No custom lowering required for local address space
Matt Arsenault0da63502018-08-31 05:49:54 +00007330 if (!isFlatGlobalAddrSpace(AS))
Tom Stellard354a43c2016-04-01 18:27:37 +00007331 return Op;
7332
7333 // Non-local address space requires custom lowering for atomic compare
7334 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
7335 SDLoc DL(Op);
7336 SDValue ChainIn = Op.getOperand(0);
7337 SDValue Addr = Op.getOperand(1);
7338 SDValue Old = Op.getOperand(2);
7339 SDValue New = Op.getOperand(3);
7340 EVT VT = Op.getValueType();
7341 MVT SimpleVT = VT.getSimpleVT();
7342 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
7343
Ahmed Bougacha128f8732016-04-26 21:15:30 +00007344 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
Tom Stellard354a43c2016-04-01 18:27:37 +00007345 SDValue Ops[] = { ChainIn, Addr, NewOld };
Matt Arsenault88701812016-06-09 23:42:48 +00007346
7347 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
7348 Ops, VT, AtomicNode->getMemOperand());
Tom Stellard354a43c2016-04-01 18:27:37 +00007349}
7350
Tom Stellard75aadc22012-12-11 21:25:42 +00007351//===----------------------------------------------------------------------===//
7352// Custom DAG optimizations
7353//===----------------------------------------------------------------------===//
7354
Matt Arsenault364a6742014-06-11 17:50:44 +00007355SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
Matt Arsenaulte6986632015-01-14 01:35:22 +00007356 DAGCombinerInfo &DCI) const {
Matt Arsenault364a6742014-06-11 17:50:44 +00007357 EVT VT = N->getValueType(0);
7358 EVT ScalarVT = VT.getScalarType();
7359 if (ScalarVT != MVT::f32)
7360 return SDValue();
7361
7362 SelectionDAG &DAG = DCI.DAG;
7363 SDLoc DL(N);
7364
7365 SDValue Src = N->getOperand(0);
7366 EVT SrcVT = Src.getValueType();
7367
7368 // TODO: We could try to match extracting the higher bytes, which would be
7369 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
7370 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
7371 // about in practice.
Craig Topper80d3bb32018-03-06 19:44:52 +00007372 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
Matt Arsenault364a6742014-06-11 17:50:44 +00007373 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
7374 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
7375 DCI.AddToWorklist(Cvt.getNode());
7376 return Cvt;
7377 }
7378 }
7379
Matt Arsenault364a6742014-06-11 17:50:44 +00007380 return SDValue();
7381}
7382
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007383// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
7384
7385// This is a variant of
7386// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
7387//
7388// The normal DAG combiner will do this, but only if the add has one use since
7389// that would increase the number of instructions.
7390//
7391// This prevents us from seeing a constant offset that can be folded into a
7392// memory instruction's addressing mode. If we know the resulting add offset of
7393// a pointer can be folded into an addressing offset, we can replace the pointer
7394// operand with the add of new constant offset. This eliminates one of the uses,
7395// and may allow the remaining use to also be simplified.
7396//
7397SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
7398 unsigned AddrSpace,
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007399 EVT MemVT,
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007400 DAGCombinerInfo &DCI) const {
7401 SDValue N0 = N->getOperand(0);
7402 SDValue N1 = N->getOperand(1);
7403
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007404 // We only do this to handle cases where it's profitable when there are
7405 // multiple uses of the add, so defer to the standard combine.
Matt Arsenaultc8903122017-11-14 23:46:42 +00007406 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
7407 N0->hasOneUse())
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007408 return SDValue();
7409
7410 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
7411 if (!CN1)
7412 return SDValue();
7413
7414 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
7415 if (!CAdd)
7416 return SDValue();
7417
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007418 // If the resulting offset is too large, we can't fold it into the addressing
7419 // mode offset.
7420 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007421 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
7422
7423 AddrMode AM;
7424 AM.HasBaseReg = true;
7425 AM.BaseOffs = Offset.getSExtValue();
7426 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007427 return SDValue();
7428
7429 SelectionDAG &DAG = DCI.DAG;
7430 SDLoc SL(N);
7431 EVT VT = N->getValueType(0);
7432
7433 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007434 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007435
Matt Arsenaulte5e0c742017-11-13 05:33:35 +00007436 SDNodeFlags Flags;
7437 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
7438 (N0.getOpcode() == ISD::OR ||
7439 N0->getFlags().hasNoUnsignedWrap()));
7440
7441 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00007442}
7443
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00007444SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
7445 DAGCombinerInfo &DCI) const {
7446 SDValue Ptr = N->getBasePtr();
7447 SelectionDAG &DAG = DCI.DAG;
7448 SDLoc SL(N);
7449
7450 // TODO: We could also do this for multiplies.
Matt Arsenaultfbe95332017-11-13 05:11:54 +00007451 if (Ptr.getOpcode() == ISD::SHL) {
7452 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
7453 N->getMemoryVT(), DCI);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00007454 if (NewPtr) {
7455 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
7456
7457 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
7458 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
7459 }
7460 }
7461
7462 return SDValue();
7463}
7464
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007465static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
7466 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
7467 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
7468 (Opc == ISD::XOR && Val == 0);
7469}
7470
7471// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
7472// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
7473// integer combine opportunities since most 64-bit operations are decomposed
7474// this way. TODO: We won't want this for SALU especially if it is an inline
7475// immediate.
7476SDValue SITargetLowering::splitBinaryBitConstantOp(
7477 DAGCombinerInfo &DCI,
7478 const SDLoc &SL,
7479 unsigned Opc, SDValue LHS,
7480 const ConstantSDNode *CRHS) const {
7481 uint64_t Val = CRHS->getZExtValue();
7482 uint32_t ValLo = Lo_32(Val);
7483 uint32_t ValHi = Hi_32(Val);
7484 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7485
7486 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
7487 bitOpWithConstantIsReducible(Opc, ValHi)) ||
7488 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
7489 // If we need to materialize a 64-bit immediate, it will be split up later
7490 // anyway. Avoid creating the harder to understand 64-bit immediate
7491 // materialization.
7492 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
7493 }
7494
7495 return SDValue();
7496}
7497
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00007498// Returns true if argument is a boolean value which is not serialized into
7499// memory or argument and does not require v_cmdmask_b32 to be deserialized.
7500static bool isBoolSGPR(SDValue V) {
7501 if (V.getValueType() != MVT::i1)
7502 return false;
7503 switch (V.getOpcode()) {
7504 default: break;
7505 case ISD::SETCC:
7506 case ISD::AND:
7507 case ISD::OR:
7508 case ISD::XOR:
7509 case AMDGPUISD::FP_CLASS:
7510 return true;
7511 }
7512 return false;
7513}
7514
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007515// If a constant has all zeroes or all ones within each byte return it.
7516// Otherwise return 0.
7517static uint32_t getConstantPermuteMask(uint32_t C) {
7518 // 0xff for any zero byte in the mask
7519 uint32_t ZeroByteMask = 0;
7520 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
7521 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
7522 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
7523 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
7524 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
7525 if ((NonZeroByteMask & C) != NonZeroByteMask)
7526 return 0; // Partial bytes selected.
7527 return C;
7528}
7529
7530// Check if a node selects whole bytes from its operand 0 starting at a byte
7531// boundary while masking the rest. Returns select mask as in the v_perm_b32
7532// or -1 if not succeeded.
7533// Note byte select encoding:
7534// value 0-3 selects corresponding source byte;
7535// value 0xc selects zero;
7536// value 0xff selects 0xff.
7537static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
7538 assert(V.getValueSizeInBits() == 32);
7539
7540 if (V.getNumOperands() != 2)
7541 return ~0;
7542
7543 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
7544 if (!N1)
7545 return ~0;
7546
7547 uint32_t C = N1->getZExtValue();
7548
7549 switch (V.getOpcode()) {
7550 default:
7551 break;
7552 case ISD::AND:
7553 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7554 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
7555 }
7556 break;
7557
7558 case ISD::OR:
7559 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7560 return (0x03020100 & ~ConstMask) | ConstMask;
7561 }
7562 break;
7563
7564 case ISD::SHL:
7565 if (C % 8)
7566 return ~0;
7567
7568 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
7569
7570 case ISD::SRL:
7571 if (C % 8)
7572 return ~0;
7573
7574 return uint32_t(0x0c0c0c0c03020100ull >> C);
7575 }
7576
7577 return ~0;
7578}
7579
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007580SDValue SITargetLowering::performAndCombine(SDNode *N,
7581 DAGCombinerInfo &DCI) const {
7582 if (DCI.isBeforeLegalize())
7583 return SDValue();
7584
7585 SelectionDAG &DAG = DCI.DAG;
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007586 EVT VT = N->getValueType(0);
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007587 SDValue LHS = N->getOperand(0);
7588 SDValue RHS = N->getOperand(1);
7589
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007590
Stanislav Mekhanoshin53a21292017-05-23 19:54:48 +00007591 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7592 if (VT == MVT::i64 && CRHS) {
7593 if (SDValue Split
7594 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
7595 return Split;
7596 }
7597
7598 if (CRHS && VT == MVT::i32) {
7599 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
7600 // nb = number of trailing zeroes in mask
7601 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
7602 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
7603 uint64_t Mask = CRHS->getZExtValue();
7604 unsigned Bits = countPopulation(Mask);
7605 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
7606 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
7607 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
7608 unsigned Shift = CShift->getZExtValue();
7609 unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
7610 unsigned Offset = NB + Shift;
7611 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
7612 SDLoc SL(N);
7613 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
7614 LHS->getOperand(0),
7615 DAG.getConstant(Offset, SL, MVT::i32),
7616 DAG.getConstant(Bits, SL, MVT::i32));
7617 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
7618 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
7619 DAG.getValueType(NarrowVT));
7620 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
7621 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
7622 return Shl;
7623 }
7624 }
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007625 }
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007626
7627 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7628 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
7629 isa<ConstantSDNode>(LHS.getOperand(2))) {
7630 uint32_t Sel = getConstantPermuteMask(Mask);
7631 if (!Sel)
7632 return SDValue();
7633
7634 // Select 0xc for all zero bytes
7635 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
7636 SDLoc DL(N);
7637 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7638 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7639 }
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007640 }
7641
7642 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
7643 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
7644 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007645 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7646 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
7647
7648 SDValue X = LHS.getOperand(0);
7649 SDValue Y = RHS.getOperand(0);
7650 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
7651 return SDValue();
7652
7653 if (LCC == ISD::SETO) {
7654 if (X != LHS.getOperand(1))
7655 return SDValue();
7656
7657 if (RCC == ISD::SETUNE) {
7658 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
7659 if (!C1 || !C1->isInfinity() || C1->isNegative())
7660 return SDValue();
7661
7662 const uint32_t Mask = SIInstrFlags::N_NORMAL |
7663 SIInstrFlags::N_SUBNORMAL |
7664 SIInstrFlags::N_ZERO |
7665 SIInstrFlags::P_ZERO |
7666 SIInstrFlags::P_SUBNORMAL |
7667 SIInstrFlags::P_NORMAL;
7668
7669 static_assert(((~(SIInstrFlags::S_NAN |
7670 SIInstrFlags::Q_NAN |
7671 SIInstrFlags::N_INFINITY |
7672 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
7673 "mask not equal");
7674
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00007675 SDLoc DL(N);
7676 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7677 X, DAG.getConstant(Mask, DL, MVT::i32));
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007678 }
7679 }
7680 }
7681
Matt Arsenault3dcf4ce2018-08-10 18:58:56 +00007682 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
7683 std::swap(LHS, RHS);
7684
7685 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7686 RHS.hasOneUse()) {
7687 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7688 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
7689 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
7690 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7691 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
7692 (RHS.getOperand(0) == LHS.getOperand(0) &&
7693 LHS.getOperand(0) == LHS.getOperand(1))) {
7694 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
7695 unsigned NewMask = LCC == ISD::SETO ?
7696 Mask->getZExtValue() & ~OrdMask :
7697 Mask->getZExtValue() & OrdMask;
7698
7699 SDLoc DL(N);
7700 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
7701 DAG.getConstant(NewMask, DL, MVT::i32));
7702 }
7703 }
7704
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00007705 if (VT == MVT::i32 &&
7706 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
7707 // and x, (sext cc from i1) => select cc, x, 0
7708 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
7709 std::swap(LHS, RHS);
7710 if (isBoolSGPR(RHS.getOperand(0)))
7711 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
7712 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
7713 }
7714
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007715 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7716 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7717 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7718 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7719 uint32_t LHSMask = getPermuteMask(DAG, LHS);
7720 uint32_t RHSMask = getPermuteMask(DAG, RHS);
7721 if (LHSMask != ~0u && RHSMask != ~0u) {
7722 // Canonicalize the expression in an attempt to have fewer unique masks
7723 // and therefore fewer registers used to hold the masks.
7724 if (LHSMask > RHSMask) {
7725 std::swap(LHSMask, RHSMask);
7726 std::swap(LHS, RHS);
7727 }
7728
7729 // Select 0xc for each lane used from source operand. Zero has 0xc mask
7730 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7731 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7732 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7733
7734 // Check of we need to combine values from two sources within a byte.
7735 if (!(LHSUsedLanes & RHSUsedLanes) &&
7736 // If we select high and lower word keep it for SDWA.
7737 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7738 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7739 // Each byte in each mask is either selector mask 0-3, or has higher
7740 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
7741 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
7742 // mask which is not 0xff wins. By anding both masks we have a correct
7743 // result except that 0x0c shall be corrected to give 0x0c only.
7744 uint32_t Mask = LHSMask & RHSMask;
7745 for (unsigned I = 0; I < 32; I += 8) {
7746 uint32_t ByteSel = 0xff << I;
7747 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
7748 Mask &= (0x0c << I) & 0xffffffff;
7749 }
7750
7751 // Add 4 to each active LHS lane. It will not affect any existing 0xff
7752 // or 0x0c.
7753 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
7754 SDLoc DL(N);
7755
7756 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7757 LHS.getOperand(0), RHS.getOperand(0),
7758 DAG.getConstant(Sel, DL, MVT::i32));
7759 }
7760 }
7761 }
7762
Matt Arsenaultd0101a22015-01-06 23:00:46 +00007763 return SDValue();
7764}
7765
Matt Arsenaultf2290332015-01-06 23:00:39 +00007766SDValue SITargetLowering::performOrCombine(SDNode *N,
7767 DAGCombinerInfo &DCI) const {
7768 SelectionDAG &DAG = DCI.DAG;
7769 SDValue LHS = N->getOperand(0);
7770 SDValue RHS = N->getOperand(1);
7771
Matt Arsenault3b082382016-04-12 18:24:38 +00007772 EVT VT = N->getValueType(0);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007773 if (VT == MVT::i1) {
7774 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
7775 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7776 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
7777 SDValue Src = LHS.getOperand(0);
7778 if (Src != RHS.getOperand(0))
7779 return SDValue();
Matt Arsenault3b082382016-04-12 18:24:38 +00007780
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007781 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
7782 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7783 if (!CLHS || !CRHS)
7784 return SDValue();
Matt Arsenault3b082382016-04-12 18:24:38 +00007785
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007786 // Only 10 bits are used.
7787 static const uint32_t MaxMask = 0x3ff;
Matt Arsenault3b082382016-04-12 18:24:38 +00007788
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007789 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
7790 SDLoc DL(N);
7791 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7792 Src, DAG.getConstant(NewMask, DL, MVT::i32));
7793 }
Matt Arsenault3b082382016-04-12 18:24:38 +00007794
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007795 return SDValue();
7796 }
7797
Stanislav Mekhanoshin8fd3c4e2018-06-12 23:50:37 +00007798 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7799 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
7800 LHS.getOpcode() == AMDGPUISD::PERM &&
7801 isa<ConstantSDNode>(LHS.getOperand(2))) {
7802 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
7803 if (!Sel)
7804 return SDValue();
7805
7806 Sel |= LHS.getConstantOperandVal(2);
7807 SDLoc DL(N);
7808 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7809 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7810 }
7811
7812 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7813 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7814 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7815 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7816 uint32_t LHSMask = getPermuteMask(DAG, LHS);
7817 uint32_t RHSMask = getPermuteMask(DAG, RHS);
7818 if (LHSMask != ~0u && RHSMask != ~0u) {
7819 // Canonicalize the expression in an attempt to have fewer unique masks
7820 // and therefore fewer registers used to hold the masks.
7821 if (LHSMask > RHSMask) {
7822 std::swap(LHSMask, RHSMask);
7823 std::swap(LHS, RHS);
7824 }
7825
7826 // Select 0xc for each lane used from source operand. Zero has 0xc mask
7827 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7828 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7829 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7830
7831 // Check of we need to combine values from two sources within a byte.
7832 if (!(LHSUsedLanes & RHSUsedLanes) &&
7833 // If we select high and lower word keep it for SDWA.
7834 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7835 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7836 // Kill zero bytes selected by other mask. Zero value is 0xc.
7837 LHSMask &= ~RHSUsedLanes;
7838 RHSMask &= ~LHSUsedLanes;
7839 // Add 4 to each active LHS lane
7840 LHSMask |= LHSUsedLanes & 0x04040404;
7841 // Combine masks
7842 uint32_t Sel = LHSMask | RHSMask;
7843 SDLoc DL(N);
7844
7845 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7846 LHS.getOperand(0), RHS.getOperand(0),
7847 DAG.getConstant(Sel, DL, MVT::i32));
7848 }
7849 }
7850 }
7851
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007852 if (VT != MVT::i64)
7853 return SDValue();
7854
7855 // TODO: This could be a generic combine with a predicate for extracting the
7856 // high half of an integer being free.
7857
7858 // (or i64:x, (zero_extend i32:y)) ->
7859 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
7860 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
7861 RHS.getOpcode() != ISD::ZERO_EXTEND)
7862 std::swap(LHS, RHS);
7863
7864 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
7865 SDValue ExtSrc = RHS.getOperand(0);
7866 EVT SrcVT = ExtSrc.getValueType();
7867 if (SrcVT == MVT::i32) {
7868 SDLoc SL(N);
7869 SDValue LowLHS, HiBits;
7870 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
7871 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
7872
7873 DCI.AddToWorklist(LowOr.getNode());
7874 DCI.AddToWorklist(HiBits.getNode());
7875
7876 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
7877 LowOr, HiBits);
7878 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
Matt Arsenault3b082382016-04-12 18:24:38 +00007879 }
7880 }
7881
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007882 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
7883 if (CRHS) {
7884 if (SDValue Split
7885 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
7886 return Split;
7887 }
Matt Arsenaultf2290332015-01-06 23:00:39 +00007888
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007889 return SDValue();
7890}
Matt Arsenaultf2290332015-01-06 23:00:39 +00007891
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007892SDValue SITargetLowering::performXorCombine(SDNode *N,
7893 DAGCombinerInfo &DCI) const {
7894 EVT VT = N->getValueType(0);
7895 if (VT != MVT::i64)
7896 return SDValue();
Matt Arsenaultf2290332015-01-06 23:00:39 +00007897
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00007898 SDValue LHS = N->getOperand(0);
7899 SDValue RHS = N->getOperand(1);
7900
7901 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7902 if (CRHS) {
7903 if (SDValue Split
7904 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
7905 return Split;
Matt Arsenaultf2290332015-01-06 23:00:39 +00007906 }
7907
7908 return SDValue();
7909}
7910
Matt Arsenault5cf42712017-04-06 20:58:30 +00007911// Instructions that will be lowered with a final instruction that zeros the
7912// high result bits.
7913// XXX - probably only need to list legal operations.
Matt Arsenault8edfaee2017-03-31 19:53:03 +00007914static bool fp16SrcZerosHighBits(unsigned Opc) {
7915 switch (Opc) {
Matt Arsenault5cf42712017-04-06 20:58:30 +00007916 case ISD::FADD:
7917 case ISD::FSUB:
7918 case ISD::FMUL:
7919 case ISD::FDIV:
7920 case ISD::FREM:
7921 case ISD::FMA:
7922 case ISD::FMAD:
7923 case ISD::FCANONICALIZE:
7924 case ISD::FP_ROUND:
7925 case ISD::UINT_TO_FP:
7926 case ISD::SINT_TO_FP:
7927 case ISD::FABS:
7928 // Fabs is lowered to a bit operation, but it's an and which will clear the
7929 // high bits anyway.
7930 case ISD::FSQRT:
7931 case ISD::FSIN:
7932 case ISD::FCOS:
7933 case ISD::FPOWI:
7934 case ISD::FPOW:
7935 case ISD::FLOG:
7936 case ISD::FLOG2:
7937 case ISD::FLOG10:
7938 case ISD::FEXP:
7939 case ISD::FEXP2:
7940 case ISD::FCEIL:
7941 case ISD::FTRUNC:
7942 case ISD::FRINT:
7943 case ISD::FNEARBYINT:
7944 case ISD::FROUND:
7945 case ISD::FFLOOR:
7946 case ISD::FMINNUM:
7947 case ISD::FMAXNUM:
7948 case AMDGPUISD::FRACT:
7949 case AMDGPUISD::CLAMP:
7950 case AMDGPUISD::COS_HW:
7951 case AMDGPUISD::SIN_HW:
7952 case AMDGPUISD::FMIN3:
7953 case AMDGPUISD::FMAX3:
7954 case AMDGPUISD::FMED3:
7955 case AMDGPUISD::FMAD_FTZ:
7956 case AMDGPUISD::RCP:
7957 case AMDGPUISD::RSQ:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00007958 case AMDGPUISD::RCP_IFLAG:
Matt Arsenault5cf42712017-04-06 20:58:30 +00007959 case AMDGPUISD::LDEXP:
Matt Arsenault8edfaee2017-03-31 19:53:03 +00007960 return true;
Matt Arsenault5cf42712017-04-06 20:58:30 +00007961 default:
7962 // fcopysign, select and others may be lowered to 32-bit bit operations
7963 // which don't zero the high bits.
7964 return false;
Matt Arsenault8edfaee2017-03-31 19:53:03 +00007965 }
7966}
7967
7968SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
7969 DAGCombinerInfo &DCI) const {
7970 if (!Subtarget->has16BitInsts() ||
7971 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
7972 return SDValue();
7973
7974 EVT VT = N->getValueType(0);
7975 if (VT != MVT::i32)
7976 return SDValue();
7977
7978 SDValue Src = N->getOperand(0);
7979 if (Src.getValueType() != MVT::i16)
7980 return SDValue();
7981
7982 // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
7983 // FIXME: It is not universally true that the high bits are zeroed on gfx9.
7984 if (Src.getOpcode() == ISD::BITCAST) {
7985 SDValue BCSrc = Src.getOperand(0);
7986 if (BCSrc.getValueType() == MVT::f16 &&
7987 fp16SrcZerosHighBits(BCSrc.getOpcode()))
7988 return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
7989 }
7990
7991 return SDValue();
7992}
7993
Ryan Taylor00e063a2019-03-19 16:07:00 +00007994SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N,
7995 DAGCombinerInfo &DCI)
7996 const {
7997 SDValue Src = N->getOperand(0);
7998 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
7999
8000 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
8001 VTSign->getVT() == MVT::i8) ||
8002 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
8003 VTSign->getVT() == MVT::i16)) &&
8004 Src.hasOneUse()) {
8005 auto *M = cast<MemSDNode>(Src);
8006 SDValue Ops[] = {
8007 Src.getOperand(0), // Chain
8008 Src.getOperand(1), // rsrc
8009 Src.getOperand(2), // vindex
8010 Src.getOperand(3), // voffset
8011 Src.getOperand(4), // soffset
8012 Src.getOperand(5), // offset
8013 Src.getOperand(6),
8014 Src.getOperand(7)
8015 };
8016 // replace with BUFFER_LOAD_BYTE/SHORT
8017 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
8018 Src.getOperand(0).getValueType());
8019 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
8020 AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT;
8021 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
8022 ResList,
8023 Ops, M->getMemoryVT(),
8024 M->getMemOperand());
8025 return DCI.DAG.getMergeValues({BufferLoadSignExt,
8026 BufferLoadSignExt.getValue(1)}, SDLoc(N));
8027 }
8028 return SDValue();
8029}
8030
Matt Arsenaultf2290332015-01-06 23:00:39 +00008031SDValue SITargetLowering::performClassCombine(SDNode *N,
8032 DAGCombinerInfo &DCI) const {
8033 SelectionDAG &DAG = DCI.DAG;
8034 SDValue Mask = N->getOperand(1);
8035
8036 // fp_class x, 0 -> false
8037 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
8038 if (CMask->isNullValue())
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00008039 return DAG.getConstant(0, SDLoc(N), MVT::i1);
Matt Arsenaultf2290332015-01-06 23:00:39 +00008040 }
8041
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00008042 if (N->getOperand(0).isUndef())
8043 return DAG.getUNDEF(MVT::i1);
8044
Matt Arsenaultf2290332015-01-06 23:00:39 +00008045 return SDValue();
8046}
8047
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00008048SDValue SITargetLowering::performRcpCombine(SDNode *N,
8049 DAGCombinerInfo &DCI) const {
8050 EVT VT = N->getValueType(0);
8051 SDValue N0 = N->getOperand(0);
8052
8053 if (N0.isUndef())
8054 return N0;
8055
8056 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
8057 N0.getOpcode() == ISD::SINT_TO_FP)) {
8058 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
8059 N->getFlags());
8060 }
8061
8062 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
8063}
8064
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008065bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
8066 unsigned MaxDepth) const {
8067 unsigned Opcode = Op.getOpcode();
8068 if (Opcode == ISD::FCANONICALIZE)
8069 return true;
8070
8071 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
8072 auto F = CFP->getValueAPF();
8073 if (F.isNaN() && F.isSignaling())
8074 return false;
8075 return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
8076 }
8077
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008078 // If source is a result of another standard FP operation it is already in
8079 // canonical form.
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008080 if (MaxDepth == 0)
8081 return false;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008082
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008083 switch (Opcode) {
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008084 // These will flush denorms if required.
8085 case ISD::FADD:
8086 case ISD::FSUB:
8087 case ISD::FMUL:
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008088 case ISD::FCEIL:
8089 case ISD::FFLOOR:
8090 case ISD::FMA:
8091 case ISD::FMAD:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008092 case ISD::FSQRT:
8093 case ISD::FDIV:
8094 case ISD::FREM:
Matt Arsenaultce6d61f2018-08-06 21:51:52 +00008095 case ISD::FP_ROUND:
8096 case ISD::FP_EXTEND:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008097 case AMDGPUISD::FMUL_LEGACY:
8098 case AMDGPUISD::FMAD_FTZ:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00008099 case AMDGPUISD::RCP:
8100 case AMDGPUISD::RSQ:
8101 case AMDGPUISD::RSQ_CLAMP:
8102 case AMDGPUISD::RCP_LEGACY:
8103 case AMDGPUISD::RSQ_LEGACY:
8104 case AMDGPUISD::RCP_IFLAG:
8105 case AMDGPUISD::TRIG_PREOP:
8106 case AMDGPUISD::DIV_SCALE:
8107 case AMDGPUISD::DIV_FMAS:
8108 case AMDGPUISD::DIV_FIXUP:
8109 case AMDGPUISD::FRACT:
8110 case AMDGPUISD::LDEXP:
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008111 case AMDGPUISD::CVT_PKRTZ_F16_F32:
Matt Arsenault940e6072018-08-10 19:20:17 +00008112 case AMDGPUISD::CVT_F32_UBYTE0:
8113 case AMDGPUISD::CVT_F32_UBYTE1:
8114 case AMDGPUISD::CVT_F32_UBYTE2:
8115 case AMDGPUISD::CVT_F32_UBYTE3:
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008116 return true;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008117
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008118 // It can/will be lowered or combined as a bit operation.
8119 // Need to check their input recursively to handle.
8120 case ISD::FNEG:
8121 case ISD::FABS:
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008122 case ISD::FCOPYSIGN:
8123 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008124
8125 case ISD::FSIN:
8126 case ISD::FCOS:
8127 case ISD::FSINCOS:
8128 return Op.getValueType().getScalarType() != MVT::f16;
8129
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008130 case ISD::FMINNUM:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00008131 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008132 case ISD::FMINNUM_IEEE:
8133 case ISD::FMAXNUM_IEEE:
Matt Arsenaultd49ab0b2018-08-06 21:58:11 +00008134 case AMDGPUISD::CLAMP:
8135 case AMDGPUISD::FMED3:
8136 case AMDGPUISD::FMAX3:
8137 case AMDGPUISD::FMIN3: {
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008138 // FIXME: Shouldn't treat the generic operations different based these.
Matt Arsenault687ec752018-10-22 16:27:27 +00008139 // However, we aren't really required to flush the result from
8140 // minnum/maxnum..
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008141
Matt Arsenault687ec752018-10-22 16:27:27 +00008142 // snans will be quieted, so we only need to worry about denormals.
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008143 if (Subtarget->supportsMinMaxDenormModes() ||
Matt Arsenault687ec752018-10-22 16:27:27 +00008144 denormalsEnabledForType(Op.getValueType()))
8145 return true;
8146
8147 // Flushing may be required.
8148 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
8149 // targets need to check their input recursively.
8150
8151 // FIXME: Does this apply with clamp? It's implemented with max.
8152 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
8153 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
8154 return false;
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008155 }
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008156
Matt Arsenault687ec752018-10-22 16:27:27 +00008157 return true;
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008158 }
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008159 case ISD::SELECT: {
8160 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
8161 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008162 }
Matt Arsenaulte94ee832018-08-06 22:45:51 +00008163 case ISD::BUILD_VECTOR: {
8164 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
8165 SDValue SrcOp = Op.getOperand(i);
8166 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
8167 return false;
8168 }
8169
8170 return true;
8171 }
8172 case ISD::EXTRACT_VECTOR_ELT:
8173 case ISD::EXTRACT_SUBVECTOR: {
8174 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
8175 }
8176 case ISD::INSERT_VECTOR_ELT: {
8177 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
8178 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
8179 }
8180 case ISD::UNDEF:
8181 // Could be anything.
8182 return false;
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008183
Matt Arsenault687ec752018-10-22 16:27:27 +00008184 case ISD::BITCAST: {
8185 // Hack round the mess we make when legalizing extract_vector_elt
8186 SDValue Src = Op.getOperand(0);
8187 if (Src.getValueType() == MVT::i16 &&
8188 Src.getOpcode() == ISD::TRUNCATE) {
8189 SDValue TruncSrc = Src.getOperand(0);
8190 if (TruncSrc.getValueType() == MVT::i32 &&
8191 TruncSrc.getOpcode() == ISD::BITCAST &&
8192 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
8193 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
8194 }
8195 }
8196
8197 return false;
8198 }
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008199 case ISD::INTRINSIC_WO_CHAIN: {
8200 unsigned IntrinsicID
8201 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
8202 // TODO: Handle more intrinsics
8203 switch (IntrinsicID) {
8204 case Intrinsic::amdgcn_cvt_pkrtz:
Matt Arsenault940e6072018-08-10 19:20:17 +00008205 case Intrinsic::amdgcn_cubeid:
8206 case Intrinsic::amdgcn_frexp_mant:
8207 case Intrinsic::amdgcn_fdot2:
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008208 return true;
8209 default:
8210 break;
8211 }
Matt Arsenault5bb9d792018-08-10 17:57:12 +00008212
8213 LLVM_FALLTHROUGH;
Matt Arsenault08f3fe42018-08-06 23:01:31 +00008214 }
Matt Arsenaultf8768bf2018-08-06 21:38:27 +00008215 default:
8216 return denormalsEnabledForType(Op.getValueType()) &&
8217 DAG.isKnownNeverSNaN(Op);
8218 }
8219
8220 llvm_unreachable("invalid operation");
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008221}
8222
Matt Arsenault9cd90712016-04-14 01:42:16 +00008223// Constant fold canonicalize.
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00008224SDValue SITargetLowering::getCanonicalConstantFP(
8225 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
8226 // Flush denormals to 0 if not enabled.
8227 if (C.isDenormal() && !denormalsEnabledForType(VT))
8228 return DAG.getConstantFP(0.0, SL, VT);
8229
8230 if (C.isNaN()) {
8231 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
8232 if (C.isSignaling()) {
8233 // Quiet a signaling NaN.
8234 // FIXME: Is this supposed to preserve payload bits?
8235 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
8236 }
8237
8238 // Make sure it is the canonical NaN bitpattern.
8239 //
8240 // TODO: Can we use -1 as the canonical NaN value since it's an inline
8241 // immediate?
8242 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
8243 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
8244 }
8245
8246 // Already canonical.
8247 return DAG.getConstantFP(C, SL, VT);
8248}
8249
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008250static bool vectorEltWillFoldAway(SDValue Op) {
8251 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
8252}
8253
Matt Arsenault9cd90712016-04-14 01:42:16 +00008254SDValue SITargetLowering::performFCanonicalizeCombine(
8255 SDNode *N,
8256 DAGCombinerInfo &DCI) const {
Matt Arsenault9cd90712016-04-14 01:42:16 +00008257 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault4aec86d2018-07-31 13:34:31 +00008258 SDValue N0 = N->getOperand(0);
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008259 EVT VT = N->getValueType(0);
Stanislav Mekhanoshin5680b0c2017-07-12 21:20:28 +00008260
Matt Arsenault4aec86d2018-07-31 13:34:31 +00008261 // fcanonicalize undef -> qnan
8262 if (N0.isUndef()) {
Matt Arsenault4aec86d2018-07-31 13:34:31 +00008263 APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
8264 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
8265 }
8266
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00008267 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
Matt Arsenault9cd90712016-04-14 01:42:16 +00008268 EVT VT = N->getValueType(0);
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00008269 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
Matt Arsenault9cd90712016-04-14 01:42:16 +00008270 }
8271
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008272 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
8273 // (fcanonicalize k)
8274 //
8275 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
8276
8277 // TODO: This could be better with wider vectors that will be split to v2f16,
8278 // and to consider uses since there aren't that many packed operations.
Matt Arsenaultb5acec12018-08-12 08:42:54 +00008279 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
8280 isTypeLegal(MVT::v2f16)) {
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008281 SDLoc SL(N);
8282 SDValue NewElts[2];
8283 SDValue Lo = N0.getOperand(0);
8284 SDValue Hi = N0.getOperand(1);
Matt Arsenaultb5acec12018-08-12 08:42:54 +00008285 EVT EltVT = Lo.getValueType();
8286
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008287 if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
8288 for (unsigned I = 0; I != 2; ++I) {
8289 SDValue Op = N0.getOperand(I);
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008290 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
8291 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
8292 CFP->getValueAPF());
8293 } else if (Op.isUndef()) {
Matt Arsenaultb5acec12018-08-12 08:42:54 +00008294 // Handled below based on what the other operand is.
8295 NewElts[I] = Op;
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008296 } else {
8297 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
8298 }
8299 }
8300
Matt Arsenaultb5acec12018-08-12 08:42:54 +00008301 // If one half is undef, and one is constant, perfer a splat vector rather
8302 // than the normal qNaN. If it's a register, prefer 0.0 since that's
8303 // cheaper to use and may be free with a packed operation.
8304 if (NewElts[0].isUndef()) {
8305 if (isa<ConstantFPSDNode>(NewElts[1]))
8306 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
8307 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
8308 }
8309
8310 if (NewElts[1].isUndef()) {
8311 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
8312 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
8313 }
8314
Matt Arsenaulta29e7622018-08-06 22:30:44 +00008315 return DAG.getBuildVector(VT, SL, NewElts);
8316 }
8317 }
8318
Matt Arsenault687ec752018-10-22 16:27:27 +00008319 unsigned SrcOpc = N0.getOpcode();
8320
8321 // If it's free to do so, push canonicalizes further up the source, which may
8322 // find a canonical source.
8323 //
8324 // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
8325 // sNaNs.
8326 if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
8327 auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
8328 if (CRHS && N0.hasOneUse()) {
8329 SDLoc SL(N);
8330 SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
8331 N0.getOperand(0));
8332 SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
8333 DCI.AddToWorklist(Canon0.getNode());
8334
8335 return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
8336 }
8337 }
8338
Matt Arsenaultf2a167f2018-08-06 22:10:26 +00008339 return isCanonicalized(DAG, N0) ? N0 : SDValue();
Matt Arsenault9cd90712016-04-14 01:42:16 +00008340}
8341
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008342static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
8343 switch (Opc) {
8344 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008345 case ISD::FMAXNUM_IEEE:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008346 return AMDGPUISD::FMAX3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008347 case ISD::SMAX:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008348 return AMDGPUISD::SMAX3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008349 case ISD::UMAX:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008350 return AMDGPUISD::UMAX3;
8351 case ISD::FMINNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008352 case ISD::FMINNUM_IEEE:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008353 return AMDGPUISD::FMIN3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008354 case ISD::SMIN:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008355 return AMDGPUISD::SMIN3;
Matt Arsenault5881f4e2015-06-09 00:52:37 +00008356 case ISD::UMIN:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008357 return AMDGPUISD::UMIN3;
8358 default:
8359 llvm_unreachable("Not a min/max opcode");
8360 }
8361}
8362
Matt Arsenault10268f92017-02-27 22:40:39 +00008363SDValue SITargetLowering::performIntMed3ImmCombine(
8364 SelectionDAG &DAG, const SDLoc &SL,
8365 SDValue Op0, SDValue Op1, bool Signed) const {
Matt Arsenaultf639c322016-01-28 20:53:42 +00008366 ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
8367 if (!K1)
8368 return SDValue();
8369
8370 ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
8371 if (!K0)
8372 return SDValue();
8373
Matt Arsenaultf639c322016-01-28 20:53:42 +00008374 if (Signed) {
8375 if (K0->getAPIntValue().sge(K1->getAPIntValue()))
8376 return SDValue();
8377 } else {
8378 if (K0->getAPIntValue().uge(K1->getAPIntValue()))
8379 return SDValue();
8380 }
8381
8382 EVT VT = K0->getValueType(0);
Matt Arsenault10268f92017-02-27 22:40:39 +00008383 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
8384 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
8385 return DAG.getNode(Med3Opc, SL, VT,
8386 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
8387 }
Tom Stellard115a6152016-11-10 16:02:37 +00008388
Matt Arsenault10268f92017-02-27 22:40:39 +00008389 // If there isn't a 16-bit med3 operation, convert to 32-bit.
Tom Stellard115a6152016-11-10 16:02:37 +00008390 MVT NVT = MVT::i32;
8391 unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8392
Matt Arsenault10268f92017-02-27 22:40:39 +00008393 SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
8394 SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
8395 SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
Tom Stellard115a6152016-11-10 16:02:37 +00008396
Matt Arsenault10268f92017-02-27 22:40:39 +00008397 SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
8398 return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
Matt Arsenaultf639c322016-01-28 20:53:42 +00008399}
8400
Matt Arsenault6b114d22017-08-30 01:20:17 +00008401static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
8402 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
8403 return C;
8404
8405 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
8406 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
8407 return C;
8408 }
8409
8410 return nullptr;
8411}
8412
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008413SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
8414 const SDLoc &SL,
8415 SDValue Op0,
8416 SDValue Op1) const {
Matt Arsenault6b114d22017-08-30 01:20:17 +00008417 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
Matt Arsenaultf639c322016-01-28 20:53:42 +00008418 if (!K1)
8419 return SDValue();
8420
Matt Arsenault6b114d22017-08-30 01:20:17 +00008421 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
Matt Arsenaultf639c322016-01-28 20:53:42 +00008422 if (!K0)
8423 return SDValue();
8424
8425 // Ordered >= (although NaN inputs should have folded away by now).
8426 APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
8427 if (Cmp == APFloat::cmpGreaterThan)
8428 return SDValue();
8429
Matt Arsenault055e4dc2019-03-29 19:14:54 +00008430 const MachineFunction &MF = DAG.getMachineFunction();
8431 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8432
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008433 // TODO: Check IEEE bit enabled?
Matt Arsenault6b114d22017-08-30 01:20:17 +00008434 EVT VT = Op0.getValueType();
Matt Arsenault055e4dc2019-03-29 19:14:54 +00008435 if (Info->getMode().DX10Clamp) {
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008436 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
8437 // hardware fmed3 behavior converting to a min.
8438 // FIXME: Should this be allowing -0.0?
8439 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
8440 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
8441 }
8442
Matt Arsenault6b114d22017-08-30 01:20:17 +00008443 // med3 for f16 is only available on gfx9+, and not available for v2f16.
8444 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
8445 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
8446 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
8447 // then give the other result, which is different from med3 with a NaN
8448 // input.
8449 SDValue Var = Op0.getOperand(0);
Matt Arsenaultc3dc8e62018-08-03 18:27:52 +00008450 if (!DAG.isKnownNeverSNaN(Var))
Matt Arsenault6b114d22017-08-30 01:20:17 +00008451 return SDValue();
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008452
Matt Arsenaultebf46142018-09-18 02:34:54 +00008453 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
8454
8455 if ((!K0->hasOneUse() ||
8456 TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
8457 (!K1->hasOneUse() ||
8458 TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
8459 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
8460 Var, SDValue(K0, 0), SDValue(K1, 0));
8461 }
Matt Arsenault6b114d22017-08-30 01:20:17 +00008462 }
Matt Arsenaultf639c322016-01-28 20:53:42 +00008463
Matt Arsenault6b114d22017-08-30 01:20:17 +00008464 return SDValue();
Matt Arsenaultf639c322016-01-28 20:53:42 +00008465}
8466
8467SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
8468 DAGCombinerInfo &DCI) const {
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008469 SelectionDAG &DAG = DCI.DAG;
8470
Matt Arsenault79a45db2017-02-22 23:53:37 +00008471 EVT VT = N->getValueType(0);
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008472 unsigned Opc = N->getOpcode();
8473 SDValue Op0 = N->getOperand(0);
8474 SDValue Op1 = N->getOperand(1);
8475
8476 // Only do this if the inner op has one use since this will just increases
8477 // register pressure for no benefit.
8478
Matt Arsenault79a45db2017-02-22 23:53:37 +00008479 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
Neil Henninge85f6bd2019-03-19 15:50:24 +00008480 !VT.isVector() &&
8481 (VT == MVT::i32 || VT == MVT::f32 ||
8482 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
Matt Arsenault5b39b342016-01-28 20:53:48 +00008483 // max(max(a, b), c) -> max3(a, b, c)
8484 // min(min(a, b), c) -> min3(a, b, c)
8485 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
8486 SDLoc DL(N);
8487 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
8488 DL,
8489 N->getValueType(0),
8490 Op0.getOperand(0),
8491 Op0.getOperand(1),
8492 Op1);
8493 }
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008494
Matt Arsenault5b39b342016-01-28 20:53:48 +00008495 // Try commuted.
8496 // max(a, max(b, c)) -> max3(a, b, c)
8497 // min(a, min(b, c)) -> min3(a, b, c)
8498 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
8499 SDLoc DL(N);
8500 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
8501 DL,
8502 N->getValueType(0),
8503 Op0,
8504 Op1.getOperand(0),
8505 Op1.getOperand(1));
8506 }
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008507 }
8508
Matt Arsenaultf639c322016-01-28 20:53:42 +00008509 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
8510 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
8511 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
8512 return Med3;
8513 }
8514
8515 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
8516 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
8517 return Med3;
8518 }
8519
8520 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
Matt Arsenault5b39b342016-01-28 20:53:48 +00008521 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
Matt Arsenault687ec752018-10-22 16:27:27 +00008522 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
Matt Arsenault5b39b342016-01-28 20:53:48 +00008523 (Opc == AMDGPUISD::FMIN_LEGACY &&
8524 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
Matt Arsenault79a45db2017-02-22 23:53:37 +00008525 (VT == MVT::f32 || VT == MVT::f64 ||
Matt Arsenault6b114d22017-08-30 01:20:17 +00008526 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
8527 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008528 Op0.hasOneUse()) {
Matt Arsenaultf639c322016-01-28 20:53:42 +00008529 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
8530 return Res;
8531 }
8532
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00008533 return SDValue();
8534}
8535
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008536static bool isClampZeroToOne(SDValue A, SDValue B) {
8537 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
8538 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
8539 // FIXME: Should this be allowing -0.0?
8540 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
8541 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
8542 }
8543 }
8544
8545 return false;
8546}
8547
8548// FIXME: Should only worry about snans for version with chain.
8549SDValue SITargetLowering::performFMed3Combine(SDNode *N,
8550 DAGCombinerInfo &DCI) const {
8551 EVT VT = N->getValueType(0);
8552 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
8553 // NaNs. With a NaN input, the order of the operands may change the result.
8554
8555 SelectionDAG &DAG = DCI.DAG;
8556 SDLoc SL(N);
8557
8558 SDValue Src0 = N->getOperand(0);
8559 SDValue Src1 = N->getOperand(1);
8560 SDValue Src2 = N->getOperand(2);
8561
8562 if (isClampZeroToOne(Src0, Src1)) {
8563 // const_a, const_b, x -> clamp is safe in all cases including signaling
8564 // nans.
8565 // FIXME: Should this be allowing -0.0?
8566 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
8567 }
8568
Matt Arsenault055e4dc2019-03-29 19:14:54 +00008569 const MachineFunction &MF = DAG.getMachineFunction();
8570 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8571
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008572 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
8573 // handling no dx10-clamp?
Matt Arsenault055e4dc2019-03-29 19:14:54 +00008574 if (Info->getMode().DX10Clamp) {
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00008575 // If NaNs is clamped to 0, we are free to reorder the inputs.
8576
8577 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8578 std::swap(Src0, Src1);
8579
8580 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
8581 std::swap(Src1, Src2);
8582
8583 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8584 std::swap(Src0, Src1);
8585
8586 if (isClampZeroToOne(Src1, Src2))
8587 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
8588 }
8589
8590 return SDValue();
8591}
8592
Matt Arsenault1f17c662017-02-22 00:27:34 +00008593SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
8594 DAGCombinerInfo &DCI) const {
8595 SDValue Src0 = N->getOperand(0);
8596 SDValue Src1 = N->getOperand(1);
8597 if (Src0.isUndef() && Src1.isUndef())
8598 return DCI.DAG.getUNDEF(N->getValueType(0));
8599 return SDValue();
8600}
8601
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008602SDValue SITargetLowering::performExtractVectorEltCombine(
8603 SDNode *N, DAGCombinerInfo &DCI) const {
8604 SDValue Vec = N->getOperand(0);
Matt Arsenault8cbb4882017-09-20 21:01:24 +00008605 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008606
8607 EVT VecVT = Vec.getValueType();
8608 EVT EltVT = VecVT.getVectorElementType();
8609
Matt Arsenaultfcc5ba42018-04-26 19:21:32 +00008610 if ((Vec.getOpcode() == ISD::FNEG ||
8611 Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008612 SDLoc SL(N);
8613 EVT EltVT = N->getValueType(0);
8614 SDValue Idx = N->getOperand(1);
8615 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8616 Vec.getOperand(0), Idx);
Matt Arsenaultfcc5ba42018-04-26 19:21:32 +00008617 return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008618 }
8619
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008620 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
8621 // =>
8622 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
8623 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
8624 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
Farhana Aleene24f3ff2018-05-09 21:18:34 +00008625 if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008626 SDLoc SL(N);
8627 EVT EltVT = N->getValueType(0);
8628 SDValue Idx = N->getOperand(1);
8629 unsigned Opc = Vec.getOpcode();
8630
8631 switch(Opc) {
8632 default:
Stanislav Mekhanoshinbcb34ac2018-11-13 21:18:21 +00008633 break;
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008634 // TODO: Support other binary operations.
8635 case ISD::FADD:
Matt Arsenaulta8160732018-08-15 21:34:06 +00008636 case ISD::FSUB:
8637 case ISD::FMUL:
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008638 case ISD::ADD:
Farhana Aleene24f3ff2018-05-09 21:18:34 +00008639 case ISD::UMIN:
8640 case ISD::UMAX:
8641 case ISD::SMIN:
8642 case ISD::SMAX:
8643 case ISD::FMAXNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00008644 case ISD::FMINNUM:
8645 case ISD::FMAXNUM_IEEE:
8646 case ISD::FMINNUM_IEEE: {
Matt Arsenaulta8160732018-08-15 21:34:06 +00008647 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8648 Vec.getOperand(0), Idx);
8649 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8650 Vec.getOperand(1), Idx);
8651
8652 DCI.AddToWorklist(Elt0.getNode());
8653 DCI.AddToWorklist(Elt1.getNode());
8654 return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
8655 }
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00008656 }
8657 }
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008658
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008659 unsigned VecSize = VecVT.getSizeInBits();
8660 unsigned EltSize = EltVT.getSizeInBits();
8661
Stanislav Mekhanoshinbcb34ac2018-11-13 21:18:21 +00008662 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
8663 // This elminates non-constant index and subsequent movrel or scratch access.
8664 // Sub-dword vectors of size 2 dword or less have better implementation.
8665 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8666 // instructions.
8667 if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) &&
8668 !isa<ConstantSDNode>(N->getOperand(1))) {
8669 SDLoc SL(N);
8670 SDValue Idx = N->getOperand(1);
8671 EVT IdxVT = Idx.getValueType();
8672 SDValue V;
8673 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8674 SDValue IC = DAG.getConstant(I, SL, IdxVT);
8675 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8676 if (I == 0)
8677 V = Elt;
8678 else
8679 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
8680 }
8681 return V;
8682 }
8683
8684 if (!DCI.isBeforeLegalize())
8685 return SDValue();
8686
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008687 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
8688 // elements. This exposes more load reduction opportunities by replacing
8689 // multiple small extract_vector_elements with a single 32-bit extract.
8690 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
Matt Arsenaultbf07a502018-08-31 15:39:52 +00008691 if (isa<MemSDNode>(Vec) &&
8692 EltSize <= 16 &&
Matt Arsenault63bc0e32018-06-15 15:31:36 +00008693 EltVT.isByteSized() &&
8694 VecSize > 32 &&
8695 VecSize % 32 == 0 &&
8696 Idx) {
8697 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
8698
8699 unsigned BitIndex = Idx->getZExtValue() * EltSize;
8700 unsigned EltIdx = BitIndex / 32;
8701 unsigned LeftoverBitIdx = BitIndex % 32;
8702 SDLoc SL(N);
8703
8704 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
8705 DCI.AddToWorklist(Cast.getNode());
8706
8707 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
8708 DAG.getConstant(EltIdx, SL, MVT::i32));
8709 DCI.AddToWorklist(Elt.getNode());
8710 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
8711 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
8712 DCI.AddToWorklist(Srl.getNode());
8713
8714 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
8715 DCI.AddToWorklist(Trunc.getNode());
8716 return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
8717 }
8718
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00008719 return SDValue();
8720}
8721
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00008722SDValue
8723SITargetLowering::performInsertVectorEltCombine(SDNode *N,
8724 DAGCombinerInfo &DCI) const {
8725 SDValue Vec = N->getOperand(0);
8726 SDValue Idx = N->getOperand(2);
8727 EVT VecVT = Vec.getValueType();
8728 EVT EltVT = VecVT.getVectorElementType();
8729 unsigned VecSize = VecVT.getSizeInBits();
8730 unsigned EltSize = EltVT.getSizeInBits();
8731
8732 // INSERT_VECTOR_ELT (<n x e>, var-idx)
8733 // => BUILD_VECTOR n x select (e, const-idx)
8734 // This elminates non-constant index and subsequent movrel or scratch access.
8735 // Sub-dword vectors of size 2 dword or less have better implementation.
8736 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8737 // instructions.
8738 if (isa<ConstantSDNode>(Idx) ||
8739 VecSize > 256 || (VecSize <= 64 && EltSize < 32))
8740 return SDValue();
8741
8742 SelectionDAG &DAG = DCI.DAG;
8743 SDLoc SL(N);
8744 SDValue Ins = N->getOperand(1);
8745 EVT IdxVT = Idx.getValueType();
8746
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00008747 SmallVector<SDValue, 16> Ops;
8748 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8749 SDValue IC = DAG.getConstant(I, SL, IdxVT);
8750 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8751 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
8752 Ops.push_back(V);
8753 }
8754
8755 return DAG.getBuildVector(VecVT, SL, Ops);
8756}
8757
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008758unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
8759 const SDNode *N0,
8760 const SDNode *N1) const {
8761 EVT VT = N0->getValueType(0);
8762
Matt Arsenault770ec862016-12-22 03:55:35 +00008763 // Only do this if we are not trying to support denormals. v_mad_f32 does not
8764 // support denormals ever.
Stanislav Mekhanoshin28a19362019-05-04 04:20:37 +00008765 if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
8766 (VT == MVT::f16 && !Subtarget->hasFP16Denormals() &&
8767 getSubtarget()->hasMadF16())) &&
8768 isOperationLegal(ISD::FMAD, VT))
Matt Arsenault770ec862016-12-22 03:55:35 +00008769 return ISD::FMAD;
8770
8771 const TargetOptions &Options = DAG.getTarget().Options;
Amara Emersond28f0cd42017-05-01 15:17:51 +00008772 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
Michael Berg7acc81b2018-05-04 18:48:20 +00008773 (N0->getFlags().hasAllowContract() &&
8774 N1->getFlags().hasAllowContract())) &&
Matt Arsenault770ec862016-12-22 03:55:35 +00008775 isFMAFasterThanFMulAndFAdd(VT)) {
8776 return ISD::FMA;
8777 }
8778
8779 return 0;
8780}
8781
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00008782// For a reassociatable opcode perform:
8783// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
8784SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
8785 SelectionDAG &DAG) const {
8786 EVT VT = N->getValueType(0);
8787 if (VT != MVT::i32 && VT != MVT::i64)
8788 return SDValue();
8789
8790 unsigned Opc = N->getOpcode();
8791 SDValue Op0 = N->getOperand(0);
8792 SDValue Op1 = N->getOperand(1);
8793
8794 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
8795 return SDValue();
8796
8797 if (Op0->isDivergent())
8798 std::swap(Op0, Op1);
8799
8800 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
8801 return SDValue();
8802
8803 SDValue Op2 = Op1.getOperand(1);
8804 Op1 = Op1.getOperand(0);
8805 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
8806 return SDValue();
8807
8808 if (Op1->isDivergent())
8809 std::swap(Op1, Op2);
8810
8811 // If either operand is constant this will conflict with
8812 // DAGCombiner::ReassociateOps().
Stanislav Mekhanoshinda1628e2019-02-26 20:56:25 +00008813 if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
8814 DAG.isConstantIntBuildVectorOrConstantInt(Op1))
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00008815 return SDValue();
8816
8817 SDLoc SL(N);
8818 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
8819 return DAG.getNode(Opc, SL, VT, Add1, Op2);
8820}
8821
Matt Arsenault4f6318f2017-11-06 17:04:37 +00008822static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
8823 EVT VT,
8824 SDValue N0, SDValue N1, SDValue N2,
8825 bool Signed) {
8826 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
8827 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
8828 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
8829 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
8830}
8831
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008832SDValue SITargetLowering::performAddCombine(SDNode *N,
8833 DAGCombinerInfo &DCI) const {
8834 SelectionDAG &DAG = DCI.DAG;
8835 EVT VT = N->getValueType(0);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008836 SDLoc SL(N);
8837 SDValue LHS = N->getOperand(0);
8838 SDValue RHS = N->getOperand(1);
8839
Matt Arsenault4f6318f2017-11-06 17:04:37 +00008840 if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
8841 && Subtarget->hasMad64_32() &&
8842 !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
8843 VT.getScalarSizeInBits() <= 64) {
8844 if (LHS.getOpcode() != ISD::MUL)
8845 std::swap(LHS, RHS);
8846
8847 SDValue MulLHS = LHS.getOperand(0);
8848 SDValue MulRHS = LHS.getOperand(1);
8849 SDValue AddRHS = RHS;
8850
8851 // TODO: Maybe restrict if SGPR inputs.
8852 if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
8853 numBitsUnsigned(MulRHS, DAG) <= 32) {
8854 MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
8855 MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
8856 AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
8857 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
8858 }
8859
8860 if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
8861 MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
8862 MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
8863 AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
8864 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
8865 }
8866
8867 return SDValue();
8868 }
8869
Stanislav Mekhanoshin871821f2019-02-14 22:11:25 +00008870 if (SDValue V = reassociateScalarOps(N, DAG)) {
8871 return V;
8872 }
8873
Farhana Aleen07e61232018-05-02 18:16:39 +00008874 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
Matt Arsenault4f6318f2017-11-06 17:04:37 +00008875 return SDValue();
8876
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008877 // add x, zext (setcc) => addcarry x, 0, setcc
8878 // add x, sext (setcc) => subcarry x, 0, setcc
8879 unsigned Opc = LHS.getOpcode();
8880 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008881 Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008882 std::swap(RHS, LHS);
8883
8884 Opc = RHS.getOpcode();
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008885 switch (Opc) {
8886 default: break;
8887 case ISD::ZERO_EXTEND:
8888 case ISD::SIGN_EXTEND:
8889 case ISD::ANY_EXTEND: {
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008890 auto Cond = RHS.getOperand(0);
Stanislav Mekhanoshin6851ddf2017-06-27 18:25:26 +00008891 if (!isBoolSGPR(Cond))
Stanislav Mekhanoshin3ed38c62017-06-21 23:46:22 +00008892 break;
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008893 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
8894 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
8895 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
8896 return DAG.getNode(Opc, SL, VTList, Args);
8897 }
8898 case ISD::ADDCARRY: {
8899 // add x, (addcarry y, 0, cc) => addcarry x, y, cc
8900 auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
8901 if (!C || C->getZExtValue() != 0) break;
8902 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
8903 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
8904 }
8905 }
8906 return SDValue();
8907}
8908
8909SDValue SITargetLowering::performSubCombine(SDNode *N,
8910 DAGCombinerInfo &DCI) const {
8911 SelectionDAG &DAG = DCI.DAG;
8912 EVT VT = N->getValueType(0);
8913
8914 if (VT != MVT::i32)
8915 return SDValue();
8916
8917 SDLoc SL(N);
8918 SDValue LHS = N->getOperand(0);
8919 SDValue RHS = N->getOperand(1);
8920
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008921 if (LHS.getOpcode() == ISD::SUBCARRY) {
8922 // sub (subcarry x, 0, cc), y => subcarry x, y, cc
8923 auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
Stanislav Mekhanoshin42e229e2019-02-21 02:58:00 +00008924 if (!C || !C->isNullValue())
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00008925 return SDValue();
8926 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
8927 return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
8928 }
8929 return SDValue();
8930}
8931
8932SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
8933 DAGCombinerInfo &DCI) const {
8934
8935 if (N->getValueType(0) != MVT::i32)
8936 return SDValue();
8937
8938 auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8939 if (!C || C->getZExtValue() != 0)
8940 return SDValue();
8941
8942 SelectionDAG &DAG = DCI.DAG;
8943 SDValue LHS = N->getOperand(0);
8944
8945 // addcarry (add x, y), 0, cc => addcarry x, y, cc
8946 // subcarry (sub x, y), 0, cc => subcarry x, y, cc
8947 unsigned LHSOpc = LHS.getOpcode();
8948 unsigned Opc = N->getOpcode();
8949 if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
8950 (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
8951 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
8952 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00008953 }
8954 return SDValue();
8955}
8956
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008957SDValue SITargetLowering::performFAddCombine(SDNode *N,
8958 DAGCombinerInfo &DCI) const {
8959 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8960 return SDValue();
8961
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008962 SelectionDAG &DAG = DCI.DAG;
Matt Arsenault770ec862016-12-22 03:55:35 +00008963 EVT VT = N->getValueType(0);
Matt Arsenault770ec862016-12-22 03:55:35 +00008964
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008965 SDLoc SL(N);
8966 SDValue LHS = N->getOperand(0);
8967 SDValue RHS = N->getOperand(1);
8968
8969 // These should really be instruction patterns, but writing patterns with
8970 // source modiifiers is a pain.
8971
8972 // fadd (fadd (a, a), b) -> mad 2.0, a, b
8973 if (LHS.getOpcode() == ISD::FADD) {
8974 SDValue A = LHS.getOperand(0);
8975 if (A == LHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008976 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00008977 if (FusedOp != 0) {
8978 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00008979 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
Matt Arsenault770ec862016-12-22 03:55:35 +00008980 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008981 }
8982 }
8983
8984 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
8985 if (RHS.getOpcode() == ISD::FADD) {
8986 SDValue A = RHS.getOperand(0);
8987 if (A == RHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00008988 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00008989 if (FusedOp != 0) {
8990 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00008991 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
Matt Arsenault770ec862016-12-22 03:55:35 +00008992 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00008993 }
8994 }
8995
8996 return SDValue();
8997}
8998
8999SDValue SITargetLowering::performFSubCombine(SDNode *N,
9000 DAGCombinerInfo &DCI) const {
9001 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
9002 return SDValue();
9003
9004 SelectionDAG &DAG = DCI.DAG;
9005 SDLoc SL(N);
9006 EVT VT = N->getValueType(0);
9007 assert(!VT.isVector());
9008
9009 // Try to get the fneg to fold into the source modifier. This undoes generic
9010 // DAG combines and folds them into the mad.
9011 //
9012 // Only do this if we are not trying to support denormals. v_mad_f32 does
9013 // not support denormals ever.
Matt Arsenault770ec862016-12-22 03:55:35 +00009014 SDValue LHS = N->getOperand(0);
9015 SDValue RHS = N->getOperand(1);
9016 if (LHS.getOpcode() == ISD::FADD) {
9017 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
9018 SDValue A = LHS.getOperand(0);
9019 if (A == LHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00009020 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00009021 if (FusedOp != 0){
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009022 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
9023 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
9024
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00009025 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009026 }
9027 }
Matt Arsenault770ec862016-12-22 03:55:35 +00009028 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009029
Matt Arsenault770ec862016-12-22 03:55:35 +00009030 if (RHS.getOpcode() == ISD::FADD) {
9031 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009032
Matt Arsenault770ec862016-12-22 03:55:35 +00009033 SDValue A = RHS.getOperand(0);
9034 if (A == RHS.getOperand(1)) {
Matt Arsenault46e6b7a2016-12-22 04:03:35 +00009035 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
Matt Arsenault770ec862016-12-22 03:55:35 +00009036 if (FusedOp != 0){
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009037 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +00009038 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009039 }
9040 }
9041 }
9042
9043 return SDValue();
9044}
9045
Farhana Aleenc370d7b2018-07-16 18:19:59 +00009046SDValue SITargetLowering::performFMACombine(SDNode *N,
9047 DAGCombinerInfo &DCI) const {
9048 SelectionDAG &DAG = DCI.DAG;
9049 EVT VT = N->getValueType(0);
9050 SDLoc SL(N);
9051
Stanislav Mekhanoshin0e858b02019-02-09 00:34:21 +00009052 if (!Subtarget->hasDot2Insts() || VT != MVT::f32)
Farhana Aleenc370d7b2018-07-16 18:19:59 +00009053 return SDValue();
9054
9055 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
9056 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
9057 SDValue Op1 = N->getOperand(0);
9058 SDValue Op2 = N->getOperand(1);
9059 SDValue FMA = N->getOperand(2);
9060
9061 if (FMA.getOpcode() != ISD::FMA ||
9062 Op1.getOpcode() != ISD::FP_EXTEND ||
9063 Op2.getOpcode() != ISD::FP_EXTEND)
9064 return SDValue();
9065
9066 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
9067 // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
9068 // is sufficient to allow generaing fdot2.
9069 const TargetOptions &Options = DAG.getTarget().Options;
9070 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
9071 (N->getFlags().hasAllowContract() &&
9072 FMA->getFlags().hasAllowContract())) {
9073 Op1 = Op1.getOperand(0);
9074 Op2 = Op2.getOperand(0);
9075 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9076 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9077 return SDValue();
9078
9079 SDValue Vec1 = Op1.getOperand(0);
9080 SDValue Idx1 = Op1.getOperand(1);
9081 SDValue Vec2 = Op2.getOperand(0);
9082
9083 SDValue FMAOp1 = FMA.getOperand(0);
9084 SDValue FMAOp2 = FMA.getOperand(1);
9085 SDValue FMAAcc = FMA.getOperand(2);
9086
9087 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
9088 FMAOp2.getOpcode() != ISD::FP_EXTEND)
9089 return SDValue();
9090
9091 FMAOp1 = FMAOp1.getOperand(0);
9092 FMAOp2 = FMAOp2.getOperand(0);
9093 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9094 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9095 return SDValue();
9096
9097 SDValue Vec3 = FMAOp1.getOperand(0);
9098 SDValue Vec4 = FMAOp2.getOperand(0);
9099 SDValue Idx2 = FMAOp1.getOperand(1);
9100
9101 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
9102 // Idx1 and Idx2 cannot be the same.
9103 Idx1 == Idx2)
9104 return SDValue();
9105
9106 if (Vec1 == Vec2 || Vec3 == Vec4)
9107 return SDValue();
9108
9109 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
9110 return SDValue();
9111
9112 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
Konstantin Zhuravlyovbb30ef72018-08-01 01:31:30 +00009113 (Vec1 == Vec4 && Vec2 == Vec3)) {
9114 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
9115 DAG.getTargetConstant(0, SL, MVT::i1));
9116 }
Farhana Aleenc370d7b2018-07-16 18:19:59 +00009117 }
9118 return SDValue();
9119}
9120
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009121SDValue SITargetLowering::performSetCCCombine(SDNode *N,
9122 DAGCombinerInfo &DCI) const {
9123 SelectionDAG &DAG = DCI.DAG;
9124 SDLoc SL(N);
9125
9126 SDValue LHS = N->getOperand(0);
9127 SDValue RHS = N->getOperand(1);
9128 EVT VT = LHS.getValueType();
Stanislav Mekhanoshinc9bd53a2017-06-27 18:53:03 +00009129 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
9130
9131 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
9132 if (!CRHS) {
9133 CRHS = dyn_cast<ConstantSDNode>(LHS);
9134 if (CRHS) {
9135 std::swap(LHS, RHS);
9136 CC = getSetCCSwappedOperands(CC);
9137 }
9138 }
9139
Stanislav Mekhanoshin3b117942018-06-16 03:46:59 +00009140 if (CRHS) {
9141 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
9142 isBoolSGPR(LHS.getOperand(0))) {
9143 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
9144 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
9145 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
9146 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
9147 if ((CRHS->isAllOnesValue() &&
9148 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
9149 (CRHS->isNullValue() &&
9150 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
9151 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
9152 DAG.getConstant(-1, SL, MVT::i1));
9153 if ((CRHS->isAllOnesValue() &&
9154 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
9155 (CRHS->isNullValue() &&
9156 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
9157 return LHS.getOperand(0);
9158 }
9159
9160 uint64_t CRHSVal = CRHS->getZExtValue();
9161 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
9162 LHS.getOpcode() == ISD::SELECT &&
9163 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9164 isa<ConstantSDNode>(LHS.getOperand(2)) &&
9165 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
9166 isBoolSGPR(LHS.getOperand(0))) {
9167 // Given CT != FT:
9168 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
9169 // setcc (select cc, CT, CF), CF, ne => cc
9170 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
9171 // setcc (select cc, CT, CF), CT, eq => cc
9172 uint64_t CT = LHS.getConstantOperandVal(1);
9173 uint64_t CF = LHS.getConstantOperandVal(2);
9174
9175 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
9176 (CT == CRHSVal && CC == ISD::SETNE))
9177 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
9178 DAG.getConstant(-1, SL, MVT::i1));
9179 if ((CF == CRHSVal && CC == ISD::SETNE) ||
9180 (CT == CRHSVal && CC == ISD::SETEQ))
9181 return LHS.getOperand(0);
9182 }
Stanislav Mekhanoshinc9bd53a2017-06-27 18:53:03 +00009183 }
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009184
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00009185 if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
9186 VT != MVT::f16))
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009187 return SDValue();
9188
Matt Arsenault8ad00d32018-08-10 18:58:41 +00009189 // Match isinf/isfinite pattern
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009190 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
Matt Arsenault8ad00d32018-08-10 18:58:41 +00009191 // (fcmp one (fabs x), inf) -> (fp_class x,
9192 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
9193 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009194 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
9195 if (!CRHS)
9196 return SDValue();
9197
9198 const APFloat &APF = CRHS->getValueAPF();
9199 if (APF.isInfinity() && !APF.isNegative()) {
Matt Arsenault8ad00d32018-08-10 18:58:41 +00009200 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
9201 SIInstrFlags::N_INFINITY;
9202 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
9203 SIInstrFlags::P_ZERO |
9204 SIInstrFlags::N_NORMAL |
9205 SIInstrFlags::P_NORMAL |
9206 SIInstrFlags::N_SUBNORMAL |
9207 SIInstrFlags::P_SUBNORMAL;
9208 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009209 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
9210 DAG.getConstant(Mask, SL, MVT::i32));
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009211 }
9212 }
9213
9214 return SDValue();
9215}
9216
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009217SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
9218 DAGCombinerInfo &DCI) const {
9219 SelectionDAG &DAG = DCI.DAG;
9220 SDLoc SL(N);
9221 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
9222
9223 SDValue Src = N->getOperand(0);
9224 SDValue Srl = N->getOperand(0);
9225 if (Srl.getOpcode() == ISD::ZERO_EXTEND)
9226 Srl = Srl.getOperand(0);
9227
9228 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
9229 if (Srl.getOpcode() == ISD::SRL) {
9230 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
9231 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
9232 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
9233
9234 if (const ConstantSDNode *C =
9235 dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
9236 Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
9237 EVT(MVT::i32));
9238
9239 unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
9240 if (SrcOffset < 32 && SrcOffset % 8 == 0) {
9241 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
9242 MVT::f32, Srl);
9243 }
9244 }
9245 }
9246
9247 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
9248
Craig Topperd0af7e82017-04-28 05:31:46 +00009249 KnownBits Known;
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009250 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
9251 !DCI.isBeforeLegalizeOps());
9252 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
Stanislav Mekhanoshined0d6c62019-01-09 02:24:22 +00009253 if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009254 DCI.CommitTargetLoweringOpt(TLO);
9255 }
9256
9257 return SDValue();
9258}
9259
Tom Stellard1b95fed2018-05-24 05:28:34 +00009260SDValue SITargetLowering::performClampCombine(SDNode *N,
9261 DAGCombinerInfo &DCI) const {
9262 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
9263 if (!CSrc)
9264 return SDValue();
9265
Matt Arsenault055e4dc2019-03-29 19:14:54 +00009266 const MachineFunction &MF = DCI.DAG.getMachineFunction();
Tom Stellard1b95fed2018-05-24 05:28:34 +00009267 const APFloat &F = CSrc->getValueAPF();
9268 APFloat Zero = APFloat::getZero(F.getSemantics());
9269 APFloat::cmpResult Cmp0 = F.compare(Zero);
9270 if (Cmp0 == APFloat::cmpLessThan ||
Matt Arsenault055e4dc2019-03-29 19:14:54 +00009271 (Cmp0 == APFloat::cmpUnordered &&
9272 MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
Tom Stellard1b95fed2018-05-24 05:28:34 +00009273 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
9274 }
9275
9276 APFloat One(F.getSemantics(), "1.0");
9277 APFloat::cmpResult Cmp1 = F.compare(One);
9278 if (Cmp1 == APFloat::cmpGreaterThan)
9279 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
9280
9281 return SDValue(CSrc, 0);
9282}
9283
9284
Tom Stellard75aadc22012-12-11 21:25:42 +00009285SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
9286 DAGCombinerInfo &DCI) const {
Stanislav Mekhanoshin443a7f92018-11-27 15:13:37 +00009287 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
9288 return SDValue();
Tom Stellard75aadc22012-12-11 21:25:42 +00009289 switch (N->getOpcode()) {
Matt Arsenault22b4c252014-12-21 16:48:42 +00009290 default:
9291 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
Stanislav Mekhanoshine3eb42c2017-06-21 22:05:06 +00009292 case ISD::ADD:
9293 return performAddCombine(N, DCI);
Stanislav Mekhanoshina8b26932017-06-21 22:30:01 +00009294 case ISD::SUB:
9295 return performSubCombine(N, DCI);
9296 case ISD::ADDCARRY:
9297 case ISD::SUBCARRY:
9298 return performAddCarrySubCarryCombine(N, DCI);
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009299 case ISD::FADD:
9300 return performFAddCombine(N, DCI);
9301 case ISD::FSUB:
9302 return performFSubCombine(N, DCI);
Matt Arsenault6f6233d2015-01-06 23:00:41 +00009303 case ISD::SETCC:
9304 return performSetCCCombine(N, DCI);
Matt Arsenault5b39b342016-01-28 20:53:48 +00009305 case ISD::FMAXNUM:
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00009306 case ISD::FMINNUM:
Matt Arsenault687ec752018-10-22 16:27:27 +00009307 case ISD::FMAXNUM_IEEE:
9308 case ISD::FMINNUM_IEEE:
Matt Arsenault5881f4e2015-06-09 00:52:37 +00009309 case ISD::SMAX:
9310 case ISD::SMIN:
9311 case ISD::UMAX:
Matt Arsenault5b39b342016-01-28 20:53:48 +00009312 case ISD::UMIN:
9313 case AMDGPUISD::FMIN_LEGACY:
Stanislav Mekhanoshin443a7f92018-11-27 15:13:37 +00009314 case AMDGPUISD::FMAX_LEGACY:
9315 return performMinMaxCombine(N, DCI);
Farhana Aleenc370d7b2018-07-16 18:19:59 +00009316 case ISD::FMA:
9317 return performFMACombine(N, DCI);
Matt Arsenault90083d32018-06-07 09:54:49 +00009318 case ISD::LOAD: {
9319 if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
9320 return Widended;
9321 LLVM_FALLTHROUGH;
9322 }
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00009323 case ISD::STORE:
9324 case ISD::ATOMIC_LOAD:
9325 case ISD::ATOMIC_STORE:
9326 case ISD::ATOMIC_CMP_SWAP:
9327 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
9328 case ISD::ATOMIC_SWAP:
9329 case ISD::ATOMIC_LOAD_ADD:
9330 case ISD::ATOMIC_LOAD_SUB:
9331 case ISD::ATOMIC_LOAD_AND:
9332 case ISD::ATOMIC_LOAD_OR:
9333 case ISD::ATOMIC_LOAD_XOR:
9334 case ISD::ATOMIC_LOAD_NAND:
9335 case ISD::ATOMIC_LOAD_MIN:
9336 case ISD::ATOMIC_LOAD_MAX:
9337 case ISD::ATOMIC_LOAD_UMIN:
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00009338 case ISD::ATOMIC_LOAD_UMAX:
Matt Arsenaulta5840c32019-01-22 18:36:06 +00009339 case ISD::ATOMIC_LOAD_FADD:
Matt Arsenaulta9dbdca2016-04-12 14:05:04 +00009340 case AMDGPUISD::ATOMIC_INC:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00009341 case AMDGPUISD::ATOMIC_DEC:
Daniil Fukalovd5fca552018-01-17 14:05:05 +00009342 case AMDGPUISD::ATOMIC_LOAD_FMIN:
Matt Arsenaulta5840c32019-01-22 18:36:06 +00009343 case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00009344 if (DCI.isBeforeLegalize())
9345 break;
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009346 return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
Matt Arsenaultd0101a22015-01-06 23:00:46 +00009347 case ISD::AND:
9348 return performAndCombine(N, DCI);
Matt Arsenaultf2290332015-01-06 23:00:39 +00009349 case ISD::OR:
9350 return performOrCombine(N, DCI);
Matt Arsenaultfa5f7672016-09-14 15:19:03 +00009351 case ISD::XOR:
9352 return performXorCombine(N, DCI);
Matt Arsenault8edfaee2017-03-31 19:53:03 +00009353 case ISD::ZERO_EXTEND:
9354 return performZeroExtendCombine(N, DCI);
Ryan Taylor00e063a2019-03-19 16:07:00 +00009355 case ISD::SIGN_EXTEND_INREG:
9356 return performSignExtendInRegCombine(N , DCI);
Matt Arsenaultf2290332015-01-06 23:00:39 +00009357 case AMDGPUISD::FP_CLASS:
9358 return performClassCombine(N, DCI);
Matt Arsenault9cd90712016-04-14 01:42:16 +00009359 case ISD::FCANONICALIZE:
9360 return performFCanonicalizeCombine(N, DCI);
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009361 case AMDGPUISD::RCP:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00009362 return performRcpCombine(N, DCI);
9363 case AMDGPUISD::FRACT:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009364 case AMDGPUISD::RSQ:
Matt Arsenault32fc5272016-07-26 16:45:45 +00009365 case AMDGPUISD::RCP_LEGACY:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009366 case AMDGPUISD::RSQ_LEGACY:
Stanislav Mekhanoshin1a1687f2018-06-27 15:33:33 +00009367 case AMDGPUISD::RCP_IFLAG:
Matt Arsenaultb6d8c372016-06-20 18:33:56 +00009368 case AMDGPUISD::RSQ_CLAMP:
9369 case AMDGPUISD::LDEXP: {
9370 SDValue Src = N->getOperand(0);
9371 if (Src.isUndef())
9372 return Src;
9373 break;
9374 }
Matt Arsenaultd8b73d52016-12-22 03:44:42 +00009375 case ISD::SINT_TO_FP:
9376 case ISD::UINT_TO_FP:
9377 return performUCharToFloatCombine(N, DCI);
9378 case AMDGPUISD::CVT_F32_UBYTE0:
9379 case AMDGPUISD::CVT_F32_UBYTE1:
9380 case AMDGPUISD::CVT_F32_UBYTE2:
9381 case AMDGPUISD::CVT_F32_UBYTE3:
9382 return performCvtF32UByteNCombine(N, DCI);
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00009383 case AMDGPUISD::FMED3:
9384 return performFMed3Combine(N, DCI);
Matt Arsenault1f17c662017-02-22 00:27:34 +00009385 case AMDGPUISD::CVT_PKRTZ_F16_F32:
9386 return performCvtPkRTZCombine(N, DCI);
Tom Stellard1b95fed2018-05-24 05:28:34 +00009387 case AMDGPUISD::CLAMP:
9388 return performClampCombine(N, DCI);
Matt Arsenaulteb522e62017-02-27 22:15:25 +00009389 case ISD::SCALAR_TO_VECTOR: {
9390 SelectionDAG &DAG = DCI.DAG;
9391 EVT VT = N->getValueType(0);
9392
9393 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
9394 if (VT == MVT::v2i16 || VT == MVT::v2f16) {
9395 SDLoc SL(N);
9396 SDValue Src = N->getOperand(0);
9397 EVT EltVT = Src.getValueType();
9398 if (EltVT == MVT::f16)
9399 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
9400
9401 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
9402 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
9403 }
9404
9405 break;
9406 }
Matt Arsenaultbf5482e2017-05-11 17:26:25 +00009407 case ISD::EXTRACT_VECTOR_ELT:
9408 return performExtractVectorEltCombine(N, DCI);
Stanislav Mekhanoshin054f8102018-11-19 17:39:20 +00009409 case ISD::INSERT_VECTOR_ELT:
9410 return performInsertVectorEltCombine(N, DCI);
Matt Arsenaultb2baffa2014-08-15 17:49:05 +00009411 }
Matt Arsenault5565f65e2014-05-22 18:09:07 +00009412 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
Tom Stellard75aadc22012-12-11 21:25:42 +00009413}
Christian Konigd910b7d2013-02-26 17:52:16 +00009414
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009415/// Helper function for adjustWritemask
Benjamin Kramer635e3682013-05-23 15:43:05 +00009416static unsigned SubIdx2Lane(unsigned Idx) {
Christian Konig8e06e2a2013-04-10 08:39:08 +00009417 switch (Idx) {
9418 default: return 0;
9419 case AMDGPU::sub0: return 0;
9420 case AMDGPU::sub1: return 1;
9421 case AMDGPU::sub2: return 2;
9422 case AMDGPU::sub3: return 3;
David Stuttardf77079f2019-01-14 11:55:24 +00009423 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
Christian Konig8e06e2a2013-04-10 08:39:08 +00009424 }
9425}
9426
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009427/// Adjust the writemask of MIMG instructions
Matt Arsenault68f05052017-12-04 22:18:27 +00009428SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
9429 SelectionDAG &DAG) const {
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009430 unsigned Opcode = Node->getMachineOpcode();
9431
9432 // Subtract 1 because the vdata output is not a MachineSDNode operand.
9433 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
9434 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
9435 return Node; // not implemented for D16
9436
David Stuttardf77079f2019-01-14 11:55:24 +00009437 SDNode *Users[5] = { nullptr };
Tom Stellard54774e52013-10-23 02:53:47 +00009438 unsigned Lane = 0;
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009439 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009440 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
Tom Stellard54774e52013-10-23 02:53:47 +00009441 unsigned NewDmask = 0;
David Stuttardf77079f2019-01-14 11:55:24 +00009442 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
9443 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
9444 bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
9445 Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
9446 unsigned TFCLane = 0;
Matt Arsenault856777d2017-12-08 20:00:57 +00009447 bool HasChain = Node->getNumValues() > 1;
9448
9449 if (OldDmask == 0) {
9450 // These are folded out, but on the chance it happens don't assert.
9451 return Node;
9452 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009453
David Stuttardf77079f2019-01-14 11:55:24 +00009454 unsigned OldBitsSet = countPopulation(OldDmask);
9455 // Work out which is the TFE/LWE lane if that is enabled.
9456 if (UsesTFC) {
9457 TFCLane = OldBitsSet;
9458 }
9459
Christian Konig8e06e2a2013-04-10 08:39:08 +00009460 // Try to figure out the used register components
9461 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
9462 I != E; ++I) {
9463
Matt Arsenault93e65ea2017-02-22 21:16:41 +00009464 // Don't look at users of the chain.
9465 if (I.getUse().getResNo() != 0)
9466 continue;
9467
Christian Konig8e06e2a2013-04-10 08:39:08 +00009468 // Abort if we can't understand the usage
9469 if (!I->isMachineOpcode() ||
9470 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
Matt Arsenault68f05052017-12-04 22:18:27 +00009471 return Node;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009472
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +00009473 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
Tom Stellard54774e52013-10-23 02:53:47 +00009474 // Note that subregs are packed, i.e. Lane==0 is the first bit set
9475 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
9476 // set, etc.
Christian Konig8b1ed282013-04-10 08:39:16 +00009477 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
Christian Konig8e06e2a2013-04-10 08:39:08 +00009478
David Stuttardf77079f2019-01-14 11:55:24 +00009479 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
9480 if (UsesTFC && Lane == TFCLane) {
9481 Users[Lane] = *I;
9482 } else {
9483 // Set which texture component corresponds to the lane.
9484 unsigned Comp;
9485 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
9486 Comp = countTrailingZeros(Dmask);
9487 Dmask &= ~(1 << Comp);
9488 }
9489
9490 // Abort if we have more than one user per component.
9491 if (Users[Lane])
9492 return Node;
9493
9494 Users[Lane] = *I;
9495 NewDmask |= 1 << Comp;
Tom Stellard54774e52013-10-23 02:53:47 +00009496 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009497 }
9498
David Stuttardf77079f2019-01-14 11:55:24 +00009499 // Don't allow 0 dmask, as hardware assumes one channel enabled.
9500 bool NoChannels = !NewDmask;
9501 if (NoChannels) {
David Stuttardfc2a7472019-03-20 09:29:55 +00009502 if (!UsesTFC) {
9503 // No uses of the result and not using TFC. Then do nothing.
9504 return Node;
9505 }
David Stuttardf77079f2019-01-14 11:55:24 +00009506 // If the original dmask has one channel - then nothing to do
9507 if (OldBitsSet == 1)
9508 return Node;
9509 // Use an arbitrary dmask - required for the instruction to work
9510 NewDmask = 1;
9511 }
Tom Stellard54774e52013-10-23 02:53:47 +00009512 // Abort if there's no change
9513 if (NewDmask == OldDmask)
Matt Arsenault68f05052017-12-04 22:18:27 +00009514 return Node;
9515
9516 unsigned BitsSet = countPopulation(NewDmask);
9517
David Stuttardf77079f2019-01-14 11:55:24 +00009518 // Check for TFE or LWE - increase the number of channels by one to account
9519 // for the extra return value
9520 // This will need adjustment for D16 if this is also included in
9521 // adjustWriteMask (this function) but at present D16 are excluded.
9522 unsigned NewChannels = BitsSet + UsesTFC;
9523
9524 int NewOpcode =
9525 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
Matt Arsenault68f05052017-12-04 22:18:27 +00009526 assert(NewOpcode != -1 &&
9527 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
9528 "failed to find equivalent MIMG op");
Christian Konig8e06e2a2013-04-10 08:39:08 +00009529
9530 // Adjust the writemask in the node
Matt Arsenault68f05052017-12-04 22:18:27 +00009531 SmallVector<SDValue, 12> Ops;
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009532 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009533 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
Nikolay Haustov2f684f12016-02-26 09:51:05 +00009534 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
Christian Konig8e06e2a2013-04-10 08:39:08 +00009535
Matt Arsenault68f05052017-12-04 22:18:27 +00009536 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
9537
David Stuttardf77079f2019-01-14 11:55:24 +00009538 MVT ResultVT = NewChannels == 1 ?
9539 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
9540 NewChannels == 5 ? 8 : NewChannels);
Matt Arsenault856777d2017-12-08 20:00:57 +00009541 SDVTList NewVTList = HasChain ?
9542 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
9543
Matt Arsenault68f05052017-12-04 22:18:27 +00009544
9545 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
9546 NewVTList, Ops);
Matt Arsenaultecad0d532017-12-08 20:00:45 +00009547
Matt Arsenault856777d2017-12-08 20:00:57 +00009548 if (HasChain) {
9549 // Update chain.
Chandler Carruth66654b72018-08-14 23:30:32 +00009550 DAG.setNodeMemRefs(NewNode, Node->memoperands());
Matt Arsenault856777d2017-12-08 20:00:57 +00009551 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
9552 }
Matt Arsenault68f05052017-12-04 22:18:27 +00009553
David Stuttardf77079f2019-01-14 11:55:24 +00009554 if (NewChannels == 1) {
Matt Arsenault68f05052017-12-04 22:18:27 +00009555 assert(Node->hasNUsesOfValue(1, 0));
9556 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
9557 SDLoc(Node), Users[Lane]->getValueType(0),
9558 SDValue(NewNode, 0));
Christian Konig8b1ed282013-04-10 08:39:16 +00009559 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
Matt Arsenault68f05052017-12-04 22:18:27 +00009560 return nullptr;
Christian Konig8b1ed282013-04-10 08:39:16 +00009561 }
9562
Christian Konig8e06e2a2013-04-10 08:39:08 +00009563 // Update the users of the node with the new indices
David Stuttardf77079f2019-01-14 11:55:24 +00009564 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
Christian Konig8e06e2a2013-04-10 08:39:08 +00009565 SDNode *User = Users[i];
David Stuttardf77079f2019-01-14 11:55:24 +00009566 if (!User) {
9567 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
9568 // Users[0] is still nullptr because channel 0 doesn't really have a use.
9569 if (i || !NoChannels)
9570 continue;
9571 } else {
9572 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
9573 DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
9574 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009575
9576 switch (Idx) {
9577 default: break;
9578 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
9579 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
9580 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
David Stuttardf77079f2019-01-14 11:55:24 +00009581 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009582 }
9583 }
Matt Arsenault68f05052017-12-04 22:18:27 +00009584
9585 DAG.RemoveDeadNode(Node);
9586 return nullptr;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009587}
9588
Tom Stellardc98ee202015-07-16 19:40:07 +00009589static bool isFrameIndexOp(SDValue Op) {
9590 if (Op.getOpcode() == ISD::AssertZext)
9591 Op = Op.getOperand(0);
9592
9593 return isa<FrameIndexSDNode>(Op);
9594}
9595
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009596/// Legalize target independent instructions (e.g. INSERT_SUBREG)
Tom Stellard3457a842014-10-09 19:06:00 +00009597/// with frame index operands.
9598/// LLVM assumes that inputs are to these instructions are registers.
Matt Arsenault0d0d6c22017-04-12 21:58:23 +00009599SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
9600 SelectionDAG &DAG) const {
9601 if (Node->getOpcode() == ISD::CopyToReg) {
9602 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
9603 SDValue SrcVal = Node->getOperand(2);
9604
9605 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
9606 // to try understanding copies to physical registers.
9607 if (SrcVal.getValueType() == MVT::i1 &&
9608 TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
9609 SDLoc SL(Node);
9610 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
9611 SDValue VReg = DAG.getRegister(
9612 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
9613
9614 SDNode *Glued = Node->getGluedNode();
9615 SDValue ToVReg
9616 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
9617 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
9618 SDValue ToResultReg
9619 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
9620 VReg, ToVReg.getValue(1));
9621 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
9622 DAG.RemoveDeadNode(Node);
9623 return ToResultReg.getNode();
9624 }
9625 }
Tom Stellard8dd392e2014-10-09 18:09:15 +00009626
9627 SmallVector<SDValue, 8> Ops;
Tom Stellard3457a842014-10-09 19:06:00 +00009628 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
Tom Stellardc98ee202015-07-16 19:40:07 +00009629 if (!isFrameIndexOp(Node->getOperand(i))) {
Tom Stellard3457a842014-10-09 19:06:00 +00009630 Ops.push_back(Node->getOperand(i));
Tom Stellard8dd392e2014-10-09 18:09:15 +00009631 continue;
9632 }
9633
Tom Stellard3457a842014-10-09 19:06:00 +00009634 SDLoc DL(Node);
Tom Stellard8dd392e2014-10-09 18:09:15 +00009635 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
Tom Stellard3457a842014-10-09 19:06:00 +00009636 Node->getOperand(i).getValueType(),
9637 Node->getOperand(i)), 0));
Tom Stellard8dd392e2014-10-09 18:09:15 +00009638 }
9639
Mark Searles4e3d6162017-10-16 23:38:53 +00009640 return DAG.UpdateNodeOperands(Node, Ops);
Tom Stellard8dd392e2014-10-09 18:09:15 +00009641}
9642
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009643/// Fold the instructions after selecting them.
Matt Arsenault68f05052017-12-04 22:18:27 +00009644/// Returns null if users were already updated.
Christian Konig8e06e2a2013-04-10 08:39:08 +00009645SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
9646 SelectionDAG &DAG) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009647 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Nicolai Haehnlef2c64db2016-02-18 16:44:18 +00009648 unsigned Opcode = Node->getMachineOpcode();
Christian Konig8e06e2a2013-04-10 08:39:08 +00009649
Nicolai Haehnlec06bfa12016-07-11 21:59:43 +00009650 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
Nicolai Haehnlef2674312018-06-21 13:36:01 +00009651 !TII->isGather4(Opcode)) {
Matt Arsenault68f05052017-12-04 22:18:27 +00009652 return adjustWritemask(Node, DAG);
9653 }
Christian Konig8e06e2a2013-04-10 08:39:08 +00009654
Nicolai Haehnlef2c64db2016-02-18 16:44:18 +00009655 if (Opcode == AMDGPU::INSERT_SUBREG ||
9656 Opcode == AMDGPU::REG_SEQUENCE) {
Tom Stellard8dd392e2014-10-09 18:09:15 +00009657 legalizeTargetIndependentNode(Node, DAG);
9658 return Node;
9659 }
Matt Arsenault206f8262017-08-01 20:49:41 +00009660
9661 switch (Opcode) {
9662 case AMDGPU::V_DIV_SCALE_F32:
9663 case AMDGPU::V_DIV_SCALE_F64: {
9664 // Satisfy the operand register constraint when one of the inputs is
9665 // undefined. Ordinarily each undef value will have its own implicit_def of
9666 // a vreg, so force these to use a single register.
9667 SDValue Src0 = Node->getOperand(0);
9668 SDValue Src1 = Node->getOperand(1);
9669 SDValue Src2 = Node->getOperand(2);
9670
9671 if ((Src0.isMachineOpcode() &&
9672 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
9673 (Src0 == Src1 || Src0 == Src2))
9674 break;
9675
9676 MVT VT = Src0.getValueType().getSimpleVT();
Alexander Timofeevba447ba2019-05-26 20:33:26 +00009677 const TargetRegisterClass *RC =
9678 getRegClassFor(VT, Src0.getNode()->isDivergent());
Matt Arsenault206f8262017-08-01 20:49:41 +00009679
9680 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
9681 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
9682
9683 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
9684 UndefReg, Src0, SDValue());
9685
9686 // src0 must be the same register as src1 or src2, even if the value is
9687 // undefined, so make sure we don't violate this constraint.
9688 if (Src0.isMachineOpcode() &&
9689 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
9690 if (Src1.isMachineOpcode() &&
9691 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9692 Src0 = Src1;
9693 else if (Src2.isMachineOpcode() &&
9694 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9695 Src0 = Src2;
9696 else {
9697 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
9698 Src0 = UndefReg;
9699 Src1 = UndefReg;
9700 }
9701 } else
9702 break;
9703
9704 SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
9705 for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
9706 Ops.push_back(Node->getOperand(I));
9707
9708 Ops.push_back(ImpDef.getValue(1));
9709 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
9710 }
9711 default:
9712 break;
9713 }
9714
Tom Stellard654d6692015-01-08 15:08:17 +00009715 return Node;
Christian Konig8e06e2a2013-04-10 08:39:08 +00009716}
Christian Konig8b1ed282013-04-10 08:39:16 +00009717
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009718/// Assign the register class depending on the number of
Christian Konig8b1ed282013-04-10 08:39:16 +00009719/// bits set in the writemask
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009720void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
Christian Konig8b1ed282013-04-10 08:39:16 +00009721 SDNode *Node) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009722 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009723
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009724 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009725
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009726 if (TII->isVOP3(MI.getOpcode())) {
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009727 // Make sure constant bus requirements are respected.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009728 TII->legalizeOperandsVOP3(MRI, MI);
Matt Arsenault6005fcb2015-10-21 21:51:02 +00009729 return;
9730 }
Matt Arsenaultcb0ac3d2014-09-26 17:54:59 +00009731
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009732 // Replace unused atomics with the no return version.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009733 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009734 if (NoRetAtomicOp != -1) {
9735 if (!Node->hasAnyUseOfValue(0)) {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009736 MI.setDesc(TII->get(NoRetAtomicOp));
9737 MI.RemoveOperand(0);
Tom Stellard354a43c2016-04-01 18:27:37 +00009738 return;
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009739 }
9740
Tom Stellard354a43c2016-04-01 18:27:37 +00009741 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
9742 // instruction, because the return type of these instructions is a vec2 of
9743 // the memory type, so it can be tied to the input operand.
9744 // This means these instructions always have a use, so we need to add a
9745 // special case to check if the atomic has only one extract_subreg use,
9746 // which itself has no uses.
9747 if ((Node->hasNUsesOfValue(1, 0) &&
Nicolai Haehnle750082d2016-04-15 14:42:36 +00009748 Node->use_begin()->isMachineOpcode() &&
Tom Stellard354a43c2016-04-01 18:27:37 +00009749 Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
9750 !Node->use_begin()->hasAnyUseOfValue(0))) {
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009751 unsigned Def = MI.getOperand(0).getReg();
Tom Stellard354a43c2016-04-01 18:27:37 +00009752
9753 // Change this into a noret atomic.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009754 MI.setDesc(TII->get(NoRetAtomicOp));
9755 MI.RemoveOperand(0);
Tom Stellard354a43c2016-04-01 18:27:37 +00009756
9757 // If we only remove the def operand from the atomic instruction, the
9758 // extract_subreg will be left with a use of a vreg without a def.
9759 // So we need to insert an implicit_def to avoid machine verifier
9760 // errors.
Duncan P. N. Exon Smithe4f5e4f2016-06-30 22:52:52 +00009761 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
Tom Stellard354a43c2016-04-01 18:27:37 +00009762 TII->get(AMDGPU::IMPLICIT_DEF), Def);
9763 }
Matt Arsenault7ac9c4a2014-09-08 15:07:31 +00009764 return;
9765 }
Christian Konig8b1ed282013-04-10 08:39:16 +00009766}
Tom Stellard0518ff82013-06-03 17:39:58 +00009767
Benjamin Kramerbdc49562016-06-12 15:39:02 +00009768static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
9769 uint64_t Val) {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009770 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
Matt Arsenault485defe2014-11-05 19:01:17 +00009771 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
9772}
9773
9774MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
Benjamin Kramerbdc49562016-06-12 15:39:02 +00009775 const SDLoc &DL,
Matt Arsenault485defe2014-11-05 19:01:17 +00009776 SDValue Ptr) const {
Matt Arsenault43e92fe2016-06-24 06:30:11 +00009777 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
Matt Arsenault485defe2014-11-05 19:01:17 +00009778
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009779 // Build the half of the subregister with the constants before building the
9780 // full 128-bit register. If we are building multiple resource descriptors,
9781 // this will allow CSEing of the 2-component register.
9782 const SDValue Ops0[] = {
9783 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
9784 buildSMovImm32(DAG, DL, 0),
9785 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
9786 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
9787 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
9788 };
Matt Arsenault485defe2014-11-05 19:01:17 +00009789
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009790 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
9791 MVT::v2i32, Ops0), 0);
Matt Arsenault485defe2014-11-05 19:01:17 +00009792
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009793 // Combine the constants and the pointer.
9794 const SDValue Ops1[] = {
9795 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
9796 Ptr,
9797 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
9798 SubRegHi,
9799 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
9800 };
Matt Arsenault485defe2014-11-05 19:01:17 +00009801
Matt Arsenault2d6fdb82015-09-25 17:08:42 +00009802 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
Matt Arsenault485defe2014-11-05 19:01:17 +00009803}
9804
Adrian Prantl5f8f34e42018-05-01 15:54:18 +00009805/// Return a resource descriptor with the 'Add TID' bit enabled
Benjamin Kramerdf005cb2015-08-08 18:27:36 +00009806/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
9807/// of the resource descriptor) to create an offset, which is added to
9808/// the resource pointer.
Benjamin Kramerbdc49562016-06-12 15:39:02 +00009809MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
9810 SDValue Ptr, uint32_t RsrcDword1,
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009811 uint64_t RsrcDword2And3) const {
9812 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
9813 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
9814 if (RsrcDword1) {
9815 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009816 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
9817 0);
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009818 }
9819
9820 SDValue DataLo = buildSMovImm32(DAG, DL,
9821 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
9822 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
9823
9824 const SDValue Ops[] = {
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009825 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009826 PtrLo,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009827 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009828 PtrHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009829 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009830 DataLo,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009831 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009832 DataHi,
Sergey Dmitrouk842a51b2015-04-28 14:05:47 +00009833 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
Matt Arsenaultf3cd4512014-11-05 19:01:19 +00009834 };
9835
9836 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
9837}
9838
Tom Stellardd7e6f132015-04-08 01:09:26 +00009839//===----------------------------------------------------------------------===//
9840// SI Inline Assembly Support
9841//===----------------------------------------------------------------------===//
9842
9843std::pair<unsigned, const TargetRegisterClass *>
9844SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
Benjamin Kramer9bfb6272015-07-05 19:29:18 +00009845 StringRef Constraint,
Tom Stellardd7e6f132015-04-08 01:09:26 +00009846 MVT VT) const {
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009847 const TargetRegisterClass *RC = nullptr;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009848 if (Constraint.size() == 1) {
9849 switch (Constraint[0]) {
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009850 default:
9851 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009852 case 's':
9853 case 'r':
9854 switch (VT.getSizeInBits()) {
9855 default:
9856 return std::make_pair(0U, nullptr);
9857 case 32:
Matt Arsenault9e910142016-12-20 19:06:12 +00009858 case 16:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009859 RC = &AMDGPU::SReg_32_XM0RegClass;
9860 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009861 case 64:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009862 RC = &AMDGPU::SGPR_64RegClass;
9863 break;
Tim Renouf361b5b22019-03-21 12:01:21 +00009864 case 96:
9865 RC = &AMDGPU::SReg_96RegClass;
9866 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009867 case 128:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009868 RC = &AMDGPU::SReg_128RegClass;
9869 break;
Tim Renouf033f99a2019-03-22 10:11:21 +00009870 case 160:
9871 RC = &AMDGPU::SReg_160RegClass;
9872 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009873 case 256:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009874 RC = &AMDGPU::SReg_256RegClass;
9875 break;
Matt Arsenaulte0bf7d02017-02-21 19:12:08 +00009876 case 512:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009877 RC = &AMDGPU::SReg_512RegClass;
9878 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009879 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009880 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009881 case 'v':
9882 switch (VT.getSizeInBits()) {
9883 default:
9884 return std::make_pair(0U, nullptr);
9885 case 32:
Matt Arsenault9e910142016-12-20 19:06:12 +00009886 case 16:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009887 RC = &AMDGPU::VGPR_32RegClass;
9888 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009889 case 64:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009890 RC = &AMDGPU::VReg_64RegClass;
9891 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009892 case 96:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009893 RC = &AMDGPU::VReg_96RegClass;
9894 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009895 case 128:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009896 RC = &AMDGPU::VReg_128RegClass;
9897 break;
Tim Renouf033f99a2019-03-22 10:11:21 +00009898 case 160:
9899 RC = &AMDGPU::VReg_160RegClass;
9900 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009901 case 256:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009902 RC = &AMDGPU::VReg_256RegClass;
9903 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009904 case 512:
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009905 RC = &AMDGPU::VReg_512RegClass;
9906 break;
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009907 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009908 break;
Tom Stellardd7e6f132015-04-08 01:09:26 +00009909 }
Daniil Fukalovc9a098b2018-06-08 16:29:04 +00009910 // We actually support i128, i16 and f16 as inline parameters
9911 // even if they are not reported as legal
9912 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
9913 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
9914 return std::make_pair(0U, RC);
Tom Stellardd7e6f132015-04-08 01:09:26 +00009915 }
9916
9917 if (Constraint.size() > 1) {
Tom Stellardd7e6f132015-04-08 01:09:26 +00009918 if (Constraint[1] == 'v') {
9919 RC = &AMDGPU::VGPR_32RegClass;
9920 } else if (Constraint[1] == 's') {
9921 RC = &AMDGPU::SGPR_32RegClass;
9922 }
9923
9924 if (RC) {
Matt Arsenault0b554ed2015-06-23 02:05:55 +00009925 uint32_t Idx;
9926 bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
9927 if (!Failed && Idx < RC->getNumRegs())
Tom Stellardd7e6f132015-04-08 01:09:26 +00009928 return std::make_pair(RC->getRegister(Idx), RC);
9929 }
9930 }
9931 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9932}
Tom Stellardb3c3bda2015-12-10 02:12:53 +00009933
9934SITargetLowering::ConstraintType
9935SITargetLowering::getConstraintType(StringRef Constraint) const {
9936 if (Constraint.size() == 1) {
9937 switch (Constraint[0]) {
9938 default: break;
9939 case 's':
9940 case 'v':
9941 return C_RegisterClass;
9942 }
9943 }
9944 return TargetLowering::getConstraintType(Constraint);
9945}
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009946
9947// Figure out which registers should be reserved for stack access. Only after
9948// the function is legalized do we know all of the non-spill stack objects or if
9949// calls are present.
9950void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
9951 MachineRegisterInfo &MRI = MF.getRegInfo();
9952 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Tom Stellardc5a154d2018-06-28 23:47:12 +00009953 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009954
9955 if (Info->isEntryFunction()) {
9956 // Callable functions have fixed registers used for stack access.
9957 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
9958 }
9959
Matt Arsenaultb812b7a2019-06-05 22:20:47 +00009960 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
9961 Info->getStackPtrOffsetReg()));
9962 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
9963 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009964
Matt Arsenaultbc6d07c2019-03-14 22:54:43 +00009965 // We need to worry about replacing the default register with itself in case
9966 // of MIR testcases missing the MFI.
9967 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
9968 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
9969
9970 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
9971 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
9972
9973 if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) {
9974 MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
9975 Info->getScratchWaveOffsetReg());
9976 }
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009977
Stanislav Mekhanoshind4b500c2018-05-31 05:36:04 +00009978 Info->limitOccupancy(MF);
9979
Matt Arsenault1cc47f82017-07-18 16:44:56 +00009980 TargetLoweringBase::finalizeLowering(MF);
9981}
Matt Arsenault45b98182017-11-15 00:45:43 +00009982
9983void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
9984 KnownBits &Known,
9985 const APInt &DemandedElts,
9986 const SelectionDAG &DAG,
9987 unsigned Depth) const {
9988 TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
9989 DAG, Depth);
9990
Matt Arsenault5c714cb2019-05-23 19:38:14 +00009991 // Set the high bits to zero based on the maximum allowed scratch size per
9992 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
Matt Arsenault45b98182017-11-15 00:45:43 +00009993 // calculation won't overflow, so assume the sign bit is never set.
Matt Arsenault5c714cb2019-05-23 19:38:14 +00009994 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
Matt Arsenault45b98182017-11-15 00:45:43 +00009995}
Tom Stellard264c1712018-06-13 15:06:37 +00009996
Stanislav Mekhanoshin93f15c92019-05-03 21:17:29 +00009997unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
9998 const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
9999 const unsigned CacheLineAlign = 6; // log2(64)
10000
10001 // Pre-GFX10 target did not benefit from loop alignment
10002 if (!ML || DisableLoopAlignment ||
10003 (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) ||
10004 getSubtarget()->hasInstFwdPrefetchBug())
10005 return PrefAlign;
10006
10007 // On GFX10 I$ is 4 x 64 bytes cache lines.
10008 // By default prefetcher keeps one cache line behind and reads two ahead.
10009 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
10010 // behind and one ahead.
10011 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
10012 // If loop fits 64 bytes it always spans no more than two cache lines and
10013 // does not need an alignment.
10014 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
10015 // Else if loop is less or equal 192 bytes we need two lines behind.
10016
10017 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10018 const MachineBasicBlock *Header = ML->getHeader();
10019 if (Header->getAlignment() != PrefAlign)
10020 return Header->getAlignment(); // Already processed.
10021
10022 unsigned LoopSize = 0;
10023 for (const MachineBasicBlock *MBB : ML->blocks()) {
10024 // If inner loop block is aligned assume in average half of the alignment
10025 // size to be added as nops.
10026 if (MBB != Header)
10027 LoopSize += (1 << MBB->getAlignment()) / 2;
10028
10029 for (const MachineInstr &MI : *MBB) {
10030 LoopSize += TII->getInstSizeInBytes(MI);
10031 if (LoopSize > 192)
10032 return PrefAlign;
10033 }
10034 }
10035
10036 if (LoopSize <= 64)
10037 return PrefAlign;
10038
10039 if (LoopSize <= 128)
10040 return CacheLineAlign;
10041
10042 // If any of parent loops is surrounded by prefetch instructions do not
10043 // insert new for inner loop, which would reset parent's settings.
10044 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
10045 if (MachineBasicBlock *Exit = P->getExitBlock()) {
10046 auto I = Exit->getFirstNonDebugInstr();
10047 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
10048 return CacheLineAlign;
10049 }
10050 }
10051
10052 MachineBasicBlock *Pre = ML->getLoopPreheader();
10053 MachineBasicBlock *Exit = ML->getExitBlock();
10054
10055 if (Pre && Exit) {
10056 BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(),
10057 TII->get(AMDGPU::S_INST_PREFETCH))
10058 .addImm(1); // prefetch 2 lines behind PC
10059
10060 BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(),
10061 TII->get(AMDGPU::S_INST_PREFETCH))
10062 .addImm(2); // prefetch 1 line behind PC
10063 }
10064
10065 return CacheLineAlign;
10066}
10067
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010068LLVM_ATTRIBUTE_UNUSED
10069static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
10070 assert(N->getOpcode() == ISD::CopyFromReg);
10071 do {
10072 // Follow the chain until we find an INLINEASM node.
10073 N = N->getOperand(0).getNode();
Craig Topper784929d2019-02-08 20:48:56 +000010074 if (N->getOpcode() == ISD::INLINEASM ||
10075 N->getOpcode() == ISD::INLINEASM_BR)
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010076 return true;
10077 } while (N->getOpcode() == ISD::CopyFromReg);
10078 return false;
10079}
10080
Tom Stellard264c1712018-06-13 15:06:37 +000010081bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
Nicolai Haehnle35617ed2018-08-30 14:21:36 +000010082 FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
Tom Stellard264c1712018-06-13 15:06:37 +000010083{
10084 switch (N->getOpcode()) {
Tom Stellard264c1712018-06-13 15:06:37 +000010085 case ISD::CopyFromReg:
10086 {
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010087 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
10088 const MachineFunction * MF = FLI->MF;
10089 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
10090 const MachineRegisterInfo &MRI = MF->getRegInfo();
10091 const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
10092 unsigned Reg = R->getReg();
10093 if (TRI.isPhysicalRegister(Reg))
10094 return !TRI.isSGPRReg(MRI, Reg);
Tom Stellard264c1712018-06-13 15:06:37 +000010095
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010096 if (MRI.isLiveIn(Reg)) {
10097 // workitem.id.x workitem.id.y workitem.id.z
10098 // Any VGPR formal argument is also considered divergent
10099 if (!TRI.isSGPRReg(MRI, Reg))
10100 return true;
10101 // Formal arguments of non-entry functions
10102 // are conservatively considered divergent
10103 else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
10104 return true;
10105 return false;
Tom Stellard264c1712018-06-13 15:06:37 +000010106 }
Nicolai Haehnlea9cc92c2018-11-30 22:55:29 +000010107 const Value *V = FLI->getValueFromVirtualReg(Reg);
10108 if (V)
10109 return KDA->isDivergent(V);
10110 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
10111 return !TRI.isSGPRReg(MRI, Reg);
Tom Stellard264c1712018-06-13 15:06:37 +000010112 }
10113 break;
10114 case ISD::LOAD: {
Matt Arsenault813613c2018-09-04 18:58:19 +000010115 const LoadSDNode *L = cast<LoadSDNode>(N);
10116 unsigned AS = L->getAddressSpace();
10117 // A flat load may access private memory.
10118 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
Tom Stellard264c1712018-06-13 15:06:37 +000010119 } break;
10120 case ISD::CALLSEQ_END:
10121 return true;
10122 break;
10123 case ISD::INTRINSIC_WO_CHAIN:
10124 {
10125
10126 }
10127 return AMDGPU::isIntrinsicSourceOfDivergence(
10128 cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
10129 case ISD::INTRINSIC_W_CHAIN:
10130 return AMDGPU::isIntrinsicSourceOfDivergence(
10131 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
10132 // In some cases intrinsics that are a source of divergence have been
10133 // lowered to AMDGPUISD so we also need to check those too.
10134 case AMDGPUISD::INTERP_MOV:
10135 case AMDGPUISD::INTERP_P1:
10136 case AMDGPUISD::INTERP_P2:
10137 return true;
10138 }
10139 return false;
10140}
Matt Arsenaultf8768bf2018-08-06 21:38:27 +000010141
10142bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
10143 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
10144 case MVT::f32:
10145 return Subtarget->hasFP32Denormals();
10146 case MVT::f64:
10147 return Subtarget->hasFP64Denormals();
10148 case MVT::f16:
10149 return Subtarget->hasFP16Denormals();
10150 default:
10151 return false;
10152 }
10153}
Matt Arsenault687ec752018-10-22 16:27:27 +000010154
10155bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
10156 const SelectionDAG &DAG,
10157 bool SNaN,
10158 unsigned Depth) const {
10159 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
Matt Arsenault055e4dc2019-03-29 19:14:54 +000010160 const MachineFunction &MF = DAG.getMachineFunction();
10161 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
10162
10163 if (Info->getMode().DX10Clamp)
Matt Arsenault687ec752018-10-22 16:27:27 +000010164 return true; // Clamped to 0.
10165 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
10166 }
10167
10168 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
10169 SNaN, Depth);
10170}
Matt Arsenaulta5840c32019-01-22 18:36:06 +000010171
10172TargetLowering::AtomicExpansionKind
10173SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
10174 switch (RMW->getOperation()) {
10175 case AtomicRMWInst::FAdd: {
10176 Type *Ty = RMW->getType();
10177
10178 // We don't have a way to support 16-bit atomics now, so just leave them
10179 // as-is.
10180 if (Ty->isHalfTy())
10181 return AtomicExpansionKind::None;
10182
10183 if (!Ty->isFloatTy())
10184 return AtomicExpansionKind::CmpXChg;
10185
10186 // TODO: Do have these for flat. Older targets also had them for buffers.
10187 unsigned AS = RMW->getPointerAddressSpace();
10188 return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
10189 AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
10190 }
10191 default:
10192 break;
10193 }
10194
10195 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
10196}