blob: 136f0351ce781249449aacf76eef75030c53a296 [file] [log] [blame]
Tom Stellardca166212017-01-30 21:56:46 +00001//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Tom Stellardca166212017-01-30 21:56:46 +00006//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
Reid Klecknerfe47ed62019-08-29 20:32:53 +000014#if defined(_MSC_VER) || defined(__MINGW32__)
15// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16// from the Visual C++ cmath / math.h headers:
17// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18#define _USE_MATH_DEFINES
19#endif
20
David Blaikie36a0f222018-03-23 23:58:31 +000021#include "AMDGPU.h"
Craig Topper2fa14362018-03-29 17:21:10 +000022#include "AMDGPULegalizerInfo.h"
Matt Arsenault85803362018-03-17 15:17:41 +000023#include "AMDGPUTargetMachine.h"
Matt Arsenaulta8b43392019-02-08 02:40:47 +000024#include "SIMachineFunctionInfo.h"
Matt Arsenault6ce1b4f2019-07-10 16:31:19 +000025#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
Matt Arsenaulta8b43392019-02-08 02:40:47 +000026#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
David Blaikieb3bde2e2017-11-17 01:07:10 +000027#include "llvm/CodeGen/TargetOpcodes.h"
Craig Topper2fa14362018-03-29 17:21:10 +000028#include "llvm/CodeGen/ValueTypes.h"
Tom Stellardca166212017-01-30 21:56:46 +000029#include "llvm/IR/DerivedTypes.h"
Matt Arsenault64ecca92019-09-09 17:13:44 +000030#include "llvm/IR/DiagnosticInfo.h"
Chandler Carruth6bda14b2017-06-06 11:49:48 +000031#include "llvm/IR/Type.h"
Tom Stellardca166212017-01-30 21:56:46 +000032#include "llvm/Support/Debug.h"
33
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +000034#define DEBUG_TYPE "amdgpu-legalinfo"
35
Tom Stellardca166212017-01-30 21:56:46 +000036using namespace llvm;
Daniel Sanders9ade5592018-01-29 17:37:29 +000037using namespace LegalizeActions;
Matt Arsenault990f5072019-01-25 00:51:00 +000038using namespace LegalizeMutations;
Matt Arsenault7ac79ed2019-01-20 19:45:18 +000039using namespace LegalityPredicates;
Tom Stellardca166212017-01-30 21:56:46 +000040
Matt Arsenaultd9141892019-02-07 19:10:15 +000041
42static LegalityPredicate isMultiple32(unsigned TypeIdx,
Matt Arsenault9dba6032019-10-01 16:35:06 +000043 unsigned MaxSize = 1024) {
Matt Arsenaultd9141892019-02-07 19:10:15 +000044 return [=](const LegalityQuery &Query) {
45 const LLT Ty = Query.Types[TypeIdx];
46 const LLT EltTy = Ty.getScalarType();
47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48 };
49}
50
Matt Arsenaulta5b9c752019-10-06 01:37:35 +000051static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52 return [=](const LegalityQuery &Query) {
53 return Query.Types[TypeIdx].getSizeInBits() == Size;
54 };
55}
56
Matt Arsenault18ec3822019-02-11 22:00:39 +000057static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58 return [=](const LegalityQuery &Query) {
59 const LLT Ty = Query.Types[TypeIdx];
60 return Ty.isVector() &&
61 Ty.getNumElements() % 2 != 0 &&
Matt Arsenault3d23e582019-10-03 17:50:29 +000062 Ty.getElementType().getSizeInBits() < 32 &&
63 Ty.getSizeInBits() % 32 != 0;
Matt Arsenault18ec3822019-02-11 22:00:39 +000064 };
65}
66
Matt Arsenault3cd39592019-10-09 22:44:43 +000067static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68 return [=](const LegalityQuery &Query) {
69 const LLT Ty = Query.Types[TypeIdx];
70 const LLT EltTy = Ty.getScalarType();
71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72 };
73}
74
Matt Arsenault18ec3822019-02-11 22:00:39 +000075static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 const LLT EltTy = Ty.getElementType();
79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80 };
81}
82
Matt Arsenault26b7e852019-02-19 16:30:19 +000083static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84 return [=](const LegalityQuery &Query) {
85 const LLT Ty = Query.Types[TypeIdx];
86 const LLT EltTy = Ty.getElementType();
87 unsigned Size = Ty.getSizeInBits();
88 unsigned Pieces = (Size + 63) / 64;
89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91 };
92}
93
Matt Arsenaultc0ceca52019-09-10 16:20:14 +000094// Increase the number of vector elements to reach the next multiple of 32-bit
95// type.
96static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99
100 const LLT EltTy = Ty.getElementType();
101 const int Size = Ty.getSizeInBits();
102 const int EltSize = EltTy.getSizeInBits();
103 const int NextMul32 = (Size + 31) / 32;
104
105 assert(EltSize < 32);
106
107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109 };
110}
111
112static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113 return [=](const LegalityQuery &Query) {
114 const LLT QueryTy = Query.Types[TypeIdx];
115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116 };
117}
118
Matt Arsenault26b7e852019-02-19 16:30:19 +0000119static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120 return [=](const LegalityQuery &Query) {
121 const LLT QueryTy = Query.Types[TypeIdx];
122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123 };
124}
125
Matt Arsenaultb4c95b32019-02-19 17:03:09 +0000126static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127 return [=](const LegalityQuery &Query) {
128 const LLT QueryTy = Query.Types[TypeIdx];
129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130 };
131}
Matt Arsenault18ec3822019-02-11 22:00:39 +0000132
Matt Arsenault9dba6032019-10-01 16:35:06 +0000133// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
Matt Arsenault4dd57552019-07-09 14:17:31 +0000134// v2s16.
135static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136 return [=](const LegalityQuery &Query) {
137 const LLT Ty = Query.Types[TypeIdx];
138 if (Ty.isVector()) {
139 const int EltSize = Ty.getElementType().getSizeInBits();
140 return EltSize == 32 || EltSize == 64 ||
Matt Arsenault3f1a3452019-07-09 22:48:04 +0000141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142 EltSize == 128 || EltSize == 256;
Matt Arsenault4dd57552019-07-09 14:17:31 +0000143 }
144
Matt Arsenault9dba6032019-10-01 16:35:06 +0000145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
Matt Arsenault4dd57552019-07-09 14:17:31 +0000146 };
147}
148
Matt Arsenault28215ca2019-08-13 16:26:28 +0000149static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150 return [=](const LegalityQuery &Query) {
151 return Query.Types[TypeIdx].getElementType() == Type;
152 };
153}
154
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000155static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156 return [=](const LegalityQuery &Query) {
157 const LLT Ty = Query.Types[TypeIdx];
158 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160 };
161}
162
Matt Arsenault9e8e8c62019-07-01 18:49:01 +0000163AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164 const GCNTargetMachine &TM)
165 : ST(ST_) {
Tom Stellardca166212017-01-30 21:56:46 +0000166 using namespace TargetOpcode;
167
Matt Arsenault85803362018-03-17 15:17:41 +0000168 auto GetAddrSpacePtr = [&TM](unsigned AS) {
169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170 };
171
172 const LLT S1 = LLT::scalar(1);
Matt Arsenault888aa5d2019-02-03 00:07:33 +0000173 const LLT S8 = LLT::scalar(8);
Matt Arsenault45991592019-01-18 21:33:50 +0000174 const LLT S16 = LLT::scalar(16);
Tom Stellardca166212017-01-30 21:56:46 +0000175 const LLT S32 = LLT::scalar(32);
176 const LLT S64 = LLT::scalar(64);
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000177 const LLT S96 = LLT::scalar(96);
Matt Arsenaultca676342019-01-25 02:36:32 +0000178 const LLT S128 = LLT::scalar(128);
Matt Arsenaultff6a9a22019-01-20 18:40:36 +0000179 const LLT S256 = LLT::scalar(256);
Matt Arsenault9dba6032019-10-01 16:35:06 +0000180 const LLT S1024 = LLT::scalar(1024);
Matt Arsenault85803362018-03-17 15:17:41 +0000181
Matt Arsenaultbee2ad72018-12-21 03:03:11 +0000182 const LLT V2S16 = LLT::vector(2, 16);
Matt Arsenaulta1515d22019-01-08 01:30:02 +0000183 const LLT V4S16 = LLT::vector(4, 16);
Matt Arsenaultbee2ad72018-12-21 03:03:11 +0000184
185 const LLT V2S32 = LLT::vector(2, 32);
186 const LLT V3S32 = LLT::vector(3, 32);
187 const LLT V4S32 = LLT::vector(4, 32);
188 const LLT V5S32 = LLT::vector(5, 32);
189 const LLT V6S32 = LLT::vector(6, 32);
190 const LLT V7S32 = LLT::vector(7, 32);
191 const LLT V8S32 = LLT::vector(8, 32);
192 const LLT V9S32 = LLT::vector(9, 32);
193 const LLT V10S32 = LLT::vector(10, 32);
194 const LLT V11S32 = LLT::vector(11, 32);
195 const LLT V12S32 = LLT::vector(12, 32);
196 const LLT V13S32 = LLT::vector(13, 32);
197 const LLT V14S32 = LLT::vector(14, 32);
198 const LLT V15S32 = LLT::vector(15, 32);
199 const LLT V16S32 = LLT::vector(16, 32);
Matt Arsenault05aa8a72019-10-02 01:02:18 +0000200 const LLT V32S32 = LLT::vector(32, 32);
Matt Arsenaultbee2ad72018-12-21 03:03:11 +0000201
202 const LLT V2S64 = LLT::vector(2, 64);
203 const LLT V3S64 = LLT::vector(3, 64);
204 const LLT V4S64 = LLT::vector(4, 64);
205 const LLT V5S64 = LLT::vector(5, 64);
206 const LLT V6S64 = LLT::vector(6, 64);
207 const LLT V7S64 = LLT::vector(7, 64);
208 const LLT V8S64 = LLT::vector(8, 64);
Matt Arsenault05aa8a72019-10-02 01:02:18 +0000209 const LLT V16S64 = LLT::vector(16, 64);
Matt Arsenaultbee2ad72018-12-21 03:03:11 +0000210
211 std::initializer_list<LLT> AllS32Vectors =
212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
Matt Arsenault05aa8a72019-10-02 01:02:18 +0000213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
Matt Arsenaultbee2ad72018-12-21 03:03:11 +0000214 std::initializer_list<LLT> AllS64Vectors =
Matt Arsenault05aa8a72019-10-02 01:02:18 +0000215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
Matt Arsenaultbee2ad72018-12-21 03:03:11 +0000216
Matt Arsenault85803362018-03-17 15:17:41 +0000217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
Matt Arsenaultf3bfb852019-07-19 22:28:44 +0000219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
Matt Arsenault685d1e82018-03-17 15:17:45 +0000220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
Matt Arsenaultf3bfb852019-07-19 22:28:44 +0000221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
Matt Arsenault0da63502018-08-31 05:49:54 +0000222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
Matt Arsenault85803362018-03-17 15:17:41 +0000224
Matt Arsenault934e5342018-12-13 20:34:15 +0000225 const LLT CodePtr = FlatPtr;
226
Matt Arsenault9e5e8682019-02-14 22:24:28 +0000227 const std::initializer_list<LLT> AddrSpaces64 = {
228 GlobalPtr, ConstantPtr, FlatPtr
229 };
230
231 const std::initializer_list<LLT> AddrSpaces32 = {
Matt Arsenaultf3bfb852019-07-19 22:28:44 +0000232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
Matt Arsenault685d1e82018-03-17 15:17:45 +0000233 };
Tom Stellardca166212017-01-30 21:56:46 +0000234
Matt Arsenault40d1faf2019-07-01 17:35:53 +0000235 const std::initializer_list<LLT> FPTypesBase = {
236 S32, S64
237 };
238
239 const std::initializer_list<LLT> FPTypes16 = {
240 S32, S64, S16
241 };
242
Matt Arsenault6ce1b4f2019-07-10 16:31:19 +0000243 const std::initializer_list<LLT> FPTypesPK16 = {
244 S32, S64, S16, V2S16
245 };
246
Matt Arsenaultadc40ba2019-01-08 01:22:47 +0000247 setAction({G_BRCOND, S1}, Legal);
248
Matt Arsenault2e0ee472019-02-21 15:48:13 +0000249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250 // elements for v3s16
251 getActionDefinitionsBuilder(G_PHI)
252 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253 .legalFor(AllS32Vectors)
254 .legalFor(AllS64Vectors)
255 .legalFor(AddrSpaces64)
256 .legalFor(AddrSpaces32)
257 .clampScalar(0, S32, S256)
258 .widenScalarToNextPow2(0, 32)
Matt Arsenaultd3093c22019-02-28 00:16:32 +0000259 .clampMaxNumElements(0, S32, 16)
Matt Arsenault72bcf152019-02-28 00:01:05 +0000260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
Matt Arsenault2e0ee472019-02-21 15:48:13 +0000261 .legalIf(isPointer(0));
262
Matt Arsenaultef59cb62019-07-01 18:18:55 +0000263 if (ST.has16BitInsts()) {
264 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265 .legalFor({S32, S16})
266 .clampScalar(0, S16, S32)
267 .scalarize(0);
268 } else {
269 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270 .legalFor({S32})
271 .clampScalar(0, S32, S32)
272 .scalarize(0);
273 }
Matt Arsenault2e0ee472019-02-21 15:48:13 +0000274
Matt Arsenaultef59cb62019-07-01 18:18:55 +0000275 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
Matt Arsenault5d622fb2019-01-25 03:23:04 +0000276 .legalFor({S32})
Matt Arsenault211e89d2019-01-27 00:52:51 +0000277 .clampScalar(0, S32, S32)
Matt Arsenault5d622fb2019-01-25 03:23:04 +0000278 .scalarize(0);
Matt Arsenault43398832018-12-20 01:35:49 +0000279
Matt Arsenault26a6c742019-01-26 23:47:07 +0000280 // Report legal for any types we can handle anywhere. For the cases only legal
281 // on the SALU, RegBankSelect will be able to re-legalize.
Matt Arsenault43398832018-12-20 01:35:49 +0000282 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
Matt Arsenault22c4a142019-07-16 14:28:30 +0000283 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
Matt Arsenault26a6c742019-01-26 23:47:07 +0000284 .clampScalar(0, S32, S64)
Matt Arsenault26b7e852019-02-19 16:30:19 +0000285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
Matt Arsenault3d23e582019-10-03 17:50:29 +0000286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
Matt Arsenaultf4bfe4c2019-02-25 21:32:48 +0000287 .widenScalarToNextPow2(0)
Matt Arsenault26a6c742019-01-26 23:47:07 +0000288 .scalarize(0);
Tom Stellardee6e6452017-06-12 20:54:56 +0000289
Matt Arsenault34ed76e2019-10-16 20:46:32 +0000290 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
Matt Arsenault68c668a2019-01-08 01:09:09 +0000291 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
Matt Arsenault4d475942019-01-26 23:44:51 +0000292 .legalFor({{S32, S1}})
Matt Arsenault54167ea2019-10-01 01:23:13 +0000293 .clampScalar(0, S32, S32)
294 .scalarize(0); // TODO: Implement.
Matt Arsenault2cc15b62019-01-08 01:03:58 +0000295
Matt Arsenault34ed76e2019-10-16 20:46:32 +0000296 getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
297 .lower();
298
Matt Arsenault7ac79ed2019-01-20 19:45:18 +0000299 getActionDefinitionsBuilder(G_BITCAST)
Matt Arsenault7ac79ed2019-01-20 19:45:18 +0000300 // Don't worry about the size constraint.
Matt Arsenault1c135a32019-10-03 05:46:08 +0000301 .legalIf(all(isRegisterType(0), isRegisterType(1)))
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000302 // FIXME: Testing hack
303 .legalForCartesianProduct({S16, LLT::vector(2, 8), });
Tom Stellardff63ee02017-06-19 13:15:45 +0000304
Matt Arsenaultd9af7122019-09-04 16:19:45 +0000305 getActionDefinitionsBuilder(G_FCONSTANT)
306 .legalFor({S32, S64, S16})
307 .clampScalar(0, S16, S64);
Tom Stellardeebbfc22018-06-30 04:09:44 +0000308
Matt Arsenaultb3feccd2018-06-25 15:42:12 +0000309 getActionDefinitionsBuilder(G_IMPLICIT_DEF)
Matt Arsenaultd9af7122019-09-04 16:19:45 +0000310 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
Matt Arsenaultd9141892019-02-07 19:10:15 +0000311 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
Matt Arsenault18ec3822019-02-11 22:00:39 +0000312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
Matt Arsenault9dba6032019-10-01 16:35:06 +0000313 .clampScalarOrElt(0, S32, S1024)
Matt Arsenault0f2debb2019-02-08 14:46:27 +0000314 .legalIf(isMultiple32(0))
Matt Arsenault82b10392019-02-25 20:46:06 +0000315 .widenScalarToNextPow2(0, 32)
316 .clampMaxNumElements(0, S32, 16);
Matt Arsenaultb3feccd2018-06-25 15:42:12 +0000317
Matt Arsenaultabdc4f22018-03-17 15:17:48 +0000318
Tom Stellarde0424122017-06-03 01:13:33 +0000319 // FIXME: i1 operands to intrinsics should always be legal, but other i1
320 // values may not be legal. We need to figure out how to distinguish
321 // between these two scenarios.
Matt Arsenault45991592019-01-18 21:33:50 +0000322 getActionDefinitionsBuilder(G_CONSTANT)
Matt Arsenaultd9af7122019-09-04 16:19:45 +0000323 .legalFor({S1, S32, S64, S16, GlobalPtr,
Matt Arsenault2065c942019-02-02 23:33:49 +0000324 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
Matt Arsenault45991592019-01-18 21:33:50 +0000325 .clampScalar(0, S32, S64)
Matt Arsenault2065c942019-02-02 23:33:49 +0000326 .widenScalarToNextPow2(0)
327 .legalIf(isPointer(0));
Matt Arsenault06cbb272018-03-01 19:16:52 +0000328
Matt Arsenaultc94e26c2018-12-18 09:46:13 +0000329 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
Matt Arsenault77ac4002019-10-01 01:06:43 +0000330 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
331 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
Matt Arsenault64ecca92019-09-09 17:13:44 +0000332
Matt Arsenaultc94e26c2018-12-18 09:46:13 +0000333
Matt Arsenault93fdec72019-02-07 18:03:11 +0000334 auto &FPOpActions = getActionDefinitionsBuilder(
Matt Arsenaulte1895ab2019-09-10 17:19:46 +0000335 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
Matt Arsenault93fdec72019-02-07 18:03:11 +0000336 .legalFor({S32, S64});
Matt Arsenaultcbd17822019-08-29 20:06:48 +0000337 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
338 .customFor({S32, S64});
Austin Kerbow97263fa2019-10-21 22:18:26 +0000339 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
340 .customFor({S32, S64});
Matt Arsenault93fdec72019-02-07 18:03:11 +0000341
342 if (ST.has16BitInsts()) {
343 if (ST.hasVOP3PInsts())
344 FPOpActions.legalFor({S16, V2S16});
345 else
346 FPOpActions.legalFor({S16});
Matt Arsenaultcbd17822019-08-29 20:06:48 +0000347
348 TrigActions.customFor({S16});
Austin Kerbow97263fa2019-10-21 22:18:26 +0000349 FDIVActions.customFor({S16});
Matt Arsenault93fdec72019-02-07 18:03:11 +0000350 }
351
Matt Arsenault6ce1b4f2019-07-10 16:31:19 +0000352 auto &MinNumMaxNum = getActionDefinitionsBuilder({
353 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
354
355 if (ST.hasVOP3PInsts()) {
356 MinNumMaxNum.customFor(FPTypesPK16)
357 .clampMaxNumElements(0, S16, 2)
358 .clampScalar(0, S16, S64)
359 .scalarize(0);
360 } else if (ST.has16BitInsts()) {
361 MinNumMaxNum.customFor(FPTypes16)
362 .clampScalar(0, S16, S64)
363 .scalarize(0);
364 } else {
365 MinNumMaxNum.customFor(FPTypesBase)
366 .clampScalar(0, S32, S64)
367 .scalarize(0);
368 }
369
Matt Arsenault93fdec72019-02-07 18:03:11 +0000370 if (ST.hasVOP3PInsts())
371 FPOpActions.clampMaxNumElements(0, S16, 2);
Matt Arsenaultcbd17822019-08-29 20:06:48 +0000372
Matt Arsenault93fdec72019-02-07 18:03:11 +0000373 FPOpActions
374 .scalarize(0)
375 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
Tom Stellardd0c6cf22017-10-27 23:57:41 +0000376
Matt Arsenaultcbd17822019-08-29 20:06:48 +0000377 TrigActions
378 .scalarize(0)
379 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
380
Austin Kerbow97263fa2019-10-21 22:18:26 +0000381 FDIVActions
382 .scalarize(0)
383 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
384
Matt Arsenaulte1895ab2019-09-10 17:19:46 +0000385 getActionDefinitionsBuilder({G_FNEG, G_FABS})
386 .legalFor(FPTypesPK16)
387 .clampMaxNumElements(0, S16, 2)
388 .scalarize(0)
389 .clampScalar(0, S16, S64);
390
391 // TODO: Implement
392 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
393
Matt Arsenaultc0f75692019-02-07 18:14:39 +0000394 if (ST.has16BitInsts()) {
Matt Arsenaultf457dd22019-09-13 01:48:15 +0000395 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
Matt Arsenaultc0f75692019-02-07 18:14:39 +0000396 .legalFor({S32, S64, S16})
397 .scalarize(0)
398 .clampScalar(0, S16, S64);
399 } else {
Matt Arsenaultf457dd22019-09-13 01:48:15 +0000400 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
Matt Arsenaultc0f75692019-02-07 18:14:39 +0000401 .legalFor({S32, S64})
402 .scalarize(0)
403 .clampScalar(0, S32, S64);
404 }
405
Matt Arsenaultdff33c32018-12-20 00:37:02 +0000406 getActionDefinitionsBuilder(G_FPTRUNC)
Matt Arsenaulte6cebd02019-01-25 04:37:33 +0000407 .legalFor({{S32, S64}, {S16, S32}})
408 .scalarize(0);
Matt Arsenaultdff33c32018-12-20 00:37:02 +0000409
Matt Arsenault24563ef2019-01-20 18:34:24 +0000410 getActionDefinitionsBuilder(G_FPEXT)
411 .legalFor({{S64, S32}, {S32, S16}})
Matt Arsenaultca676342019-01-25 02:36:32 +0000412 .lowerFor({{S64, S16}}) // FIXME: Implement
413 .scalarize(0);
Matt Arsenault24563ef2019-01-20 18:34:24 +0000414
Matt Arsenaultb1843e12019-07-09 23:34:29 +0000415 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
416 getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
Matt Arsenault1448f562019-05-17 12:19:52 +0000417
Matt Arsenault745fd9f2019-01-20 19:10:31 +0000418 getActionDefinitionsBuilder(G_FSUB)
Matt Arsenaultaebb2ee2019-01-22 20:14:29 +0000419 // Use actual fsub instruction
420 .legalFor({S32})
421 // Must use fadd + fneg
422 .lowerFor({S64, S16, V2S16})
Matt Arsenault990f5072019-01-25 00:51:00 +0000423 .scalarize(0)
Matt Arsenaultaebb2ee2019-01-22 20:14:29 +0000424 .clampScalar(0, S32, S64);
Matt Arsenaulte01e7c82018-12-18 09:19:03 +0000425
Matt Arsenault4d339182019-09-13 00:44:35 +0000426 // Whether this is legal depends on the floating point mode for the function.
427 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
428 if (ST.hasMadF16())
429 FMad.customFor({S32, S16});
430 else
431 FMad.customFor({S32});
432 FMad.scalarize(0)
433 .lower();
434
Matt Arsenault24563ef2019-01-20 18:34:24 +0000435 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
Matt Arsenault46ffe682019-01-20 19:28:20 +0000436 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
Matt Arsenaultca676342019-01-25 02:36:32 +0000437 {S32, S1}, {S64, S1}, {S16, S1},
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000438 {S96, S32},
Matt Arsenaultca676342019-01-25 02:36:32 +0000439 // FIXME: Hack
Matt Arsenaultf4bfe4c2019-02-25 21:32:48 +0000440 {S64, LLT::scalar(33)},
Matt Arsenault888aa5d2019-02-03 00:07:33 +0000441 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
Matt Arsenaultca676342019-01-25 02:36:32 +0000442 .scalarize(0);
Matt Arsenaultf38f4832018-12-13 08:23:51 +0000443
Matt Arsenaultfdea5e02019-10-01 02:23:20 +0000444 // TODO: Split s1->s64 during regbankselect for VALU.
Matt Arsenaultc8a6df72019-10-07 23:33:08 +0000445 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
Matt Arsenaultfdea5e02019-10-01 02:23:20 +0000446 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
Matt Arsenault02b5ca82019-05-17 23:05:13 +0000447 .lowerFor({{S32, S64}})
Matt Arsenaultc8a6df72019-10-07 23:33:08 +0000448 .customFor({{S64, S64}});
449 if (ST.has16BitInsts())
450 IToFP.legalFor({{S16, S16}});
451 IToFP.clampScalar(1, S32, S64)
452 .scalarize(0);
Matt Arsenaultdd022ce2018-03-01 19:04:25 +0000453
Matt Arsenaulted85b0c2019-10-01 01:06:48 +0000454 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
455 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
456 if (ST.has16BitInsts())
457 FPToI.legalFor({{S16, S16}});
458 else
459 FPToI.minScalar(1, S32);
460
461 FPToI.minScalar(0, S32)
462 .scalarize(0);
Tom Stellard33445762018-02-07 04:47:59 +0000463
Matt Arsenault6aebcd52019-05-17 12:20:01 +0000464 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
Matt Arsenault2e5f9002019-01-27 00:12:21 +0000465 .legalFor({S32, S64})
466 .scalarize(0);
Matt Arsenaultf4c21c52018-12-21 03:14:45 +0000467
Matt Arsenault6aafc5e2019-05-17 12:19:57 +0000468 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
Matt Arsenaulta510b572019-05-17 12:20:05 +0000469 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
Matt Arsenault6aafc5e2019-05-17 12:19:57 +0000470 .legalFor({S32, S64})
471 .clampScalar(0, S32, S64)
472 .scalarize(0);
473 } else {
Matt Arsenaulta510b572019-05-17 12:20:05 +0000474 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
Matt Arsenault6aafc5e2019-05-17 12:19:57 +0000475 .legalFor({S32})
476 .customFor({S64})
477 .clampScalar(0, S32, S64)
478 .scalarize(0);
479 }
Tom Stellardca166212017-01-30 21:56:46 +0000480
Daniel Sanderse74c5b92019-11-01 13:18:00 -0700481 getActionDefinitionsBuilder(G_PTR_ADD)
Matt Arsenault9e5e8682019-02-14 22:24:28 +0000482 .legalForCartesianProduct(AddrSpaces64, {S64})
483 .legalForCartesianProduct(AddrSpaces32, {S32})
484 .scalarize(0);
Matt Arsenault3b9a82f2019-01-25 04:54:00 +0000485
Matt Arsenaultc34b4032019-09-09 15:46:13 +0000486 getActionDefinitionsBuilder(G_PTR_MASK)
487 .scalarize(0)
488 .alwaysLegal();
489
Matt Arsenault934e5342018-12-13 20:34:15 +0000490 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
491
Matt Arsenault8b8eee52019-07-09 14:10:43 +0000492 auto &CmpBuilder =
493 getActionDefinitionsBuilder(G_ICMP)
Matt Arsenault58f9d3d2019-02-02 23:35:15 +0000494 .legalForCartesianProduct(
495 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
Matt Arsenault8b8eee52019-07-09 14:10:43 +0000496 .legalFor({{S1, S32}, {S1, S64}});
497 if (ST.has16BitInsts()) {
498 CmpBuilder.legalFor({{S1, S16}});
499 }
500
501 CmpBuilder
Matt Arsenault58f9d3d2019-02-02 23:35:15 +0000502 .widenScalarToNextPow2(1)
503 .clampScalar(1, S32, S64)
504 .scalarize(0)
505 .legalIf(all(typeIs(0, S1), isPointer(1)));
506
507 getActionDefinitionsBuilder(G_FCMP)
Matt Arsenault40d1faf2019-07-01 17:35:53 +0000508 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
Matt Arsenault1b1e6852019-01-25 02:59:34 +0000509 .widenScalarToNextPow2(1)
510 .clampScalar(1, S32, S64)
Matt Arsenaultded2f822019-01-26 23:54:53 +0000511 .scalarize(0);
Matt Arsenault1b1e6852019-01-25 02:59:34 +0000512
Matt Arsenault95fd95c2019-01-25 04:03:38 +0000513 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
514 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
515 G_FLOG, G_FLOG2, G_FLOG10})
516 .legalFor({S32})
517 .scalarize(0);
Tom Stellard8cd60a52017-06-06 14:16:50 +0000518
Matt Arsenaultd5684f72019-01-31 02:09:57 +0000519 // The 64-bit versions produce 32-bit results, but only on the SALU.
520 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
521 G_CTTZ, G_CTTZ_ZERO_UNDEF,
522 G_CTPOP})
523 .legalFor({{S32, S32}, {S32, S64}})
524 .clampScalar(0, S32, S32)
Matt Arsenault75e30c42019-02-20 16:42:52 +0000525 .clampScalar(1, S32, S64)
Matt Arsenaultb10fa8d2019-02-21 15:22:20 +0000526 .scalarize(0)
527 .widenScalarToNextPow2(0, 32)
528 .widenScalarToNextPow2(1, 32);
Matt Arsenaultd5684f72019-01-31 02:09:57 +0000529
Matt Arsenaultd1bfc8d2019-01-31 02:34:03 +0000530 // TODO: Expand for > s32
Matt Arsenault5ff310e2019-09-04 20:46:15 +0000531 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
Matt Arsenaultd1bfc8d2019-01-31 02:34:03 +0000532 .legalFor({S32})
533 .clampScalar(0, S32, S32)
534 .scalarize(0);
Matt Arsenaultd5684f72019-01-31 02:09:57 +0000535
Matt Arsenault0f3ba442019-05-23 17:58:48 +0000536 if (ST.has16BitInsts()) {
537 if (ST.hasVOP3PInsts()) {
538 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
539 .legalFor({S32, S16, V2S16})
540 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
541 .clampMaxNumElements(0, S16, 2)
542 .clampScalar(0, S16, S32)
543 .widenScalarToNextPow2(0)
544 .scalarize(0);
545 } else {
546 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
547 .legalFor({S32, S16})
548 .widenScalarToNextPow2(0)
549 .clampScalar(0, S16, S32)
550 .scalarize(0);
551 }
552 } else {
553 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
554 .legalFor({S32})
555 .clampScalar(0, S32, S32)
556 .widenScalarToNextPow2(0)
557 .scalarize(0);
558 }
Matt Arsenaultf38f4832018-12-13 08:23:51 +0000559
Matt Arsenaultcbaada62019-02-02 23:29:55 +0000560 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
561 return [=](const LegalityQuery &Query) {
562 return Query.Types[TypeIdx0].getSizeInBits() <
563 Query.Types[TypeIdx1].getSizeInBits();
564 };
565 };
566
567 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
568 return [=](const LegalityQuery &Query) {
569 return Query.Types[TypeIdx0].getSizeInBits() >
570 Query.Types[TypeIdx1].getSizeInBits();
571 };
572 };
573
Tom Stellard7c650782018-10-05 04:34:09 +0000574 getActionDefinitionsBuilder(G_INTTOPTR)
Matt Arsenaultcbaada62019-02-02 23:29:55 +0000575 // List the common cases
Matt Arsenault9e5e8682019-02-14 22:24:28 +0000576 .legalForCartesianProduct(AddrSpaces64, {S64})
577 .legalForCartesianProduct(AddrSpaces32, {S32})
Matt Arsenaultcbaada62019-02-02 23:29:55 +0000578 .scalarize(0)
579 // Accept any address space as long as the size matches
580 .legalIf(sameSize(0, 1))
581 .widenScalarIf(smallerThan(1, 0),
582 [](const LegalityQuery &Query) {
583 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
584 })
585 .narrowScalarIf(greaterThan(1, 0),
586 [](const LegalityQuery &Query) {
587 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
588 });
Matt Arsenault85803362018-03-17 15:17:41 +0000589
Matt Arsenaultf38f4832018-12-13 08:23:51 +0000590 getActionDefinitionsBuilder(G_PTRTOINT)
Matt Arsenaultcbaada62019-02-02 23:29:55 +0000591 // List the common cases
Matt Arsenault9e5e8682019-02-14 22:24:28 +0000592 .legalForCartesianProduct(AddrSpaces64, {S64})
593 .legalForCartesianProduct(AddrSpaces32, {S32})
Matt Arsenaultcbaada62019-02-02 23:29:55 +0000594 .scalarize(0)
595 // Accept any address space as long as the size matches
596 .legalIf(sameSize(0, 1))
597 .widenScalarIf(smallerThan(0, 1),
598 [](const LegalityQuery &Query) {
599 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
600 })
601 .narrowScalarIf(
602 greaterThan(0, 1),
603 [](const LegalityQuery &Query) {
604 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
605 });
Matt Arsenaultf38f4832018-12-13 08:23:51 +0000606
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +0000607 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
608 .scalarize(0)
609 .custom();
Matt Arsenaulta8b43392019-02-08 02:40:47 +0000610
Matt Arsenault35c96592019-07-16 18:05:29 +0000611 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
612 // handle some operations by just promoting the register during
613 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000614 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
615 switch (AS) {
616 // FIXME: Private element size.
617 case AMDGPUAS::PRIVATE_ADDRESS:
618 return 32;
619 // FIXME: Check subtarget
620 case AMDGPUAS::LOCAL_ADDRESS:
621 return ST.useDS128() ? 128 : 64;
Matt Arsenault85803362018-03-17 15:17:41 +0000622
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000623 // Treat constant and global as identical. SMRD loads are sometimes usable
624 // for global loads (ideally constant address space should be eliminated)
625 // depending on the context. Legality cannot be context dependent, but
626 // RegBankSelect can split the load as necessary depending on the pointer
627 // register bank/uniformity and if the memory is invariant or not written in
628 // a kernel.
629 case AMDGPUAS::CONSTANT_ADDRESS:
630 case AMDGPUAS::GLOBAL_ADDRESS:
631 return 512;
632 default:
633 return 128;
634 }
635 };
Matt Arsenault18619af2019-01-29 18:13:02 +0000636
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000637 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
638 const LLT DstTy = Query.Types[0];
Matt Arsenault18619af2019-01-29 18:13:02 +0000639
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000640 // Split vector extloads.
641 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
642 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
643 return true;
Matt Arsenault85803362018-03-17 15:17:41 +0000644
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000645 const LLT PtrTy = Query.Types[1];
646 unsigned AS = PtrTy.getAddressSpace();
647 if (MemSize > maxSizeForAddrSpace(AS))
648 return true;
Matt Arsenault85803362018-03-17 15:17:41 +0000649
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000650 // Catch weird sized loads that don't evenly divide into the access sizes
651 // TODO: May be able to widen depending on alignment etc.
652 unsigned NumRegs = MemSize / 32;
653 if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
654 return true;
Tom Stellardd0ba79f2019-07-10 00:22:41 +0000655
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000656 unsigned Align = Query.MMODescrs[0].AlignInBits;
657 if (Align < MemSize) {
658 const SITargetLowering *TLI = ST.getTargetLowering();
659 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
660 }
Matt Arsenault85803362018-03-17 15:17:41 +0000661
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000662 return false;
663 };
Matt Arsenault85803362018-03-17 15:17:41 +0000664
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000665 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
666 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
667 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
668
669 // TODO: Refine based on subtargets which support unaligned access or 128-bit
670 // LDS
671 // TODO: Unsupported flat for SI.
672
673 for (unsigned Op : {G_LOAD, G_STORE}) {
674 const bool IsStore = Op == G_STORE;
675
676 auto &Actions = getActionDefinitionsBuilder(Op);
677 // Whitelist the common cases.
678 // TODO: Pointer loads
679 // TODO: Wide constant loads
680 // TODO: Only CI+ has 3x loads
681 // TODO: Loads to s16 on gfx9
682 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
683 {V2S32, GlobalPtr, 64, GlobalAlign32},
684 {V3S32, GlobalPtr, 96, GlobalAlign32},
685 {S96, GlobalPtr, 96, GlobalAlign32},
686 {V4S32, GlobalPtr, 128, GlobalAlign32},
687 {S128, GlobalPtr, 128, GlobalAlign32},
688 {S64, GlobalPtr, 64, GlobalAlign32},
689 {V2S64, GlobalPtr, 128, GlobalAlign32},
690 {V2S16, GlobalPtr, 32, GlobalAlign32},
691 {S32, GlobalPtr, 8, GlobalAlign8},
692 {S32, GlobalPtr, 16, GlobalAlign16},
693
694 {S32, LocalPtr, 32, 32},
695 {S64, LocalPtr, 64, 32},
696 {V2S32, LocalPtr, 64, 32},
697 {S32, LocalPtr, 8, 8},
698 {S32, LocalPtr, 16, 16},
699 {V2S16, LocalPtr, 32, 32},
700
701 {S32, PrivatePtr, 32, 32},
702 {S32, PrivatePtr, 8, 8},
703 {S32, PrivatePtr, 16, 16},
704 {V2S16, PrivatePtr, 32, 32},
705
706 {S32, FlatPtr, 32, GlobalAlign32},
707 {S32, FlatPtr, 16, GlobalAlign16},
708 {S32, FlatPtr, 8, GlobalAlign8},
709 {V2S16, FlatPtr, 32, GlobalAlign32},
710
711 {S32, ConstantPtr, 32, GlobalAlign32},
712 {V2S32, ConstantPtr, 64, GlobalAlign32},
713 {V3S32, ConstantPtr, 96, GlobalAlign32},
714 {V4S32, ConstantPtr, 128, GlobalAlign32},
715 {S64, ConstantPtr, 64, GlobalAlign32},
716 {S128, ConstantPtr, 128, GlobalAlign32},
717 {V2S32, ConstantPtr, 32, GlobalAlign32}});
718 Actions
Matt Arsenaultad6a8b832019-09-10 16:42:31 +0000719 .customIf(typeIs(1, Constant32Ptr))
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000720 .narrowScalarIf(
721 [=](const LegalityQuery &Query) -> bool {
722 return !Query.Types[0].isVector() && needToSplitLoad(Query);
723 },
724 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
725 const LLT DstTy = Query.Types[0];
726 const LLT PtrTy = Query.Types[1];
727
728 const unsigned DstSize = DstTy.getSizeInBits();
729 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
730
731 // Split extloads.
732 if (DstSize > MemSize)
733 return std::make_pair(0, LLT::scalar(MemSize));
734
735 if (DstSize > 32 && (DstSize % 32 != 0)) {
736 // FIXME: Need a way to specify non-extload of larger size if
737 // suitably aligned.
738 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
739 }
740
741 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
742 if (MemSize > MaxSize)
743 return std::make_pair(0, LLT::scalar(MaxSize));
744
745 unsigned Align = Query.MMODescrs[0].AlignInBits;
746 return std::make_pair(0, LLT::scalar(Align));
747 })
748 .fewerElementsIf(
749 [=](const LegalityQuery &Query) -> bool {
750 return Query.Types[0].isVector() && needToSplitLoad(Query);
751 },
752 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
753 const LLT DstTy = Query.Types[0];
754 const LLT PtrTy = Query.Types[1];
755
756 LLT EltTy = DstTy.getElementType();
757 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
758
759 // Split if it's too large for the address space.
760 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
761 unsigned NumElts = DstTy.getNumElements();
762 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
763
764 // FIXME: Refine when odd breakdowns handled
765 // The scalars will need to be re-legalized.
766 if (NumPieces == 1 || NumPieces >= NumElts ||
767 NumElts % NumPieces != 0)
768 return std::make_pair(0, EltTy);
769
770 return std::make_pair(0,
771 LLT::vector(NumElts / NumPieces, EltTy));
772 }
773
774 // Need to split because of alignment.
775 unsigned Align = Query.MMODescrs[0].AlignInBits;
776 unsigned EltSize = EltTy.getSizeInBits();
777 if (EltSize > Align &&
778 (EltSize / Align < DstTy.getNumElements())) {
779 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
780 }
781
782 // May need relegalization for the scalars.
783 return std::make_pair(0, EltTy);
784 })
785 .minScalar(0, S32);
786
787 if (IsStore)
788 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
789
790 // TODO: Need a bitcast lower option?
791 Actions
792 .legalIf([=](const LegalityQuery &Query) {
793 const LLT Ty0 = Query.Types[0];
794 unsigned Size = Ty0.getSizeInBits();
795 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
796 unsigned Align = Query.MMODescrs[0].AlignInBits;
797
798 // No extending vector loads.
799 if (Size > MemSize && Ty0.isVector())
800 return false;
801
802 // FIXME: Widening store from alignment not valid.
803 if (MemSize < Size)
804 MemSize = std::max(MemSize, Align);
805
806 switch (MemSize) {
807 case 8:
808 case 16:
809 return Size == 32;
810 case 32:
811 case 64:
812 case 128:
813 return true;
814 case 96:
815 return ST.hasDwordx3LoadStores();
816 case 256:
817 case 512:
818 return true;
819 default:
820 return false;
821 }
822 })
823 .widenScalarToNextPow2(0)
824 // TODO: v3s32->v4s32 with alignment
825 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
826 }
827
Matt Arsenault6614f852019-01-22 19:02:10 +0000828 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000829 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
830 {S32, GlobalPtr, 16, 2 * 8},
831 {S32, LocalPtr, 8, 8},
832 {S32, LocalPtr, 16, 16},
833 {S32, PrivatePtr, 8, 8},
Matt Arsenaultda027272019-09-10 16:42:37 +0000834 {S32, PrivatePtr, 16, 16},
835 {S32, ConstantPtr, 8, 8},
836 {S32, ConstantPtr, 16, 2 * 8}});
Matt Arsenault6614f852019-01-22 19:02:10 +0000837 if (ST.hasFlatAddressSpace()) {
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000838 ExtLoads.legalForTypesWithMemDesc(
839 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
Matt Arsenault6614f852019-01-22 19:02:10 +0000840 }
841
842 ExtLoads.clampScalar(0, S32, S32)
843 .widenScalarToNextPow2(0)
844 .unsupportedIfMemSizeNotPow2()
845 .lower();
846
Matt Arsenault36d40922018-12-20 00:33:49 +0000847 auto &Atomics = getActionDefinitionsBuilder(
848 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
849 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
850 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
Matt Arsenault171cf532019-10-08 10:04:41 -0700851 G_ATOMICRMW_UMIN})
Matt Arsenault36d40922018-12-20 00:33:49 +0000852 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
853 {S64, GlobalPtr}, {S64, LocalPtr}});
854 if (ST.hasFlatAddressSpace()) {
855 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
856 }
Tom Stellardca166212017-01-30 21:56:46 +0000857
Matt Arsenault26cb53b2019-08-01 03:33:15 +0000858 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
859 .legalFor({{S32, LocalPtr}});
860
Matt Arsenault171cf532019-10-08 10:04:41 -0700861 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
862 // demarshalling
863 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
864 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
865 {S32, FlatPtr}, {S64, FlatPtr}})
866 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
867 {S32, RegionPtr}, {S64, RegionPtr}});
868
Matt Arsenaultbcd6b1d2019-10-06 01:37:37 +0000869 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
870 .lower();
871
Matt Arsenault96e47012019-01-18 21:42:55 +0000872 // TODO: Pointer types, any 32-bit or 64-bit vector
873 getActionDefinitionsBuilder(G_SELECT)
Matt Arsenaultfdf36722019-07-01 15:42:47 +0000874 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
Matt Arsenault10547232019-02-04 14:04:52 +0000875 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
876 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
Matt Arsenaultfdf36722019-07-01 15:42:47 +0000877 .clampScalar(0, S16, S64)
Matt Arsenaultb4c95b32019-02-19 17:03:09 +0000878 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
879 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
Matt Arsenaultdc6c7852019-01-30 04:19:31 +0000880 .scalarize(1)
Matt Arsenault2491f822019-02-02 23:31:50 +0000881 .clampMaxNumElements(0, S32, 2)
882 .clampMaxNumElements(0, LocalPtr, 2)
883 .clampMaxNumElements(0, PrivatePtr, 2)
Matt Arsenaultb4c95b32019-02-19 17:03:09 +0000884 .scalarize(0)
Matt Arsenault4ed6cca2019-04-05 14:03:04 +0000885 .widenScalarToNextPow2(0)
Matt Arsenault2491f822019-02-02 23:31:50 +0000886 .legalIf(all(isPointer(0), typeIs(1, S1)));
Tom Stellard2860a422017-06-07 13:54:51 +0000887
Matt Arsenault4c5e8f512019-01-22 22:00:19 +0000888 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
889 // be more flexible with the shift amount type.
890 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
891 .legalFor({{S32, S32}, {S64, S32}});
Matt Arsenaultf6cab162019-01-30 03:36:25 +0000892 if (ST.has16BitInsts()) {
Matt Arsenaultc83b8232019-02-07 17:38:00 +0000893 if (ST.hasVOP3PInsts()) {
894 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
895 .clampMaxNumElements(0, S16, 2);
896 } else
897 Shifts.legalFor({{S16, S32}, {S16, S16}});
Matt Arsenaultfbec8fe2019-02-07 19:37:44 +0000898
899 Shifts.clampScalar(1, S16, S32);
Matt Arsenaultf6cab162019-01-30 03:36:25 +0000900 Shifts.clampScalar(0, S16, S64);
Matt Arsenaultb0a22702019-02-08 15:06:24 +0000901 Shifts.widenScalarToNextPow2(0, 16);
Matt Arsenaultfbec8fe2019-02-07 19:37:44 +0000902 } else {
903 // Make sure we legalize the shift amount type first, as the general
904 // expansion for the shifted type will produce much worse code if it hasn't
905 // been truncated already.
906 Shifts.clampScalar(1, S32, S32);
Matt Arsenault4c5e8f512019-01-22 22:00:19 +0000907 Shifts.clampScalar(0, S32, S64);
Matt Arsenaultb0a22702019-02-08 15:06:24 +0000908 Shifts.widenScalarToNextPow2(0, 32);
Matt Arsenaultfbec8fe2019-02-07 19:37:44 +0000909 }
910 Shifts.scalarize(0);
Tom Stellardca166212017-01-30 21:56:46 +0000911
Matt Arsenault7b9ed892018-03-12 13:35:53 +0000912 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
Matt Arsenault63786292019-01-22 20:38:15 +0000913 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
914 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
915 unsigned IdxTypeIdx = 2;
916
Matt Arsenault7b9ed892018-03-12 13:35:53 +0000917 getActionDefinitionsBuilder(Op)
Matt Arsenaultb0e04c02019-07-15 19:40:59 +0000918 .customIf([=](const LegalityQuery &Query) {
Matt Arsenault90bdfb32019-07-15 18:31:10 +0000919 const LLT EltTy = Query.Types[EltTypeIdx];
920 const LLT VecTy = Query.Types[VecTypeIdx];
921 const LLT IdxTy = Query.Types[IdxTypeIdx];
922 return (EltTy.getSizeInBits() == 16 ||
923 EltTy.getSizeInBits() % 32 == 0) &&
924 VecTy.getSizeInBits() % 32 == 0 &&
Matt Arsenault9dba6032019-10-01 16:35:06 +0000925 VecTy.getSizeInBits() <= 1024 &&
Matt Arsenault90bdfb32019-07-15 18:31:10 +0000926 IdxTy.getSizeInBits() == 32;
Matt Arsenault63786292019-01-22 20:38:15 +0000927 })
928 .clampScalar(EltTypeIdx, S32, S64)
929 .clampScalar(VecTypeIdx, S32, S64)
930 .clampScalar(IdxTypeIdx, S32, S32);
Matt Arsenault7b9ed892018-03-12 13:35:53 +0000931 }
932
Matt Arsenault63786292019-01-22 20:38:15 +0000933 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
934 .unsupportedIf([=](const LegalityQuery &Query) {
935 const LLT &EltTy = Query.Types[1].getElementType();
936 return Query.Types[0] != EltTy;
937 });
938
Matt Arsenaultc4d07552019-02-20 16:11:22 +0000939 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
940 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
941 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
942
943 // FIXME: Doesn't handle extract of illegal sizes.
Matt Arsenault4bcdcad2019-10-07 19:13:27 +0000944 getActionDefinitionsBuilder(Op)
945 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
946 // FIXME: Multiples of 16 should not be legal.
Matt Arsenault91be65b2019-02-07 17:25:51 +0000947 .legalIf([=](const LegalityQuery &Query) {
Matt Arsenaultc4d07552019-02-20 16:11:22 +0000948 const LLT BigTy = Query.Types[BigTyIdx];
949 const LLT LitTy = Query.Types[LitTyIdx];
950 return (BigTy.getSizeInBits() % 32 == 0) &&
951 (LitTy.getSizeInBits() % 16 == 0);
952 })
Matt Arsenault91be65b2019-02-07 17:25:51 +0000953 .widenScalarIf(
Matt Arsenaultc4d07552019-02-20 16:11:22 +0000954 [=](const LegalityQuery &Query) {
955 const LLT BigTy = Query.Types[BigTyIdx];
956 return (BigTy.getScalarSizeInBits() < 16);
957 },
958 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
959 .widenScalarIf(
960 [=](const LegalityQuery &Query) {
961 const LLT LitTy = Query.Types[LitTyIdx];
962 return (LitTy.getScalarSizeInBits() < 16);
963 },
964 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
Matt Arsenault2b6f76f2019-04-22 15:22:46 +0000965 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
966 .widenScalarToNextPow2(BigTyIdx, 32);
967
Matt Arsenaultc4d07552019-02-20 16:11:22 +0000968 }
Matt Arsenault71272e62018-03-05 16:25:15 +0000969
Matt Arsenaulta0933e62019-09-09 18:57:51 +0000970 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
971 .legalForCartesianProduct(AllS32Vectors, {S32})
972 .legalForCartesianProduct(AllS64Vectors, {S64})
Matt Arsenault05aa8a72019-10-02 01:02:18 +0000973 .clampNumElements(0, V16S32, V32S32)
Matt Arsenault3cd39592019-10-09 22:44:43 +0000974 .clampNumElements(0, V2S64, V16S64)
975 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
Matt Arsenaulta0933e62019-09-09 18:57:51 +0000976
977 if (ST.hasScalarPackInsts())
978 BuildVector.legalFor({V2S16, S32});
979
980 BuildVector
981 .minScalarSameAs(1, 0)
982 .legalIf(isRegisterType(0))
983 .minScalarOrElt(0, S32);
Matt Arsenaultbee2ad72018-12-21 03:03:11 +0000984
Matt Arsenault182f9242019-09-09 17:04:18 +0000985 if (ST.hasScalarPackInsts()) {
986 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
987 .legalFor({V2S16, S32})
988 .lower();
989 } else {
990 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
991 .lower();
992 }
993
Matt Arsenaulta1515d22019-01-08 01:30:02 +0000994 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
Matt Arsenault4dd57552019-07-09 14:17:31 +0000995 .legalIf(isRegisterType(0));
Matt Arsenaulta1515d22019-01-08 01:30:02 +0000996
Matt Arsenault690645b2019-08-13 16:09:07 +0000997 // TODO: Don't fully scalarize v2s16 pieces
998 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
999
Matt Arsenault503afda2018-03-12 13:35:43 +00001000 // Merge/Unmerge
1001 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1002 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1003 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1004
Matt Arsenaultff6a9a22019-01-20 18:40:36 +00001005 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1006 const LLT &Ty = Query.Types[TypeIdx];
1007 if (Ty.isVector()) {
1008 const LLT &EltTy = Ty.getElementType();
1009 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1010 return true;
1011 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1012 return true;
1013 }
1014 return false;
1015 };
1016
Matt Arsenault578fa282019-10-07 19:05:58 +00001017 auto &Builder = getActionDefinitionsBuilder(Op)
Matt Arsenaultd8d193d2019-01-29 23:17:35 +00001018 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1019 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1020 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1021 // valid.
1022 .clampScalar(LitTyIdx, S16, S256)
1023 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
Matt Arsenault954a0122019-08-21 16:59:10 +00001024 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
Matt Arsenault28215ca2019-08-13 16:26:28 +00001025 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1026 elementTypeIs(1, S16)),
1027 changeTo(1, V2S16))
Matt Arsenaultff6a9a22019-01-20 18:40:36 +00001028 // Break up vectors with weird elements into scalars
1029 .fewerElementsIf(
1030 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
Matt Arsenault990f5072019-01-25 00:51:00 +00001031 scalarize(0))
Matt Arsenaultff6a9a22019-01-20 18:40:36 +00001032 .fewerElementsIf(
1033 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
Matt Arsenault990f5072019-01-25 00:51:00 +00001034 scalarize(1))
Matt Arsenault9dba6032019-10-01 16:35:06 +00001035 .clampScalar(BigTyIdx, S32, S1024)
Matt Arsenault578fa282019-10-07 19:05:58 +00001036 .lowerFor({{S16, V2S16}});
1037
1038 if (Op == G_MERGE_VALUES) {
1039 Builder.widenScalarIf(
1040 // TODO: Use 16-bit shifts if legal for 8-bit values?
Matt Arsenaultff6a9a22019-01-20 18:40:36 +00001041 [=](const LegalityQuery &Query) {
Matt Arsenault578fa282019-10-07 19:05:58 +00001042 const LLT Ty = Query.Types[LitTyIdx];
1043 return Ty.getSizeInBits() < 32;
Matt Arsenaultff6a9a22019-01-20 18:40:36 +00001044 },
Matt Arsenault578fa282019-10-07 19:05:58 +00001045 changeTo(LitTyIdx, S32));
1046 }
1047
1048 Builder.widenScalarIf(
1049 [=](const LegalityQuery &Query) {
1050 const LLT Ty = Query.Types[BigTyIdx];
1051 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1052 Ty.getSizeInBits() % 16 != 0;
1053 },
1054 [=](const LegalityQuery &Query) {
1055 // Pick the next power of 2, or a multiple of 64 over 128.
1056 // Whichever is smaller.
1057 const LLT &Ty = Query.Types[BigTyIdx];
1058 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1059 if (NewSizeInBits >= 256) {
1060 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1061 if (RoundedTo < NewSizeInBits)
1062 NewSizeInBits = RoundedTo;
1063 }
1064 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1065 })
Matt Arsenault503afda2018-03-12 13:35:43 +00001066 .legalIf([=](const LegalityQuery &Query) {
1067 const LLT &BigTy = Query.Types[BigTyIdx];
1068 const LLT &LitTy = Query.Types[LitTyIdx];
Matt Arsenaultff6a9a22019-01-20 18:40:36 +00001069
1070 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1071 return false;
1072 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1073 return false;
1074
1075 return BigTy.getSizeInBits() % 16 == 0 &&
1076 LitTy.getSizeInBits() % 16 == 0 &&
Matt Arsenault9dba6032019-10-01 16:35:06 +00001077 BigTy.getSizeInBits() <= 1024;
Matt Arsenault503afda2018-03-12 13:35:43 +00001078 })
1079 // Any vectors left are the wrong size. Scalarize them.
Matt Arsenault990f5072019-01-25 00:51:00 +00001080 .scalarize(0)
1081 .scalarize(1);
Matt Arsenault503afda2018-03-12 13:35:43 +00001082 }
1083
Daniel Sanderse9a57c22019-08-09 21:11:20 +00001084 getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1085
Tom Stellardca166212017-01-30 21:56:46 +00001086 computeTables();
Roman Tereshin76c29c62018-05-31 16:16:48 +00001087 verify(*ST.getInstrInfo());
Tom Stellardca166212017-01-30 21:56:46 +00001088}
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001089
1090bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1091 MachineRegisterInfo &MRI,
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001092 MachineIRBuilder &B,
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001093 GISelChangeObserver &Observer) const {
1094 switch (MI.getOpcode()) {
1095 case TargetOpcode::G_ADDRSPACE_CAST:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001096 return legalizeAddrSpaceCast(MI, MRI, B);
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001097 case TargetOpcode::G_FRINT:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001098 return legalizeFrint(MI, MRI, B);
Matt Arsenaulta510b572019-05-17 12:20:05 +00001099 case TargetOpcode::G_FCEIL:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001100 return legalizeFceil(MI, MRI, B);
Matt Arsenault6aebcd52019-05-17 12:20:01 +00001101 case TargetOpcode::G_INTRINSIC_TRUNC:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001102 return legalizeIntrinsicTrunc(MI, MRI, B);
Matt Arsenault2f292202019-05-17 23:05:18 +00001103 case TargetOpcode::G_SITOFP:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001104 return legalizeITOFP(MI, MRI, B, true);
Matt Arsenault2f292202019-05-17 23:05:18 +00001105 case TargetOpcode::G_UITOFP:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001106 return legalizeITOFP(MI, MRI, B, false);
Matt Arsenault6ce1b4f2019-07-10 16:31:19 +00001107 case TargetOpcode::G_FMINNUM:
1108 case TargetOpcode::G_FMAXNUM:
1109 case TargetOpcode::G_FMINNUM_IEEE:
1110 case TargetOpcode::G_FMAXNUM_IEEE:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001111 return legalizeMinNumMaxNum(MI, MRI, B);
Matt Arsenaultb0e04c02019-07-15 19:40:59 +00001112 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001113 return legalizeExtractVectorElt(MI, MRI, B);
Matt Arsenaultb0e04c02019-07-15 19:40:59 +00001114 case TargetOpcode::G_INSERT_VECTOR_ELT:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001115 return legalizeInsertVectorElt(MI, MRI, B);
Matt Arsenaultcbd17822019-08-29 20:06:48 +00001116 case TargetOpcode::G_FSIN:
1117 case TargetOpcode::G_FCOS:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001118 return legalizeSinCos(MI, MRI, B);
Matt Arsenault64ecca92019-09-09 17:13:44 +00001119 case TargetOpcode::G_GLOBAL_VALUE:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001120 return legalizeGlobalValue(MI, MRI, B);
Matt Arsenaultad6a8b832019-09-10 16:42:31 +00001121 case TargetOpcode::G_LOAD:
1122 return legalizeLoad(MI, MRI, B, Observer);
Matt Arsenault4d339182019-09-13 00:44:35 +00001123 case TargetOpcode::G_FMAD:
1124 return legalizeFMad(MI, MRI, B);
Austin Kerbow97263fa2019-10-21 22:18:26 +00001125 case TargetOpcode::G_FDIV:
1126 return legalizeFDIV(MI, MRI, B);
Matt Arsenault171cf532019-10-08 10:04:41 -07001127 case TargetOpcode::G_ATOMIC_CMPXCHG:
1128 return legalizeAtomicCmpXChg(MI, MRI, B);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001129 default:
1130 return false;
1131 }
1132
1133 llvm_unreachable("expected switch to return");
1134}
1135
Matt Arsenault1178dc32019-06-28 01:16:46 +00001136Register AMDGPULegalizerInfo::getSegmentAperture(
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001137 unsigned AS,
1138 MachineRegisterInfo &MRI,
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001139 MachineIRBuilder &B) const {
1140 MachineFunction &MF = B.getMF();
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001141 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1142 const LLT S32 = LLT::scalar(32);
1143
Matt Arsenaultd7cad4f2019-10-04 08:35:38 +00001144 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1145
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001146 if (ST.hasApertureRegs()) {
1147 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1148 // getreg.
1149 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1150 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1151 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1152 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1153 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1154 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1155 unsigned Encoding =
1156 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1157 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1158 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1159
Matt Arsenault1178dc32019-06-28 01:16:46 +00001160 Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1161 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001162
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001163 B.buildInstr(AMDGPU::S_GETREG_B32)
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001164 .addDef(GetReg)
1165 .addImm(Encoding);
1166 MRI.setType(GetReg, S32);
1167
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001168 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1169 B.buildInstr(TargetOpcode::G_SHL)
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001170 .addDef(ApertureReg)
1171 .addUse(GetReg)
Amara Emerson946b1242019-04-15 05:04:20 +00001172 .addUse(ShiftAmt.getReg(0));
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001173
1174 return ApertureReg;
1175 }
1176
Matt Arsenault1178dc32019-06-28 01:16:46 +00001177 Register QueuePtr = MRI.createGenericVirtualRegister(
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001178 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1179
Matt Arsenault25156ae2019-09-05 02:20:29 +00001180 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001181 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
Matt Arsenault25156ae2019-09-05 02:20:29 +00001182 return Register();
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001183
1184 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1185 // private_segment_aperture_base_hi.
1186 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1187
1188 // FIXME: Don't use undef
1189 Value *V = UndefValue::get(PointerType::get(
1190 Type::getInt8Ty(MF.getFunction().getContext()),
1191 AMDGPUAS::CONSTANT_ADDRESS));
1192
1193 MachinePointerInfo PtrInfo(V, StructOffset);
1194 MachineMemOperand *MMO = MF.getMachineMemOperand(
1195 PtrInfo,
1196 MachineMemOperand::MOLoad |
1197 MachineMemOperand::MODereferenceable |
1198 MachineMemOperand::MOInvariant,
1199 4,
1200 MinAlign(64, StructOffset));
1201
Matt Arsenaulte3a676e2019-06-24 15:50:29 +00001202 Register LoadResult = MRI.createGenericVirtualRegister(S32);
1203 Register LoadAddr;
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001204
Daniel Sanderse74c5b92019-11-01 13:18:00 -07001205 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001206 B.buildLoad(LoadResult, LoadAddr, *MMO);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001207 return LoadResult;
1208}
1209
1210bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1211 MachineInstr &MI, MachineRegisterInfo &MRI,
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001212 MachineIRBuilder &B) const {
1213 MachineFunction &MF = B.getMF();
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001214
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001215 B.setInstr(MI);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001216
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +00001217 const LLT S32 = LLT::scalar(32);
Matt Arsenaulte3a676e2019-06-24 15:50:29 +00001218 Register Dst = MI.getOperand(0).getReg();
1219 Register Src = MI.getOperand(1).getReg();
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001220
1221 LLT DstTy = MRI.getType(Dst);
1222 LLT SrcTy = MRI.getType(Src);
1223 unsigned DestAS = DstTy.getAddressSpace();
1224 unsigned SrcAS = SrcTy.getAddressSpace();
1225
1226 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1227 // vector element.
1228 assert(!DstTy.isVector());
1229
1230 const AMDGPUTargetMachine &TM
1231 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1232
1233 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1234 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001235 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001236 return true;
1237 }
1238
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +00001239 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1240 // Truncate.
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001241 B.buildExtract(Dst, Src, 0);
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +00001242 MI.eraseFromParent();
1243 return true;
1244 }
1245
1246 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1247 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1248 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1249
1250 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1251 // another. Merge operands are required to be the same type, but creating an
1252 // extra ptrtoint would be kind of pointless.
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001253 auto HighAddr = B.buildConstant(
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +00001254 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001255 B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +00001256 MI.eraseFromParent();
1257 return true;
1258 }
1259
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001260 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1261 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1262 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1263 unsigned NullVal = TM.getNullPointerValue(DestAS);
1264
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001265 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1266 auto FlatNull = B.buildConstant(SrcTy, 0);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001267
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001268 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001269
1270 // Extract low 32-bits of the pointer.
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001271 B.buildExtract(PtrLo32, Src, 0);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001272
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001273 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001274 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1275 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001276
1277 MI.eraseFromParent();
1278 return true;
1279 }
1280
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +00001281 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1282 return false;
1283
1284 if (!ST.hasFlatAddressSpace())
1285 return false;
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001286
Amara Emerson946b1242019-04-15 05:04:20 +00001287 auto SegmentNull =
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001288 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
Amara Emerson946b1242019-04-15 05:04:20 +00001289 auto FlatNull =
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001290 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001291
Matt Arsenaultd7cad4f2019-10-04 08:35:38 +00001292 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
Matt Arsenault25156ae2019-09-05 02:20:29 +00001293 if (!ApertureReg.isValid())
1294 return false;
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001295
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001296 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001297 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001298
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001299 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001300
1301 // Coerce the type of the low half of the result so we can use merge_values.
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +00001302 Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001303 B.buildInstr(TargetOpcode::G_PTRTOINT)
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001304 .addDef(SrcAsInt)
1305 .addUse(Src);
1306
1307 // TODO: Should we allow mismatched types but matching sizes in merges to
1308 // avoid the ptrtoint?
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001309 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1310 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001311
1312 MI.eraseFromParent();
1313 return true;
1314}
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001315
1316bool AMDGPULegalizerInfo::legalizeFrint(
1317 MachineInstr &MI, MachineRegisterInfo &MRI,
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001318 MachineIRBuilder &B) const {
1319 B.setInstr(MI);
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001320
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001321 Register Src = MI.getOperand(1).getReg();
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001322 LLT Ty = MRI.getType(Src);
1323 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1324
1325 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1326 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1327
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001328 auto C1 = B.buildFConstant(Ty, C1Val);
1329 auto CopySign = B.buildFCopysign(Ty, C1, Src);
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001330
1331 // TODO: Should this propagate fast-math-flags?
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001332 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1333 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001334
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001335 auto C2 = B.buildFConstant(Ty, C2Val);
1336 auto Fabs = B.buildFAbs(Ty, Src);
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001337
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001338 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1339 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001340 return true;
1341}
Matt Arsenault6aebcd52019-05-17 12:20:01 +00001342
Matt Arsenaulta510b572019-05-17 12:20:05 +00001343bool AMDGPULegalizerInfo::legalizeFceil(
1344 MachineInstr &MI, MachineRegisterInfo &MRI,
1345 MachineIRBuilder &B) const {
1346 B.setInstr(MI);
1347
Matt Arsenault1a02d302019-05-17 12:59:27 +00001348 const LLT S1 = LLT::scalar(1);
1349 const LLT S64 = LLT::scalar(64);
1350
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001351 Register Src = MI.getOperand(1).getReg();
Matt Arsenault1a02d302019-05-17 12:59:27 +00001352 assert(MRI.getType(Src) == S64);
Matt Arsenaulta510b572019-05-17 12:20:05 +00001353
1354 // result = trunc(src)
1355 // if (src > 0.0 && src != result)
1356 // result += 1.0
1357
Matt Arsenaulta510b572019-05-17 12:20:05 +00001358 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1359
Matt Arsenaulta510b572019-05-17 12:20:05 +00001360 const auto Zero = B.buildFConstant(S64, 0.0);
1361 const auto One = B.buildFConstant(S64, 1.0);
1362 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1363 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1364 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1365 auto Add = B.buildSelect(S64, And, One, Zero);
1366
1367 // TODO: Should this propagate fast-math-flags?
1368 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1369 return true;
1370}
1371
Matt Arsenault6aebcd52019-05-17 12:20:01 +00001372static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1373 MachineIRBuilder &B) {
1374 const unsigned FractBits = 52;
1375 const unsigned ExpBits = 11;
1376 LLT S32 = LLT::scalar(32);
1377
1378 auto Const0 = B.buildConstant(S32, FractBits - 32);
1379 auto Const1 = B.buildConstant(S32, ExpBits);
1380
1381 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1382 .addUse(Const0.getReg(0))
1383 .addUse(Const1.getReg(0));
1384
1385 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1386}
1387
1388bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1389 MachineInstr &MI, MachineRegisterInfo &MRI,
1390 MachineIRBuilder &B) const {
1391 B.setInstr(MI);
1392
Matt Arsenault1a02d302019-05-17 12:59:27 +00001393 const LLT S1 = LLT::scalar(1);
1394 const LLT S32 = LLT::scalar(32);
1395 const LLT S64 = LLT::scalar(64);
Matt Arsenault6aebcd52019-05-17 12:20:01 +00001396
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001397 Register Src = MI.getOperand(1).getReg();
Matt Arsenault1a02d302019-05-17 12:59:27 +00001398 assert(MRI.getType(Src) == S64);
Matt Arsenault6aebcd52019-05-17 12:20:01 +00001399
1400 // TODO: Should this use extract since the low half is unused?
1401 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001402 Register Hi = Unmerge.getReg(1);
Matt Arsenault6aebcd52019-05-17 12:20:01 +00001403
1404 // Extract the upper half, since this is where we will find the sign and
1405 // exponent.
1406 auto Exp = extractF64Exponent(Hi, B);
1407
1408 const unsigned FractBits = 52;
1409
1410 // Extract the sign bit.
1411 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1412 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1413
1414 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1415
1416 const auto Zero32 = B.buildConstant(S32, 0);
1417
1418 // Extend back to 64-bits.
1419 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1420
1421 auto Shr = B.buildAShr(S64, FractMask, Exp);
1422 auto Not = B.buildNot(S64, Shr);
1423 auto Tmp0 = B.buildAnd(S64, Src, Not);
1424 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1425
1426 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1427 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1428
1429 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1430 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1431 return true;
1432}
Matt Arsenault2f292202019-05-17 23:05:18 +00001433
1434bool AMDGPULegalizerInfo::legalizeITOFP(
1435 MachineInstr &MI, MachineRegisterInfo &MRI,
1436 MachineIRBuilder &B, bool Signed) const {
1437 B.setInstr(MI);
1438
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001439 Register Dst = MI.getOperand(0).getReg();
1440 Register Src = MI.getOperand(1).getReg();
Matt Arsenault2f292202019-05-17 23:05:18 +00001441
1442 const LLT S64 = LLT::scalar(64);
1443 const LLT S32 = LLT::scalar(32);
1444
1445 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1446
1447 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1448
1449 auto CvtHi = Signed ?
1450 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1451 B.buildUITOFP(S64, Unmerge.getReg(1));
1452
1453 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1454
1455 auto ThirtyTwo = B.buildConstant(S32, 32);
1456 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1457 .addUse(CvtHi.getReg(0))
1458 .addUse(ThirtyTwo.getReg(0));
1459
1460 // TODO: Should this propagate fast-math-flags?
1461 B.buildFAdd(Dst, LdExp, CvtLo);
1462 MI.eraseFromParent();
1463 return true;
1464}
Matt Arsenaulte15770a2019-07-01 18:40:23 +00001465
Matt Arsenault6ce1b4f2019-07-10 16:31:19 +00001466bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1467 MachineInstr &MI, MachineRegisterInfo &MRI,
1468 MachineIRBuilder &B) const {
1469 MachineFunction &MF = B.getMF();
1470 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1471
1472 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1473 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1474
1475 // With ieee_mode disabled, the instructions have the correct behavior
1476 // already for G_FMINNUM/G_FMAXNUM
1477 if (!MFI->getMode().IEEE)
1478 return !IsIEEEOp;
1479
1480 if (IsIEEEOp)
1481 return true;
1482
1483 MachineIRBuilder HelperBuilder(MI);
1484 GISelObserverWrapper DummyObserver;
1485 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
Matt Arsenaulta91f0172019-09-09 23:30:11 +00001486 HelperBuilder.setInstr(MI);
Matt Arsenault6ce1b4f2019-07-10 16:31:19 +00001487 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1488}
1489
Matt Arsenaultb0e04c02019-07-15 19:40:59 +00001490bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1491 MachineInstr &MI, MachineRegisterInfo &MRI,
1492 MachineIRBuilder &B) const {
1493 // TODO: Should move some of this into LegalizerHelper.
1494
1495 // TODO: Promote dynamic indexing of s16 to s32
1496 // TODO: Dynamic s64 indexing is only legal for SGPR.
1497 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1498 if (!IdxVal) // Dynamic case will be selected to register indexing.
1499 return true;
1500
1501 Register Dst = MI.getOperand(0).getReg();
1502 Register Vec = MI.getOperand(1).getReg();
1503
1504 LLT VecTy = MRI.getType(Vec);
1505 LLT EltTy = VecTy.getElementType();
1506 assert(EltTy == MRI.getType(Dst));
1507
1508 B.setInstr(MI);
1509
1510 if (IdxVal.getValue() < VecTy.getNumElements())
1511 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1512 else
1513 B.buildUndef(Dst);
1514
1515 MI.eraseFromParent();
1516 return true;
1517}
1518
Matt Arsenault6ed315f2019-07-15 19:43:04 +00001519bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1520 MachineInstr &MI, MachineRegisterInfo &MRI,
1521 MachineIRBuilder &B) const {
1522 // TODO: Should move some of this into LegalizerHelper.
1523
1524 // TODO: Promote dynamic indexing of s16 to s32
1525 // TODO: Dynamic s64 indexing is only legal for SGPR.
1526 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1527 if (!IdxVal) // Dynamic case will be selected to register indexing.
1528 return true;
1529
1530 Register Dst = MI.getOperand(0).getReg();
1531 Register Vec = MI.getOperand(1).getReg();
1532 Register Ins = MI.getOperand(2).getReg();
1533
1534 LLT VecTy = MRI.getType(Vec);
1535 LLT EltTy = VecTy.getElementType();
1536 assert(EltTy == MRI.getType(Ins));
1537
1538 B.setInstr(MI);
1539
1540 if (IdxVal.getValue() < VecTy.getNumElements())
1541 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1542 else
1543 B.buildUndef(Dst);
1544
1545 MI.eraseFromParent();
1546 return true;
1547}
1548
Matt Arsenaultcbd17822019-08-29 20:06:48 +00001549bool AMDGPULegalizerInfo::legalizeSinCos(
1550 MachineInstr &MI, MachineRegisterInfo &MRI,
1551 MachineIRBuilder &B) const {
1552 B.setInstr(MI);
1553
1554 Register DstReg = MI.getOperand(0).getReg();
1555 Register SrcReg = MI.getOperand(1).getReg();
1556 LLT Ty = MRI.getType(DstReg);
1557 unsigned Flags = MI.getFlags();
1558
1559 Register TrigVal;
1560 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1561 if (ST.hasTrigReducedRange()) {
1562 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1563 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1564 .addUse(MulVal.getReg(0))
1565 .setMIFlags(Flags).getReg(0);
1566 } else
1567 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1568
1569 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1570 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1571 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1572 .addUse(TrigVal)
1573 .setMIFlags(Flags);
1574 MI.eraseFromParent();
1575 return true;
1576}
1577
Matt Arsenault77ac4002019-10-01 01:06:43 +00001578bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1579 Register DstReg, LLT PtrTy,
1580 MachineIRBuilder &B, const GlobalValue *GV,
1581 unsigned Offset, unsigned GAFlags) const {
1582 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1583 // to the following code sequence:
1584 //
1585 // For constant address space:
1586 // s_getpc_b64 s[0:1]
1587 // s_add_u32 s0, s0, $symbol
1588 // s_addc_u32 s1, s1, 0
1589 //
1590 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1591 // a fixup or relocation is emitted to replace $symbol with a literal
1592 // constant, which is a pc-relative offset from the encoding of the $symbol
1593 // operand to the global variable.
1594 //
1595 // For global address space:
1596 // s_getpc_b64 s[0:1]
1597 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1598 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1599 //
1600 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1601 // fixups or relocations are emitted to replace $symbol@*@lo and
1602 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1603 // which is a 64-bit pc-relative offset from the encoding of the $symbol
1604 // operand to the global variable.
1605 //
1606 // What we want here is an offset from the value returned by s_getpc
1607 // (which is the address of the s_add_u32 instruction) to the global
1608 // variable, but since the encoding of $symbol starts 4 bytes after the start
1609 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1610 // small. This requires us to add 4 to the global variable offset in order to
1611 // compute the correct address.
1612
1613 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1614
1615 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1616 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1617
1618 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1619 .addDef(PCReg);
1620
1621 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1622 if (GAFlags == SIInstrInfo::MO_NONE)
1623 MIB.addImm(0);
1624 else
1625 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1626
1627 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1628
1629 if (PtrTy.getSizeInBits() == 32)
1630 B.buildExtract(DstReg, PCReg, 0);
1631 return true;
1632 }
1633
Matt Arsenault64ecca92019-09-09 17:13:44 +00001634bool AMDGPULegalizerInfo::legalizeGlobalValue(
1635 MachineInstr &MI, MachineRegisterInfo &MRI,
1636 MachineIRBuilder &B) const {
1637 Register DstReg = MI.getOperand(0).getReg();
1638 LLT Ty = MRI.getType(DstReg);
1639 unsigned AS = Ty.getAddressSpace();
1640
1641 const GlobalValue *GV = MI.getOperand(1).getGlobal();
1642 MachineFunction &MF = B.getMF();
1643 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenault77ac4002019-10-01 01:06:43 +00001644 B.setInstr(MI);
Matt Arsenault64ecca92019-09-09 17:13:44 +00001645
1646 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
Matt Arsenault64ecca92019-09-09 17:13:44 +00001647 if (!MFI->isEntryFunction()) {
1648 const Function &Fn = MF.getFunction();
1649 DiagnosticInfoUnsupported BadLDSDecl(
1650 Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1651 Fn.getContext().diagnose(BadLDSDecl);
1652 }
1653
1654 // TODO: We could emit code to handle the initialization somewhere.
1655 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1656 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1657 MI.eraseFromParent();
1658 return true;
1659 }
Matt Arsenault64ecca92019-09-09 17:13:44 +00001660
Matt Arsenault77ac4002019-10-01 01:06:43 +00001661 const Function &Fn = MF.getFunction();
1662 DiagnosticInfoUnsupported BadInit(
1663 Fn, "unsupported initializer for address space", MI.getDebugLoc());
1664 Fn.getContext().diagnose(BadInit);
1665 return true;
1666 }
1667
1668 const SITargetLowering *TLI = ST.getTargetLowering();
1669
1670 if (TLI->shouldEmitFixup(GV)) {
1671 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1672 MI.eraseFromParent();
1673 return true;
1674 }
1675
1676 if (TLI->shouldEmitPCReloc(GV)) {
1677 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1678 MI.eraseFromParent();
1679 return true;
1680 }
1681
1682 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1683 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1684
1685 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1686 MachinePointerInfo::getGOT(MF),
1687 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1688 MachineMemOperand::MOInvariant,
1689 8 /*Size*/, 8 /*Align*/);
1690
1691 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1692
1693 if (Ty.getSizeInBits() == 32) {
1694 // Truncate if this is a 32-bit constant adrdess.
1695 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1696 B.buildExtract(DstReg, Load, 0);
1697 } else
1698 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1699
1700 MI.eraseFromParent();
Matt Arsenault64ecca92019-09-09 17:13:44 +00001701 return true;
1702}
1703
Matt Arsenaultad6a8b832019-09-10 16:42:31 +00001704bool AMDGPULegalizerInfo::legalizeLoad(
1705 MachineInstr &MI, MachineRegisterInfo &MRI,
1706 MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1707 B.setInstr(MI);
1708 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1709 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1710 Observer.changingInstr(MI);
1711 MI.getOperand(1).setReg(Cast.getReg(0));
1712 Observer.changedInstr(MI);
1713 return true;
1714}
1715
Matt Arsenault4d339182019-09-13 00:44:35 +00001716bool AMDGPULegalizerInfo::legalizeFMad(
1717 MachineInstr &MI, MachineRegisterInfo &MRI,
1718 MachineIRBuilder &B) const {
1719 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1720 assert(Ty.isScalar());
1721
1722 // TODO: Always legal with future ftz flag.
1723 if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1724 return true;
1725 if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1726 return true;
1727
1728 MachineFunction &MF = B.getMF();
1729
1730 MachineIRBuilder HelperBuilder(MI);
1731 GISelObserverWrapper DummyObserver;
1732 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1733 HelperBuilder.setMBB(*MI.getParent());
1734 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1735}
1736
Matt Arsenault171cf532019-10-08 10:04:41 -07001737bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1738 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1739 Register DstReg = MI.getOperand(0).getReg();
1740 Register PtrReg = MI.getOperand(1).getReg();
1741 Register CmpVal = MI.getOperand(2).getReg();
1742 Register NewVal = MI.getOperand(3).getReg();
1743
1744 assert(SITargetLowering::isFlatGlobalAddrSpace(
1745 MRI.getType(PtrReg).getAddressSpace()) &&
1746 "this should not have been custom lowered");
1747
1748 LLT ValTy = MRI.getType(CmpVal);
1749 LLT VecTy = LLT::vector(2, ValTy);
1750
1751 B.setInstr(MI);
1752 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1753
1754 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1755 .addDef(DstReg)
1756 .addUse(PtrReg)
1757 .addUse(PackedVal)
1758 .setMemRefs(MI.memoperands());
1759
1760 MI.eraseFromParent();
1761 return true;
1762}
1763
Matt Arsenaulte15770a2019-07-01 18:40:23 +00001764// Return the use branch instruction, otherwise null if the usage is invalid.
1765static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1766 MachineRegisterInfo &MRI) {
1767 Register CondDef = MI.getOperand(0).getReg();
1768 if (!MRI.hasOneNonDBGUse(CondDef))
1769 return nullptr;
1770
1771 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1772 return UseMI.getParent() == MI.getParent() &&
1773 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1774}
1775
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +00001776Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1777 Register Reg, LLT Ty) const {
1778 Register LiveIn = MRI.getLiveInVirtReg(Reg);
1779 if (LiveIn)
1780 return LiveIn;
1781
1782 Register NewReg = MRI.createGenericVirtualRegister(Ty);
1783 MRI.addLiveIn(Reg, NewReg);
1784 return NewReg;
1785}
1786
1787bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1788 const ArgDescriptor *Arg) const {
Matt Arsenault25156ae2019-09-05 02:20:29 +00001789 if (!Arg->isRegister() || !Arg->getRegister().isValid())
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +00001790 return false; // TODO: Handle these
1791
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +00001792 assert(Arg->getRegister().isPhysical());
1793
1794 MachineRegisterInfo &MRI = *B.getMRI();
1795
1796 LLT Ty = MRI.getType(DstReg);
1797 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1798
1799 if (Arg->isMasked()) {
1800 // TODO: Should we try to emit this once in the entry block?
1801 const LLT S32 = LLT::scalar(32);
1802 const unsigned Mask = Arg->getMask();
1803 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1804
Matt Arsenault8f6bdb72019-10-01 01:44:46 +00001805 Register AndMaskSrc = LiveIn;
1806
1807 if (Shift != 0) {
1808 auto ShiftAmt = B.buildConstant(S32, Shift);
1809 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1810 }
1811
1812 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +00001813 } else
1814 B.buildCopy(DstReg, LiveIn);
1815
1816 // Insert the argument copy if it doens't already exist.
1817 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1818 if (!MRI.getVRegDef(LiveIn)) {
Matt Arsenault69b1a2a2019-09-05 02:20:32 +00001819 // FIXME: Should have scoped insert pt
1820 MachineBasicBlock &OrigInsBB = B.getMBB();
1821 auto OrigInsPt = B.getInsertPt();
1822
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +00001823 MachineBasicBlock &EntryMBB = B.getMF().front();
1824 EntryMBB.addLiveIn(Arg->getRegister());
1825 B.setInsertPt(EntryMBB, EntryMBB.begin());
1826 B.buildCopy(LiveIn, Arg->getRegister());
Matt Arsenault69b1a2a2019-09-05 02:20:32 +00001827
1828 B.setInsertPt(OrigInsBB, OrigInsPt);
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +00001829 }
1830
1831 return true;
1832}
1833
1834bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1835 MachineInstr &MI,
1836 MachineRegisterInfo &MRI,
1837 MachineIRBuilder &B,
1838 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1839 B.setInstr(MI);
1840
1841 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1842
1843 const ArgDescriptor *Arg;
1844 const TargetRegisterClass *RC;
1845 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1846 if (!Arg) {
1847 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1848 return false;
1849 }
1850
1851 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1852 MI.eraseFromParent();
1853 return true;
1854 }
1855
1856 return false;
1857}
1858
Austin Kerbow97263fa2019-10-21 22:18:26 +00001859bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1860 MachineRegisterInfo &MRI,
1861 MachineIRBuilder &B) const {
1862 B.setInstr(MI);
Austin Kerbowc35b3582019-10-22 17:39:26 -07001863 Register Dst = MI.getOperand(0).getReg();
1864 LLT DstTy = MRI.getType(Dst);
1865 LLT S16 = LLT::scalar(16);
Austin Kerbow2b88b342019-10-29 09:55:49 -07001866 LLT S32 = LLT::scalar(32);
Austin Kerbow97263fa2019-10-21 22:18:26 +00001867
1868 if (legalizeFastUnsafeFDIV(MI, MRI, B))
1869 return true;
1870
Austin Kerbowc35b3582019-10-22 17:39:26 -07001871 if (DstTy == S16)
1872 return legalizeFDIV16(MI, MRI, B);
Austin Kerbow2b88b342019-10-29 09:55:49 -07001873 if (DstTy == S32)
1874 return legalizeFDIV32(MI, MRI, B);
Austin Kerbowc35b3582019-10-22 17:39:26 -07001875
Austin Kerbow97263fa2019-10-21 22:18:26 +00001876 return false;
1877}
1878
1879bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1880 MachineRegisterInfo &MRI,
1881 MachineIRBuilder &B) const {
1882 Register Res = MI.getOperand(0).getReg();
1883 Register LHS = MI.getOperand(1).getReg();
1884 Register RHS = MI.getOperand(2).getReg();
1885
1886 uint16_t Flags = MI.getFlags();
1887
1888 LLT ResTy = MRI.getType(Res);
1889 LLT S32 = LLT::scalar(32);
1890 LLT S64 = LLT::scalar(64);
1891
1892 const MachineFunction &MF = B.getMF();
1893 bool Unsafe =
1894 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1895
1896 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1897 return false;
1898
1899 if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals())
1900 return false;
1901
1902 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1903 // 1 / x -> RCP(x)
1904 if (CLHS->isExactlyValue(1.0)) {
1905 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1906 .addUse(RHS)
1907 .setMIFlags(Flags);
1908
1909 MI.eraseFromParent();
1910 return true;
1911 }
1912
1913 // -1 / x -> RCP( FNEG(x) )
1914 if (CLHS->isExactlyValue(-1.0)) {
1915 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1916 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1917 .addUse(FNeg.getReg(0))
1918 .setMIFlags(Flags);
1919
1920 MI.eraseFromParent();
1921 return true;
1922 }
1923 }
1924
1925 // x / y -> x * (1.0 / y)
1926 if (Unsafe) {
1927 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1928 .addUse(RHS)
1929 .setMIFlags(Flags);
1930 B.buildFMul(Res, LHS, RCP, Flags);
1931
1932 MI.eraseFromParent();
1933 return true;
1934 }
1935
1936 return false;
1937}
1938
Austin Kerbowc35b3582019-10-22 17:39:26 -07001939bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1940 MachineRegisterInfo &MRI,
1941 MachineIRBuilder &B) const {
1942 B.setInstr(MI);
1943 Register Res = MI.getOperand(0).getReg();
1944 Register LHS = MI.getOperand(1).getReg();
1945 Register RHS = MI.getOperand(2).getReg();
1946
1947 uint16_t Flags = MI.getFlags();
1948
1949 LLT S16 = LLT::scalar(16);
1950 LLT S32 = LLT::scalar(32);
1951
1952 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
1953 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
1954
1955 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1956 .addUse(RHSExt.getReg(0))
1957 .setMIFlags(Flags);
1958
1959 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
1960 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
1961
1962 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
1963 .addUse(RDst.getReg(0))
1964 .addUse(RHS)
1965 .addUse(LHS)
1966 .setMIFlags(Flags);
1967
1968 MI.eraseFromParent();
1969 return true;
1970}
1971
Austin Kerbow2b88b342019-10-29 09:55:49 -07001972// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
1973// to enable denorm mode. When 'Enable' is false, disable denorm mode.
1974static void toggleSPDenormMode(bool Enable,
1975 const GCNSubtarget &ST,
1976 MachineIRBuilder &B) {
1977 // Set SP denorm mode to this value.
1978 unsigned SPDenormMode =
1979 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
1980
1981 if (ST.hasDenormModeInst()) {
1982 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
1983 unsigned DPDenormModeDefault = ST.hasFP64Denormals()
1984 ? FP_DENORM_FLUSH_NONE
1985 : FP_DENORM_FLUSH_IN_FLUSH_OUT;
1986
1987 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
1988 B.buildInstr(AMDGPU::S_DENORM_MODE)
1989 .addImm(NewDenormModeValue);
1990
1991 } else {
1992 // Select FP32 bit field in mode register.
1993 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
1994 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
1995 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
1996
1997 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
1998 .addImm(SPDenormMode)
1999 .addImm(SPDenormModeBitField);
2000 }
2001}
2002
2003bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2004 MachineRegisterInfo &MRI,
2005 MachineIRBuilder &B) const {
2006 B.setInstr(MI);
2007 Register Res = MI.getOperand(0).getReg();
2008 Register LHS = MI.getOperand(1).getReg();
2009 Register RHS = MI.getOperand(2).getReg();
2010
2011 uint16_t Flags = MI.getFlags();
2012
2013 LLT S32 = LLT::scalar(32);
2014 LLT S1 = LLT::scalar(1);
2015
2016 auto One = B.buildFConstant(S32, 1.0f);
2017
2018 auto DenominatorScaled =
2019 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2020 .addUse(RHS)
2021 .addUse(RHS)
2022 .addUse(LHS)
2023 .setMIFlags(Flags);
2024 auto NumeratorScaled =
2025 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2026 .addUse(LHS)
2027 .addUse(RHS)
2028 .addUse(LHS)
2029 .setMIFlags(Flags);
2030
2031 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2032 .addUse(DenominatorScaled.getReg(0))
2033 .setMIFlags(Flags);
2034 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2035
2036 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2037 // aren't modeled as reading it.
2038 if (!ST.hasFP32Denormals())
2039 toggleSPDenormMode(true, ST, B);
2040
2041 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2042 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2043 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2044 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2045 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2046 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2047
2048 if (!ST.hasFP32Denormals())
2049 toggleSPDenormMode(false, ST, B);
2050
2051 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2052 .addUse(Fma4.getReg(0))
2053 .addUse(Fma1.getReg(0))
2054 .addUse(Fma3.getReg(0))
2055 .addUse(NumeratorScaled.getReg(1))
2056 .setMIFlags(Flags);
2057
2058 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2059 .addUse(Fmas.getReg(0))
2060 .addUse(RHS)
2061 .addUse(LHS)
2062 .setMIFlags(Flags);
2063
2064 MI.eraseFromParent();
2065 return true;
2066}
2067
Austin Kerbow97263fa2019-10-21 22:18:26 +00002068bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2069 MachineRegisterInfo &MRI,
2070 MachineIRBuilder &B) const {
Austin Kerbowc99f62e2019-07-30 18:49:16 +00002071 B.setInstr(MI);
2072 Register Res = MI.getOperand(0).getReg();
2073 Register LHS = MI.getOperand(2).getReg();
2074 Register RHS = MI.getOperand(3).getReg();
2075 uint16_t Flags = MI.getFlags();
2076
2077 LLT S32 = LLT::scalar(32);
2078 LLT S1 = LLT::scalar(1);
2079
2080 auto Abs = B.buildFAbs(S32, RHS, Flags);
2081 const APFloat C0Val(1.0f);
2082
2083 auto C0 = B.buildConstant(S32, 0x6f800000);
2084 auto C1 = B.buildConstant(S32, 0x2f800000);
2085 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2086
2087 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2088 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2089
2090 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2091
2092 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2093 .addUse(Mul0.getReg(0))
2094 .setMIFlags(Flags);
2095
2096 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2097
2098 B.buildFMul(Res, Sel, Mul1, Flags);
2099
2100 MI.eraseFromParent();
2101 return true;
2102}
2103
Matt Arsenault9e8e8c62019-07-01 18:49:01 +00002104bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2105 MachineRegisterInfo &MRI,
2106 MachineIRBuilder &B) const {
2107 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2108 if (!MFI->isEntryFunction()) {
2109 return legalizePreloadedArgIntrin(MI, MRI, B,
2110 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2111 }
2112
2113 B.setInstr(MI);
2114
2115 uint64_t Offset =
2116 ST.getTargetLowering()->getImplicitParameterOffset(
2117 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2118 Register DstReg = MI.getOperand(0).getReg();
2119 LLT DstTy = MRI.getType(DstReg);
2120 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2121
2122 const ArgDescriptor *Arg;
2123 const TargetRegisterClass *RC;
2124 std::tie(Arg, RC)
2125 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2126 if (!Arg)
2127 return false;
2128
2129 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2130 if (!loadInputValue(KernargPtrReg, B, Arg))
2131 return false;
2132
Daniel Sanderse74c5b92019-11-01 13:18:00 -07002133 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
Matt Arsenault9e8e8c62019-07-01 18:49:01 +00002134 MI.eraseFromParent();
2135 return true;
2136}
2137
Matt Arsenaultf581d572019-09-05 02:20:39 +00002138bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2139 MachineRegisterInfo &MRI,
2140 MachineIRBuilder &B,
2141 unsigned AddrSpace) const {
2142 B.setInstr(MI);
2143 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2144 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2145 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2146 MI.eraseFromParent();
2147 return true;
2148}
2149
Matt Arsenault3ecab8e2019-09-19 16:26:14 +00002150/// Handle register layout difference for f16 images for some subtargets.
2151Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2152 MachineRegisterInfo &MRI,
2153 Register Reg) const {
2154 if (!ST.hasUnpackedD16VMem())
2155 return Reg;
2156
2157 const LLT S16 = LLT::scalar(16);
2158 const LLT S32 = LLT::scalar(32);
2159 LLT StoreVT = MRI.getType(Reg);
2160 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2161
2162 auto Unmerge = B.buildUnmerge(S16, Reg);
2163
2164 SmallVector<Register, 4> WideRegs;
2165 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2166 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2167
2168 int NumElts = StoreVT.getNumElements();
2169
2170 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2171}
2172
2173bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2174 MachineRegisterInfo &MRI,
2175 MachineIRBuilder &B,
2176 bool IsFormat) const {
2177 // TODO: Reject f16 format on targets where unsupported.
2178 Register VData = MI.getOperand(1).getReg();
2179 LLT Ty = MRI.getType(VData);
2180
2181 B.setInstr(MI);
2182
2183 const LLT S32 = LLT::scalar(32);
2184 const LLT S16 = LLT::scalar(16);
2185
2186 // Fixup illegal register types for i8 stores.
2187 if (Ty == LLT::scalar(8) || Ty == S16) {
2188 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2189 MI.getOperand(1).setReg(AnyExt);
2190 return true;
2191 }
2192
2193 if (Ty.isVector()) {
2194 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2195 if (IsFormat)
2196 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2197 return true;
2198 }
2199
2200 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2201 }
2202
2203 return Ty == S32;
2204}
2205
Matt Arsenaulte15770a2019-07-01 18:40:23 +00002206bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2207 MachineRegisterInfo &MRI,
2208 MachineIRBuilder &B) const {
2209 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
Matt Arsenault86f864da2019-10-02 01:02:27 +00002210 switch (MI.getIntrinsicID()) {
Matt Arsenaulte15770a2019-07-01 18:40:23 +00002211 case Intrinsic::amdgcn_if: {
2212 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2213 const SIRegisterInfo *TRI
2214 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2215
2216 B.setInstr(*BrCond);
2217 Register Def = MI.getOperand(1).getReg();
2218 Register Use = MI.getOperand(3).getReg();
2219 B.buildInstr(AMDGPU::SI_IF)
2220 .addDef(Def)
2221 .addUse(Use)
2222 .addMBB(BrCond->getOperand(1).getMBB());
2223
2224 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2225 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2226 MI.eraseFromParent();
2227 BrCond->eraseFromParent();
2228 return true;
2229 }
2230
2231 return false;
2232 }
2233 case Intrinsic::amdgcn_loop: {
2234 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2235 const SIRegisterInfo *TRI
2236 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2237
2238 B.setInstr(*BrCond);
2239 Register Reg = MI.getOperand(2).getReg();
2240 B.buildInstr(AMDGPU::SI_LOOP)
2241 .addUse(Reg)
2242 .addMBB(BrCond->getOperand(1).getMBB());
2243 MI.eraseFromParent();
2244 BrCond->eraseFromParent();
2245 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2246 return true;
2247 }
2248
2249 return false;
2250 }
Matt Arsenault9e8e8c62019-07-01 18:49:01 +00002251 case Intrinsic::amdgcn_kernarg_segment_ptr:
2252 return legalizePreloadedArgIntrin(
2253 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2254 case Intrinsic::amdgcn_implicitarg_ptr:
2255 return legalizeImplicitArgPtr(MI, MRI, B);
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +00002256 case Intrinsic::amdgcn_workitem_id_x:
2257 return legalizePreloadedArgIntrin(MI, MRI, B,
2258 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2259 case Intrinsic::amdgcn_workitem_id_y:
2260 return legalizePreloadedArgIntrin(MI, MRI, B,
2261 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2262 case Intrinsic::amdgcn_workitem_id_z:
2263 return legalizePreloadedArgIntrin(MI, MRI, B,
2264 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
Matt Arsenault756d8192019-07-01 18:47:22 +00002265 case Intrinsic::amdgcn_workgroup_id_x:
2266 return legalizePreloadedArgIntrin(MI, MRI, B,
2267 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2268 case Intrinsic::amdgcn_workgroup_id_y:
2269 return legalizePreloadedArgIntrin(MI, MRI, B,
2270 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2271 case Intrinsic::amdgcn_workgroup_id_z:
2272 return legalizePreloadedArgIntrin(MI, MRI, B,
2273 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
Matt Arsenaultbae36362019-07-01 18:50:50 +00002274 case Intrinsic::amdgcn_dispatch_ptr:
2275 return legalizePreloadedArgIntrin(MI, MRI, B,
2276 AMDGPUFunctionArgInfo::DISPATCH_PTR);
2277 case Intrinsic::amdgcn_queue_ptr:
2278 return legalizePreloadedArgIntrin(MI, MRI, B,
2279 AMDGPUFunctionArgInfo::QUEUE_PTR);
2280 case Intrinsic::amdgcn_implicit_buffer_ptr:
2281 return legalizePreloadedArgIntrin(
2282 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2283 case Intrinsic::amdgcn_dispatch_id:
2284 return legalizePreloadedArgIntrin(MI, MRI, B,
2285 AMDGPUFunctionArgInfo::DISPATCH_ID);
Austin Kerbowc99f62e2019-07-30 18:49:16 +00002286 case Intrinsic::amdgcn_fdiv_fast:
Austin Kerbow97263fa2019-10-21 22:18:26 +00002287 return legalizeFDIVFastIntrin(MI, MRI, B);
Matt Arsenaultf581d572019-09-05 02:20:39 +00002288 case Intrinsic::amdgcn_is_shared:
2289 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2290 case Intrinsic::amdgcn_is_private:
2291 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
Matt Arsenault8e3bc9b2019-09-09 15:20:49 +00002292 case Intrinsic::amdgcn_wavefrontsize: {
2293 B.setInstr(MI);
2294 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2295 MI.eraseFromParent();
2296 return true;
2297 }
Matt Arsenault3ecab8e2019-09-19 16:26:14 +00002298 case Intrinsic::amdgcn_raw_buffer_store:
2299 return legalizeRawBufferStore(MI, MRI, B, false);
2300 case Intrinsic::amdgcn_raw_buffer_store_format:
2301 return legalizeRawBufferStore(MI, MRI, B, true);
Matt Arsenaulte15770a2019-07-01 18:40:23 +00002302 default:
2303 return true;
2304 }
2305
2306 return true;
2307}