blob: 200946f2c7d039743e33f096da2fa1512d09421a [file] [log] [blame]
Tom Stellardca166212017-01-30 21:56:46 +00001//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Tom Stellardca166212017-01-30 21:56:46 +00006//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
Reid Klecknerfe47ed62019-08-29 20:32:53 +000014#if defined(_MSC_VER) || defined(__MINGW32__)
15// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16// from the Visual C++ cmath / math.h headers:
17// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18#define _USE_MATH_DEFINES
19#endif
20
David Blaikie36a0f222018-03-23 23:58:31 +000021#include "AMDGPU.h"
Craig Topper2fa14362018-03-29 17:21:10 +000022#include "AMDGPULegalizerInfo.h"
Matt Arsenault85803362018-03-17 15:17:41 +000023#include "AMDGPUTargetMachine.h"
Matt Arsenaulta8b43392019-02-08 02:40:47 +000024#include "SIMachineFunctionInfo.h"
Matt Arsenault6ce1b4f2019-07-10 16:31:19 +000025#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
Matt Arsenaulta8b43392019-02-08 02:40:47 +000026#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
David Blaikieb3bde2e2017-11-17 01:07:10 +000027#include "llvm/CodeGen/TargetOpcodes.h"
Craig Topper2fa14362018-03-29 17:21:10 +000028#include "llvm/CodeGen/ValueTypes.h"
Tom Stellardca166212017-01-30 21:56:46 +000029#include "llvm/IR/DerivedTypes.h"
Matt Arsenault64ecca92019-09-09 17:13:44 +000030#include "llvm/IR/DiagnosticInfo.h"
Chandler Carruth6bda14b2017-06-06 11:49:48 +000031#include "llvm/IR/Type.h"
Tom Stellardca166212017-01-30 21:56:46 +000032#include "llvm/Support/Debug.h"
33
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +000034#define DEBUG_TYPE "amdgpu-legalinfo"
35
Tom Stellardca166212017-01-30 21:56:46 +000036using namespace llvm;
Daniel Sanders9ade5592018-01-29 17:37:29 +000037using namespace LegalizeActions;
Matt Arsenault990f5072019-01-25 00:51:00 +000038using namespace LegalizeMutations;
Matt Arsenault7ac79ed2019-01-20 19:45:18 +000039using namespace LegalityPredicates;
Tom Stellardca166212017-01-30 21:56:46 +000040
Matt Arsenaultd9141892019-02-07 19:10:15 +000041
42static LegalityPredicate isMultiple32(unsigned TypeIdx,
Matt Arsenault9dba6032019-10-01 16:35:06 +000043 unsigned MaxSize = 1024) {
Matt Arsenaultd9141892019-02-07 19:10:15 +000044 return [=](const LegalityQuery &Query) {
45 const LLT Ty = Query.Types[TypeIdx];
46 const LLT EltTy = Ty.getScalarType();
47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48 };
49}
50
Matt Arsenaulta5b9c752019-10-06 01:37:35 +000051static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52 return [=](const LegalityQuery &Query) {
53 return Query.Types[TypeIdx].getSizeInBits() == Size;
54 };
55}
56
Matt Arsenault18ec3822019-02-11 22:00:39 +000057static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58 return [=](const LegalityQuery &Query) {
59 const LLT Ty = Query.Types[TypeIdx];
60 return Ty.isVector() &&
61 Ty.getNumElements() % 2 != 0 &&
Matt Arsenault3d23e582019-10-03 17:50:29 +000062 Ty.getElementType().getSizeInBits() < 32 &&
63 Ty.getSizeInBits() % 32 != 0;
Matt Arsenault18ec3822019-02-11 22:00:39 +000064 };
65}
66
Matt Arsenault3cd39592019-10-09 22:44:43 +000067static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68 return [=](const LegalityQuery &Query) {
69 const LLT Ty = Query.Types[TypeIdx];
70 const LLT EltTy = Ty.getScalarType();
71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72 };
73}
74
Matt Arsenault18ec3822019-02-11 22:00:39 +000075static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 const LLT EltTy = Ty.getElementType();
79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80 };
81}
82
Matt Arsenault26b7e852019-02-19 16:30:19 +000083static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84 return [=](const LegalityQuery &Query) {
85 const LLT Ty = Query.Types[TypeIdx];
86 const LLT EltTy = Ty.getElementType();
87 unsigned Size = Ty.getSizeInBits();
88 unsigned Pieces = (Size + 63) / 64;
89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91 };
92}
93
Matt Arsenaultc0ceca52019-09-10 16:20:14 +000094// Increase the number of vector elements to reach the next multiple of 32-bit
95// type.
96static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99
100 const LLT EltTy = Ty.getElementType();
101 const int Size = Ty.getSizeInBits();
102 const int EltSize = EltTy.getSizeInBits();
103 const int NextMul32 = (Size + 31) / 32;
104
105 assert(EltSize < 32);
106
107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109 };
110}
111
112static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113 return [=](const LegalityQuery &Query) {
114 const LLT QueryTy = Query.Types[TypeIdx];
115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116 };
117}
118
Matt Arsenault26b7e852019-02-19 16:30:19 +0000119static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120 return [=](const LegalityQuery &Query) {
121 const LLT QueryTy = Query.Types[TypeIdx];
122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123 };
124}
125
Matt Arsenaultb4c95b32019-02-19 17:03:09 +0000126static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127 return [=](const LegalityQuery &Query) {
128 const LLT QueryTy = Query.Types[TypeIdx];
129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130 };
131}
Matt Arsenault18ec3822019-02-11 22:00:39 +0000132
Matt Arsenault9dba6032019-10-01 16:35:06 +0000133// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
Matt Arsenault4dd57552019-07-09 14:17:31 +0000134// v2s16.
135static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136 return [=](const LegalityQuery &Query) {
137 const LLT Ty = Query.Types[TypeIdx];
138 if (Ty.isVector()) {
139 const int EltSize = Ty.getElementType().getSizeInBits();
140 return EltSize == 32 || EltSize == 64 ||
Matt Arsenault3f1a3452019-07-09 22:48:04 +0000141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142 EltSize == 128 || EltSize == 256;
Matt Arsenault4dd57552019-07-09 14:17:31 +0000143 }
144
Matt Arsenault9dba6032019-10-01 16:35:06 +0000145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
Matt Arsenault4dd57552019-07-09 14:17:31 +0000146 };
147}
148
Matt Arsenault28215ca2019-08-13 16:26:28 +0000149static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150 return [=](const LegalityQuery &Query) {
151 return Query.Types[TypeIdx].getElementType() == Type;
152 };
153}
154
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000155static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156 return [=](const LegalityQuery &Query) {
157 const LLT Ty = Query.Types[TypeIdx];
158 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160 };
161}
162
Matt Arsenault9e8e8c62019-07-01 18:49:01 +0000163AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164 const GCNTargetMachine &TM)
165 : ST(ST_) {
Tom Stellardca166212017-01-30 21:56:46 +0000166 using namespace TargetOpcode;
167
Matt Arsenault85803362018-03-17 15:17:41 +0000168 auto GetAddrSpacePtr = [&TM](unsigned AS) {
169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170 };
171
172 const LLT S1 = LLT::scalar(1);
Matt Arsenault888aa5d2019-02-03 00:07:33 +0000173 const LLT S8 = LLT::scalar(8);
Matt Arsenault45991592019-01-18 21:33:50 +0000174 const LLT S16 = LLT::scalar(16);
Tom Stellardca166212017-01-30 21:56:46 +0000175 const LLT S32 = LLT::scalar(32);
176 const LLT S64 = LLT::scalar(64);
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000177 const LLT S96 = LLT::scalar(96);
Matt Arsenaultca676342019-01-25 02:36:32 +0000178 const LLT S128 = LLT::scalar(128);
Matt Arsenaultff6a9a22019-01-20 18:40:36 +0000179 const LLT S256 = LLT::scalar(256);
Matt Arsenault9dba6032019-10-01 16:35:06 +0000180 const LLT S1024 = LLT::scalar(1024);
Matt Arsenault85803362018-03-17 15:17:41 +0000181
Matt Arsenaultbee2ad72018-12-21 03:03:11 +0000182 const LLT V2S16 = LLT::vector(2, 16);
Matt Arsenaulta1515d22019-01-08 01:30:02 +0000183 const LLT V4S16 = LLT::vector(4, 16);
Matt Arsenaultbee2ad72018-12-21 03:03:11 +0000184
185 const LLT V2S32 = LLT::vector(2, 32);
186 const LLT V3S32 = LLT::vector(3, 32);
187 const LLT V4S32 = LLT::vector(4, 32);
188 const LLT V5S32 = LLT::vector(5, 32);
189 const LLT V6S32 = LLT::vector(6, 32);
190 const LLT V7S32 = LLT::vector(7, 32);
191 const LLT V8S32 = LLT::vector(8, 32);
192 const LLT V9S32 = LLT::vector(9, 32);
193 const LLT V10S32 = LLT::vector(10, 32);
194 const LLT V11S32 = LLT::vector(11, 32);
195 const LLT V12S32 = LLT::vector(12, 32);
196 const LLT V13S32 = LLT::vector(13, 32);
197 const LLT V14S32 = LLT::vector(14, 32);
198 const LLT V15S32 = LLT::vector(15, 32);
199 const LLT V16S32 = LLT::vector(16, 32);
Matt Arsenault05aa8a72019-10-02 01:02:18 +0000200 const LLT V32S32 = LLT::vector(32, 32);
Matt Arsenaultbee2ad72018-12-21 03:03:11 +0000201
202 const LLT V2S64 = LLT::vector(2, 64);
203 const LLT V3S64 = LLT::vector(3, 64);
204 const LLT V4S64 = LLT::vector(4, 64);
205 const LLT V5S64 = LLT::vector(5, 64);
206 const LLT V6S64 = LLT::vector(6, 64);
207 const LLT V7S64 = LLT::vector(7, 64);
208 const LLT V8S64 = LLT::vector(8, 64);
Matt Arsenault05aa8a72019-10-02 01:02:18 +0000209 const LLT V16S64 = LLT::vector(16, 64);
Matt Arsenaultbee2ad72018-12-21 03:03:11 +0000210
211 std::initializer_list<LLT> AllS32Vectors =
212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
Matt Arsenault05aa8a72019-10-02 01:02:18 +0000213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
Matt Arsenaultbee2ad72018-12-21 03:03:11 +0000214 std::initializer_list<LLT> AllS64Vectors =
Matt Arsenault05aa8a72019-10-02 01:02:18 +0000215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
Matt Arsenaultbee2ad72018-12-21 03:03:11 +0000216
Matt Arsenault85803362018-03-17 15:17:41 +0000217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
Matt Arsenaultf3bfb852019-07-19 22:28:44 +0000219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
Matt Arsenault685d1e82018-03-17 15:17:45 +0000220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
Matt Arsenaultf3bfb852019-07-19 22:28:44 +0000221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
Matt Arsenault0da63502018-08-31 05:49:54 +0000222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
Matt Arsenault85803362018-03-17 15:17:41 +0000224
Matt Arsenault934e5342018-12-13 20:34:15 +0000225 const LLT CodePtr = FlatPtr;
226
Matt Arsenault9e5e8682019-02-14 22:24:28 +0000227 const std::initializer_list<LLT> AddrSpaces64 = {
228 GlobalPtr, ConstantPtr, FlatPtr
229 };
230
231 const std::initializer_list<LLT> AddrSpaces32 = {
Matt Arsenaultf3bfb852019-07-19 22:28:44 +0000232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
Matt Arsenault685d1e82018-03-17 15:17:45 +0000233 };
Tom Stellardca166212017-01-30 21:56:46 +0000234
Matt Arsenault40d1faf2019-07-01 17:35:53 +0000235 const std::initializer_list<LLT> FPTypesBase = {
236 S32, S64
237 };
238
239 const std::initializer_list<LLT> FPTypes16 = {
240 S32, S64, S16
241 };
242
Matt Arsenault6ce1b4f2019-07-10 16:31:19 +0000243 const std::initializer_list<LLT> FPTypesPK16 = {
244 S32, S64, S16, V2S16
245 };
246
Matt Arsenaultadc40ba2019-01-08 01:22:47 +0000247 setAction({G_BRCOND, S1}, Legal);
248
Matt Arsenault2e0ee472019-02-21 15:48:13 +0000249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250 // elements for v3s16
251 getActionDefinitionsBuilder(G_PHI)
252 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253 .legalFor(AllS32Vectors)
254 .legalFor(AllS64Vectors)
255 .legalFor(AddrSpaces64)
256 .legalFor(AddrSpaces32)
257 .clampScalar(0, S32, S256)
258 .widenScalarToNextPow2(0, 32)
Matt Arsenaultd3093c22019-02-28 00:16:32 +0000259 .clampMaxNumElements(0, S32, 16)
Matt Arsenault72bcf152019-02-28 00:01:05 +0000260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
Matt Arsenault2e0ee472019-02-21 15:48:13 +0000261 .legalIf(isPointer(0));
262
Matt Arsenaultef59cb62019-07-01 18:18:55 +0000263 if (ST.has16BitInsts()) {
264 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265 .legalFor({S32, S16})
266 .clampScalar(0, S16, S32)
267 .scalarize(0);
268 } else {
269 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270 .legalFor({S32})
271 .clampScalar(0, S32, S32)
272 .scalarize(0);
273 }
Matt Arsenault2e0ee472019-02-21 15:48:13 +0000274
Matt Arsenaultef59cb62019-07-01 18:18:55 +0000275 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
Matt Arsenault5d622fb2019-01-25 03:23:04 +0000276 .legalFor({S32})
Matt Arsenault211e89d2019-01-27 00:52:51 +0000277 .clampScalar(0, S32, S32)
Matt Arsenault5d622fb2019-01-25 03:23:04 +0000278 .scalarize(0);
Matt Arsenault43398832018-12-20 01:35:49 +0000279
Matt Arsenault26a6c742019-01-26 23:47:07 +0000280 // Report legal for any types we can handle anywhere. For the cases only legal
281 // on the SALU, RegBankSelect will be able to re-legalize.
Matt Arsenault43398832018-12-20 01:35:49 +0000282 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
Matt Arsenault22c4a142019-07-16 14:28:30 +0000283 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
Matt Arsenault26a6c742019-01-26 23:47:07 +0000284 .clampScalar(0, S32, S64)
Matt Arsenault26b7e852019-02-19 16:30:19 +0000285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
Matt Arsenault3d23e582019-10-03 17:50:29 +0000286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
Matt Arsenaultf4bfe4c2019-02-25 21:32:48 +0000287 .widenScalarToNextPow2(0)
Matt Arsenault26a6c742019-01-26 23:47:07 +0000288 .scalarize(0);
Tom Stellardee6e6452017-06-12 20:54:56 +0000289
Matt Arsenault34ed76e2019-10-16 20:46:32 +0000290 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
Matt Arsenault68c668a2019-01-08 01:09:09 +0000291 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
Matt Arsenault4d475942019-01-26 23:44:51 +0000292 .legalFor({{S32, S1}})
Matt Arsenault54167ea2019-10-01 01:23:13 +0000293 .clampScalar(0, S32, S32)
294 .scalarize(0); // TODO: Implement.
Matt Arsenault2cc15b62019-01-08 01:03:58 +0000295
Matt Arsenault34ed76e2019-10-16 20:46:32 +0000296 getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
297 .lower();
298
Matt Arsenault7ac79ed2019-01-20 19:45:18 +0000299 getActionDefinitionsBuilder(G_BITCAST)
Matt Arsenault7ac79ed2019-01-20 19:45:18 +0000300 // Don't worry about the size constraint.
Matt Arsenault1c135a32019-10-03 05:46:08 +0000301 .legalIf(all(isRegisterType(0), isRegisterType(1)))
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000302 // FIXME: Testing hack
303 .legalForCartesianProduct({S16, LLT::vector(2, 8), });
Tom Stellardff63ee02017-06-19 13:15:45 +0000304
Matt Arsenaultd9af7122019-09-04 16:19:45 +0000305 getActionDefinitionsBuilder(G_FCONSTANT)
306 .legalFor({S32, S64, S16})
307 .clampScalar(0, S16, S64);
Tom Stellardeebbfc22018-06-30 04:09:44 +0000308
Matt Arsenaultb3feccd2018-06-25 15:42:12 +0000309 getActionDefinitionsBuilder(G_IMPLICIT_DEF)
Matt Arsenaultd9af7122019-09-04 16:19:45 +0000310 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
Matt Arsenaultd9141892019-02-07 19:10:15 +0000311 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
Matt Arsenault18ec3822019-02-11 22:00:39 +0000312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
Matt Arsenault9dba6032019-10-01 16:35:06 +0000313 .clampScalarOrElt(0, S32, S1024)
Matt Arsenault0f2debb2019-02-08 14:46:27 +0000314 .legalIf(isMultiple32(0))
Matt Arsenault82b10392019-02-25 20:46:06 +0000315 .widenScalarToNextPow2(0, 32)
316 .clampMaxNumElements(0, S32, 16);
Matt Arsenaultb3feccd2018-06-25 15:42:12 +0000317
Matt Arsenaultabdc4f22018-03-17 15:17:48 +0000318
Tom Stellarde0424122017-06-03 01:13:33 +0000319 // FIXME: i1 operands to intrinsics should always be legal, but other i1
320 // values may not be legal. We need to figure out how to distinguish
321 // between these two scenarios.
Matt Arsenault45991592019-01-18 21:33:50 +0000322 getActionDefinitionsBuilder(G_CONSTANT)
Matt Arsenaultd9af7122019-09-04 16:19:45 +0000323 .legalFor({S1, S32, S64, S16, GlobalPtr,
Matt Arsenault2065c942019-02-02 23:33:49 +0000324 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
Matt Arsenault45991592019-01-18 21:33:50 +0000325 .clampScalar(0, S32, S64)
Matt Arsenault2065c942019-02-02 23:33:49 +0000326 .widenScalarToNextPow2(0)
327 .legalIf(isPointer(0));
Matt Arsenault06cbb272018-03-01 19:16:52 +0000328
Matt Arsenaultc94e26c2018-12-18 09:46:13 +0000329 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
Matt Arsenault77ac4002019-10-01 01:06:43 +0000330 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
331 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
Matt Arsenault64ecca92019-09-09 17:13:44 +0000332
Matt Arsenaultc94e26c2018-12-18 09:46:13 +0000333
Matt Arsenault93fdec72019-02-07 18:03:11 +0000334 auto &FPOpActions = getActionDefinitionsBuilder(
Matt Arsenaulte1895ab2019-09-10 17:19:46 +0000335 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
Matt Arsenault93fdec72019-02-07 18:03:11 +0000336 .legalFor({S32, S64});
Matt Arsenaultcbd17822019-08-29 20:06:48 +0000337 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
338 .customFor({S32, S64});
Austin Kerbow97263fa2019-10-21 22:18:26 +0000339 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
340 .customFor({S32, S64});
Matt Arsenault93fdec72019-02-07 18:03:11 +0000341
342 if (ST.has16BitInsts()) {
343 if (ST.hasVOP3PInsts())
344 FPOpActions.legalFor({S16, V2S16});
345 else
346 FPOpActions.legalFor({S16});
Matt Arsenaultcbd17822019-08-29 20:06:48 +0000347
348 TrigActions.customFor({S16});
Austin Kerbow97263fa2019-10-21 22:18:26 +0000349 FDIVActions.customFor({S16});
Matt Arsenault93fdec72019-02-07 18:03:11 +0000350 }
351
Matt Arsenault6ce1b4f2019-07-10 16:31:19 +0000352 auto &MinNumMaxNum = getActionDefinitionsBuilder({
353 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
354
355 if (ST.hasVOP3PInsts()) {
356 MinNumMaxNum.customFor(FPTypesPK16)
357 .clampMaxNumElements(0, S16, 2)
358 .clampScalar(0, S16, S64)
359 .scalarize(0);
360 } else if (ST.has16BitInsts()) {
361 MinNumMaxNum.customFor(FPTypes16)
362 .clampScalar(0, S16, S64)
363 .scalarize(0);
364 } else {
365 MinNumMaxNum.customFor(FPTypesBase)
366 .clampScalar(0, S32, S64)
367 .scalarize(0);
368 }
369
Matt Arsenault93fdec72019-02-07 18:03:11 +0000370 if (ST.hasVOP3PInsts())
371 FPOpActions.clampMaxNumElements(0, S16, 2);
Matt Arsenaultcbd17822019-08-29 20:06:48 +0000372
Matt Arsenault93fdec72019-02-07 18:03:11 +0000373 FPOpActions
374 .scalarize(0)
375 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
Tom Stellardd0c6cf22017-10-27 23:57:41 +0000376
Matt Arsenaultcbd17822019-08-29 20:06:48 +0000377 TrigActions
378 .scalarize(0)
379 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
380
Austin Kerbow97263fa2019-10-21 22:18:26 +0000381 FDIVActions
382 .scalarize(0)
383 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
384
Matt Arsenaulte1895ab2019-09-10 17:19:46 +0000385 getActionDefinitionsBuilder({G_FNEG, G_FABS})
386 .legalFor(FPTypesPK16)
387 .clampMaxNumElements(0, S16, 2)
388 .scalarize(0)
389 .clampScalar(0, S16, S64);
390
391 // TODO: Implement
392 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
393
Matt Arsenaultc0f75692019-02-07 18:14:39 +0000394 if (ST.has16BitInsts()) {
Matt Arsenaultf457dd22019-09-13 01:48:15 +0000395 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
Matt Arsenaultc0f75692019-02-07 18:14:39 +0000396 .legalFor({S32, S64, S16})
397 .scalarize(0)
398 .clampScalar(0, S16, S64);
399 } else {
Matt Arsenaultf457dd22019-09-13 01:48:15 +0000400 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
Matt Arsenaultc0f75692019-02-07 18:14:39 +0000401 .legalFor({S32, S64})
402 .scalarize(0)
403 .clampScalar(0, S32, S64);
404 }
405
Matt Arsenaultdff33c32018-12-20 00:37:02 +0000406 getActionDefinitionsBuilder(G_FPTRUNC)
Matt Arsenaulte6cebd02019-01-25 04:37:33 +0000407 .legalFor({{S32, S64}, {S16, S32}})
408 .scalarize(0);
Matt Arsenaultdff33c32018-12-20 00:37:02 +0000409
Matt Arsenault24563ef2019-01-20 18:34:24 +0000410 getActionDefinitionsBuilder(G_FPEXT)
411 .legalFor({{S64, S32}, {S32, S16}})
Matt Arsenaultca676342019-01-25 02:36:32 +0000412 .lowerFor({{S64, S16}}) // FIXME: Implement
413 .scalarize(0);
Matt Arsenault24563ef2019-01-20 18:34:24 +0000414
Matt Arsenaultb1843e12019-07-09 23:34:29 +0000415 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
416 getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
Matt Arsenault1448f562019-05-17 12:19:52 +0000417
Matt Arsenault745fd9f2019-01-20 19:10:31 +0000418 getActionDefinitionsBuilder(G_FSUB)
Matt Arsenaultaebb2ee2019-01-22 20:14:29 +0000419 // Use actual fsub instruction
420 .legalFor({S32})
421 // Must use fadd + fneg
422 .lowerFor({S64, S16, V2S16})
Matt Arsenault990f5072019-01-25 00:51:00 +0000423 .scalarize(0)
Matt Arsenaultaebb2ee2019-01-22 20:14:29 +0000424 .clampScalar(0, S32, S64);
Matt Arsenaulte01e7c82018-12-18 09:19:03 +0000425
Matt Arsenault4d339182019-09-13 00:44:35 +0000426 // Whether this is legal depends on the floating point mode for the function.
427 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
428 if (ST.hasMadF16())
429 FMad.customFor({S32, S16});
430 else
431 FMad.customFor({S32});
432 FMad.scalarize(0)
433 .lower();
434
Matt Arsenault24563ef2019-01-20 18:34:24 +0000435 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
Matt Arsenault46ffe682019-01-20 19:28:20 +0000436 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
Matt Arsenaultca676342019-01-25 02:36:32 +0000437 {S32, S1}, {S64, S1}, {S16, S1},
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000438 {S96, S32},
Matt Arsenaultca676342019-01-25 02:36:32 +0000439 // FIXME: Hack
Matt Arsenaultf4bfe4c2019-02-25 21:32:48 +0000440 {S64, LLT::scalar(33)},
Matt Arsenault888aa5d2019-02-03 00:07:33 +0000441 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
Matt Arsenaultca676342019-01-25 02:36:32 +0000442 .scalarize(0);
Matt Arsenaultf38f4832018-12-13 08:23:51 +0000443
Matt Arsenaultfdea5e02019-10-01 02:23:20 +0000444 // TODO: Split s1->s64 during regbankselect for VALU.
Matt Arsenaultc8a6df72019-10-07 23:33:08 +0000445 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
Matt Arsenaultfdea5e02019-10-01 02:23:20 +0000446 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
Matt Arsenault02b5ca82019-05-17 23:05:13 +0000447 .lowerFor({{S32, S64}})
Matt Arsenaultc8a6df72019-10-07 23:33:08 +0000448 .customFor({{S64, S64}});
449 if (ST.has16BitInsts())
450 IToFP.legalFor({{S16, S16}});
451 IToFP.clampScalar(1, S32, S64)
452 .scalarize(0);
Matt Arsenaultdd022ce2018-03-01 19:04:25 +0000453
Matt Arsenaulted85b0c2019-10-01 01:06:48 +0000454 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
455 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
456 if (ST.has16BitInsts())
457 FPToI.legalFor({{S16, S16}});
458 else
459 FPToI.minScalar(1, S32);
460
461 FPToI.minScalar(0, S32)
462 .scalarize(0);
Tom Stellard33445762018-02-07 04:47:59 +0000463
Matt Arsenault6aebcd52019-05-17 12:20:01 +0000464 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
Matt Arsenault2e5f9002019-01-27 00:12:21 +0000465 .legalFor({S32, S64})
466 .scalarize(0);
Matt Arsenaultf4c21c52018-12-21 03:14:45 +0000467
Matt Arsenault6aafc5e2019-05-17 12:19:57 +0000468 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
Matt Arsenaulta510b572019-05-17 12:20:05 +0000469 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
Matt Arsenault6aafc5e2019-05-17 12:19:57 +0000470 .legalFor({S32, S64})
471 .clampScalar(0, S32, S64)
472 .scalarize(0);
473 } else {
Matt Arsenaulta510b572019-05-17 12:20:05 +0000474 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
Matt Arsenault6aafc5e2019-05-17 12:19:57 +0000475 .legalFor({S32})
476 .customFor({S64})
477 .clampScalar(0, S32, S64)
478 .scalarize(0);
479 }
Tom Stellardca166212017-01-30 21:56:46 +0000480
Matt Arsenault9e5e8682019-02-14 22:24:28 +0000481 getActionDefinitionsBuilder(G_GEP)
482 .legalForCartesianProduct(AddrSpaces64, {S64})
483 .legalForCartesianProduct(AddrSpaces32, {S32})
484 .scalarize(0);
Matt Arsenault3b9a82f2019-01-25 04:54:00 +0000485
Matt Arsenaultc34b4032019-09-09 15:46:13 +0000486 getActionDefinitionsBuilder(G_PTR_MASK)
487 .scalarize(0)
488 .alwaysLegal();
489
Matt Arsenault934e5342018-12-13 20:34:15 +0000490 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
491
Matt Arsenault8b8eee52019-07-09 14:10:43 +0000492 auto &CmpBuilder =
493 getActionDefinitionsBuilder(G_ICMP)
Matt Arsenault58f9d3d2019-02-02 23:35:15 +0000494 .legalForCartesianProduct(
495 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
Matt Arsenault8b8eee52019-07-09 14:10:43 +0000496 .legalFor({{S1, S32}, {S1, S64}});
497 if (ST.has16BitInsts()) {
498 CmpBuilder.legalFor({{S1, S16}});
499 }
500
501 CmpBuilder
Matt Arsenault58f9d3d2019-02-02 23:35:15 +0000502 .widenScalarToNextPow2(1)
503 .clampScalar(1, S32, S64)
504 .scalarize(0)
505 .legalIf(all(typeIs(0, S1), isPointer(1)));
506
507 getActionDefinitionsBuilder(G_FCMP)
Matt Arsenault40d1faf2019-07-01 17:35:53 +0000508 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
Matt Arsenault1b1e6852019-01-25 02:59:34 +0000509 .widenScalarToNextPow2(1)
510 .clampScalar(1, S32, S64)
Matt Arsenaultded2f822019-01-26 23:54:53 +0000511 .scalarize(0);
Matt Arsenault1b1e6852019-01-25 02:59:34 +0000512
Matt Arsenault95fd95c2019-01-25 04:03:38 +0000513 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
514 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
515 G_FLOG, G_FLOG2, G_FLOG10})
516 .legalFor({S32})
517 .scalarize(0);
Tom Stellard8cd60a52017-06-06 14:16:50 +0000518
Matt Arsenaultd5684f72019-01-31 02:09:57 +0000519 // The 64-bit versions produce 32-bit results, but only on the SALU.
520 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
521 G_CTTZ, G_CTTZ_ZERO_UNDEF,
522 G_CTPOP})
523 .legalFor({{S32, S32}, {S32, S64}})
524 .clampScalar(0, S32, S32)
Matt Arsenault75e30c42019-02-20 16:42:52 +0000525 .clampScalar(1, S32, S64)
Matt Arsenaultb10fa8d2019-02-21 15:22:20 +0000526 .scalarize(0)
527 .widenScalarToNextPow2(0, 32)
528 .widenScalarToNextPow2(1, 32);
Matt Arsenaultd5684f72019-01-31 02:09:57 +0000529
Matt Arsenaultd1bfc8d2019-01-31 02:34:03 +0000530 // TODO: Expand for > s32
Matt Arsenault5ff310e2019-09-04 20:46:15 +0000531 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
Matt Arsenaultd1bfc8d2019-01-31 02:34:03 +0000532 .legalFor({S32})
533 .clampScalar(0, S32, S32)
534 .scalarize(0);
Matt Arsenaultd5684f72019-01-31 02:09:57 +0000535
Matt Arsenault0f3ba442019-05-23 17:58:48 +0000536 if (ST.has16BitInsts()) {
537 if (ST.hasVOP3PInsts()) {
538 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
539 .legalFor({S32, S16, V2S16})
540 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
541 .clampMaxNumElements(0, S16, 2)
542 .clampScalar(0, S16, S32)
543 .widenScalarToNextPow2(0)
544 .scalarize(0);
545 } else {
546 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
547 .legalFor({S32, S16})
548 .widenScalarToNextPow2(0)
549 .clampScalar(0, S16, S32)
550 .scalarize(0);
551 }
552 } else {
553 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
554 .legalFor({S32})
555 .clampScalar(0, S32, S32)
556 .widenScalarToNextPow2(0)
557 .scalarize(0);
558 }
Matt Arsenaultf38f4832018-12-13 08:23:51 +0000559
Matt Arsenaultcbaada62019-02-02 23:29:55 +0000560 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
561 return [=](const LegalityQuery &Query) {
562 return Query.Types[TypeIdx0].getSizeInBits() <
563 Query.Types[TypeIdx1].getSizeInBits();
564 };
565 };
566
567 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
568 return [=](const LegalityQuery &Query) {
569 return Query.Types[TypeIdx0].getSizeInBits() >
570 Query.Types[TypeIdx1].getSizeInBits();
571 };
572 };
573
Tom Stellard7c650782018-10-05 04:34:09 +0000574 getActionDefinitionsBuilder(G_INTTOPTR)
Matt Arsenaultcbaada62019-02-02 23:29:55 +0000575 // List the common cases
Matt Arsenault9e5e8682019-02-14 22:24:28 +0000576 .legalForCartesianProduct(AddrSpaces64, {S64})
577 .legalForCartesianProduct(AddrSpaces32, {S32})
Matt Arsenaultcbaada62019-02-02 23:29:55 +0000578 .scalarize(0)
579 // Accept any address space as long as the size matches
580 .legalIf(sameSize(0, 1))
581 .widenScalarIf(smallerThan(1, 0),
582 [](const LegalityQuery &Query) {
583 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
584 })
585 .narrowScalarIf(greaterThan(1, 0),
586 [](const LegalityQuery &Query) {
587 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
588 });
Matt Arsenault85803362018-03-17 15:17:41 +0000589
Matt Arsenaultf38f4832018-12-13 08:23:51 +0000590 getActionDefinitionsBuilder(G_PTRTOINT)
Matt Arsenaultcbaada62019-02-02 23:29:55 +0000591 // List the common cases
Matt Arsenault9e5e8682019-02-14 22:24:28 +0000592 .legalForCartesianProduct(AddrSpaces64, {S64})
593 .legalForCartesianProduct(AddrSpaces32, {S32})
Matt Arsenaultcbaada62019-02-02 23:29:55 +0000594 .scalarize(0)
595 // Accept any address space as long as the size matches
596 .legalIf(sameSize(0, 1))
597 .widenScalarIf(smallerThan(0, 1),
598 [](const LegalityQuery &Query) {
599 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
600 })
601 .narrowScalarIf(
602 greaterThan(0, 1),
603 [](const LegalityQuery &Query) {
604 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
605 });
Matt Arsenaultf38f4832018-12-13 08:23:51 +0000606
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +0000607 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
608 .scalarize(0)
609 .custom();
Matt Arsenaulta8b43392019-02-08 02:40:47 +0000610
Matt Arsenault35c96592019-07-16 18:05:29 +0000611 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
612 // handle some operations by just promoting the register during
613 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000614 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
615 switch (AS) {
616 // FIXME: Private element size.
617 case AMDGPUAS::PRIVATE_ADDRESS:
618 return 32;
619 // FIXME: Check subtarget
620 case AMDGPUAS::LOCAL_ADDRESS:
621 return ST.useDS128() ? 128 : 64;
Matt Arsenault85803362018-03-17 15:17:41 +0000622
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000623 // Treat constant and global as identical. SMRD loads are sometimes usable
624 // for global loads (ideally constant address space should be eliminated)
625 // depending on the context. Legality cannot be context dependent, but
626 // RegBankSelect can split the load as necessary depending on the pointer
627 // register bank/uniformity and if the memory is invariant or not written in
628 // a kernel.
629 case AMDGPUAS::CONSTANT_ADDRESS:
630 case AMDGPUAS::GLOBAL_ADDRESS:
631 return 512;
632 default:
633 return 128;
634 }
635 };
Matt Arsenault18619af2019-01-29 18:13:02 +0000636
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000637 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
638 const LLT DstTy = Query.Types[0];
Matt Arsenault18619af2019-01-29 18:13:02 +0000639
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000640 // Split vector extloads.
641 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
642 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
643 return true;
Matt Arsenault85803362018-03-17 15:17:41 +0000644
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000645 const LLT PtrTy = Query.Types[1];
646 unsigned AS = PtrTy.getAddressSpace();
647 if (MemSize > maxSizeForAddrSpace(AS))
648 return true;
Matt Arsenault85803362018-03-17 15:17:41 +0000649
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000650 // Catch weird sized loads that don't evenly divide into the access sizes
651 // TODO: May be able to widen depending on alignment etc.
652 unsigned NumRegs = MemSize / 32;
653 if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
654 return true;
Tom Stellardd0ba79f2019-07-10 00:22:41 +0000655
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000656 unsigned Align = Query.MMODescrs[0].AlignInBits;
657 if (Align < MemSize) {
658 const SITargetLowering *TLI = ST.getTargetLowering();
659 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
660 }
Matt Arsenault85803362018-03-17 15:17:41 +0000661
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000662 return false;
663 };
Matt Arsenault85803362018-03-17 15:17:41 +0000664
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000665 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
666 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
667 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
668
669 // TODO: Refine based on subtargets which support unaligned access or 128-bit
670 // LDS
671 // TODO: Unsupported flat for SI.
672
673 for (unsigned Op : {G_LOAD, G_STORE}) {
674 const bool IsStore = Op == G_STORE;
675
676 auto &Actions = getActionDefinitionsBuilder(Op);
677 // Whitelist the common cases.
678 // TODO: Pointer loads
679 // TODO: Wide constant loads
680 // TODO: Only CI+ has 3x loads
681 // TODO: Loads to s16 on gfx9
682 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
683 {V2S32, GlobalPtr, 64, GlobalAlign32},
684 {V3S32, GlobalPtr, 96, GlobalAlign32},
685 {S96, GlobalPtr, 96, GlobalAlign32},
686 {V4S32, GlobalPtr, 128, GlobalAlign32},
687 {S128, GlobalPtr, 128, GlobalAlign32},
688 {S64, GlobalPtr, 64, GlobalAlign32},
689 {V2S64, GlobalPtr, 128, GlobalAlign32},
690 {V2S16, GlobalPtr, 32, GlobalAlign32},
691 {S32, GlobalPtr, 8, GlobalAlign8},
692 {S32, GlobalPtr, 16, GlobalAlign16},
693
694 {S32, LocalPtr, 32, 32},
695 {S64, LocalPtr, 64, 32},
696 {V2S32, LocalPtr, 64, 32},
697 {S32, LocalPtr, 8, 8},
698 {S32, LocalPtr, 16, 16},
699 {V2S16, LocalPtr, 32, 32},
700
701 {S32, PrivatePtr, 32, 32},
702 {S32, PrivatePtr, 8, 8},
703 {S32, PrivatePtr, 16, 16},
704 {V2S16, PrivatePtr, 32, 32},
705
706 {S32, FlatPtr, 32, GlobalAlign32},
707 {S32, FlatPtr, 16, GlobalAlign16},
708 {S32, FlatPtr, 8, GlobalAlign8},
709 {V2S16, FlatPtr, 32, GlobalAlign32},
710
711 {S32, ConstantPtr, 32, GlobalAlign32},
712 {V2S32, ConstantPtr, 64, GlobalAlign32},
713 {V3S32, ConstantPtr, 96, GlobalAlign32},
714 {V4S32, ConstantPtr, 128, GlobalAlign32},
715 {S64, ConstantPtr, 64, GlobalAlign32},
716 {S128, ConstantPtr, 128, GlobalAlign32},
717 {V2S32, ConstantPtr, 32, GlobalAlign32}});
718 Actions
Matt Arsenaultad6a8b832019-09-10 16:42:31 +0000719 .customIf(typeIs(1, Constant32Ptr))
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000720 .narrowScalarIf(
721 [=](const LegalityQuery &Query) -> bool {
722 return !Query.Types[0].isVector() && needToSplitLoad(Query);
723 },
724 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
725 const LLT DstTy = Query.Types[0];
726 const LLT PtrTy = Query.Types[1];
727
728 const unsigned DstSize = DstTy.getSizeInBits();
729 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
730
731 // Split extloads.
732 if (DstSize > MemSize)
733 return std::make_pair(0, LLT::scalar(MemSize));
734
735 if (DstSize > 32 && (DstSize % 32 != 0)) {
736 // FIXME: Need a way to specify non-extload of larger size if
737 // suitably aligned.
738 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
739 }
740
741 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
742 if (MemSize > MaxSize)
743 return std::make_pair(0, LLT::scalar(MaxSize));
744
745 unsigned Align = Query.MMODescrs[0].AlignInBits;
746 return std::make_pair(0, LLT::scalar(Align));
747 })
748 .fewerElementsIf(
749 [=](const LegalityQuery &Query) -> bool {
750 return Query.Types[0].isVector() && needToSplitLoad(Query);
751 },
752 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
753 const LLT DstTy = Query.Types[0];
754 const LLT PtrTy = Query.Types[1];
755
756 LLT EltTy = DstTy.getElementType();
757 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
758
759 // Split if it's too large for the address space.
760 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
761 unsigned NumElts = DstTy.getNumElements();
762 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
763
764 // FIXME: Refine when odd breakdowns handled
765 // The scalars will need to be re-legalized.
766 if (NumPieces == 1 || NumPieces >= NumElts ||
767 NumElts % NumPieces != 0)
768 return std::make_pair(0, EltTy);
769
770 return std::make_pair(0,
771 LLT::vector(NumElts / NumPieces, EltTy));
772 }
773
774 // Need to split because of alignment.
775 unsigned Align = Query.MMODescrs[0].AlignInBits;
776 unsigned EltSize = EltTy.getSizeInBits();
777 if (EltSize > Align &&
778 (EltSize / Align < DstTy.getNumElements())) {
779 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
780 }
781
782 // May need relegalization for the scalars.
783 return std::make_pair(0, EltTy);
784 })
785 .minScalar(0, S32);
786
787 if (IsStore)
788 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
789
790 // TODO: Need a bitcast lower option?
791 Actions
792 .legalIf([=](const LegalityQuery &Query) {
793 const LLT Ty0 = Query.Types[0];
794 unsigned Size = Ty0.getSizeInBits();
795 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
796 unsigned Align = Query.MMODescrs[0].AlignInBits;
797
798 // No extending vector loads.
799 if (Size > MemSize && Ty0.isVector())
800 return false;
801
802 // FIXME: Widening store from alignment not valid.
803 if (MemSize < Size)
804 MemSize = std::max(MemSize, Align);
805
806 switch (MemSize) {
807 case 8:
808 case 16:
809 return Size == 32;
810 case 32:
811 case 64:
812 case 128:
813 return true;
814 case 96:
815 return ST.hasDwordx3LoadStores();
816 case 256:
817 case 512:
818 return true;
819 default:
820 return false;
821 }
822 })
823 .widenScalarToNextPow2(0)
824 // TODO: v3s32->v4s32 with alignment
825 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
826 }
827
Matt Arsenault6614f852019-01-22 19:02:10 +0000828 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000829 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
830 {S32, GlobalPtr, 16, 2 * 8},
831 {S32, LocalPtr, 8, 8},
832 {S32, LocalPtr, 16, 16},
833 {S32, PrivatePtr, 8, 8},
Matt Arsenaultda027272019-09-10 16:42:37 +0000834 {S32, PrivatePtr, 16, 16},
835 {S32, ConstantPtr, 8, 8},
836 {S32, ConstantPtr, 16, 2 * 8}});
Matt Arsenault6614f852019-01-22 19:02:10 +0000837 if (ST.hasFlatAddressSpace()) {
Matt Arsenaultc0ceca52019-09-10 16:20:14 +0000838 ExtLoads.legalForTypesWithMemDesc(
839 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
Matt Arsenault6614f852019-01-22 19:02:10 +0000840 }
841
842 ExtLoads.clampScalar(0, S32, S32)
843 .widenScalarToNextPow2(0)
844 .unsupportedIfMemSizeNotPow2()
845 .lower();
846
Matt Arsenault36d40922018-12-20 00:33:49 +0000847 auto &Atomics = getActionDefinitionsBuilder(
848 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
849 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
850 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
851 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
852 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
853 {S64, GlobalPtr}, {S64, LocalPtr}});
854 if (ST.hasFlatAddressSpace()) {
855 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
856 }
Tom Stellardca166212017-01-30 21:56:46 +0000857
Matt Arsenault26cb53b2019-08-01 03:33:15 +0000858 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
859 .legalFor({{S32, LocalPtr}});
860
Matt Arsenaultbcd6b1d2019-10-06 01:37:37 +0000861 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
862 .lower();
863
Matt Arsenault96e47012019-01-18 21:42:55 +0000864 // TODO: Pointer types, any 32-bit or 64-bit vector
865 getActionDefinitionsBuilder(G_SELECT)
Matt Arsenaultfdf36722019-07-01 15:42:47 +0000866 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
Matt Arsenault10547232019-02-04 14:04:52 +0000867 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
868 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
Matt Arsenaultfdf36722019-07-01 15:42:47 +0000869 .clampScalar(0, S16, S64)
Matt Arsenaultb4c95b32019-02-19 17:03:09 +0000870 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
871 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
Matt Arsenaultdc6c7852019-01-30 04:19:31 +0000872 .scalarize(1)
Matt Arsenault2491f822019-02-02 23:31:50 +0000873 .clampMaxNumElements(0, S32, 2)
874 .clampMaxNumElements(0, LocalPtr, 2)
875 .clampMaxNumElements(0, PrivatePtr, 2)
Matt Arsenaultb4c95b32019-02-19 17:03:09 +0000876 .scalarize(0)
Matt Arsenault4ed6cca2019-04-05 14:03:04 +0000877 .widenScalarToNextPow2(0)
Matt Arsenault2491f822019-02-02 23:31:50 +0000878 .legalIf(all(isPointer(0), typeIs(1, S1)));
Tom Stellard2860a422017-06-07 13:54:51 +0000879
Matt Arsenault4c5e8f512019-01-22 22:00:19 +0000880 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
881 // be more flexible with the shift amount type.
882 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
883 .legalFor({{S32, S32}, {S64, S32}});
Matt Arsenaultf6cab162019-01-30 03:36:25 +0000884 if (ST.has16BitInsts()) {
Matt Arsenaultc83b8232019-02-07 17:38:00 +0000885 if (ST.hasVOP3PInsts()) {
886 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
887 .clampMaxNumElements(0, S16, 2);
888 } else
889 Shifts.legalFor({{S16, S32}, {S16, S16}});
Matt Arsenaultfbec8fe2019-02-07 19:37:44 +0000890
891 Shifts.clampScalar(1, S16, S32);
Matt Arsenaultf6cab162019-01-30 03:36:25 +0000892 Shifts.clampScalar(0, S16, S64);
Matt Arsenaultb0a22702019-02-08 15:06:24 +0000893 Shifts.widenScalarToNextPow2(0, 16);
Matt Arsenaultfbec8fe2019-02-07 19:37:44 +0000894 } else {
895 // Make sure we legalize the shift amount type first, as the general
896 // expansion for the shifted type will produce much worse code if it hasn't
897 // been truncated already.
898 Shifts.clampScalar(1, S32, S32);
Matt Arsenault4c5e8f512019-01-22 22:00:19 +0000899 Shifts.clampScalar(0, S32, S64);
Matt Arsenaultb0a22702019-02-08 15:06:24 +0000900 Shifts.widenScalarToNextPow2(0, 32);
Matt Arsenaultfbec8fe2019-02-07 19:37:44 +0000901 }
902 Shifts.scalarize(0);
Tom Stellardca166212017-01-30 21:56:46 +0000903
Matt Arsenault7b9ed892018-03-12 13:35:53 +0000904 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
Matt Arsenault63786292019-01-22 20:38:15 +0000905 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
906 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
907 unsigned IdxTypeIdx = 2;
908
Matt Arsenault7b9ed892018-03-12 13:35:53 +0000909 getActionDefinitionsBuilder(Op)
Matt Arsenaultb0e04c02019-07-15 19:40:59 +0000910 .customIf([=](const LegalityQuery &Query) {
Matt Arsenault90bdfb32019-07-15 18:31:10 +0000911 const LLT EltTy = Query.Types[EltTypeIdx];
912 const LLT VecTy = Query.Types[VecTypeIdx];
913 const LLT IdxTy = Query.Types[IdxTypeIdx];
914 return (EltTy.getSizeInBits() == 16 ||
915 EltTy.getSizeInBits() % 32 == 0) &&
916 VecTy.getSizeInBits() % 32 == 0 &&
Matt Arsenault9dba6032019-10-01 16:35:06 +0000917 VecTy.getSizeInBits() <= 1024 &&
Matt Arsenault90bdfb32019-07-15 18:31:10 +0000918 IdxTy.getSizeInBits() == 32;
Matt Arsenault63786292019-01-22 20:38:15 +0000919 })
920 .clampScalar(EltTypeIdx, S32, S64)
921 .clampScalar(VecTypeIdx, S32, S64)
922 .clampScalar(IdxTypeIdx, S32, S32);
Matt Arsenault7b9ed892018-03-12 13:35:53 +0000923 }
924
Matt Arsenault63786292019-01-22 20:38:15 +0000925 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
926 .unsupportedIf([=](const LegalityQuery &Query) {
927 const LLT &EltTy = Query.Types[1].getElementType();
928 return Query.Types[0] != EltTy;
929 });
930
Matt Arsenaultc4d07552019-02-20 16:11:22 +0000931 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
932 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
933 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
934
935 // FIXME: Doesn't handle extract of illegal sizes.
Matt Arsenault4bcdcad2019-10-07 19:13:27 +0000936 getActionDefinitionsBuilder(Op)
937 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
938 // FIXME: Multiples of 16 should not be legal.
Matt Arsenault91be65b2019-02-07 17:25:51 +0000939 .legalIf([=](const LegalityQuery &Query) {
Matt Arsenaultc4d07552019-02-20 16:11:22 +0000940 const LLT BigTy = Query.Types[BigTyIdx];
941 const LLT LitTy = Query.Types[LitTyIdx];
942 return (BigTy.getSizeInBits() % 32 == 0) &&
943 (LitTy.getSizeInBits() % 16 == 0);
944 })
Matt Arsenault91be65b2019-02-07 17:25:51 +0000945 .widenScalarIf(
Matt Arsenaultc4d07552019-02-20 16:11:22 +0000946 [=](const LegalityQuery &Query) {
947 const LLT BigTy = Query.Types[BigTyIdx];
948 return (BigTy.getScalarSizeInBits() < 16);
949 },
950 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
951 .widenScalarIf(
952 [=](const LegalityQuery &Query) {
953 const LLT LitTy = Query.Types[LitTyIdx];
954 return (LitTy.getScalarSizeInBits() < 16);
955 },
956 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
Matt Arsenault2b6f76f2019-04-22 15:22:46 +0000957 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
958 .widenScalarToNextPow2(BigTyIdx, 32);
959
Matt Arsenaultc4d07552019-02-20 16:11:22 +0000960 }
Matt Arsenault71272e62018-03-05 16:25:15 +0000961
Matt Arsenaulta0933e62019-09-09 18:57:51 +0000962 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
963 .legalForCartesianProduct(AllS32Vectors, {S32})
964 .legalForCartesianProduct(AllS64Vectors, {S64})
Matt Arsenault05aa8a72019-10-02 01:02:18 +0000965 .clampNumElements(0, V16S32, V32S32)
Matt Arsenault3cd39592019-10-09 22:44:43 +0000966 .clampNumElements(0, V2S64, V16S64)
967 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
Matt Arsenaulta0933e62019-09-09 18:57:51 +0000968
969 if (ST.hasScalarPackInsts())
970 BuildVector.legalFor({V2S16, S32});
971
972 BuildVector
973 .minScalarSameAs(1, 0)
974 .legalIf(isRegisterType(0))
975 .minScalarOrElt(0, S32);
Matt Arsenaultbee2ad72018-12-21 03:03:11 +0000976
Matt Arsenault182f9242019-09-09 17:04:18 +0000977 if (ST.hasScalarPackInsts()) {
978 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
979 .legalFor({V2S16, S32})
980 .lower();
981 } else {
982 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
983 .lower();
984 }
985
Matt Arsenaulta1515d22019-01-08 01:30:02 +0000986 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
Matt Arsenault4dd57552019-07-09 14:17:31 +0000987 .legalIf(isRegisterType(0));
Matt Arsenaulta1515d22019-01-08 01:30:02 +0000988
Matt Arsenault690645b2019-08-13 16:09:07 +0000989 // TODO: Don't fully scalarize v2s16 pieces
990 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
991
Matt Arsenault503afda2018-03-12 13:35:43 +0000992 // Merge/Unmerge
993 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
994 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
995 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
996
Matt Arsenaultff6a9a22019-01-20 18:40:36 +0000997 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
998 const LLT &Ty = Query.Types[TypeIdx];
999 if (Ty.isVector()) {
1000 const LLT &EltTy = Ty.getElementType();
1001 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1002 return true;
1003 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1004 return true;
1005 }
1006 return false;
1007 };
1008
Matt Arsenault578fa282019-10-07 19:05:58 +00001009 auto &Builder = getActionDefinitionsBuilder(Op)
Matt Arsenaultd8d193d2019-01-29 23:17:35 +00001010 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1011 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1012 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1013 // valid.
1014 .clampScalar(LitTyIdx, S16, S256)
1015 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
Matt Arsenault954a0122019-08-21 16:59:10 +00001016 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
Matt Arsenault28215ca2019-08-13 16:26:28 +00001017 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1018 elementTypeIs(1, S16)),
1019 changeTo(1, V2S16))
Matt Arsenaultff6a9a22019-01-20 18:40:36 +00001020 // Break up vectors with weird elements into scalars
1021 .fewerElementsIf(
1022 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
Matt Arsenault990f5072019-01-25 00:51:00 +00001023 scalarize(0))
Matt Arsenaultff6a9a22019-01-20 18:40:36 +00001024 .fewerElementsIf(
1025 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
Matt Arsenault990f5072019-01-25 00:51:00 +00001026 scalarize(1))
Matt Arsenault9dba6032019-10-01 16:35:06 +00001027 .clampScalar(BigTyIdx, S32, S1024)
Matt Arsenault578fa282019-10-07 19:05:58 +00001028 .lowerFor({{S16, V2S16}});
1029
1030 if (Op == G_MERGE_VALUES) {
1031 Builder.widenScalarIf(
1032 // TODO: Use 16-bit shifts if legal for 8-bit values?
Matt Arsenaultff6a9a22019-01-20 18:40:36 +00001033 [=](const LegalityQuery &Query) {
Matt Arsenault578fa282019-10-07 19:05:58 +00001034 const LLT Ty = Query.Types[LitTyIdx];
1035 return Ty.getSizeInBits() < 32;
Matt Arsenaultff6a9a22019-01-20 18:40:36 +00001036 },
Matt Arsenault578fa282019-10-07 19:05:58 +00001037 changeTo(LitTyIdx, S32));
1038 }
1039
1040 Builder.widenScalarIf(
1041 [=](const LegalityQuery &Query) {
1042 const LLT Ty = Query.Types[BigTyIdx];
1043 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1044 Ty.getSizeInBits() % 16 != 0;
1045 },
1046 [=](const LegalityQuery &Query) {
1047 // Pick the next power of 2, or a multiple of 64 over 128.
1048 // Whichever is smaller.
1049 const LLT &Ty = Query.Types[BigTyIdx];
1050 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1051 if (NewSizeInBits >= 256) {
1052 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1053 if (RoundedTo < NewSizeInBits)
1054 NewSizeInBits = RoundedTo;
1055 }
1056 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1057 })
Matt Arsenault503afda2018-03-12 13:35:43 +00001058 .legalIf([=](const LegalityQuery &Query) {
1059 const LLT &BigTy = Query.Types[BigTyIdx];
1060 const LLT &LitTy = Query.Types[LitTyIdx];
Matt Arsenaultff6a9a22019-01-20 18:40:36 +00001061
1062 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1063 return false;
1064 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1065 return false;
1066
1067 return BigTy.getSizeInBits() % 16 == 0 &&
1068 LitTy.getSizeInBits() % 16 == 0 &&
Matt Arsenault9dba6032019-10-01 16:35:06 +00001069 BigTy.getSizeInBits() <= 1024;
Matt Arsenault503afda2018-03-12 13:35:43 +00001070 })
1071 // Any vectors left are the wrong size. Scalarize them.
Matt Arsenault990f5072019-01-25 00:51:00 +00001072 .scalarize(0)
1073 .scalarize(1);
Matt Arsenault503afda2018-03-12 13:35:43 +00001074 }
1075
Daniel Sanderse9a57c22019-08-09 21:11:20 +00001076 getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1077
Tom Stellardca166212017-01-30 21:56:46 +00001078 computeTables();
Roman Tereshin76c29c62018-05-31 16:16:48 +00001079 verify(*ST.getInstrInfo());
Tom Stellardca166212017-01-30 21:56:46 +00001080}
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001081
1082bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1083 MachineRegisterInfo &MRI,
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001084 MachineIRBuilder &B,
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001085 GISelChangeObserver &Observer) const {
1086 switch (MI.getOpcode()) {
1087 case TargetOpcode::G_ADDRSPACE_CAST:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001088 return legalizeAddrSpaceCast(MI, MRI, B);
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001089 case TargetOpcode::G_FRINT:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001090 return legalizeFrint(MI, MRI, B);
Matt Arsenaulta510b572019-05-17 12:20:05 +00001091 case TargetOpcode::G_FCEIL:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001092 return legalizeFceil(MI, MRI, B);
Matt Arsenault6aebcd52019-05-17 12:20:01 +00001093 case TargetOpcode::G_INTRINSIC_TRUNC:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001094 return legalizeIntrinsicTrunc(MI, MRI, B);
Matt Arsenault2f292202019-05-17 23:05:18 +00001095 case TargetOpcode::G_SITOFP:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001096 return legalizeITOFP(MI, MRI, B, true);
Matt Arsenault2f292202019-05-17 23:05:18 +00001097 case TargetOpcode::G_UITOFP:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001098 return legalizeITOFP(MI, MRI, B, false);
Matt Arsenault6ce1b4f2019-07-10 16:31:19 +00001099 case TargetOpcode::G_FMINNUM:
1100 case TargetOpcode::G_FMAXNUM:
1101 case TargetOpcode::G_FMINNUM_IEEE:
1102 case TargetOpcode::G_FMAXNUM_IEEE:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001103 return legalizeMinNumMaxNum(MI, MRI, B);
Matt Arsenaultb0e04c02019-07-15 19:40:59 +00001104 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001105 return legalizeExtractVectorElt(MI, MRI, B);
Matt Arsenaultb0e04c02019-07-15 19:40:59 +00001106 case TargetOpcode::G_INSERT_VECTOR_ELT:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001107 return legalizeInsertVectorElt(MI, MRI, B);
Matt Arsenaultcbd17822019-08-29 20:06:48 +00001108 case TargetOpcode::G_FSIN:
1109 case TargetOpcode::G_FCOS:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001110 return legalizeSinCos(MI, MRI, B);
Matt Arsenault64ecca92019-09-09 17:13:44 +00001111 case TargetOpcode::G_GLOBAL_VALUE:
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001112 return legalizeGlobalValue(MI, MRI, B);
Matt Arsenaultad6a8b832019-09-10 16:42:31 +00001113 case TargetOpcode::G_LOAD:
1114 return legalizeLoad(MI, MRI, B, Observer);
Matt Arsenault4d339182019-09-13 00:44:35 +00001115 case TargetOpcode::G_FMAD:
1116 return legalizeFMad(MI, MRI, B);
Austin Kerbow97263fa2019-10-21 22:18:26 +00001117 case TargetOpcode::G_FDIV:
1118 return legalizeFDIV(MI, MRI, B);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001119 default:
1120 return false;
1121 }
1122
1123 llvm_unreachable("expected switch to return");
1124}
1125
Matt Arsenault1178dc32019-06-28 01:16:46 +00001126Register AMDGPULegalizerInfo::getSegmentAperture(
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001127 unsigned AS,
1128 MachineRegisterInfo &MRI,
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001129 MachineIRBuilder &B) const {
1130 MachineFunction &MF = B.getMF();
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001131 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1132 const LLT S32 = LLT::scalar(32);
1133
Matt Arsenaultd7cad4f2019-10-04 08:35:38 +00001134 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1135
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001136 if (ST.hasApertureRegs()) {
1137 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1138 // getreg.
1139 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1140 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1141 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1142 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1143 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1144 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1145 unsigned Encoding =
1146 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1147 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1148 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1149
Matt Arsenault1178dc32019-06-28 01:16:46 +00001150 Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1151 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001152
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001153 B.buildInstr(AMDGPU::S_GETREG_B32)
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001154 .addDef(GetReg)
1155 .addImm(Encoding);
1156 MRI.setType(GetReg, S32);
1157
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001158 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1159 B.buildInstr(TargetOpcode::G_SHL)
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001160 .addDef(ApertureReg)
1161 .addUse(GetReg)
Amara Emerson946b1242019-04-15 05:04:20 +00001162 .addUse(ShiftAmt.getReg(0));
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001163
1164 return ApertureReg;
1165 }
1166
Matt Arsenault1178dc32019-06-28 01:16:46 +00001167 Register QueuePtr = MRI.createGenericVirtualRegister(
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001168 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1169
Matt Arsenault25156ae2019-09-05 02:20:29 +00001170 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001171 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
Matt Arsenault25156ae2019-09-05 02:20:29 +00001172 return Register();
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001173
1174 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1175 // private_segment_aperture_base_hi.
1176 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1177
1178 // FIXME: Don't use undef
1179 Value *V = UndefValue::get(PointerType::get(
1180 Type::getInt8Ty(MF.getFunction().getContext()),
1181 AMDGPUAS::CONSTANT_ADDRESS));
1182
1183 MachinePointerInfo PtrInfo(V, StructOffset);
1184 MachineMemOperand *MMO = MF.getMachineMemOperand(
1185 PtrInfo,
1186 MachineMemOperand::MOLoad |
1187 MachineMemOperand::MODereferenceable |
1188 MachineMemOperand::MOInvariant,
1189 4,
1190 MinAlign(64, StructOffset));
1191
Matt Arsenaulte3a676e2019-06-24 15:50:29 +00001192 Register LoadResult = MRI.createGenericVirtualRegister(S32);
1193 Register LoadAddr;
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001194
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001195 B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1196 B.buildLoad(LoadResult, LoadAddr, *MMO);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001197 return LoadResult;
1198}
1199
1200bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1201 MachineInstr &MI, MachineRegisterInfo &MRI,
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001202 MachineIRBuilder &B) const {
1203 MachineFunction &MF = B.getMF();
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001204
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001205 B.setInstr(MI);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001206
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +00001207 const LLT S32 = LLT::scalar(32);
Matt Arsenaulte3a676e2019-06-24 15:50:29 +00001208 Register Dst = MI.getOperand(0).getReg();
1209 Register Src = MI.getOperand(1).getReg();
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001210
1211 LLT DstTy = MRI.getType(Dst);
1212 LLT SrcTy = MRI.getType(Src);
1213 unsigned DestAS = DstTy.getAddressSpace();
1214 unsigned SrcAS = SrcTy.getAddressSpace();
1215
1216 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1217 // vector element.
1218 assert(!DstTy.isVector());
1219
1220 const AMDGPUTargetMachine &TM
1221 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1222
1223 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1224 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001225 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001226 return true;
1227 }
1228
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +00001229 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1230 // Truncate.
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001231 B.buildExtract(Dst, Src, 0);
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +00001232 MI.eraseFromParent();
1233 return true;
1234 }
1235
1236 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1237 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1238 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1239
1240 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1241 // another. Merge operands are required to be the same type, but creating an
1242 // extra ptrtoint would be kind of pointless.
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001243 auto HighAddr = B.buildConstant(
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +00001244 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001245 B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +00001246 MI.eraseFromParent();
1247 return true;
1248 }
1249
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001250 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1251 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1252 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1253 unsigned NullVal = TM.getNullPointerValue(DestAS);
1254
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001255 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1256 auto FlatNull = B.buildConstant(SrcTy, 0);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001257
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001258 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001259
1260 // Extract low 32-bits of the pointer.
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001261 B.buildExtract(PtrLo32, Src, 0);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001262
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001263 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001264 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1265 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001266
1267 MI.eraseFromParent();
1268 return true;
1269 }
1270
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +00001271 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1272 return false;
1273
1274 if (!ST.hasFlatAddressSpace())
1275 return false;
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001276
Amara Emerson946b1242019-04-15 05:04:20 +00001277 auto SegmentNull =
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001278 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
Amara Emerson946b1242019-04-15 05:04:20 +00001279 auto FlatNull =
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001280 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001281
Matt Arsenaultd7cad4f2019-10-04 08:35:38 +00001282 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
Matt Arsenault25156ae2019-09-05 02:20:29 +00001283 if (!ApertureReg.isValid())
1284 return false;
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001285
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001286 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001287 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001288
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001289 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001290
1291 // Coerce the type of the low half of the result so we can use merge_values.
Matt Arsenault5c7e96dc2019-08-28 00:58:24 +00001292 Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001293 B.buildInstr(TargetOpcode::G_PTRTOINT)
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001294 .addDef(SrcAsInt)
1295 .addUse(Src);
1296
1297 // TODO: Should we allow mismatched types but matching sizes in merges to
1298 // avoid the ptrtoint?
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001299 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1300 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
Matt Arsenaulta8b43392019-02-08 02:40:47 +00001301
1302 MI.eraseFromParent();
1303 return true;
1304}
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001305
1306bool AMDGPULegalizerInfo::legalizeFrint(
1307 MachineInstr &MI, MachineRegisterInfo &MRI,
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001308 MachineIRBuilder &B) const {
1309 B.setInstr(MI);
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001310
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001311 Register Src = MI.getOperand(1).getReg();
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001312 LLT Ty = MRI.getType(Src);
1313 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1314
1315 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1316 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1317
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001318 auto C1 = B.buildFConstant(Ty, C1Val);
1319 auto CopySign = B.buildFCopysign(Ty, C1, Src);
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001320
1321 // TODO: Should this propagate fast-math-flags?
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001322 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1323 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001324
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001325 auto C2 = B.buildFConstant(Ty, C2Val);
1326 auto Fabs = B.buildFAbs(Ty, Src);
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001327
Austin Kerbow06c8cb02019-09-09 23:06:13 +00001328 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1329 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
Matt Arsenault6aafc5e2019-05-17 12:19:57 +00001330 return true;
1331}
Matt Arsenault6aebcd52019-05-17 12:20:01 +00001332
Matt Arsenaulta510b572019-05-17 12:20:05 +00001333bool AMDGPULegalizerInfo::legalizeFceil(
1334 MachineInstr &MI, MachineRegisterInfo &MRI,
1335 MachineIRBuilder &B) const {
1336 B.setInstr(MI);
1337
Matt Arsenault1a02d302019-05-17 12:59:27 +00001338 const LLT S1 = LLT::scalar(1);
1339 const LLT S64 = LLT::scalar(64);
1340
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001341 Register Src = MI.getOperand(1).getReg();
Matt Arsenault1a02d302019-05-17 12:59:27 +00001342 assert(MRI.getType(Src) == S64);
Matt Arsenaulta510b572019-05-17 12:20:05 +00001343
1344 // result = trunc(src)
1345 // if (src > 0.0 && src != result)
1346 // result += 1.0
1347
Matt Arsenaulta510b572019-05-17 12:20:05 +00001348 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1349
Matt Arsenaulta510b572019-05-17 12:20:05 +00001350 const auto Zero = B.buildFConstant(S64, 0.0);
1351 const auto One = B.buildFConstant(S64, 1.0);
1352 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1353 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1354 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1355 auto Add = B.buildSelect(S64, And, One, Zero);
1356
1357 // TODO: Should this propagate fast-math-flags?
1358 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1359 return true;
1360}
1361
Matt Arsenault6aebcd52019-05-17 12:20:01 +00001362static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1363 MachineIRBuilder &B) {
1364 const unsigned FractBits = 52;
1365 const unsigned ExpBits = 11;
1366 LLT S32 = LLT::scalar(32);
1367
1368 auto Const0 = B.buildConstant(S32, FractBits - 32);
1369 auto Const1 = B.buildConstant(S32, ExpBits);
1370
1371 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1372 .addUse(Const0.getReg(0))
1373 .addUse(Const1.getReg(0));
1374
1375 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1376}
1377
1378bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1379 MachineInstr &MI, MachineRegisterInfo &MRI,
1380 MachineIRBuilder &B) const {
1381 B.setInstr(MI);
1382
Matt Arsenault1a02d302019-05-17 12:59:27 +00001383 const LLT S1 = LLT::scalar(1);
1384 const LLT S32 = LLT::scalar(32);
1385 const LLT S64 = LLT::scalar(64);
Matt Arsenault6aebcd52019-05-17 12:20:01 +00001386
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001387 Register Src = MI.getOperand(1).getReg();
Matt Arsenault1a02d302019-05-17 12:59:27 +00001388 assert(MRI.getType(Src) == S64);
Matt Arsenault6aebcd52019-05-17 12:20:01 +00001389
1390 // TODO: Should this use extract since the low half is unused?
1391 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001392 Register Hi = Unmerge.getReg(1);
Matt Arsenault6aebcd52019-05-17 12:20:01 +00001393
1394 // Extract the upper half, since this is where we will find the sign and
1395 // exponent.
1396 auto Exp = extractF64Exponent(Hi, B);
1397
1398 const unsigned FractBits = 52;
1399
1400 // Extract the sign bit.
1401 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1402 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1403
1404 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1405
1406 const auto Zero32 = B.buildConstant(S32, 0);
1407
1408 // Extend back to 64-bits.
1409 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1410
1411 auto Shr = B.buildAShr(S64, FractMask, Exp);
1412 auto Not = B.buildNot(S64, Shr);
1413 auto Tmp0 = B.buildAnd(S64, Src, Not);
1414 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1415
1416 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1417 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1418
1419 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1420 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1421 return true;
1422}
Matt Arsenault2f292202019-05-17 23:05:18 +00001423
1424bool AMDGPULegalizerInfo::legalizeITOFP(
1425 MachineInstr &MI, MachineRegisterInfo &MRI,
1426 MachineIRBuilder &B, bool Signed) const {
1427 B.setInstr(MI);
1428
Matt Arsenaultfaeaedf2019-06-24 16:16:12 +00001429 Register Dst = MI.getOperand(0).getReg();
1430 Register Src = MI.getOperand(1).getReg();
Matt Arsenault2f292202019-05-17 23:05:18 +00001431
1432 const LLT S64 = LLT::scalar(64);
1433 const LLT S32 = LLT::scalar(32);
1434
1435 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1436
1437 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1438
1439 auto CvtHi = Signed ?
1440 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1441 B.buildUITOFP(S64, Unmerge.getReg(1));
1442
1443 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1444
1445 auto ThirtyTwo = B.buildConstant(S32, 32);
1446 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1447 .addUse(CvtHi.getReg(0))
1448 .addUse(ThirtyTwo.getReg(0));
1449
1450 // TODO: Should this propagate fast-math-flags?
1451 B.buildFAdd(Dst, LdExp, CvtLo);
1452 MI.eraseFromParent();
1453 return true;
1454}
Matt Arsenaulte15770a2019-07-01 18:40:23 +00001455
Matt Arsenault6ce1b4f2019-07-10 16:31:19 +00001456bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1457 MachineInstr &MI, MachineRegisterInfo &MRI,
1458 MachineIRBuilder &B) const {
1459 MachineFunction &MF = B.getMF();
1460 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1461
1462 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1463 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1464
1465 // With ieee_mode disabled, the instructions have the correct behavior
1466 // already for G_FMINNUM/G_FMAXNUM
1467 if (!MFI->getMode().IEEE)
1468 return !IsIEEEOp;
1469
1470 if (IsIEEEOp)
1471 return true;
1472
1473 MachineIRBuilder HelperBuilder(MI);
1474 GISelObserverWrapper DummyObserver;
1475 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
Matt Arsenaulta91f0172019-09-09 23:30:11 +00001476 HelperBuilder.setInstr(MI);
Matt Arsenault6ce1b4f2019-07-10 16:31:19 +00001477 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1478}
1479
Matt Arsenaultb0e04c02019-07-15 19:40:59 +00001480bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1481 MachineInstr &MI, MachineRegisterInfo &MRI,
1482 MachineIRBuilder &B) const {
1483 // TODO: Should move some of this into LegalizerHelper.
1484
1485 // TODO: Promote dynamic indexing of s16 to s32
1486 // TODO: Dynamic s64 indexing is only legal for SGPR.
1487 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1488 if (!IdxVal) // Dynamic case will be selected to register indexing.
1489 return true;
1490
1491 Register Dst = MI.getOperand(0).getReg();
1492 Register Vec = MI.getOperand(1).getReg();
1493
1494 LLT VecTy = MRI.getType(Vec);
1495 LLT EltTy = VecTy.getElementType();
1496 assert(EltTy == MRI.getType(Dst));
1497
1498 B.setInstr(MI);
1499
1500 if (IdxVal.getValue() < VecTy.getNumElements())
1501 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1502 else
1503 B.buildUndef(Dst);
1504
1505 MI.eraseFromParent();
1506 return true;
1507}
1508
Matt Arsenault6ed315f2019-07-15 19:43:04 +00001509bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1510 MachineInstr &MI, MachineRegisterInfo &MRI,
1511 MachineIRBuilder &B) const {
1512 // TODO: Should move some of this into LegalizerHelper.
1513
1514 // TODO: Promote dynamic indexing of s16 to s32
1515 // TODO: Dynamic s64 indexing is only legal for SGPR.
1516 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1517 if (!IdxVal) // Dynamic case will be selected to register indexing.
1518 return true;
1519
1520 Register Dst = MI.getOperand(0).getReg();
1521 Register Vec = MI.getOperand(1).getReg();
1522 Register Ins = MI.getOperand(2).getReg();
1523
1524 LLT VecTy = MRI.getType(Vec);
1525 LLT EltTy = VecTy.getElementType();
1526 assert(EltTy == MRI.getType(Ins));
1527
1528 B.setInstr(MI);
1529
1530 if (IdxVal.getValue() < VecTy.getNumElements())
1531 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1532 else
1533 B.buildUndef(Dst);
1534
1535 MI.eraseFromParent();
1536 return true;
1537}
1538
Matt Arsenaultcbd17822019-08-29 20:06:48 +00001539bool AMDGPULegalizerInfo::legalizeSinCos(
1540 MachineInstr &MI, MachineRegisterInfo &MRI,
1541 MachineIRBuilder &B) const {
1542 B.setInstr(MI);
1543
1544 Register DstReg = MI.getOperand(0).getReg();
1545 Register SrcReg = MI.getOperand(1).getReg();
1546 LLT Ty = MRI.getType(DstReg);
1547 unsigned Flags = MI.getFlags();
1548
1549 Register TrigVal;
1550 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1551 if (ST.hasTrigReducedRange()) {
1552 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1553 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1554 .addUse(MulVal.getReg(0))
1555 .setMIFlags(Flags).getReg(0);
1556 } else
1557 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1558
1559 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1560 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1561 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1562 .addUse(TrigVal)
1563 .setMIFlags(Flags);
1564 MI.eraseFromParent();
1565 return true;
1566}
1567
Matt Arsenault77ac4002019-10-01 01:06:43 +00001568bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1569 Register DstReg, LLT PtrTy,
1570 MachineIRBuilder &B, const GlobalValue *GV,
1571 unsigned Offset, unsigned GAFlags) const {
1572 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1573 // to the following code sequence:
1574 //
1575 // For constant address space:
1576 // s_getpc_b64 s[0:1]
1577 // s_add_u32 s0, s0, $symbol
1578 // s_addc_u32 s1, s1, 0
1579 //
1580 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1581 // a fixup or relocation is emitted to replace $symbol with a literal
1582 // constant, which is a pc-relative offset from the encoding of the $symbol
1583 // operand to the global variable.
1584 //
1585 // For global address space:
1586 // s_getpc_b64 s[0:1]
1587 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1588 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1589 //
1590 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1591 // fixups or relocations are emitted to replace $symbol@*@lo and
1592 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1593 // which is a 64-bit pc-relative offset from the encoding of the $symbol
1594 // operand to the global variable.
1595 //
1596 // What we want here is an offset from the value returned by s_getpc
1597 // (which is the address of the s_add_u32 instruction) to the global
1598 // variable, but since the encoding of $symbol starts 4 bytes after the start
1599 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1600 // small. This requires us to add 4 to the global variable offset in order to
1601 // compute the correct address.
1602
1603 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1604
1605 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1606 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1607
1608 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1609 .addDef(PCReg);
1610
1611 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1612 if (GAFlags == SIInstrInfo::MO_NONE)
1613 MIB.addImm(0);
1614 else
1615 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1616
1617 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1618
1619 if (PtrTy.getSizeInBits() == 32)
1620 B.buildExtract(DstReg, PCReg, 0);
1621 return true;
1622 }
1623
Matt Arsenault64ecca92019-09-09 17:13:44 +00001624bool AMDGPULegalizerInfo::legalizeGlobalValue(
1625 MachineInstr &MI, MachineRegisterInfo &MRI,
1626 MachineIRBuilder &B) const {
1627 Register DstReg = MI.getOperand(0).getReg();
1628 LLT Ty = MRI.getType(DstReg);
1629 unsigned AS = Ty.getAddressSpace();
1630
1631 const GlobalValue *GV = MI.getOperand(1).getGlobal();
1632 MachineFunction &MF = B.getMF();
1633 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Matt Arsenault77ac4002019-10-01 01:06:43 +00001634 B.setInstr(MI);
Matt Arsenault64ecca92019-09-09 17:13:44 +00001635
1636 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
Matt Arsenault64ecca92019-09-09 17:13:44 +00001637 if (!MFI->isEntryFunction()) {
1638 const Function &Fn = MF.getFunction();
1639 DiagnosticInfoUnsupported BadLDSDecl(
1640 Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1641 Fn.getContext().diagnose(BadLDSDecl);
1642 }
1643
1644 // TODO: We could emit code to handle the initialization somewhere.
1645 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1646 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1647 MI.eraseFromParent();
1648 return true;
1649 }
Matt Arsenault64ecca92019-09-09 17:13:44 +00001650
Matt Arsenault77ac4002019-10-01 01:06:43 +00001651 const Function &Fn = MF.getFunction();
1652 DiagnosticInfoUnsupported BadInit(
1653 Fn, "unsupported initializer for address space", MI.getDebugLoc());
1654 Fn.getContext().diagnose(BadInit);
1655 return true;
1656 }
1657
1658 const SITargetLowering *TLI = ST.getTargetLowering();
1659
1660 if (TLI->shouldEmitFixup(GV)) {
1661 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1662 MI.eraseFromParent();
1663 return true;
1664 }
1665
1666 if (TLI->shouldEmitPCReloc(GV)) {
1667 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1668 MI.eraseFromParent();
1669 return true;
1670 }
1671
1672 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1673 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1674
1675 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1676 MachinePointerInfo::getGOT(MF),
1677 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1678 MachineMemOperand::MOInvariant,
1679 8 /*Size*/, 8 /*Align*/);
1680
1681 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1682
1683 if (Ty.getSizeInBits() == 32) {
1684 // Truncate if this is a 32-bit constant adrdess.
1685 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1686 B.buildExtract(DstReg, Load, 0);
1687 } else
1688 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1689
1690 MI.eraseFromParent();
Matt Arsenault64ecca92019-09-09 17:13:44 +00001691 return true;
1692}
1693
Matt Arsenaultad6a8b832019-09-10 16:42:31 +00001694bool AMDGPULegalizerInfo::legalizeLoad(
1695 MachineInstr &MI, MachineRegisterInfo &MRI,
1696 MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1697 B.setInstr(MI);
1698 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1699 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1700 Observer.changingInstr(MI);
1701 MI.getOperand(1).setReg(Cast.getReg(0));
1702 Observer.changedInstr(MI);
1703 return true;
1704}
1705
Matt Arsenault4d339182019-09-13 00:44:35 +00001706bool AMDGPULegalizerInfo::legalizeFMad(
1707 MachineInstr &MI, MachineRegisterInfo &MRI,
1708 MachineIRBuilder &B) const {
1709 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1710 assert(Ty.isScalar());
1711
1712 // TODO: Always legal with future ftz flag.
1713 if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1714 return true;
1715 if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1716 return true;
1717
1718 MachineFunction &MF = B.getMF();
1719
1720 MachineIRBuilder HelperBuilder(MI);
1721 GISelObserverWrapper DummyObserver;
1722 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1723 HelperBuilder.setMBB(*MI.getParent());
1724 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1725}
1726
Matt Arsenaulte15770a2019-07-01 18:40:23 +00001727// Return the use branch instruction, otherwise null if the usage is invalid.
1728static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1729 MachineRegisterInfo &MRI) {
1730 Register CondDef = MI.getOperand(0).getReg();
1731 if (!MRI.hasOneNonDBGUse(CondDef))
1732 return nullptr;
1733
1734 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1735 return UseMI.getParent() == MI.getParent() &&
1736 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1737}
1738
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +00001739Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1740 Register Reg, LLT Ty) const {
1741 Register LiveIn = MRI.getLiveInVirtReg(Reg);
1742 if (LiveIn)
1743 return LiveIn;
1744
1745 Register NewReg = MRI.createGenericVirtualRegister(Ty);
1746 MRI.addLiveIn(Reg, NewReg);
1747 return NewReg;
1748}
1749
1750bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1751 const ArgDescriptor *Arg) const {
Matt Arsenault25156ae2019-09-05 02:20:29 +00001752 if (!Arg->isRegister() || !Arg->getRegister().isValid())
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +00001753 return false; // TODO: Handle these
1754
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +00001755 assert(Arg->getRegister().isPhysical());
1756
1757 MachineRegisterInfo &MRI = *B.getMRI();
1758
1759 LLT Ty = MRI.getType(DstReg);
1760 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1761
1762 if (Arg->isMasked()) {
1763 // TODO: Should we try to emit this once in the entry block?
1764 const LLT S32 = LLT::scalar(32);
1765 const unsigned Mask = Arg->getMask();
1766 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1767
Matt Arsenault8f6bdb72019-10-01 01:44:46 +00001768 Register AndMaskSrc = LiveIn;
1769
1770 if (Shift != 0) {
1771 auto ShiftAmt = B.buildConstant(S32, Shift);
1772 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1773 }
1774
1775 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +00001776 } else
1777 B.buildCopy(DstReg, LiveIn);
1778
1779 // Insert the argument copy if it doens't already exist.
1780 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1781 if (!MRI.getVRegDef(LiveIn)) {
Matt Arsenault69b1a2a2019-09-05 02:20:32 +00001782 // FIXME: Should have scoped insert pt
1783 MachineBasicBlock &OrigInsBB = B.getMBB();
1784 auto OrigInsPt = B.getInsertPt();
1785
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +00001786 MachineBasicBlock &EntryMBB = B.getMF().front();
1787 EntryMBB.addLiveIn(Arg->getRegister());
1788 B.setInsertPt(EntryMBB, EntryMBB.begin());
1789 B.buildCopy(LiveIn, Arg->getRegister());
Matt Arsenault69b1a2a2019-09-05 02:20:32 +00001790
1791 B.setInsertPt(OrigInsBB, OrigInsPt);
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +00001792 }
1793
1794 return true;
1795}
1796
1797bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1798 MachineInstr &MI,
1799 MachineRegisterInfo &MRI,
1800 MachineIRBuilder &B,
1801 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1802 B.setInstr(MI);
1803
1804 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1805
1806 const ArgDescriptor *Arg;
1807 const TargetRegisterClass *RC;
1808 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1809 if (!Arg) {
1810 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1811 return false;
1812 }
1813
1814 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1815 MI.eraseFromParent();
1816 return true;
1817 }
1818
1819 return false;
1820}
1821
Austin Kerbow97263fa2019-10-21 22:18:26 +00001822bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1823 MachineRegisterInfo &MRI,
1824 MachineIRBuilder &B) const {
1825 B.setInstr(MI);
Austin Kerbowc35b3582019-10-22 17:39:26 -07001826 Register Dst = MI.getOperand(0).getReg();
1827 LLT DstTy = MRI.getType(Dst);
1828 LLT S16 = LLT::scalar(16);
Austin Kerbow97263fa2019-10-21 22:18:26 +00001829
1830 if (legalizeFastUnsafeFDIV(MI, MRI, B))
1831 return true;
1832
Austin Kerbowc35b3582019-10-22 17:39:26 -07001833 if (DstTy == S16)
1834 return legalizeFDIV16(MI, MRI, B);
1835
Austin Kerbow97263fa2019-10-21 22:18:26 +00001836 return false;
1837}
1838
1839bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1840 MachineRegisterInfo &MRI,
1841 MachineIRBuilder &B) const {
1842 Register Res = MI.getOperand(0).getReg();
1843 Register LHS = MI.getOperand(1).getReg();
1844 Register RHS = MI.getOperand(2).getReg();
1845
1846 uint16_t Flags = MI.getFlags();
1847
1848 LLT ResTy = MRI.getType(Res);
1849 LLT S32 = LLT::scalar(32);
1850 LLT S64 = LLT::scalar(64);
1851
1852 const MachineFunction &MF = B.getMF();
1853 bool Unsafe =
1854 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1855
1856 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1857 return false;
1858
1859 if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals())
1860 return false;
1861
1862 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1863 // 1 / x -> RCP(x)
1864 if (CLHS->isExactlyValue(1.0)) {
1865 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1866 .addUse(RHS)
1867 .setMIFlags(Flags);
1868
1869 MI.eraseFromParent();
1870 return true;
1871 }
1872
1873 // -1 / x -> RCP( FNEG(x) )
1874 if (CLHS->isExactlyValue(-1.0)) {
1875 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1876 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1877 .addUse(FNeg.getReg(0))
1878 .setMIFlags(Flags);
1879
1880 MI.eraseFromParent();
1881 return true;
1882 }
1883 }
1884
1885 // x / y -> x * (1.0 / y)
1886 if (Unsafe) {
1887 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1888 .addUse(RHS)
1889 .setMIFlags(Flags);
1890 B.buildFMul(Res, LHS, RCP, Flags);
1891
1892 MI.eraseFromParent();
1893 return true;
1894 }
1895
1896 return false;
1897}
1898
Austin Kerbowc35b3582019-10-22 17:39:26 -07001899bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1900 MachineRegisterInfo &MRI,
1901 MachineIRBuilder &B) const {
1902 B.setInstr(MI);
1903 Register Res = MI.getOperand(0).getReg();
1904 Register LHS = MI.getOperand(1).getReg();
1905 Register RHS = MI.getOperand(2).getReg();
1906
1907 uint16_t Flags = MI.getFlags();
1908
1909 LLT S16 = LLT::scalar(16);
1910 LLT S32 = LLT::scalar(32);
1911
1912 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
1913 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
1914
1915 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1916 .addUse(RHSExt.getReg(0))
1917 .setMIFlags(Flags);
1918
1919 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
1920 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
1921
1922 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
1923 .addUse(RDst.getReg(0))
1924 .addUse(RHS)
1925 .addUse(LHS)
1926 .setMIFlags(Flags);
1927
1928 MI.eraseFromParent();
1929 return true;
1930}
1931
Austin Kerbow97263fa2019-10-21 22:18:26 +00001932bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
1933 MachineRegisterInfo &MRI,
1934 MachineIRBuilder &B) const {
Austin Kerbowc99f62e2019-07-30 18:49:16 +00001935 B.setInstr(MI);
1936 Register Res = MI.getOperand(0).getReg();
1937 Register LHS = MI.getOperand(2).getReg();
1938 Register RHS = MI.getOperand(3).getReg();
1939 uint16_t Flags = MI.getFlags();
1940
1941 LLT S32 = LLT::scalar(32);
1942 LLT S1 = LLT::scalar(1);
1943
1944 auto Abs = B.buildFAbs(S32, RHS, Flags);
1945 const APFloat C0Val(1.0f);
1946
1947 auto C0 = B.buildConstant(S32, 0x6f800000);
1948 auto C1 = B.buildConstant(S32, 0x2f800000);
1949 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1950
1951 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1952 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1953
1954 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1955
1956 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1957 .addUse(Mul0.getReg(0))
1958 .setMIFlags(Flags);
1959
1960 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1961
1962 B.buildFMul(Res, Sel, Mul1, Flags);
1963
1964 MI.eraseFromParent();
1965 return true;
1966}
1967
Matt Arsenault9e8e8c62019-07-01 18:49:01 +00001968bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1969 MachineRegisterInfo &MRI,
1970 MachineIRBuilder &B) const {
1971 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1972 if (!MFI->isEntryFunction()) {
1973 return legalizePreloadedArgIntrin(MI, MRI, B,
1974 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1975 }
1976
1977 B.setInstr(MI);
1978
1979 uint64_t Offset =
1980 ST.getTargetLowering()->getImplicitParameterOffset(
1981 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1982 Register DstReg = MI.getOperand(0).getReg();
1983 LLT DstTy = MRI.getType(DstReg);
1984 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1985
1986 const ArgDescriptor *Arg;
1987 const TargetRegisterClass *RC;
1988 std::tie(Arg, RC)
1989 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1990 if (!Arg)
1991 return false;
1992
1993 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1994 if (!loadInputValue(KernargPtrReg, B, Arg))
1995 return false;
1996
1997 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1998 MI.eraseFromParent();
1999 return true;
2000}
2001
Matt Arsenaultf581d572019-09-05 02:20:39 +00002002bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2003 MachineRegisterInfo &MRI,
2004 MachineIRBuilder &B,
2005 unsigned AddrSpace) const {
2006 B.setInstr(MI);
2007 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2008 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2009 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2010 MI.eraseFromParent();
2011 return true;
2012}
2013
Matt Arsenault3ecab8e2019-09-19 16:26:14 +00002014/// Handle register layout difference for f16 images for some subtargets.
2015Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2016 MachineRegisterInfo &MRI,
2017 Register Reg) const {
2018 if (!ST.hasUnpackedD16VMem())
2019 return Reg;
2020
2021 const LLT S16 = LLT::scalar(16);
2022 const LLT S32 = LLT::scalar(32);
2023 LLT StoreVT = MRI.getType(Reg);
2024 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2025
2026 auto Unmerge = B.buildUnmerge(S16, Reg);
2027
2028 SmallVector<Register, 4> WideRegs;
2029 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2030 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2031
2032 int NumElts = StoreVT.getNumElements();
2033
2034 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2035}
2036
2037bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2038 MachineRegisterInfo &MRI,
2039 MachineIRBuilder &B,
2040 bool IsFormat) const {
2041 // TODO: Reject f16 format on targets where unsupported.
2042 Register VData = MI.getOperand(1).getReg();
2043 LLT Ty = MRI.getType(VData);
2044
2045 B.setInstr(MI);
2046
2047 const LLT S32 = LLT::scalar(32);
2048 const LLT S16 = LLT::scalar(16);
2049
2050 // Fixup illegal register types for i8 stores.
2051 if (Ty == LLT::scalar(8) || Ty == S16) {
2052 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2053 MI.getOperand(1).setReg(AnyExt);
2054 return true;
2055 }
2056
2057 if (Ty.isVector()) {
2058 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2059 if (IsFormat)
2060 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2061 return true;
2062 }
2063
2064 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2065 }
2066
2067 return Ty == S32;
2068}
2069
Matt Arsenaulte15770a2019-07-01 18:40:23 +00002070bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2071 MachineRegisterInfo &MRI,
2072 MachineIRBuilder &B) const {
2073 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
Matt Arsenault86f864da2019-10-02 01:02:27 +00002074 switch (MI.getIntrinsicID()) {
Matt Arsenaulte15770a2019-07-01 18:40:23 +00002075 case Intrinsic::amdgcn_if: {
2076 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2077 const SIRegisterInfo *TRI
2078 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2079
2080 B.setInstr(*BrCond);
2081 Register Def = MI.getOperand(1).getReg();
2082 Register Use = MI.getOperand(3).getReg();
2083 B.buildInstr(AMDGPU::SI_IF)
2084 .addDef(Def)
2085 .addUse(Use)
2086 .addMBB(BrCond->getOperand(1).getMBB());
2087
2088 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2089 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2090 MI.eraseFromParent();
2091 BrCond->eraseFromParent();
2092 return true;
2093 }
2094
2095 return false;
2096 }
2097 case Intrinsic::amdgcn_loop: {
2098 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2099 const SIRegisterInfo *TRI
2100 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2101
2102 B.setInstr(*BrCond);
2103 Register Reg = MI.getOperand(2).getReg();
2104 B.buildInstr(AMDGPU::SI_LOOP)
2105 .addUse(Reg)
2106 .addMBB(BrCond->getOperand(1).getMBB());
2107 MI.eraseFromParent();
2108 BrCond->eraseFromParent();
2109 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2110 return true;
2111 }
2112
2113 return false;
2114 }
Matt Arsenault9e8e8c62019-07-01 18:49:01 +00002115 case Intrinsic::amdgcn_kernarg_segment_ptr:
2116 return legalizePreloadedArgIntrin(
2117 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2118 case Intrinsic::amdgcn_implicitarg_ptr:
2119 return legalizeImplicitArgPtr(MI, MRI, B);
Matt Arsenaulte2c86cc2019-07-01 18:45:36 +00002120 case Intrinsic::amdgcn_workitem_id_x:
2121 return legalizePreloadedArgIntrin(MI, MRI, B,
2122 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2123 case Intrinsic::amdgcn_workitem_id_y:
2124 return legalizePreloadedArgIntrin(MI, MRI, B,
2125 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2126 case Intrinsic::amdgcn_workitem_id_z:
2127 return legalizePreloadedArgIntrin(MI, MRI, B,
2128 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
Matt Arsenault756d8192019-07-01 18:47:22 +00002129 case Intrinsic::amdgcn_workgroup_id_x:
2130 return legalizePreloadedArgIntrin(MI, MRI, B,
2131 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2132 case Intrinsic::amdgcn_workgroup_id_y:
2133 return legalizePreloadedArgIntrin(MI, MRI, B,
2134 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2135 case Intrinsic::amdgcn_workgroup_id_z:
2136 return legalizePreloadedArgIntrin(MI, MRI, B,
2137 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
Matt Arsenaultbae36362019-07-01 18:50:50 +00002138 case Intrinsic::amdgcn_dispatch_ptr:
2139 return legalizePreloadedArgIntrin(MI, MRI, B,
2140 AMDGPUFunctionArgInfo::DISPATCH_PTR);
2141 case Intrinsic::amdgcn_queue_ptr:
2142 return legalizePreloadedArgIntrin(MI, MRI, B,
2143 AMDGPUFunctionArgInfo::QUEUE_PTR);
2144 case Intrinsic::amdgcn_implicit_buffer_ptr:
2145 return legalizePreloadedArgIntrin(
2146 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2147 case Intrinsic::amdgcn_dispatch_id:
2148 return legalizePreloadedArgIntrin(MI, MRI, B,
2149 AMDGPUFunctionArgInfo::DISPATCH_ID);
Austin Kerbowc99f62e2019-07-30 18:49:16 +00002150 case Intrinsic::amdgcn_fdiv_fast:
Austin Kerbow97263fa2019-10-21 22:18:26 +00002151 return legalizeFDIVFastIntrin(MI, MRI, B);
Matt Arsenaultf581d572019-09-05 02:20:39 +00002152 case Intrinsic::amdgcn_is_shared:
2153 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2154 case Intrinsic::amdgcn_is_private:
2155 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
Matt Arsenault8e3bc9b2019-09-09 15:20:49 +00002156 case Intrinsic::amdgcn_wavefrontsize: {
2157 B.setInstr(MI);
2158 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2159 MI.eraseFromParent();
2160 return true;
2161 }
Matt Arsenault3ecab8e2019-09-19 16:26:14 +00002162 case Intrinsic::amdgcn_raw_buffer_store:
2163 return legalizeRawBufferStore(MI, MRI, B, false);
2164 case Intrinsic::amdgcn_raw_buffer_store_format:
2165 return legalizeRawBufferStore(MI, MRI, B, true);
Matt Arsenaulte15770a2019-07-01 18:40:23 +00002166 default:
2167 return true;
2168 }
2169
2170 return true;
2171}