Blame - llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp - toolchain/llvm-project

2017-08-08 00:47:13 +0000

[diff] [blame]

1

//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

2

//

3

// The LLVM Compiler Infrastructure

4

//

5

// This file is distributed under the University of Illinois Open Source

6

// License. See LICENSE.TXT for details.

7

//

8

//===----------------------------------------------------------------------===//

9

//

10

/// \file

Adrian Prantl

5f8f34e4

2018-05-01 15:54:18 +0000

[diff] [blame]

11

/// Insert wait instructions for memory reads and writes.

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

12

///

13

/// Memory reads and writes are issued asynchronously, so we need to insert

14

/// S_WAITCNT instructions when we want to access any of their results or

15

/// overwrite any register that's used asynchronously.

16

//

17

//===----------------------------------------------------------------------===//

18

19

#include "AMDGPU.h"

20

#include "AMDGPUSubtarget.h"

21

#include "SIDefines.h"

22

#include "SIInstrInfo.h"

23

#include "SIMachineFunctionInfo.h"

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

24

#include "SIRegisterInfo.h"

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

25

#include "Utils/AMDGPUBaseInfo.h"

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

26

#include "llvm/ADT/DenseMap.h"

27

#include "llvm/ADT/DenseSet.h"

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

28

#include "llvm/ADT/PostOrderIterator.h"

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

29

#include "llvm/ADT/STLExtras.h"

30

#include "llvm/ADT/SmallVector.h"

31

#include "llvm/CodeGen/MachineBasicBlock.h"

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

32

#include "llvm/CodeGen/MachineFunction.h"

33

#include "llvm/CodeGen/MachineFunctionPass.h"

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

34

#include "llvm/CodeGen/MachineInstr.h"

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

35

#include "llvm/CodeGen/MachineInstrBuilder.h"

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

36

#include "llvm/CodeGen/MachineLoopInfo.h"

37

#include "llvm/CodeGen/MachineMemOperand.h"

38

#include "llvm/CodeGen/MachineOperand.h"

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

39

#include "llvm/CodeGen/MachineRegisterInfo.h"

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

40

#include "llvm/IR/DebugLoc.h"

41

#include "llvm/Pass.h"

42

#include "llvm/Support/Debug.h"

Mark Searles

2018-04-25 19:21:26 +0000

[diff] [blame]

43

#include "llvm/Support/DebugCounter.h"

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

44

#include "llvm/Support/ErrorHandling.h"

45

#include "llvm/Support/raw_ostream.h"

#include <algorithm>

#include <cassert>

#include <cstdint>

#include <cstring>

#include <memory>

#include <utility>

#include <vector>

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

53

Mark Searles

2018-04-25 19:21:26 +0000

[diff] [blame]

54

using namespace llvm;

55

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

56

#define DEBUG_TYPE "si-insert-waitcnts"

57

Mark Searles

2018-04-25 19:21:26 +0000

[diff] [blame]

58

DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",

59

"Force emit s_waitcnt expcnt(0) instrs");

60

DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",

61

"Force emit s_waitcnt lgkmcnt(0) instrs");

62

DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",

63

"Force emit s_waitcnt vmcnt(0) instrs");

64

65

static cl::opt<unsigned> ForceEmitZeroFlag(

66

"amdgpu-waitcnt-forcezero",

67

cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),

68

cl::init(0), cl::Hidden);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

namespace {

// Class of object that encapsulates latest instruction counter score

73

// associated with the operand. Used for determining whether

74

// s_waitcnt instruction needs to be emited.

75

76

#define CNT_MASK(t) (1u << (t))

77

78

enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };

79

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

80

using RegInterval = std::pair<signed, signed>;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

struct {

int32_t VmcntMax;

int32_t ExpcntMax;

int32_t LgkmcntMax;

int32_t NumVGPRsMax;

int32_t NumSGPRsMax;

} HardwareLimits;

struct {

unsigned VGPR0;

unsigned VGPRL;

unsigned SGPR0;

unsigned SGPRL;

} RegisterEncoding;

enum WaitEventType {

VMEM_ACCESS, // vector-memory read & write

99

LDS_ACCESS, // lds read & write

100

GDS_ACCESS, // gds read & write

101

SQ_MESSAGE, // send message

102

SMEM_ACCESS, // scalar-memory read & write

103

EXP_GPR_LOCK, // export holding on its data src

104

GDS_GPR_LOCK, // GDS holding on its data and addr src

105

EXP_POS_ACCESS, // write to export position

106

EXP_PARAM_ACCESS, // write to export parameter

107

VMW_GPR_LOCK, // vector-memory write holding on its data src

NUM_WAIT_EVENTS,

};

// The mapping is:

// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs

113

// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots

114

// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs

115

// We reserve a fixed number of VGPR slots in the scoring tables for

116

// special tokens like SCMEM_LDS (needed for buffer load to LDS).

117

enum RegisterMapping {

118

SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.

119

SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.

120

NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.

121

EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.

122

NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.

123

};

124

125

#define ForAllWaitEventType(w) \

126

for (enum WaitEventType w = (enum WaitEventType)0; \

127

(w) < (enum WaitEventType)NUM_WAIT_EVENTS; \

128

(w) = (enum WaitEventType)((w) + 1))

129

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

130

void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {

131

switch (T) {

132

case VM_CNT:

133

Wait.VmCnt = std::min(Wait.VmCnt, Count);

134

break;

135

case EXP_CNT:

136

Wait.ExpCnt = std::min(Wait.ExpCnt, Count);

137

break;

138

case LGKM_CNT:

139

Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);

140

break;

141

default:

142

llvm_unreachable("bad InstCounterType");

}

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

146

// This is a per-basic-block object that maintains current score brackets

Mark Searles

2018-03-14 22:04:32 +0000

[diff] [blame]

147

// of each wait counter, and a per-register scoreboard for each wait counter.

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

148

// We also maintain the latest score for every event type that can change the

149

// waitcnt in order to know if there are multiple types of events within

150

// the brackets. When multiple types of event happen in the bracket,

Mark Searles

2018-03-14 22:04:32 +0000

[diff] [blame]

151

// wait count may get decreased out of order, therefore we need to put in

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

152

// "s_waitcnt 0" before use.

153

class BlockWaitcntBrackets {

154

public:

Tom Stellard

2018-07-11 20:59:01 +0000

[diff] [blame]

155

BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

156

for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;

157

T = (enum InstCounterType)(T + 1)) {

158

memset(VgprScores[T], 0, sizeof(VgprScores[T]));

}

}

~BlockWaitcntBrackets() = default;

163

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

164

static int32_t getWaitCountMax(InstCounterType T) {

165

switch (T) {

166

case VM_CNT:

167

return HardwareLimits.VmcntMax;

168

case LGKM_CNT:

169

return HardwareLimits.LgkmcntMax;

170

case EXP_CNT:

171

return HardwareLimits.ExpcntMax;

default:

break;

}

return 0;

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

176

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

177

178

void setScoreLB(InstCounterType T, int32_t Val) {

179

assert(T < NUM_INST_CNTS);

180

if (T >= NUM_INST_CNTS)

181

return;

182

ScoreLBs[T] = Val;

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

183

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

184

185

void setScoreUB(InstCounterType T, int32_t Val) {

186

assert(T < NUM_INST_CNTS);

187

if (T >= NUM_INST_CNTS)

return;

ScoreUBs[T] = Val;

if (T == EXP_CNT) {

int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));

192

if (ScoreLBs[T] < UB)

193

ScoreLBs[T] = UB;

194

}

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

195

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

196

Nicolai Haehnle

2018-11-19 12:03:11 +0000

[diff] [blame]

197

int32_t getScoreLB(InstCounterType T) const {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

198

assert(T < NUM_INST_CNTS);

199

if (T >= NUM_INST_CNTS)

200

return 0;

201

return ScoreLBs[T];

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

202

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

203

Nicolai Haehnle

2018-11-19 12:03:11 +0000

[diff] [blame]

204

int32_t getScoreUB(InstCounterType T) const {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

205

assert(T < NUM_INST_CNTS);

206

if (T >= NUM_INST_CNTS)

207

return 0;

208

return ScoreUBs[T];

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

209

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

210

211

// Mapping from event to counter.

212

InstCounterType eventCounter(WaitEventType E) {

switch (E) {

case VMEM_ACCESS:

return VM_CNT;

case LDS_ACCESS:

case GDS_ACCESS:

case SQ_MESSAGE:

case SMEM_ACCESS:

return LGKM_CNT;

case EXP_GPR_LOCK:

case GDS_GPR_LOCK:

case VMW_GPR_LOCK:

case EXP_POS_ACCESS:

case EXP_PARAM_ACCESS:

226

return EXP_CNT;

227

default:

228

llvm_unreachable("unhandled event type");

229

}

230

return NUM_INST_CNTS;

231

}

232

233

void setRegScore(int GprNo, InstCounterType T, int32_t Val) {

234

if (GprNo < NUM_ALL_VGPRS) {

235

if (GprNo > VgprUB) {

236

VgprUB = GprNo;

237

}

238

VgprScores[T][GprNo] = Val;

239

} else {

240

assert(T == LGKM_CNT);

241

if (GprNo - NUM_ALL_VGPRS > SgprUB) {

242

SgprUB = GprNo - NUM_ALL_VGPRS;

243

}

244

SgprScores[GprNo - NUM_ALL_VGPRS] = Val;

}

}

int32_t getRegScore(int GprNo, InstCounterType T) {

249

if (GprNo < NUM_ALL_VGPRS) {

250

return VgprScores[T][GprNo];

251

}

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

252

assert(T == LGKM_CNT);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

253

return SgprScores[GprNo - NUM_ALL_VGPRS];

}

void clear() {

memset(ScoreLBs, 0, sizeof(ScoreLBs));

258

memset(ScoreUBs, 0, sizeof(ScoreUBs));

259

memset(EventUBs, 0, sizeof(EventUBs));

260

for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;

261

T = (enum InstCounterType)(T + 1)) {

262

memset(VgprScores[T], 0, sizeof(VgprScores[T]));

263

}

264

memset(SgprScores, 0, sizeof(SgprScores));

265

}

266

267

RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,

268

const MachineRegisterInfo *MRI,

269

const SIRegisterInfo *TRI, unsigned OpNo,

270

bool Def) const;

271

272

void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,

273

const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,

274

unsigned OpNo, int32_t Val);

275

276

void setWaitAtBeginning() { WaitAtBeginning = true; }

277

void clearWaitAtBeginning() { WaitAtBeginning = false; }

278

bool getWaitAtBeginning() const { return WaitAtBeginning; }

279

void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }

280

int32_t getMaxVGPR() const { return VgprUB; }

281

int32_t getMaxSGPR() const { return SgprUB; }

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

282

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

283

int32_t getEventUB(enum WaitEventType W) const {

284

assert(W < NUM_WAIT_EVENTS);

285

return EventUBs[W];

286

}

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

287

Nicolai Haehnle

2018-11-19 12:03:11 +0000

[diff] [blame]

288

bool counterOutOfOrder(InstCounterType T) const;

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

289

bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;

290

bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;

291

void determineWait(InstCounterType T, int ScoreToWait,

292

AMDGPU::Waitcnt &Wait) const;

293

void applyWaitcnt(const AMDGPU::Waitcnt &Wait);

294

void applyWaitcnt(InstCounterType T, unsigned Count);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

295

void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,

296

const MachineRegisterInfo *MRI, WaitEventType E,

297

MachineInstr &MI);

298

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

299

bool hasPendingSMEM() const {

300

return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&

301

EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);

302

}

303

304

bool hasPendingFlat() const {

305

return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&

306

LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||

307

(LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&

308

LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));

309

}

310

311

void setPendingFlat() {

312

LastFlat[VM_CNT] = ScoreUBs[VM_CNT];

313

LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];

314

}

315

316

int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }

317

318

void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }

319

320

bool getRevisitLoop() const { return RevisitLoop; }

321

void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }

322

323

void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }

324

int32_t getPostOrder() const { return PostOrder; }

325

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

326

bool mixedExpTypes() const { return MixedExpTypes; }

327

void setMixedExpTypes(bool MixedExpTypesIn) {

328

MixedExpTypes = MixedExpTypesIn;

329

}

330

331

void print(raw_ostream &);

332

void dump() { print(dbgs()); }

333

334

private:

Tom Stellard

2018-07-11 20:59:01 +0000

[diff] [blame]

335

const GCNSubtarget *ST = nullptr;

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

336

bool WaitAtBeginning = false;

337

bool RevisitLoop = false;

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

338

bool MixedExpTypes = false;

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

339

int32_t PostOrder = 0;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

340

int32_t ScoreLBs[NUM_INST_CNTS] = {0};

341

int32_t ScoreUBs[NUM_INST_CNTS] = {0};

342

int32_t EventUBs[NUM_WAIT_EVENTS] = {0};

343

// Remember the last flat memory operation.

344

int32_t LastFlat[NUM_INST_CNTS] = {0};

345

// wait_cnt scores for every vgpr.

346

// Keep track of the VgprUB and SgprUB to make merge at join efficient.

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

347

int32_t VgprUB = 0;

348

int32_t SgprUB = 0;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

349

int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];

350

// Wait cnt scores for every sgpr, only lgkmcnt is relevant.

351

int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};

352

};

353

354

// This is a per-loop-region object that records waitcnt status at the end of

355

// loop footer from the previous iteration. We also maintain an iteration

356

// count to track the number of times the loop has been visited. When it

357

// doesn't converge naturally, we force convergence by inserting s_waitcnt 0

358

// at the end of the loop footer.

359

class LoopWaitcntData {

360

public:

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

361

LoopWaitcntData() = default;

362

~LoopWaitcntData() = default;

363

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

364

void incIterCnt() { IterCnt++; }

365

void resetIterCnt() { IterCnt = 0; }

Mark Searles

2018-05-30 15:47:45 +0000

[diff] [blame]

366

unsigned getIterCnt() { return IterCnt; }

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

367

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

368

void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }

369

MachineInstr *getWaitcnt() const { return LfWaitcnt; }

370

Nicola Zaghen

2018-05-14 12:53:11 +0000

[diff] [blame]

371

void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

372

373

private:

374

// s_waitcnt added at the end of loop footer to stablize wait scores

375

// at the end of the loop footer.

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

376

MachineInstr *LfWaitcnt = nullptr;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

377

// Number of iterations the loop has been visited, not including the initial

378

// walk over.

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

379

int32_t IterCnt = 0;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

380

};

381

382

class SIInsertWaitcnts : public MachineFunctionPass {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

383

private:

Tom Stellard

2018-07-11 20:59:01 +0000

[diff] [blame]

384

const GCNSubtarget *ST = nullptr;

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

385

const SIInstrInfo *TII = nullptr;

386

const SIRegisterInfo *TRI = nullptr;

387

const MachineRegisterInfo *MRI = nullptr;

388

const MachineLoopInfo *MLI = nullptr;

Konstantin Zhuravlyov

71e43ee

2018-09-12 18:50:47 +0000

[diff] [blame]

389

AMDGPU::IsaVersion IV;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

390

391

DenseSet<MachineBasicBlock *> BlockVisitedSet;

Mark Searles

2018-02-07 02:21:21 +0000

[diff] [blame]

392

DenseSet<MachineInstr *> TrackedWaitcntSet;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

393

DenseSet<MachineInstr *> VCCZBugHandledSet;

394

395

DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>

396

BlockWaitcntBracketsMap;

397

Mark Searles

2018-04-19 15:42:30 +0000

[diff] [blame]

398

std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

399

400

DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;

401

Mark Searles

4a0f2c5

2018-05-07 14:43:28 +0000

[diff] [blame]

402

// ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0

403

// because of amdgpu-waitcnt-forcezero flag

404

bool ForceEmitZeroWaitcnts;

Mark Searles

2018-04-25 19:21:26 +0000

[diff] [blame]

405

bool ForceEmitWaitcnt[NUM_INST_CNTS];

406

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

407

public:

408

static char ID;

409

Konstantin Zhuravlyov

7774777

2018-06-26 21:33:38 +0000

[diff] [blame]

410

SIInsertWaitcnts() : MachineFunctionPass(ID) {

411

(void)ForceExpCounter;

412

(void)ForceLgkmCounter;

413

(void)ForceVMCounter;

414

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

415

416

bool runOnMachineFunction(MachineFunction &MF) override;

417

418

StringRef getPassName() const override {

419

return "SI insert wait instructions";

420

}

421

422

void getAnalysisUsage(AnalysisUsage &AU) const override {

423

AU.setPreservesCFG();

424

AU.addRequired<MachineLoopInfo>();

425

MachineFunctionPass::getAnalysisUsage(AU);

426

}

427

Mark Searles

2018-04-25 19:21:26 +0000

[diff] [blame]

428

bool isForceEmitWaitcnt() const {

429

for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;

430

T = (enum InstCounterType)(T + 1))

431

if (ForceEmitWaitcnt[T])

return true;

return false;

}

void setForceEmitWaitcnt() {

437

// For non-debug builds, ForceEmitWaitcnt has been initialized to false;

438

// For debug builds, get the debug counter info and adjust if need be

439

#ifndef NDEBUG

440

if (DebugCounter::isCounterSet(ForceExpCounter) &&

441

DebugCounter::shouldExecute(ForceExpCounter)) {

442

ForceEmitWaitcnt[EXP_CNT] = true;

443

} else {

444

ForceEmitWaitcnt[EXP_CNT] = false;

445

}

446

447

if (DebugCounter::isCounterSet(ForceLgkmCounter) &&

448

DebugCounter::shouldExecute(ForceLgkmCounter)) {

449

ForceEmitWaitcnt[LGKM_CNT] = true;

450

} else {

451

ForceEmitWaitcnt[LGKM_CNT] = false;

452

}

453

454

if (DebugCounter::isCounterSet(ForceVMCounter) &&

455

DebugCounter::shouldExecute(ForceVMCounter)) {

456

ForceEmitWaitcnt[VM_CNT] = true;

457

} else {

458

ForceEmitWaitcnt[VM_CNT] = false;

}

#endif // NDEBUG

}

Matt Arsenault

2017-07-21 18:54:54 +0000

[diff] [blame]

463

bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;

Mark Searles

2018-04-24 15:59:59 +0000

[diff] [blame]

464

void generateWaitcntInstBefore(MachineInstr &MI,

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

465

BlockWaitcntBrackets *ScoreBrackets,

466

MachineInstr *OldWaitcntInstr);

Mark Searles

2018-04-24 15:59:59 +0000

[diff] [blame]

467

void updateEventWaitcntAfter(MachineInstr &Inst,

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

468

BlockWaitcntBrackets *ScoreBrackets);

469

void mergeInputScoreBrackets(MachineBasicBlock &Block);

Mark Searles

2018-04-19 15:42:30 +0000

[diff] [blame]

470

bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);

471

unsigned countNumBottomBlocks(const MachineLoop *Loop);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

472

void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);

473

void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);

474

};

475

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

476

} // end anonymous namespace

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

477

478

RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,

479

const SIInstrInfo *TII,

480

const MachineRegisterInfo *MRI,

481

const SIRegisterInfo *TRI,

482

unsigned OpNo,

483

bool Def) const {

484

const MachineOperand &Op = MI->getOperand(OpNo);

485

if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||

486

(Def && !Op.isDef()))

487

return {-1, -1};

488

489

// A use via a PW operand does not need a waitcnt.

490

// A partial write is not a WAW.

491

assert(!Op.getSubReg() || !Op.isUndef());

492

493

RegInterval Result;

494

const MachineRegisterInfo &MRIA = *MRI;

495

496

unsigned Reg = TRI->getEncodingValue(Op.getReg());

497

498

if (TRI->isVGPR(MRIA, Op.getReg())) {

499

assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);

500

Result.first = Reg - RegisterEncoding.VGPR0;

501

assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);

502

} else if (TRI->isSGPRReg(MRIA, Op.getReg())) {

503

assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);

504

Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;

505

assert(Result.first >= NUM_ALL_VGPRS &&

506

Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);

507

}

508

// TODO: Handle TTMP

509

// else if (TRI->isTTMP(MRIA, Reg.getReg())) ...

else

return {-1, -1};

const MachineInstr &MIA = *MI;

514

const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);

Krzysztof Parzyszek

44e25f3

2017-04-24 18:55:33 +0000

[diff] [blame]

515

unsigned Size = TRI->getRegSizeInBits(*RC);

516

Result.second = Result.first + (Size / 32);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

return Result;

}

void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,

522

const SIInstrInfo *TII,

523

const SIRegisterInfo *TRI,

524

const MachineRegisterInfo *MRI,

525

unsigned OpNo, int32_t Val) {

526

RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);

Nicola Zaghen

2018-05-14 12:53:11 +0000

[diff] [blame]

527

LLVM_DEBUG({

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

528

const MachineOperand &Opnd = MI->getOperand(OpNo);

529

assert(TRI->isVGPR(*MRI, Opnd.getReg()));

530

});

531

for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {

532

setRegScore(RegNo, EXP_CNT, Val);

}

}

void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,

537

const SIRegisterInfo *TRI,

538

const MachineRegisterInfo *MRI,

539

WaitEventType E, MachineInstr &Inst) {

540

const MachineRegisterInfo &MRIA = *MRI;

541

InstCounterType T = eventCounter(E);

542

int32_t CurrScore = getScoreUB(T) + 1;

543

// EventUB and ScoreUB need to be update regardless if this event changes

544

// the score of a register or not.

545

// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.

546

EventUBs[E] = CurrScore;

547

setScoreUB(T, CurrScore);

548

549

if (T == EXP_CNT) {

550

// Check for mixed export types. If they are mixed, then a waitcnt exp(0)

551

// is required.

552

if (!MixedExpTypes) {

553

MixedExpTypes = counterOutOfOrder(EXP_CNT);

554

}

555

556

// Put score on the source vgprs. If this is a store, just use those

557

// specific register(s).

558

if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {

559

// All GDS operations must protect their address register (same as

560

// export.)

561

if (Inst.getOpcode() != AMDGPU::DS_APPEND &&

562

Inst.getOpcode() != AMDGPU::DS_CONSUME) {

563

setExpScore(

564

&Inst, TII, TRI, MRI,

565

AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),

566

CurrScore);

567

}

568

if (Inst.mayStore()) {

569

setExpScore(

570

&Inst, TII, TRI, MRI,

571

AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),

572

CurrScore);

573

if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),

574

AMDGPU::OpName::data1) != -1) {

575

setExpScore(&Inst, TII, TRI, MRI,

576

AMDGPU::getNamedOperandIdx(Inst.getOpcode(),

577

AMDGPU::OpName::data1),

578

CurrScore);

579

}

580

} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&

581

Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&

582

Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&

583

Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&

584

Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&

585

Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&

586

Inst.getOpcode() != AMDGPU::DS_APPEND &&

587

Inst.getOpcode() != AMDGPU::DS_CONSUME &&

588

Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {

589

for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {

590

const MachineOperand &Op = Inst.getOperand(I);

591

if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {

592

setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);

}

}

}

} else if (TII->isFLAT(Inst)) {

597

if (Inst.mayStore()) {

598

setExpScore(

599

&Inst, TII, TRI, MRI,

600

AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),

601

CurrScore);

602

} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {

603

setExpScore(

604

&Inst, TII, TRI, MRI,

605

AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),

606

CurrScore);

607

}

608

} else if (TII->isMIMG(Inst)) {

609

if (Inst.mayStore()) {

610

setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);

611

} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {

612

setExpScore(

613

&Inst, TII, TRI, MRI,

614

AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),

615

CurrScore);

616

}

617

} else if (TII->isMTBUF(Inst)) {

618

if (Inst.mayStore()) {

619

setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);

620

}

621

} else if (TII->isMUBUF(Inst)) {

622

if (Inst.mayStore()) {

623

setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);

624

} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {

625

setExpScore(

626

&Inst, TII, TRI, MRI,

627

AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),

CurrScore);

}

} else {

if (TII->isEXP(Inst)) {

632

// For export the destination registers are really temps that

633

// can be used as the actual source after export patching, so

634

// we need to treat them like sources and set the EXP_CNT

635

// score.

636

for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {

637

MachineOperand &DefMO = Inst.getOperand(I);

638

if (DefMO.isReg() && DefMO.isDef() &&

639

TRI->isVGPR(MRIA, DefMO.getReg())) {

640

setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,

CurrScore);

}

}

}

for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {

646

MachineOperand &MO = Inst.getOperand(I);

647

if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {

648

setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);

}

}

}

#if 0 // TODO: check if this is handled by MUBUF code above.

653

} else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||

Evgeny Mankov

2017-08-16 16:47:29 +0000

[diff] [blame]

654

Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||

655

Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

656

MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);

657

unsigned OpNo;//TODO: find the OpNo for this operand;

658

RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);

659

for (signed RegNo = Interval.first; RegNo < Interval.second;

Evgeny Mankov

2017-08-16 16:47:29 +0000

[diff] [blame]

660

++RegNo) {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

661

setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);

}

#endif

} else {

// Match the score to the destination registers.

666

for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {

667

RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);

668

if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)

669

continue;

670

for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {

671

setRegScore(RegNo, T, CurrScore);

672

}

673

}

674

if (TII->isDS(Inst) && Inst.mayStore()) {

675

setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);

}

}

}

void BlockWaitcntBrackets::print(raw_ostream &OS) {

681

OS << '\n';

682

for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;

683

T = (enum InstCounterType)(T + 1)) {

684

int LB = getScoreLB(T);

685

int UB = getScoreUB(T);

switch (T) {

case VM_CNT:

OS << " VM_CNT(" << UB - LB << "): ";

690

break;

691

case LGKM_CNT:

692

OS << " LGKM_CNT(" << UB - LB << "): ";

693

break;

694

case EXP_CNT:

695

OS << " EXP_CNT(" << UB - LB << "): ";

696

break;

697

default:

698

OS << " UNKNOWN(" << UB - LB << "): ";

break;

}

if (LB < UB) {

// Print vgpr scores.

704

for (int J = 0; J <= getMaxVGPR(); J++) {

705

int RegScore = getRegScore(J, T);

706

if (RegScore <= LB)

707

continue;

708

int RelScore = RegScore - LB - 1;

709

if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {

710

OS << RelScore << ":v" << J << " ";

711

} else {

712

OS << RelScore << ":ds ";

713

}

714

}

715

// Also need to print sgpr scores for lgkm_cnt.

716

if (T == LGKM_CNT) {

717

for (int J = 0; J <= getMaxSGPR(); J++) {

718

int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);

719

if (RegScore <= LB)

720

continue;

721

int RelScore = RegScore - LB - 1;

722

OS << RelScore << ":s" << J << " ";

}

}

}

OS << '\n';

}

OS << '\n';

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

729

}

730

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

731

/// Simplify the waitcnt, in the sense of removing redundant counts, and return

732

/// whether a waitcnt instruction is needed at all.

733

bool BlockWaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {

734

return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |

735

simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |

736

simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);

737

}

738

739

bool BlockWaitcntBrackets::simplifyWaitcnt(InstCounterType T,

740

unsigned &Count) const {

741

const int32_t LB = getScoreLB(T);

742

const int32_t UB = getScoreUB(T);

743

if (Count < (unsigned)UB && UB - (int32_t)Count > LB)

return true;

Count = ~0u;

return false;

}

void BlockWaitcntBrackets::determineWait(InstCounterType T, int ScoreToWait,

751

AMDGPU::Waitcnt &Wait) const {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

752

if (ScoreToWait == -1) {

753

// The score to wait is unknown. This implies that it was not encountered

754

// during the path of the CFG walk done during the current traversal but

755

// may be seen on a different path. Emit an s_wait counter with a

756

// conservative value of 0 for the counter.

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

757

addWait(Wait, T, 0);

758

return;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

759

}

760

761

// If the score of src_operand falls within the bracket, we need an

762

// s_waitcnt instruction.

763

const int32_t LB = getScoreLB(T);

764

const int32_t UB = getScoreUB(T);

765

if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {

Mark Searles

f0b93f1

2018-06-04 16:51:59 +0000

[diff] [blame]

766

if ((T == VM_CNT || T == LGKM_CNT) &&

767

hasPendingFlat() &&

768

!ST->hasFlatLgkmVMemCountInOrder()) {

769

// If there is a pending FLAT operation, and this is a VMem or LGKM

770

// waitcnt and the target can report early completion, then we need

771

// to force a waitcnt 0.

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

772

addWait(Wait, T, 0);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

773

} else if (counterOutOfOrder(T)) {

774

// Counter can get decremented out-of-order when there

Mark Searles

2018-03-14 22:04:32 +0000

[diff] [blame]

775

// are multiple types event in the bracket. Also emit an s_wait counter

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

776

// with a conservative value of 0 for the counter.

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

777

addWait(Wait, T, 0);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

778

} else {

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

779

addWait(Wait, T, UB - ScoreToWait);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

780

}

781

}

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

782

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

783

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

784

void BlockWaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {

785

applyWaitcnt(VM_CNT, Wait.VmCnt);

786

applyWaitcnt(EXP_CNT, Wait.ExpCnt);

787

applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);

788

789

if (Wait.ExpCnt == 0)

790

setMixedExpTypes(false);

791

}

792

793

void BlockWaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {

794

const int32_t UB = getScoreUB(T);

795

if (Count >= (unsigned)UB)

796

return;

797

if (Count != 0) {

798

if (counterOutOfOrder(T))

799

return;

800

setScoreLB(T, std::max(getScoreLB(T), UB - (int32_t)Count));

801

} else {

802

setScoreLB(T, UB);

803

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

804

}

805

806

// Where there are multiple types of event in the bracket of a counter,

807

// the decrement may go out of order.

Nicolai Haehnle

2018-11-19 12:03:11 +0000

[diff] [blame]

808

bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) const {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

switch (T) {

case VM_CNT:

return false;

case LGKM_CNT: {

if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&

814

EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {

815

// Scalar memory read always can go out of order.

816

return true;

817

}

818

int NumEventTypes = 0;

819

if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&

820

EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {

821

NumEventTypes++;

822

}

823

if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&

824

EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {

825

NumEventTypes++;

826

}

827

if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&

828

EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {

829

NumEventTypes++;

830

}

831

if (NumEventTypes <= 1) {

return false;

}

break;

}

case EXP_CNT: {

// If there has been a mixture of export types, then a waitcnt exp(0) is

// required.

if (MixedExpTypes)

return true;

int NumEventTypes = 0;

842

if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&

843

EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {

844

NumEventTypes++;

845

}

846

if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&

847

EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {

848

NumEventTypes++;

849

}

850

if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&

851

EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {

852

NumEventTypes++;

853

}

854

if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&

855

EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {

NumEventTypes++;

}

if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&

860

EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {

NumEventTypes++;

}

if (NumEventTypes <= 1) {

return false;

}

break;

}

default:

break;

}

return true;

}

INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,

876

false)

877

INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,

878

false)

879

880

char SIInsertWaitcnts::ID = 0;

881

882

char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;

883

884

FunctionPass *llvm::createSIInsertWaitcntsPass() {

885

return new SIInsertWaitcnts();

886

}

887

888

static bool readsVCCZ(const MachineInstr &MI) {

889

unsigned Opc = MI.getOpcode();

890

return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&

891

!MI.getOperand(1).isUndef();

892

}

893

Adrian Prantl

5f8f34e4

2018-05-01 15:54:18 +0000

[diff] [blame]

894

/// Generate s_waitcnt instruction to be placed before cur_Inst.

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

895

/// Instructions of a given type are returned in order,

896

/// but instructions of different types can complete out of order.

897

/// We rely on this in-order completion

898

/// and simply assign a score to the memory access instructions.

899

/// We keep track of the active "score bracket" to determine

900

/// if an access of a memory read requires an s_waitcnt

901

/// and if so what the value of each counter is.

902

/// The "score bracket" is bound by the lower bound and upper bound

903

/// scores (*_score_LB and *_score_ub respectively).

Mark Searles

2018-04-24 15:59:59 +0000

[diff] [blame]

904

void SIInsertWaitcnts::generateWaitcntInstBefore(

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

905

MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets,

906

MachineInstr *OldWaitcntInstr) {

Mark Searles

4a0f2c5

2018-05-07 14:43:28 +0000

[diff] [blame]

907

setForceEmitWaitcnt();

Mark Searles

2018-04-25 19:21:26 +0000

[diff] [blame]

908

bool IsForceEmitWaitcnt = isForceEmitWaitcnt();

909

Nicolai Haehnle

61396ff

2018-11-07 21:53:36 +0000

[diff] [blame]

910

if (MI.isDebugInstr())

Stanislav Mekhanoshin

db39b4b

2018-02-08 00:18:35 +0000

[diff] [blame]

911

return;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

912

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

913

AMDGPU::Waitcnt Wait;

914

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

915

// See if an s_waitcnt is forced at block entry, or is needed at

916

// program end.

917

if (ScoreBrackets->getWaitAtBeginning()) {

918

// Note that we have already cleared the state, so we don't need to update

919

// it.

920

ScoreBrackets->clearWaitAtBeginning();

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

921

Wait = AMDGPU::Waitcnt::allZero();

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

922

}

923

924

// See if this instruction has a forced S_WAITCNT VM.

925

// TODO: Handle other cases of NeedsWaitcntVmBefore()

926

else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||

927

MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||

928

MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

929

Wait.VmCnt = 0;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

930

}

931

932

// All waits must be resolved at call return.

933

// NOTE: this could be improved with knowledge of all call sites or

934

// with knowledge of the called routines.

Tom Stellard

c5a154d

2018-06-28 23:47:12 +0000

[diff] [blame]

935

if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||

Mark Searles

2017-05-31 16:44:23 +0000

[diff] [blame]

936

MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

937

Wait = AMDGPU::Waitcnt::allZero();

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

938

}

939

// Resolve vm waits before gs-done.

940

else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||

941

MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&

942

((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==

943

AMDGPU::SendMsg::ID_GS_DONE)) {

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

944

Wait.VmCnt = 0;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

945

}

946

#if 0 // TODO: the following blocks of logic when we have fence.

947

else if (MI.getOpcode() == SC_FENCE) {

948

const unsigned int group_size =

949

context->shader_info->GetMaxThreadGroupSize();

950

// group_size == 0 means thread group size is unknown at compile time

951

const bool group_is_multi_wave =

952

(group_size == 0 || group_size > target_info->GetWaveFrontSize());

953

const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();

954

955

for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {

956

SCRegType src_type = Inst->GetSrcType(i);

957

switch (src_type) {

958

case SCMEM_LDS:

959

if (group_is_multi_wave ||

Evgeny Mankov

2017-08-16 16:47:29 +0000

[diff] [blame]

960

context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {

Mark Searles

2018-04-24 15:59:59 +0000

[diff] [blame]

961

EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

962

ScoreBrackets->getScoreUB(LGKM_CNT));

963

// LDS may have to wait for VM_CNT after buffer load to LDS

964

if (target_info->HasBufferLoadToLDS()) {

Mark Searles

2018-04-24 15:59:59 +0000

[diff] [blame]

965

EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

966

ScoreBrackets->getScoreUB(VM_CNT));

}

}

break;

case SCMEM_GDS:

if (group_is_multi_wave || fence_is_global) {

Mark Searles

2018-04-24 15:59:59 +0000

[diff] [blame]

973

EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,

Evgeny Mankov

2017-08-16 16:47:29 +0000

[diff] [blame]

974

ScoreBrackets->getScoreUB(EXP_CNT));

Mark Searles

2018-04-24 15:59:59 +0000

[diff] [blame]

975

EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,

Evgeny Mankov

2017-08-16 16:47:29 +0000

[diff] [blame]

976

ScoreBrackets->getScoreUB(LGKM_CNT));

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

}

break;

case SCMEM_UAV:

case SCMEM_TFBUF:

case SCMEM_RING:

case SCMEM_SCATTER:

if (group_is_multi_wave || fence_is_global) {

Mark Searles

2018-04-24 15:59:59 +0000

[diff] [blame]

985

EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,

Evgeny Mankov

2017-08-16 16:47:29 +0000

[diff] [blame]

986

ScoreBrackets->getScoreUB(EXP_CNT));

Mark Searles

2018-04-24 15:59:59 +0000

[diff] [blame]

987

EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,

Evgeny Mankov

2017-08-16 16:47:29 +0000

[diff] [blame]

988

ScoreBrackets->getScoreUB(VM_CNT));

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

}

break;

case SCMEM_SCRATCH:

default:

break;

}

}

}

#endif

// Export & GDS instructions do not read the EXEC mask until after the export

1001

// is granted (which can occur well after the instruction is issued).

1002

// The shader program must flush all EXP operations on the export-count

1003

// before overwriting the EXEC mask.

1004

else {

1005

if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {

1006

// Export and GDS are tracked individually, either may trigger a waitcnt

1007

// for EXEC.

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1008

ScoreBrackets->determineWait(

1009

EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK), Wait);

1010

ScoreBrackets->determineWait(

1011

EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS), Wait);

1012

ScoreBrackets->determineWait(

1013

EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS), Wait);

1014

ScoreBrackets->determineWait(

1015

EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK), Wait);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1016

}

1017

1018

#if 0 // TODO: the following code to handle CALL.

1019

// The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.

1020

// However, there is a problem with EXP_CNT, because the call cannot

1021

// easily tell if a register is used in the function, and if it did, then

1022

// the referring instruction would have to have an S_WAITCNT, which is

1023

// dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs

1024

// before the call.

1025

if (MI.getOpcode() == SC_CALL) {

1026

if (ScoreBrackets->getScoreUB(EXP_CNT) >

Evgeny Mankov

2017-08-16 16:47:29 +0000

[diff] [blame]

1027

ScoreBrackets->getScoreLB(EXP_CNT)) {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1028

ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));

Mark Searles

2018-04-24 15:59:59 +0000

[diff] [blame]

1029

EmitWaitcnt |= CNT_MASK(EXP_CNT);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

}

}

#endif

Matt Arsenault

2017-07-21 18:54:54 +0000

[diff] [blame]

1034

// FIXME: Should not be relying on memoperands.

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1035

// Look at the source operands of every instruction to see if

1036

// any of them results from a previous memory operation that affects

1037

// its current usage. If so, an s_waitcnt instruction needs to be

1038

// emitted.

1039

// If the source operand was defined by a load, add the s_waitcnt

1040

// instruction.

1041

for (const MachineMemOperand *Memop : MI.memoperands()) {

1042

unsigned AS = Memop->getAddrSpace();

Matt Arsenault

0da6350

2018-08-31 05:49:54 +0000

[diff] [blame]

1043

if (AS != AMDGPUAS::LOCAL_ADDRESS)

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1044

continue;

1045

unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;

1046

// VM_CNT is only relevant to vgpr or LDS.

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1047

ScoreBrackets->determineWait(

1048

VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1049

}

Matt Arsenault

2017-07-21 18:54:54 +0000

[diff] [blame]

1050

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1051

for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {

1052

const MachineOperand &Op = MI.getOperand(I);

1053

const MachineRegisterInfo &MRIA = *MRI;

1054

RegInterval Interval =

1055

ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);

1056

for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {

1057

if (TRI->isVGPR(MRIA, Op.getReg())) {

1058

// VM_CNT is only relevant to vgpr or LDS.

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1059

ScoreBrackets->determineWait(

1060

VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1061

}

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1062

ScoreBrackets->determineWait(

1063

LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1064

}

1065

}

1066

// End of for loop that looks at all source operands to decide vm_wait_cnt

1067

// and lgk_wait_cnt.

1068

1069

// Two cases are handled for destination operands:

1070

// 1) If the destination operand was defined by a load, add the s_waitcnt

1071

// instruction to guarantee the right WAW order.

1072

// 2) If a destination operand that was used by a recent export/store ins,

1073

// add s_waitcnt on exp_cnt to guarantee the WAR order.

1074

if (MI.mayStore()) {

Matt Arsenault

2017-07-21 18:54:54 +0000

[diff] [blame]

1075

// FIXME: Should not be relying on memoperands.

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1076

for (const MachineMemOperand *Memop : MI.memoperands()) {

1077

unsigned AS = Memop->getAddrSpace();

Matt Arsenault

0da6350

2018-08-31 05:49:54 +0000

[diff] [blame]

1078

if (AS != AMDGPUAS::LOCAL_ADDRESS)

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1079

continue;

1080

unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1081

ScoreBrackets->determineWait(

1082

VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);

1083

ScoreBrackets->determineWait(

1084

EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1085

}

1086

}

1087

for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {

1088

MachineOperand &Def = MI.getOperand(I);

1089

const MachineRegisterInfo &MRIA = *MRI;

1090

RegInterval Interval =

1091

ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);

1092

for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {

1093

if (TRI->isVGPR(MRIA, Def.getReg())) {

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1094

ScoreBrackets->determineWait(

1095

VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);

1096

ScoreBrackets->determineWait(

1097

EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1098

}

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1099

ScoreBrackets->determineWait(

1100

LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1101

}

1102

} // End of for loop that looks at all dest operands.

1103

}

1104

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1105

// Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0

1106

// occurs before the instruction. Doing it here prevents any additional

1107

// S_WAITCNTs from being emitted if the instruction was marked as

1108

// requiring a WAITCNT beforehand.

Konstantin Zhuravlyov

be6c0ca

2017-06-02 17:40:26 +0000

[diff] [blame]

1109

if (MI.getOpcode() == AMDGPU::S_BARRIER &&

1110

!ST->hasAutoWaitcntBeforeBarrier()) {

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1111

Wait = AMDGPU::Waitcnt::allZero();

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1112

}

1113

1114

// TODO: Remove this work-around, enable the assert for Bug 457939

1115

// after fixing the scheduler. Also, the Shader Compiler code is

1116

// independent of target.

Tom Stellard

c5a154d

2018-06-28 23:47:12 +0000

[diff] [blame]

1117

if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1118

if (ScoreBrackets->getScoreLB(LGKM_CNT) <

1119

ScoreBrackets->getScoreUB(LGKM_CNT) &&

1120

ScoreBrackets->hasPendingSMEM()) {

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1121

Wait.LgkmCnt = 0;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

}

}

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1125

// Early-out if no wait is indicated.

1126

if (!ScoreBrackets->simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {

1127

if (OldWaitcntInstr) {

1128

if (TrackedWaitcntSet.count(OldWaitcntInstr)) {

1129

TrackedWaitcntSet.erase(OldWaitcntInstr);

1130

OldWaitcntInstr->eraseFromParent();

Nicolai Haehnle

61396ff

2018-11-07 21:53:36 +0000

[diff] [blame]

1131

} else {

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1132

int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();

1133

ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));

Stanislav Mekhanoshin

db39b4b

2018-02-08 00:18:35 +0000

[diff] [blame]

1134

}

Nicolai Haehnle

61396ff

2018-11-07 21:53:36 +0000

[diff] [blame]

1135

}

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1136

return;

1137

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1138

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1139

if (ForceEmitZeroWaitcnts)

1140

Wait = AMDGPU::Waitcnt::allZero();

1141

1142

if (ForceEmitWaitcnt[VM_CNT])

1143

Wait.VmCnt = 0;

1144

if (ForceEmitWaitcnt[EXP_CNT])

1145

Wait.ExpCnt = 0;

1146

if (ForceEmitWaitcnt[LGKM_CNT])

1147

Wait.LgkmCnt = 0;

1148

1149

ScoreBrackets->applyWaitcnt(Wait);

1150

1151

AMDGPU::Waitcnt OldWait;

1152

if (OldWaitcntInstr) {

1153

OldWait =

1154

AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());

1155

}

1156

if (OldWait.dominates(Wait))

1157

return;

1158

1159

MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());

1160

if (ContainingLoop) {

1161

MachineBasicBlock *TBB = ContainingLoop->getHeader();

1162

BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get();

1163

if (!ScoreBracket) {

1164

assert(!BlockVisitedSet.count(TBB));

1165

BlockWaitcntBracketsMap[TBB] =

1166

llvm::make_unique<BlockWaitcntBrackets>(ST);

1167

ScoreBracket = BlockWaitcntBracketsMap[TBB].get();

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1168

}

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1169

ScoreBracket->setRevisitLoop(true);

1170

LLVM_DEBUG(dbgs() << "set-revisit2: Block"

1171

<< ContainingLoop->getHeader()->getNumber() << '\n';);

1172

}

1173

1174

if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))

1175

Wait = Wait.combined(OldWait);

1176

1177

unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);

1178

if (OldWaitcntInstr) {

1179

OldWaitcntInstr->getOperand(0).setImm(Enc);

1180

1181

LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"

1182

<< "Old Instr: " << MI << '\n'

1183

<< "New Instr: " << *OldWaitcntInstr << '\n');

1184

} else {

1185

auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),

1186

MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))

1187

.addImm(Enc);

1188

TrackedWaitcntSet.insert(SWaitInst);

1189

1190

LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"

1191

<< "Old Instr: " << MI << '\n'

1192

<< "New Instr: " << *SWaitInst << '\n');

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1193

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1194

}

1195

1196

void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,

1197

MachineInstr *Waitcnt) {

1198

if (MBB.empty()) {

1199

MBB.push_back(Waitcnt);

return;

}

MachineBasicBlock::iterator It = MBB.end();

1204

MachineInstr *MI = &*(--It);

1205

if (MI->isBranch()) {

1206

MBB.insert(It, Waitcnt);

1207

} else {

1208

MBB.push_back(Waitcnt);

1209

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1210

}

1211

Matt Arsenault

2017-07-21 18:54:54 +0000

[diff] [blame]

1212

// This is a flat memory operation. Check to see if it has memory

1213

// tokens for both LDS and Memory, and if so mark it as a flat.

1214

bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {

1215

if (MI.memoperands_empty())

1216

return true;

1217

1218

for (const MachineMemOperand *Memop : MI.memoperands()) {

1219

unsigned AS = Memop->getAddrSpace();

Matt Arsenault

0da6350

2018-08-31 05:49:54 +0000

[diff] [blame]

1220

if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)

Matt Arsenault

2017-07-21 18:54:54 +0000

[diff] [blame]

return true;

}

return false;

}

Mark Searles

2018-04-24 15:59:59 +0000

[diff] [blame]

1227

void SIInsertWaitcnts::updateEventWaitcntAfter(

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1228

MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {

1229

// Now look at the instruction opcode. If it is a memory access

1230

// instruction, update the upper-bound of the appropriate counter's

1231

// bracket and the destination operand scores.

1232

// TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.

Matt Arsenault

6ab9ea9

2017-07-21 18:34:51 +0000

[diff] [blame]

1233

if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {

Matt Arsenault

2017-07-21 18:54:54 +0000

[diff] [blame]

1234

if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1235

ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);

1236

ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);

1237

} else {

1238

ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);

1239

}

1240

} else if (TII->isFLAT(Inst)) {

1241

assert(Inst.mayLoad() || Inst.mayStore());

Matt Arsenault

6ab9ea9

2017-07-21 18:34:51 +0000

[diff] [blame]

1242

1243

if (TII->usesVM_CNT(Inst))

1244

ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);

1245

Matt Arsenault

2017-07-21 18:54:54 +0000

[diff] [blame]

1246

if (TII->usesLGKM_CNT(Inst)) {

Matt Arsenault

6ab9ea9

2017-07-21 18:34:51 +0000

[diff] [blame]

1247

ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1248

Matt Arsenault

2017-07-21 18:54:54 +0000

[diff] [blame]

1249

// This is a flat memory operation, so note it - it will require

1250

// that both the VM and LGKM be flushed to zero if it is pending when

1251

// a VM or LGKM dependency occurs.

1252

if (mayAccessLDSThroughFlat(Inst))

1253

ScoreBrackets->setPendingFlat();

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1254

}

1255

} else if (SIInstrInfo::isVMEM(Inst) &&

1256

// TODO: get a better carve out.

1257

Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&

1258

Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&

1259

Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {

1260

ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);

Mark Searles

2a19af6

2018-04-26 16:11:19 +0000

[diff] [blame]

1261

if (ST->vmemWriteNeedsExpWaitcnt() &&

Mark Searles

2017-05-31 16:44:23 +0000

[diff] [blame]

1262

(Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1263

ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);

1264

}

1265

} else if (TII->isSMRD(Inst)) {

1266

ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);

1267

} else {

1268

switch (Inst.getOpcode()) {

1269

case AMDGPU::S_SENDMSG:

1270

case AMDGPU::S_SENDMSGHALT:

1271

ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);

1272

break;

1273

case AMDGPU::EXP:

1274

case AMDGPU::EXP_DONE: {

1275

int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();

1276

if (Imm >= 32 && Imm <= 63)

1277

ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);

1278

else if (Imm >= 12 && Imm <= 15)

1279

ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);

1280

else

1281

ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);

1282

break;

1283

}

1284

case AMDGPU::S_MEMTIME:

1285

case AMDGPU::S_MEMREALTIME:

1286

ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);

break;

default:

break;

}

}

}

Mark Searles

2018-03-14 22:04:32 +0000

[diff] [blame]

1294

// Merge the score brackets of the Block's predecessors;

1295

// this merged score bracket is used when adding waitcnts to the Block

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1296

void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {

1297

BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();

1298

int32_t MaxPending[NUM_INST_CNTS] = {0};

1299

int32_t MaxFlat[NUM_INST_CNTS] = {0};

1300

bool MixedExpTypes = false;

1301

Mark Searles

2018-03-14 22:04:32 +0000

[diff] [blame]

1302

// For single basic block loops, we need to retain the Block's

1303

// score bracket to have accurate Pred info. So, make a copy of Block's

1304

// score bracket, clear() it (which retains several important bits of info),

1305

// populate, and then replace en masse. For non-single basic block loops,

1306

// just clear Block's current score bracket and repopulate in-place.

1307

bool IsSelfPred;

1308

std::unique_ptr<BlockWaitcntBrackets> S;

1309

1310

IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))

1311

!= Block.pred_end();

1312

if (IsSelfPred) {

1313

S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);

1314

ScoreBrackets = S.get();

1315

}

1316

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1317

ScoreBrackets->clear();

1318

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1319

// See if there are any uninitialized predecessors. If so, emit an

1320

// s_waitcnt 0 at the beginning of the block.

Mark Searles

2018-03-14 22:04:32 +0000

[diff] [blame]

1321

for (MachineBasicBlock *Pred : Block.predecessors()) {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1322

BlockWaitcntBrackets *PredScoreBrackets =

Mark Searles

2018-03-14 22:04:32 +0000

[diff] [blame]

1323

BlockWaitcntBracketsMap[Pred].get();

1324

bool Visited = BlockVisitedSet.count(Pred);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1325

if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {

Tim Corringham

2017-12-04 12:30:49 +0000

[diff] [blame]

1326

continue;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1327

}

1328

for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;

1329

T = (enum InstCounterType)(T + 1)) {

1330

int span =

1331

PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);

1332

MaxPending[T] = std::max(MaxPending[T], span);

1333

span =

1334

PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);

1335

MaxFlat[T] = std::max(MaxFlat[T], span);

1336

}

1337

1338

MixedExpTypes |= PredScoreBrackets->mixedExpTypes();

1339

}

1340

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1341

// Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.

1342

for (MachineBasicBlock *Pred : Block.predecessors()) {

1343

BlockWaitcntBrackets *PredScoreBrackets =

1344

BlockWaitcntBracketsMap[Pred].get();

Mark Searles

2018-02-07 02:21:21 +0000

[diff] [blame]

1345

bool Visited = BlockVisitedSet.count(Pred);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1346

if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {

Tim Corringham

2017-12-04 12:30:49 +0000

[diff] [blame]

1347

continue;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1348

}

1349

1350

int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -

1351

PredScoreBrackets->getScoreLB(EXP_CNT);

1352

MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);

1353

int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -

1354

PredScoreBrackets->getScoreLB(EXP_CNT);

1355

MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);

1356

}

1357

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1358

#if 0

1359

// LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.

1360

// TODO: how does LC distinguish between function entry and main entry?

1361

// If this is the entry to a function, force a wait.

1362

MachineBasicBlock &Entry = Block.getParent()->front();

1363

if (Entry.getNumber() == Block.getNumber()) {

1364

ScoreBrackets->setWaitAtBeginning();

return;

}

#endif

// Now set the current Block's brackets to the largest ending bracket.

1370

for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;

1371

T = (enum InstCounterType)(T + 1)) {

1372

ScoreBrackets->setScoreUB(T, MaxPending[T]);

1373

ScoreBrackets->setScoreLB(T, 0);

1374

ScoreBrackets->setLastFlat(T, MaxFlat[T]);

1375

}

1376

1377

ScoreBrackets->setMixedExpTypes(MixedExpTypes);

1378

1379

// Set the register scoreboard.

1380

for (MachineBasicBlock *Pred : Block.predecessors()) {

Mark Searles

2018-02-07 02:21:21 +0000

[diff] [blame]

1381

if (!BlockVisitedSet.count(Pred)) {

Tim Corringham

2017-12-04 12:30:49 +0000

[diff] [blame]

1382

continue;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1383

}

1384

1385

BlockWaitcntBrackets *PredScoreBrackets =

1386

BlockWaitcntBracketsMap[Pred].get();

1387

1388

// Now merge the gpr_reg_score information

1389

for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;

1390

T = (enum InstCounterType)(T + 1)) {

1391

int PredLB = PredScoreBrackets->getScoreLB(T);

1392

int PredUB = PredScoreBrackets->getScoreUB(T);

1393

if (PredLB < PredUB) {

1394

int PredScale = MaxPending[T] - PredUB;

1395

// Merge vgpr scores.

1396

for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {

1397

int PredRegScore = PredScoreBrackets->getRegScore(J, T);

1398

if (PredRegScore <= PredLB)

1399

continue;

1400

int NewRegScore = PredScale + PredRegScore;

1401

ScoreBrackets->setRegScore(

1402

J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));

1403

}

1404

// Also need to merge sgpr scores for lgkm_cnt.

1405

if (T == LGKM_CNT) {

1406

for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {

1407

int PredRegScore =

1408

PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);

1409

if (PredRegScore <= PredLB)

1410

continue;

1411

int NewRegScore = PredScale + PredRegScore;

1412

ScoreBrackets->setRegScore(

1413

J + NUM_ALL_VGPRS, LGKM_CNT,

1414

std::max(

1415

ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),

NewRegScore));

}

}

}

}

// Also merge the WaitEvent information.

1423

ForAllWaitEventType(W) {

1424

enum InstCounterType T = PredScoreBrackets->eventCounter(W);

1425

int PredEventUB = PredScoreBrackets->getEventUB(W);

1426

if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {

1427

int NewEventUB =

1428

MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);

1429

if (NewEventUB > 0) {

1430

ScoreBrackets->setEventUB(

1431

W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));

}

}

}

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1437

// Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the

1438

// sequencing predecessors, because changes to EXEC require waitcnts due to

1439

// the delayed nature of these operations.

1440

for (MachineBasicBlock *Pred : Block.predecessors()) {

Mark Searles

2018-02-07 02:21:21 +0000

[diff] [blame]

1441

if (!BlockVisitedSet.count(Pred)) {

Tim Corringham

2017-12-04 12:30:49 +0000

[diff] [blame]

1442

continue;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1443

}

1444

1445

BlockWaitcntBrackets *PredScoreBrackets =

1446

BlockWaitcntBracketsMap[Pred].get();

1447

1448

int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);

1449

if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {

1450

int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -

1451

PredScoreBrackets->getScoreUB(EXP_CNT);

1452

if (new_gds_ub > 0) {

1453

ScoreBrackets->setEventUB(

1454

GDS_GPR_LOCK,

1455

std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));

1456

}

1457

}

1458

int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);

1459

if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {

1460

int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -

1461

PredScoreBrackets->getScoreUB(EXP_CNT);

1462

if (new_exp_ub > 0) {

1463

ScoreBrackets->setEventUB(

1464

EXP_GPR_LOCK,

1465

std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));

1466

}

1467

}

1468

}

Mark Searles

2018-03-14 22:04:32 +0000

[diff] [blame]

1469

1470

// if a single block loop, update the score brackets. Not needed for other

1471

// blocks, as we did this in-place

1472

if (IsSelfPred) {

1473

BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);

1474

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1475

}

1476

Mark Searles

2018-05-30 15:47:45 +0000

[diff] [blame]

1477

/// Return true if the given basic block is a "bottom" block of a loop.

1478

/// This works even if the loop is discontiguous. This also handles

1479

/// multiple back-edges for the same "header" block of a loop.

Mark Searles

2018-04-19 15:42:30 +0000

[diff] [blame]

1480

bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,

1481

const MachineBasicBlock *Block) {

1482

for (MachineBasicBlock *MBB : Loop->blocks()) {

1483

if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {

return true;

}

}

return false;

}

/// Count the number of "bottom" basic blocks of a loop.

1491

unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {

1492

unsigned Count = 0;

1493

for (MachineBasicBlock *MBB : Loop->blocks()) {

1494

if (MBB->isSuccessor(Loop->getHeader())) {

Count++;

}

}

return Count;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1499

}

1500

1501

// Generate s_waitcnt instructions where needed.

1502

void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,

1503

MachineBasicBlock &Block) {

1504

// Initialize the state information.

1505

mergeInputScoreBrackets(Block);

1506

1507

BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();

1508

Nicola Zaghen

2018-05-14 12:53:11 +0000

[diff] [blame]

1509

LLVM_DEBUG({

Mark Searles

2018-04-25 19:21:26 +0000

[diff] [blame]

1510

dbgs() << "*** Block" << Block.getNumber() << " ***";

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1511

ScoreBrackets->dump();

1512

});

1513

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1514

// Walk over the instructions.

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1515

MachineInstr *OldWaitcntInstr = nullptr;

1516

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1517

for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();

1518

Iter != E;) {

1519

MachineInstr &Inst = *Iter;

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1520

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1521

// Remove any previously existing waitcnts.

1522

if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1523

if (OldWaitcntInstr) {

1524

if (TrackedWaitcntSet.count(OldWaitcntInstr)) {

1525

TrackedWaitcntSet.erase(OldWaitcntInstr);

1526

OldWaitcntInstr->eraseFromParent();

1527

OldWaitcntInstr = nullptr;

1528

} else if (!TrackedWaitcntSet.count(&Inst)) {

1529

// Two successive s_waitcnt's, both of which are pre-existing and

1530

// are therefore preserved.

1531

int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();

1532

ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));

1533

} else {

1534

++Iter;

1535

Inst.eraseFromParent();

1536

continue;

1537

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1538

}

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1539

1540

OldWaitcntInstr = &Inst;

1541

++Iter;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

continue;

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1545

bool VCCZBugWorkAround = false;

1546

if (readsVCCZ(Inst) &&

Mark Searles

2018-02-07 02:21:21 +0000

[diff] [blame]

1547

(!VCCZBugHandledSet.count(&Inst))) {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1548

if (ScoreBrackets->getScoreLB(LGKM_CNT) <

1549

ScoreBrackets->getScoreUB(LGKM_CNT) &&

1550

ScoreBrackets->hasPendingSMEM()) {

Tom Stellard

c5a154d

2018-06-28 23:47:12 +0000

[diff] [blame]

1551

if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1552

VCCZBugWorkAround = true;

}

}

// Generate an s_waitcnt instruction to be placed before

1557

// cur_Inst, if needed.

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1558

generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);

1559

OldWaitcntInstr = nullptr;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1560

Mark Searles

2018-04-24 15:59:59 +0000

[diff] [blame]

1561

updateEventWaitcntAfter(Inst, ScoreBrackets);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1562

1563

#if 0 // TODO: implement resource type check controlled by options with ub = LB.

1564

// If this instruction generates a S_SETVSKIP because it is an

1565

// indexed resource, and we are on Tahiti, then it will also force

1566

// an S_WAITCNT vmcnt(0)

1567

if (RequireCheckResourceType(Inst, context)) {

1568

// Force the score to as if an S_WAITCNT vmcnt(0) is emitted.

1569

ScoreBrackets->setScoreLB(VM_CNT,

Evgeny Mankov

2017-08-16 16:47:29 +0000

[diff] [blame]

1570

ScoreBrackets->getScoreUB(VM_CNT));

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

}

#endif

Nicola Zaghen

2018-05-14 12:53:11 +0000

[diff] [blame]

1574

LLVM_DEBUG({

Mark Searles

94ae3b2

2018-01-30 17:17:06 +0000

[diff] [blame]

1575

Inst.print(dbgs());

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1576

ScoreBrackets->dump();

1577

});

1578

1579

// Check to see if this is a GWS instruction. If so, and if this is CI or

1580

// VI, then the generated code sequence will include an S_WAITCNT 0.

1581

// TODO: Are these the only GWS instructions?

1582

if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||

1583

Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||

1584

Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||

1585

Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||

1586

Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {

1587

// TODO: && context->target_info->GwsRequiresMemViolTest() ) {

Nicolai Haehnle

2018-11-29 11:06:06 +0000

[diff] [blame^]

1588

ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero());

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1589

}

1590

1591

// TODO: Remove this work-around after fixing the scheduler and enable the

1592

// assert above.

1593

if (VCCZBugWorkAround) {

1594

// Restore the vccz bit. Any time a value is written to vcc, the vcc

1595

// bit is updated, so we can restore the bit by reading the value of

1596

// vcc and then writing it back to the register.

1597

BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),

1598

AMDGPU::VCC)

1599

.addReg(AMDGPU::VCC);

1600

VCCZBugHandledSet.insert(&Inst);

1601

}

1602

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

++Iter;

}

// Check if we need to force convergence at loop footer.

1607

MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);

Mark Searles

2018-04-19 15:42:30 +0000

[diff] [blame]

1608

if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1609

LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();

1610

WaitcntData->print();

Nicola Zaghen

2018-05-14 12:53:11 +0000

[diff] [blame]

1611

LLVM_DEBUG(dbgs() << '\n';);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1612

1613

// The iterative waitcnt insertion algorithm aims for optimal waitcnt

Mark Searles

2018-05-30 15:47:45 +0000

[diff] [blame]

1614

// placement, but doesn't guarantee convergence for a loop. Each

1615

// loop should take at most (n+1) iterations for it to converge naturally,

1616

// where n is the number of bottom blocks. If this threshold is reached and

1617

// the result hasn't converged, then we force convergence by inserting

1618

// a s_waitcnt at the end of loop footer.

1619

if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1620

// To ensure convergence, need to make wait events at loop footer be no

1621

// more than those from the previous iteration.

Mark Searles

6520792

2018-02-19 19:19:59 +0000

[diff] [blame]

1622

// As a simplification, instead of tracking individual scores and

1623

// generating the precise wait count, just wait on 0.

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1624

bool HasPending = false;

1625

MachineInstr *SWaitInst = WaitcntData->getWaitcnt();

1626

for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;

1627

T = (enum InstCounterType)(T + 1)) {

1628

if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {

1629

ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));

1630

HasPending = true;

Mark Searles

2018-05-30 15:47:45 +0000

[diff] [blame]

1631

break;

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

}

}

if (HasPending) {

if (!SWaitInst) {

Mark Searles

2018-05-30 15:47:45 +0000

[diff] [blame]

1637

SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),

1638

DebugLoc(), TII->get(AMDGPU::S_WAITCNT))

1639

.addImm(0);

Mark Searles

2018-02-07 02:21:21 +0000

[diff] [blame]

1640

TrackedWaitcntSet.insert(SWaitInst);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1641

#if 0 // TODO: Format the debug output

1642

OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);

1643

OutputTransformAdd(SWaitInst, context);

#endif

}

#if 0 // TODO: ??

_DEV( REPORTED_STATS->force_waitcnt_converge = 1; )

#endif

}

if (SWaitInst) {

Nicola Zaghen

2018-05-14 12:53:11 +0000

[diff] [blame]

1652

LLVM_DEBUG({

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1653

SWaitInst->print(dbgs());

1654

dbgs() << "\nAdjusted score board:";

1655

ScoreBrackets->dump();

1656

});

1657

1658

// Add this waitcnt to the block. It is either newly created or

1659

// created in previous iterations and added back since block traversal

Mark Searles

6520792

2018-02-19 19:19:59 +0000

[diff] [blame]

1660

// always removes waitcnts.

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1661

insertWaitcntBeforeCF(Block, SWaitInst);

1662

WaitcntData->setWaitcnt(SWaitInst);

}

}

}

}

bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {

Tom Stellard

2018-07-11 20:59:01 +0000

[diff] [blame]

1669

ST = &MF.getSubtarget<GCNSubtarget>();

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1670

TII = ST->getInstrInfo();

1671

TRI = &TII->getRegisterInfo();

1672

MRI = &MF.getRegInfo();

1673

MLI = &getAnalysis<MachineLoopInfo>();

Konstantin Zhuravlyov

71e43ee

2018-09-12 18:50:47 +0000

[diff] [blame]

1674

IV = AMDGPU::getIsaVersion(ST->getCPU());

Mark Searles

2017-05-31 16:44:23 +0000

[diff] [blame]

1675

const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1676

Mark Searles

4a0f2c5

2018-05-07 14:43:28 +0000

[diff] [blame]

1677

ForceEmitZeroWaitcnts = ForceEmitZeroFlag;

Mark Searles

2018-04-25 19:21:26 +0000

[diff] [blame]

1678

for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;

1679

T = (enum InstCounterType)(T + 1))

1680

ForceEmitWaitcnt[T] = false;

1681

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1682

HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);

1683

HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);

1684

HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);

1685

1686

HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();

1687

HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();

1688

assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);

1689

assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);

1690

1691

RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);

1692

RegisterEncoding.VGPRL =

1693

RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;

1694

RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);

1695

RegisterEncoding.SGPRL =

1696

RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;

1697

Mark Searles

2018-02-07 02:21:21 +0000

[diff] [blame]

1698

TrackedWaitcntSet.clear();

1699

BlockVisitedSet.clear();

1700

VCCZBugHandledSet.clear();

Mark Searles

2018-04-19 15:42:30 +0000

[diff] [blame]

1701

LoopWaitcntDataMap.clear();

Scott Linder

5792dd0

2018-06-21 18:48:48 +0000

[diff] [blame]

1702

BlockWaitcntProcessedSet.clear();

Mark Searles

2018-02-07 02:21:21 +0000

[diff] [blame]

1703

Nicolai Haehnle

0ab31c9

2018-11-07 21:53:29 +0000

[diff] [blame]

1704

// Walk over the blocks in reverse post order, inserting

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1705

// s_waitcnt where needed.

1706

ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);

1707

bool Modified = false;

1708

for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator

1709

I = RPOT.begin(),

1710

E = RPOT.end(), J = RPOT.begin();

1711

I != E;) {

1712

MachineBasicBlock &MBB = **I;

1713

1714

BlockVisitedSet.insert(&MBB);

1715

1716

BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();

1717

if (!ScoreBrackets) {

Mark Searles

f0b93f1

2018-06-04 16:51:59 +0000

[diff] [blame]

1718

BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1719

ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();

1720

}

1721

ScoreBrackets->setPostOrder(MBB.getNumber());

1722

MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);

1723

if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)

Eugene Zelenko

2017-08-08 00:47:13 +0000

[diff] [blame]

1724

LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1725

1726

// If we are walking into the block from before the loop, then guarantee

1727

// at least 1 re-walk over the loop to propagate the information, even if

1728

// no S_WAITCNT instructions were generated.

Mark Searles

2018-04-19 15:42:30 +0000

[diff] [blame]

1729

if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {

1730

unsigned Count = countNumBottomBlocks(ContainingLoop);

1731

1732

// If the loop has multiple back-edges, and so more than one "bottom"

1733

// basic block, we have to guarantee a re-walk over every blocks.

1734

if ((std::count(BlockWaitcntProcessedSet.begin(),

Mark Searles

f4e7025

2018-07-16 10:21:36 +0000

[diff] [blame]

1735

BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {

Mark Searles

2018-04-19 15:42:30 +0000

[diff] [blame]

1736

BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);

Mark Searles

2018-05-30 15:47:45 +0000

[diff] [blame]

1737

LLVM_DEBUG(dbgs() << "set-revisit1: Block"

Nicola Zaghen

2018-05-14 12:53:11 +0000

[diff] [blame]

1738

<< ContainingLoop->getHeader()->getNumber() << '\n';);

Mark Searles

2018-04-19 15:42:30 +0000

[diff] [blame]

1739

}

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1740

}

1741

1742

// Walk over the instructions.

1743

insertWaitcntInBlock(MF, MBB);

1744

Mark Searles

2018-05-30 15:47:45 +0000

[diff] [blame]

1745

// Record that waitcnts have been processed at least once for this block.

Mark Searles

2018-04-19 15:42:30 +0000

[diff] [blame]

1746

BlockWaitcntProcessedSet.push_back(&MBB);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1747

Mark Searles

2018-04-19 15:42:30 +0000

[diff] [blame]

1748

// See if we want to revisit the loop. If a loop has multiple back-edges,

1749

// we shouldn't revisit the same "bottom" basic block.

1750

if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&

1751

std::count(BlockWaitcntProcessedSet.begin(),

1752

BlockWaitcntProcessedSet.end(), &MBB) == 1) {

Kannan Narayanan

5e73b04

2017-05-05 21:10:17 +0000

[diff] [blame]

1753

MachineBasicBlock *EntryBB = ContainingLoop->getHeader();

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1754

BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();

1755

if (EntrySB && EntrySB->getRevisitLoop()) {

1756

EntrySB->setRevisitLoop(false);

1757

J = I;

1758

int32_t PostOrder = EntrySB->getPostOrder();

1759

// TODO: Avoid this loop. Find another way to set I.

1760

for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator

X = RPOT.begin(),

Y = RPOT.end();

X != Y; ++X) {

MachineBasicBlock &MBBX = **X;

1765

if (MBBX.getNumber() == PostOrder) {

I = X;

break;

}

}

LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();

1771

WaitcntData->incIterCnt();

Nicola Zaghen

2018-05-14 12:53:11 +0000

[diff] [blame]

1772

LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1773

continue;

1774

} else {

1775

LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();

1776

// Loop converged, reset iteration count. If this loop gets revisited,

1777

// it must be from an outer loop, the counter will restart, this will

1778

// ensure we don't force convergence on such revisits.

1779

WaitcntData->resetIterCnt();

}

}

J = I;

++I;

}

SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;

1788

1789

bool HaveScalarStores = false;

1790

1791

for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;

1792

++BI) {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1793

MachineBasicBlock &MBB = *BI;

1794

1795

for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;

1796

++I) {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1797

if (!HaveScalarStores && TII->isScalarStore(*I))

1798

HaveScalarStores = true;

1799

1800

if (I->getOpcode() == AMDGPU::S_ENDPGM ||

1801

I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)

1802

EndPgmBlocks.push_back(&MBB);

}

}

if (HaveScalarStores) {

1807

// If scalar writes are used, the cache must be flushed or else the next

1808

// wave to reuse the same scratch memory can be clobbered.

1809

//

1810

// Insert s_dcache_wb at wave termination points if there were any scalar

1811

// stores, and only if the cache hasn't already been flushed. This could be

1812

// improved by looking across blocks for flushes in postdominating blocks

1813

// from the stores but an explicitly requested flush is probably very rare.

1814

for (MachineBasicBlock *MBB : EndPgmBlocks) {

1815

bool SeenDCacheWB = false;

1816

1817

for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;

1818

++I) {

Kannan Narayanan

2017-04-12 03:25:12 +0000

[diff] [blame]

1819

if (I->getOpcode() == AMDGPU::S_DCACHE_WB)

1820

SeenDCacheWB = true;

1821

else if (TII->isScalarStore(*I))

1822

SeenDCacheWB = false;

1823

1824

// FIXME: It would be better to insert this before a waitcnt if any.

1825

if ((I->getOpcode() == AMDGPU::S_ENDPGM ||

1826

I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&

1827

!SeenDCacheWB) {

1828

Modified = true;

1829

BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));

}

}

}

}

Mark Searles

2017-05-31 16:44:23 +0000

[diff] [blame]

1835

if (!MFI->isEntryFunction()) {

1836

// Wait for any outstanding memory operations that the input registers may

Hiroshi Inoue

c8e9245

2018-01-29 05:17:03 +0000

[diff] [blame]

1837

// depend on. We can't track them and it's better to the wait after the

Mark Searles

2017-05-31 16:44:23 +0000

[diff] [blame]

1838

// costly call sequence.

1839

1840

// TODO: Could insert earlier and schedule more liberally with operations

1841

// that only use caller preserved registers.

1842

MachineBasicBlock &EntryBB = MF.front();

Mark Searles

ed54ff1

2018-05-30 16:27:57 +0000

[diff] [blame]

1843

BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))

1844

.addImm(0);

Mark Searles