Blame - clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp - toolchain/llvm-project

2016-04-04 15:55:02 +0000

[diff] [blame]

17

#include "CodeGenFunction.h"

18

#include "clang/AST/StmtOpenMP.h"

Samuel Antao

45bfe4c

2016-02-08 15:59:20 +0000

[diff] [blame]

19

20

using namespace clang;

21

using namespace CodeGen;

22

Arpith Chacko Jacob

2017-01-03 20:19:56 +0000

[diff] [blame]

23

namespace {

24

enum OpenMPRTLFunctionNVPTX {

Jonas Hahnfeld

2017-11-22 14:46:49 +0000

[diff] [blame]

25

/// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit,

26

/// int16_t RequiresOMPRuntime);

Arpith Chacko Jacob

2017-01-03 20:19:56 +0000

[diff] [blame]

27

OMPRTL_NVPTX__kmpc_kernel_init,

Jonas Hahnfeld

2017-11-22 14:46:49 +0000

[diff] [blame]

28

/// \brief Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

29

OMPRTL_NVPTX__kmpc_kernel_deinit,

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

30

/// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,

Jonas Hahnfeld

2017-11-22 14:46:49 +0000

[diff] [blame]

31

/// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

32

OMPRTL_NVPTX__kmpc_spmd_kernel_init,

33

/// \brief Call to void __kmpc_spmd_kernel_deinit();

34

OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

35

/// \brief Call to void __kmpc_kernel_prepare_parallel(void

Jonas Hahnfeld

2017-12-27 10:39:56 +0000

[diff] [blame]

36

/// *outlined_function, void ***args, kmp_int32 nArgs, int16_t

37

/// IsOMPRuntimeInitialized);

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

38

OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,

Jonas Hahnfeld

2017-11-22 14:46:49 +0000

[diff] [blame]

39

/// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function, void

Jonas Hahnfeld

2017-12-27 10:39:56 +0000

[diff] [blame]

40

/// ***args, int16_t IsOMPRuntimeInitialized);

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

41

OMPRTL_NVPTX__kmpc_kernel_parallel,

42

/// \brief Call to void __kmpc_kernel_end_parallel();

43

OMPRTL_NVPTX__kmpc_kernel_end_parallel,

44

/// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32

45

/// global_tid);

46

OMPRTL_NVPTX__kmpc_serialized_parallel,

47

/// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32

48

/// global_tid);

49

OMPRTL_NVPTX__kmpc_end_serialized_parallel,

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

50

/// \brief Call to int32_t __kmpc_shuffle_int32(int32_t element,

51

/// int16_t lane_offset, int16_t warp_size);

52

OMPRTL_NVPTX__kmpc_shuffle_int32,

53

/// \brief Call to int64_t __kmpc_shuffle_int64(int64_t element,

54

/// int16_t lane_offset, int16_t warp_size);

55

OMPRTL_NVPTX__kmpc_shuffle_int64,

56

/// \brief Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32

57

/// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,

58

/// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t

59

/// lane_offset, int16_t shortCircuit),

60

/// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));

61

OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

62

/// \brief Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,

63

/// int32_t num_vars, size_t reduce_size, void *reduce_data,

64

/// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t

65

/// lane_offset, int16_t shortCircuit),

66

/// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),

67

/// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,

68

/// int32_t index, int32_t width),

69

/// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t

70

/// index, int32_t width, int32_t reduce))

71

OMPRTL_NVPTX__kmpc_teams_reduce_nowait,

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

72

/// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);

73

OMPRTL_NVPTX__kmpc_end_reduce_nowait

Arpith Chacko Jacob

2017-01-03 20:19:56 +0000

[diff] [blame]

74

};

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

75

76

/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.

77

class NVPTXActionTy final : public PrePostActionTy {

78

llvm::Value *EnterCallee;

79

ArrayRef<llvm::Value *> EnterArgs;

80

llvm::Value *ExitCallee;

81

ArrayRef<llvm::Value *> ExitArgs;

82

bool Conditional;

83

llvm::BasicBlock *ContBlock = nullptr;

84

85

public:

86

NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,

87

llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,

88

bool Conditional = false)

89

: EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),

90

ExitArgs(ExitArgs), Conditional(Conditional) {}

91

void Enter(CodeGenFunction &CGF) override {

92

llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);

93

if (Conditional) {

94

llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);

95

auto *ThenBlock = CGF.createBasicBlock("omp_if.then");

96

ContBlock = CGF.createBasicBlock("omp_if.end");

97

// Generate the branch (If-stmt)

98

CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);

99

CGF.EmitBlock(ThenBlock);

100

}

101

}

102

void Done(CodeGenFunction &CGF) {

103

// Emit the rest of blocks/branches

104

CGF.EmitBranch(ContBlock);

105

CGF.EmitBlock(ContBlock, true);

106

}

107

void Exit(CodeGenFunction &CGF) override {

108

CGF.EmitRuntimeCall(ExitCallee, ExitArgs);

109

}

110

};

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

111

112

// A class to track the execution mode when codegening directives within

113

// a target region. The appropriate mode (generic/spmd) is set on entry

114

// to the target region and used by containing directives such as 'parallel'

115

// to emit optimized code.

116

class ExecutionModeRAII {

117

private:

118

CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;

119

CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;

120

121

public:

122

ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,

123

CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)

: Mode(Mode) {

SavedMode = Mode;

Mode = NewMode;

}

~ExecutionModeRAII() { Mode = SavedMode; }

129

};

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

130

131

/// GPU Configuration: This information can be derived from cuda registers,

132

/// however, providing compile time constants helps generate more efficient

133

/// code. For all practical purposes this is fine because the configuration

134

/// is the same for all known NVPTX architectures.

135

enum MachineConfiguration : unsigned {

136

WarpSize = 32,

137

/// Number of bits required to represent a lane identifier, which is

138

/// computed as log_2(WarpSize).

139

LaneIDBits = 5,

140

LaneIDMask = WarpSize - 1,

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

141

142

/// Global memory alignment for performance.

143

GlobalMemoryAlignment = 256,

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

144

};

145

146

enum NamedBarrier : unsigned {

147

/// Synchronize on this barrier #ID using a named barrier primitive.

148

/// Only the subset of active threads in a parallel region arrive at the

149

/// barrier.

150

NB_Parallel = 1,

151

};

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

152

} // anonymous namespace

Arpith Chacko Jacob

2017-01-03 20:19:56 +0000

[diff] [blame]

153

154

/// Get the GPU warp size.

155

static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {

Alexey Bataev

2017-08-14 15:01:03 +0000

[diff] [blame]

156

return CGF.EmitRuntimeCall(

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

157

llvm::Intrinsic::getDeclaration(

Arpith Chacko Jacob

2017-01-03 20:19:56 +0000

[diff] [blame]

158

&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),

Alexey Bataev

2017-08-14 15:01:03 +0000

[diff] [blame]

159

"nvptx_warp_size");

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

160

}

161

Arpith Chacko Jacob

2017-01-03 20:19:56 +0000

[diff] [blame]

162

/// Get the id of the current thread on the GPU.

163

static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {

Alexey Bataev

2017-08-14 15:01:03 +0000

[diff] [blame]

164

return CGF.EmitRuntimeCall(

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

165

llvm::Intrinsic::getDeclaration(

Arpith Chacko Jacob

2017-01-03 20:19:56 +0000

[diff] [blame]

166

&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),

Alexey Bataev

2017-08-14 15:01:03 +0000

[diff] [blame]

167

"nvptx_tid");

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

168

}

169

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

170

/// Get the id of the warp in the block.

171

/// We assume that the warp size is 32, which is always the case

172

/// on the NVPTX device, to generate more efficient code.

173

static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {

174

CGBuilderTy &Bld = CGF.Builder;

175

return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");

176

}

177

178

/// Get the id of the current lane in the Warp.

179

/// We assume that the warp size is 32, which is always the case

180

/// on the NVPTX device, to generate more efficient code.

181

static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {

182

CGBuilderTy &Bld = CGF.Builder;

183

return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),

"nvptx_lane_id");

}

Arpith Chacko Jacob

2017-01-03 20:19:56 +0000

[diff] [blame]

187

/// Get the maximum number of threads in a block of the GPU.

188

static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {

Alexey Bataev

2017-08-14 15:01:03 +0000

[diff] [blame]

189

return CGF.EmitRuntimeCall(

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

190

llvm::Intrinsic::getDeclaration(

Arpith Chacko Jacob

2017-01-03 20:19:56 +0000

[diff] [blame]

191

&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),

Alexey Bataev

2017-08-14 15:01:03 +0000

[diff] [blame]

192

"nvptx_num_threads");

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

193

}

194

Arpith Chacko Jacob

2017-01-03 20:19:56 +0000

[diff] [blame]

195

/// Get barrier to synchronize all threads in a block.

196

static void getNVPTXCTABarrier(CodeGenFunction &CGF) {

Alexey Bataev

2017-08-14 15:01:03 +0000

[diff] [blame]

197

CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(

Arpith Chacko Jacob

2017-01-03 20:19:56 +0000

[diff] [blame]

198

&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

199

}

200

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

201

/// Get barrier #ID to synchronize selected (multiple of warp size) threads in

202

/// a CTA.

203

static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,

204

llvm::Value *NumThreads) {

205

CGBuilderTy &Bld = CGF.Builder;

206

llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};

Alexey Bataev

2017-08-14 15:01:03 +0000

[diff] [blame]

207

CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(

208

&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),

209

Args);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

210

}

211

Arpith Chacko Jacob

2017-01-03 20:19:56 +0000

[diff] [blame]

212

/// Synchronize all GPU threads in a block.

213

static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

214

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

215

/// Synchronize worker threads in a parallel region.

216

static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {

217

return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);

218

}

219

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

220

/// Get the value of the thread_limit clause in the teams directive.

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

221

/// For the 'generic' execution mode, the runtime encodes thread_limit in

222

/// the launch parameters, always starting thread_limit+warpSize threads per

223

/// CTA. The threads in the last warp are reserved for master execution.

224

/// For the 'spmd' execution mode, all threads in a CTA are part of the team.

225

static llvm::Value *getThreadLimit(CodeGenFunction &CGF,

226

bool IsInSpmdExecutionMode = false) {

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

227

CGBuilderTy &Bld = CGF.Builder;

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

228

return IsInSpmdExecutionMode

229

? getNVPTXNumThreads(CGF)

230

: Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),

231

"thread_limit");

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

232

}

233

Arpith Chacko Jacob

2017-01-03 20:19:56 +0000

[diff] [blame]

234

/// Get the thread id of the OMP master thread.

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

235

/// The master thread id is the first thread (lane) of the last warp in the

236

/// GPU block. Warp size is assumed to be some power of 2.

237

/// Thread id is 0 indexed.

238

/// E.g: If NumThreads is 33, master id is 32.

239

/// If NumThreads is 64, master id is 32.

240

/// If NumThreads is 1024, master id is 992.

Arpith Chacko Jacob

2017-01-03 20:19:56 +0000

[diff] [blame]

241

static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

242

CGBuilderTy &Bld = CGF.Builder;

243

llvm::Value *NumThreads = getNVPTXNumThreads(CGF);

244

245

// We assume that the warp size is a power of 2.

246

llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));

247

248

return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),

249

Bld.CreateNot(Mask), "master_tid");

250

}

251

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

252

CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

253

CodeGenModule &CGM, SourceLocation Loc)

254

: WorkerFn(nullptr), CGFI(nullptr), Loc(Loc) {

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

255

createWorkerFunction(CGM);

Vasileios Kalintiris

e5c0959

2016-03-22 10:41:20 +0000

[diff] [blame]

256

}

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

257

258

void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(

259

CodeGenModule &CGM) {

260

// Create an worker function with no arguments.

261

CGFI = &CGM.getTypes().arrangeNullaryFunction();

262

263

WorkerFn = llvm::Function::Create(

264

CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,

Alexey Bataev

aee9389

2018-01-08 20:09:47 +0000

[diff] [blame]

265

/*placeholder=*/"_worker", &CGM.getModule());

Rafael Espindola

2018-02-28 23:46:35 +0000

[diff] [blame]

266

CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, *CGFI);

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

267

}

268

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

269

bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {

270

return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;

271

}

272

273

static CGOpenMPRuntimeNVPTX::ExecutionMode

Carlo Bertolli

7971209

2018-02-28 20:48:35 +0000

[diff] [blame]

274

getExecutionMode(CodeGenModule &CGM) {

275

return CGM.getLangOpts().OpenMPCUDAMode

276

? CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd

277

: CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

278

}

279

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

280

void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,

281

StringRef ParentName,

282

llvm::Function *&OutlinedFn,

283

llvm::Constant *&OutlinedFnID,

284

bool IsOffloadEntry,

285

const RegionCodeGenTy &CodeGen) {

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

286

ExecutionModeRAII ModeRAII(CurrentExecutionMode,

287

CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

288

EntryFunctionState EST;

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

289

WorkerFunctionState WST(CGM, D.getLocStart());

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

290

Work.clear();

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

291

WrapperFunctionsMap.clear();

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

292

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

293

// Emit target region as a standalone region.

294

class NVPTXPrePostActionTy : public PrePostActionTy {

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

295

CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;

296

CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

297

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

298

public:

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

299

NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

300

CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

301

: EST(EST), WST(WST) {}

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

302

void Enter(CodeGenFunction &CGF) override {

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

303

static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())

304

.emitGenericEntryHeader(CGF, EST, WST);

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

305

}

306

void Exit(CodeGenFunction &CGF) override {

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

307

static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())

308

.emitGenericEntryFooter(CGF, EST);

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

309

}

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

310

} Action(EST, WST);

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

311

CodeGen.setAction(Action);

312

emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,

313

IsOffloadEntry, CodeGen);

314

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

315

// Now change the name of the worker function to correspond to this target

316

// region's entry function.

317

WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");

Alexey Bataev

aee9389

2018-01-08 20:09:47 +0000

[diff] [blame]

318

319

// Create the worker function

320

emitWorkerFunction(WST);

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

321

}

322

323

// Setup NVPTX threads for master-worker OpenMP scheme.

324

void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,

325

EntryFunctionState &EST,

326

WorkerFunctionState &WST) {

327

CGBuilderTy &Bld = CGF.Builder;

328

329

llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");

330

llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");

331

llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");

332

EST.ExitBB = CGF.createBasicBlock(".exit");

333

334

auto *IsWorker =

335

Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));

336

Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);

337

338

CGF.EmitBlock(WorkerBB);

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

339

emitOutlinedFunctionCall(CGF, WST.Loc, WST.WorkerFn);

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

340

CGF.EmitBranch(EST.ExitBB);

341

342

CGF.EmitBlock(MasterCheckBB);

343

auto *IsMaster =

344

Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));

345

Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);

346

347

CGF.EmitBlock(MasterBB);

348

// First action in sequential region:

349

// Initialize the state of the OpenMP runtime library on the GPU.

Jonas Hahnfeld

2017-11-22 14:46:49 +0000

[diff] [blame]

350

// TODO: Optimize runtime initialization and pass in correct value.

351

llvm::Value *Args[] = {getThreadLimit(CGF),

352

Bld.getInt16(/*RequiresOMPRuntime=*/1)};

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

353

CGF.EmitRuntimeCall(

354

createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);

355

}

356

357

void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,

358

EntryFunctionState &EST) {

359

if (!EST.ExitBB)

360

EST.ExitBB = CGF.createBasicBlock(".exit");

361

362

llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");

363

CGF.EmitBranch(TerminateBB);

364

365

CGF.EmitBlock(TerminateBB);

366

// Signal termination condition.

Jonas Hahnfeld

2017-11-22 14:46:49 +0000

[diff] [blame]

367

// TODO: Optimize runtime initialization and pass in correct value.

368

llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

369

CGF.EmitRuntimeCall(

Jonas Hahnfeld

2017-11-22 14:46:49 +0000

[diff] [blame]

370

createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

371

// Barrier to terminate worker threads.

372

syncCTAThreads(CGF);

373

// Master thread jumps to exit point.

374

CGF.EmitBranch(EST.ExitBB);

375

376

CGF.EmitBlock(EST.ExitBB);

377

EST.ExitBB = nullptr;

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

378

}

379

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

380

void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,

381

StringRef ParentName,

382

llvm::Function *&OutlinedFn,

383

llvm::Constant *&OutlinedFnID,

384

bool IsOffloadEntry,

385

const RegionCodeGenTy &CodeGen) {

386

ExecutionModeRAII ModeRAII(CurrentExecutionMode,

387

CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);

388

EntryFunctionState EST;

389

390

// Emit target region as a standalone region.

391

class NVPTXPrePostActionTy : public PrePostActionTy {

392

CGOpenMPRuntimeNVPTX &RT;

393

CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;

394

const OMPExecutableDirective &D;

395

396

public:

397

NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,

398

CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,

399

const OMPExecutableDirective &D)

400

: RT(RT), EST(EST), D(D) {}

401

void Enter(CodeGenFunction &CGF) override {

402

RT.emitSpmdEntryHeader(CGF, EST, D);

403

}

404

void Exit(CodeGenFunction &CGF) override {

405

RT.emitSpmdEntryFooter(CGF, EST);

406

}

407

} Action(*this, EST, D);

408

CodeGen.setAction(Action);

409

emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,

410

IsOffloadEntry, CodeGen);

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

411

}

412

413

void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(

414

CodeGenFunction &CGF, EntryFunctionState &EST,

415

const OMPExecutableDirective &D) {

416

auto &Bld = CGF.Builder;

417

418

// Setup BBs in entry function.

419

llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");

420

EST.ExitBB = CGF.createBasicBlock(".exit");

421

422

// Initialize the OMP state in the runtime; called by all active threads.

423

// TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters

424

// based on code analysis of the target region.

425

llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true),

426

/*RequiresOMPRuntime=*/Bld.getInt16(1),

427

/*RequiresDataSharing=*/Bld.getInt16(1)};

428

CGF.EmitRuntimeCall(

429

createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);

430

CGF.EmitBranch(ExecuteBB);

431

432

CGF.EmitBlock(ExecuteBB);

433

}

434

435

void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,

436

EntryFunctionState &EST) {

437

if (!EST.ExitBB)

438

EST.ExitBB = CGF.createBasicBlock(".exit");

439

440

llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");

441

CGF.EmitBranch(OMPDeInitBB);

442

443

CGF.EmitBlock(OMPDeInitBB);

444

// DeInitialize the OMP state in the runtime; called by all active threads.

445

CGF.EmitRuntimeCall(

446

createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);

447

CGF.EmitBranch(EST.ExitBB);

448

449

CGF.EmitBlock(EST.ExitBB);

450

EST.ExitBB = nullptr;

451

}

452

453

// Create a unique global variable to indicate the execution mode of this target

454

// region. The execution mode is either 'generic', or 'spmd' depending on the

455

// target directive. This variable is picked up by the offload library to setup

456

// the device appropriately before kernel launch. If the execution mode is

457

// 'generic', the runtime reserves one warp for the master, otherwise, all

458

// warps participate in parallel work.

459

static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,

460

CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {

461

(void)new llvm::GlobalVariable(

462

CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,

463

llvm::GlobalValue::WeakAnyLinkage,

464

llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));

465

}

466

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

467

void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

468

ASTContext &Ctx = CGM.getContext();

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

469

470

CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

471

CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {},

472

WST.Loc, WST.Loc);

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

473

emitWorkerLoop(CGF, WST);

474

CGF.FinishFunction();

475

}

476

477

void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,

478

WorkerFunctionState &WST) {

479

//

480

// The workers enter this loop and wait for parallel work from the master.

481

// When the master encounters a parallel region it sets up the work + variable

482

// arguments, and wakes up the workers. The workers first check to see if

483

// they are required for the parallel region, i.e., within the # of requested

484

// parallel threads. The activated workers load the variable arguments and

485

// execute the parallel work.

486

//

487

488

CGBuilderTy &Bld = CGF.Builder;

489

490

llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");

491

llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");

492

llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");

493

llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");

494

llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");

495

llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");

496

497

CGF.EmitBranch(AwaitBB);

498

499

// Workers wait for work from master.

500

CGF.EmitBlock(AwaitBB);

501

// Wait for parallel work

502

syncCTAThreads(CGF);

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

503

504

Address WorkFn =

505

CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");

506

Address ExecStatus =

507

CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");

508

CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));

509

CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));

510

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

511

// Set up shared arguments

512

Address SharedArgs =

513

CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrPtrTy, "shared_args");

Jonas Hahnfeld

2017-12-27 10:39:56 +0000

[diff] [blame]

514

// TODO: Optimize runtime initialization and pass in correct value.

515

llvm::Value *Args[] = {WorkFn.getPointer(), SharedArgs.getPointer(),

516

/*RequiresOMPRuntime=*/Bld.getInt16(1)};

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

517

llvm::Value *Ret = CGF.EmitRuntimeCall(

518

createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);

519

Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

520

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

521

// On termination condition (workid == 0), exit loop.

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

522

llvm::Value *ShouldTerminate =

523

Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

524

Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);

525

526

// Activate requested workers.

527

CGF.EmitBlock(SelectWorkersBB);

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

528

llvm::Value *IsActive =

529

Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");

530

Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

531

532

// Signal start of parallel region.

533

CGF.EmitBlock(ExecuteBB);

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

534

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

535

// Current context

536

ASTContext &Ctx = CGF.getContext();

537

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

538

// Process work items: outlined parallel functions.

539

for (auto *W : Work) {

540

// Try to match this outlined function.

541

auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);

542

543

llvm::Value *WorkFnMatch =

544

Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");

545

546

llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");

547

llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");

548

Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);

549

550

// Execute this outlined function.

551

CGF.EmitBlock(ExecuteFNBB);

552

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

553

// Insert call to work function via shared wrapper. The shared

554

// wrapper takes exactly three arguments:

555

// - the parallelism level;

556

// - the master thread ID;

557

// - the list of references to shared arguments.

558

//

559

// TODO: Assert that the function is a wrapper function.s

560

Address Capture = CGF.EmitLoadOfPointer(SharedArgs,

561

Ctx.getPointerType(

562

Ctx.getPointerType(Ctx.VoidPtrTy)).castAs<PointerType>());

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

563

emitOutlinedFunctionCall(CGF, WST.Loc, W,

564

{Bld.getInt16(/*ParallelLevel=*/0),

565

getMasterThreadID(CGF), Capture.getPointer()});

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

566

567

// Go to end of parallel region.

568

CGF.EmitBranch(TerminateBB);

569

570

CGF.EmitBlock(CheckNextBB);

571

}

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

572

573

// Signal end of parallel region.

574

CGF.EmitBlock(TerminateBB);

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

575

CGF.EmitRuntimeCall(

576

createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),

577

llvm::None);

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

578

CGF.EmitBranch(BarrierBB);

579

580

// All active and inactive workers wait at a barrier after parallel region.

581

CGF.EmitBlock(BarrierBB);

582

// Barrier after parallel region.

583

syncCTAThreads(CGF);

584

CGF.EmitBranch(AwaitBB);

585

586

// Exit target region.

587

CGF.EmitBlock(ExitBB);

588

}

589

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

590

/// \brief Returns specified OpenMP runtime function for the current OpenMP

591

/// implementation. Specialized for the NVPTX device.

592

/// \param Function OpenMP runtime function.

593

/// \return Specified function.

594

llvm::Constant *

595

CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {

596

llvm::Constant *RTLFn = nullptr;

597

switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {

598

case OMPRTL_NVPTX__kmpc_kernel_init: {

Jonas Hahnfeld

2017-11-22 14:46:49 +0000

[diff] [blame]

599

// Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t

600

// RequiresOMPRuntime);

601

llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

602

llvm::FunctionType *FnTy =

603

llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);

604

RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");

605

break;

606

}

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

607

case OMPRTL_NVPTX__kmpc_kernel_deinit: {

Jonas Hahnfeld

2017-11-22 14:46:49 +0000

[diff] [blame]

608

// Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);

609

llvm::Type *TypeParams[] = {CGM.Int16Ty};

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

610

llvm::FunctionType *FnTy =

Jonas Hahnfeld

2017-11-22 14:46:49 +0000

[diff] [blame]

611

llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);

Arpith Chacko Jacob

2017-01-05 15:24:05 +0000

[diff] [blame]

612

RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");

613

break;

614

}

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

615

case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {

616

// Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,

Jonas Hahnfeld

2017-11-22 14:46:49 +0000

[diff] [blame]

617

// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

618

llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};

619

llvm::FunctionType *FnTy =

620

llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);

621

RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");

622

break;

623

}

624

case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {

625

// Build void __kmpc_spmd_kernel_deinit();

626

llvm::FunctionType *FnTy =

627

llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);

628

RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");

629

break;

630

}

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

631

case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {

632

/// Build void __kmpc_kernel_prepare_parallel(

Jonas Hahnfeld

2017-12-27 10:39:56 +0000

[diff] [blame]

633

/// void *outlined_function, void ***args, kmp_int32 nArgs, int16_t

634

/// IsOMPRuntimeInitialized);

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

635

llvm::Type *TypeParams[] = {CGM.Int8PtrTy,

Jonas Hahnfeld

2017-12-27 10:39:56 +0000

[diff] [blame]

636

CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int32Ty,

637

CGM.Int16Ty};

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

638

llvm::FunctionType *FnTy =

639

llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);

640

RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");

641

break;

642

}

643

case OMPRTL_NVPTX__kmpc_kernel_parallel: {

Jonas Hahnfeld

2017-12-27 10:39:56 +0000

[diff] [blame]

644

/// Build bool __kmpc_kernel_parallel(void **outlined_function, void

645

/// ***args, int16_t IsOMPRuntimeInitialized);

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

646

llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy,

Jonas Hahnfeld

2017-12-27 10:39:56 +0000

[diff] [blame]

647

CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int16Ty};

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

648

llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);

649

llvm::FunctionType *FnTy =

650

llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);

651

RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");

652

break;

653

}

654

case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {

655

/// Build void __kmpc_kernel_end_parallel();

656

llvm::FunctionType *FnTy =

657

llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);

658

RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");

659

break;

660

}

661

case OMPRTL_NVPTX__kmpc_serialized_parallel: {

662

// Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32

663

// global_tid);

664

llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};

665

llvm::FunctionType *FnTy =

666

llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);

667

RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");

668

break;

669

}

670

case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {

671

// Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32

672

// global_tid);

673

llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};

674

llvm::FunctionType *FnTy =

675

llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);

676

RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");

677

break;

678

}

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

679

case OMPRTL_NVPTX__kmpc_shuffle_int32: {

680

// Build int32_t __kmpc_shuffle_int32(int32_t element,

681

// int16_t lane_offset, int16_t warp_size);

682

llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};

683

llvm::FunctionType *FnTy =

684

llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);

685

RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");

686

break;

687

}

688

case OMPRTL_NVPTX__kmpc_shuffle_int64: {

689

// Build int64_t __kmpc_shuffle_int64(int64_t element,

690

// int16_t lane_offset, int16_t warp_size);

691

llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};

692

llvm::FunctionType *FnTy =

693

llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);

694

RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");

695

break;

696

}

697

case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {

698

// Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,

699

// kmp_int32 num_vars, size_t reduce_size, void* reduce_data,

700

// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t

701

// lane_offset, int16_t Algorithm Version),

702

// void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));

703

llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,

704

CGM.Int16Ty, CGM.Int16Ty};

705

auto *ShuffleReduceFnTy =

706

llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,

707

/*isVarArg=*/false);

708

llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};

709

auto *InterWarpCopyFnTy =

710

llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,

711

/*isVarArg=*/false);

712

llvm::Type *TypeParams[] = {CGM.Int32Ty,

CGM.Int32Ty,

CGM.SizeTy,

CGM.VoidPtrTy,

ShuffleReduceFnTy->getPointerTo(),

717

InterWarpCopyFnTy->getPointerTo()};

718

llvm::FunctionType *FnTy =

719

llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);

720

RTLFn = CGM.CreateRuntimeFunction(

721

FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");

722

break;

723

}

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

724

case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {

725

// Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,

726

// int32_t num_vars, size_t reduce_size, void *reduce_data,

727

// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t

728

// lane_offset, int16_t shortCircuit),

729

// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),

730

// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,

731

// int32_t index, int32_t width),

732

// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,

733

// int32_t index, int32_t width, int32_t reduce))

734

llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,

735

CGM.Int16Ty, CGM.Int16Ty};

736

auto *ShuffleReduceFnTy =

737

llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,

738

/*isVarArg=*/false);

739

llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};

740

auto *InterWarpCopyFnTy =

741

llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,

742

/*isVarArg=*/false);

743

llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,

744

CGM.Int32Ty, CGM.Int32Ty};

745

auto *CopyToScratchpadFnTy =

746

llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,

747

/*isVarArg=*/false);

748

llvm::Type *LoadReduceTypeParams[] = {

749

CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};

750

auto *LoadReduceFnTy =

751

llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,

752

/*isVarArg=*/false);

753

llvm::Type *TypeParams[] = {CGM.Int32Ty,

CGM.Int32Ty,

CGM.SizeTy,

CGM.VoidPtrTy,

ShuffleReduceFnTy->getPointerTo(),

758

InterWarpCopyFnTy->getPointerTo(),

759

CopyToScratchpadFnTy->getPointerTo(),

760

LoadReduceFnTy->getPointerTo()};

761

llvm::FunctionType *FnTy =

762

llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);

763

RTLFn = CGM.CreateRuntimeFunction(

764

FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");

765

break;

766

}

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

767

case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {

768

// Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);

769

llvm::Type *TypeParams[] = {CGM.Int32Ty};

770

llvm::FunctionType *FnTy =

771

llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);

772

RTLFn = CGM.CreateRuntimeFunction(

773

FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");

774

break;

775

}

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

}

return RTLFn;

}

void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,

781

llvm::Constant *Addr,

Samuel Antao

f83efdb

2017-01-05 16:02:49 +0000

[diff] [blame]

782

uint64_t Size, int32_t) {

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

783

auto *F = dyn_cast<llvm::Function>(Addr);

784

// TODO: Add support for global variables on the device after declare target

// support.

if (!F)

return;

llvm::Module *M = F->getParent();

789

llvm::LLVMContext &Ctx = M->getContext();

790

791

// Get "nvvm.annotations" metadata node

792

llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");

793

794

llvm::Metadata *MDVals[] = {

795

llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),

796

llvm::ConstantAsMetadata::get(

797

llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};

798

// Append metadata to nvvm.annotations

799

MD->addOperand(llvm::MDNode::get(Ctx, MDVals));

800

}

801

802

void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(

803

const OMPExecutableDirective &D, StringRef ParentName,

804

llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,

Alexey Bataev

14fa1c6

2016-03-29 05:34:15 +0000

[diff] [blame]

805

bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

806

if (!IsOffloadEntry) // Nothing to do.

807

return;

808

809

assert(!ParentName.empty() && "Invalid target region parent name!");

810

Carlo Bertolli

7971209

2018-02-28 20:48:35 +0000

[diff] [blame]

811

CGOpenMPRuntimeNVPTX::ExecutionMode Mode = getExecutionMode(CGM);

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

812

switch (Mode) {

813

case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:

814

emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,

815

CodeGen);

816

break;

817

case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:

818

emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,

819

CodeGen);

820

break;

821

case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:

822

llvm_unreachable(

823

"Unknown programming model for OpenMP directive on NVPTX target.");

824

}

825

826

setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

827

}

828

Samuel Antao

45bfe4c

2016-02-08 15:59:20 +0000

[diff] [blame]

829

CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

830

: CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

831

if (!CGM.getLangOpts().OpenMPIsDevice)

832

llvm_unreachable("OpenMP NVPTX can only handle device code.");

Arpith Chacko Jacob

2016-03-22 01:48:56 +0000

[diff] [blame]

833

}

Carlo Bertolli

2016-04-04 15:55:02 +0000

[diff] [blame]

834

Arpith Chacko Jacob

2cd6eea

2017-01-25 16:55:10 +0000

[diff] [blame]

835

void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,

836

OpenMPProcBindClauseKind ProcBind,

837

SourceLocation Loc) {

838

// Do nothing in case of Spmd mode and L0 parallel.

839

// TODO: If in Spmd mode and L1 parallel emit the clause.

840

if (isInSpmdExecutionMode())

841

return;

842

843

CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);

844

}

845

Arpith Chacko Jacob

e04da5d

2017-01-25 01:18:34 +0000

[diff] [blame]

846

void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,

847

llvm::Value *NumThreads,

848

SourceLocation Loc) {

849

// Do nothing in case of Spmd mode and L0 parallel.

850

// TODO: If in Spmd mode and L1 parallel emit the clause.

851

if (isInSpmdExecutionMode())

852

return;

853

854

CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);

855

}

856

Carlo Bertolli

2016-04-04 15:55:02 +0000

[diff] [blame]

857

void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,

858

const Expr *NumTeams,

859

const Expr *ThreadLimit,

860

SourceLocation Loc) {}

861

Arpith Chacko Jacob

2017-01-18 18:18:53 +0000

[diff] [blame]

862

llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(

863

const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,

864

OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

865

866

auto *OutlinedFun = cast<llvm::Function>(

867

CGOpenMPRuntime::emitParallelOutlinedFunction(

868

D, ThreadIDVar, InnermostKind, CodeGen));

869

if (!isInSpmdExecutionMode()) {

870

llvm::Function *WrapperFun =

871

createDataSharingWrapper(OutlinedFun, D);

872

WrapperFunctionsMap[OutlinedFun] = WrapperFun;

873

}

874

875

return OutlinedFun;

Arpith Chacko Jacob

2017-01-18 18:18:53 +0000

[diff] [blame]

876

}

877

878

llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(

Carlo Bertolli

2016-04-04 15:55:02 +0000

[diff] [blame]

879

const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,

880

OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {

881

Arpith Chacko Jacob

2017-01-18 18:18:53 +0000

[diff] [blame]

882

llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(

883

D, ThreadIDVar, InnermostKind, CodeGen);

884

llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);

885

OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);

Mehdi Amini

6aa9e9b

2017-05-29 05:38:20 +0000

[diff] [blame]

886

OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);

Arpith Chacko Jacob

2017-01-18 18:18:53 +0000

[diff] [blame]

887

OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);

Carlo Bertolli

2016-04-04 15:55:02 +0000

[diff] [blame]

return OutlinedFun;

}

void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,

893

const OMPExecutableDirective &D,

894

SourceLocation Loc,

895

llvm::Value *OutlinedFn,

896

ArrayRef<llvm::Value *> CapturedVars) {

897

if (!CGF.HaveInsertPoint())

return;

Address ZeroAddr =

CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),

902

/*Name*/ ".zero.addr");

903

CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));

904

llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;

905

OutlinedFnArgs.push_back(ZeroAddr.getPointer());

906

OutlinedFnArgs.push_back(ZeroAddr.getPointer());

907

OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());

Alexey Bataev

2017-08-14 15:01:03 +0000

[diff] [blame]

908

emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);

Carlo Bertolli

2016-04-04 15:55:02 +0000

[diff] [blame]

909

}

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

910

911

void CGOpenMPRuntimeNVPTX::emitParallelCall(

912

CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,

913

ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {

914

if (!CGF.HaveInsertPoint())

915

return;

916

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

917

if (isInSpmdExecutionMode())

918

emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);

919

else

920

emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

921

}

922

923

void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(

924

CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,

925

ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {

926

llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

927

llvm::Function *WFn = WrapperFunctionsMap[Fn];

928

assert(WFn && "Wrapper function does not exist!");

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

929

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

930

// Force inline this outlined function at its call site.

931

Fn->setLinkage(llvm::GlobalValue::InternalLinkage);

932

933

auto &&L0ParallelGen = [this, WFn, &CapturedVars](CodeGenFunction &CGF,

934

PrePostActionTy &) {

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

935

CGBuilderTy &Bld = CGF.Builder;

936

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

937

llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);

938

939

if (!CapturedVars.empty()) {

Gheorghe-Teodor Bercea

b4c74c6

2017-12-12 21:38:43 +0000

[diff] [blame]

940

// There's somehting to share, add the attribute

941

CGF.CurFn->addFnAttr("has-nvptx-shared-depot");

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

942

// Prepare for parallel region. Indicate the outlined function.

943

Address SharedArgs =

944

CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy,

945

"shared_args");

946

llvm::Value *SharedArgsPtr = SharedArgs.getPointer();

Jonas Hahnfeld

2017-12-27 10:39:56 +0000

[diff] [blame]

947

// TODO: Optimize runtime initialization and pass in correct value.

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

948

llvm::Value *Args[] = {ID, SharedArgsPtr,

Jonas Hahnfeld

2017-12-27 10:39:56 +0000

[diff] [blame]

949

Bld.getInt32(CapturedVars.size()),

950

/*RequiresOMPRuntime=*/Bld.getInt16(1)};

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

951

952

CGF.EmitRuntimeCall(

953

createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),

Args);

unsigned Idx = 0;

ASTContext &Ctx = CGF.getContext();

958

for (llvm::Value *V : CapturedVars) {

959

Address Dst = Bld.CreateConstInBoundsGEP(

960

CGF.EmitLoadOfPointer(SharedArgs,

961

Ctx.getPointerType(

962

Ctx.getPointerType(Ctx.VoidPtrTy)).castAs<PointerType>()),

963

Idx, CGF.getPointerSize());

964

llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy);

965

CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,

966

Ctx.getPointerType(Ctx.VoidPtrTy));

967

Idx++;

968

}

969

} else {

Jonas Hahnfeld

2017-12-27 10:39:56 +0000

[diff] [blame]

970

// TODO: Optimize runtime initialization and pass in correct value.

971

llvm::Value *Args[] = {

972

ID, llvm::ConstantPointerNull::get(CGF.VoidPtrPtrTy->getPointerTo(0)),

973

/*nArgs=*/Bld.getInt32(0), /*RequiresOMPRuntime=*/Bld.getInt16(1)};

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

974

CGF.EmitRuntimeCall(

975

createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),

976

Args);

977

}

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

978

979

// Activate workers. This barrier is used by the master to signal

980

// work for the workers.

981

syncCTAThreads(CGF);

982

983

// OpenMP [2.5, Parallel Construct, p.49]

984

// There is an implied barrier at the end of a parallel region. After the

985

// end of a parallel region, only the master thread of the team resumes

986

// execution of the enclosing task region.

987

//

988

// The master waits at this barrier until all workers are done.

989

syncCTAThreads(CGF);

990

991

// Remember for post-processing in worker loop.

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

992

Work.emplace_back(WFn);

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

993

};

994

995

auto *RTLoc = emitUpdateLocation(CGF, Loc);

996

auto *ThreadID = getThreadID(CGF, Loc);

997

llvm::Value *Args[] = {RTLoc, ThreadID};

998

Alexey Bataev

2017-08-14 15:01:03 +0000

[diff] [blame]

999

auto &&SeqGen = [this, Fn, &CapturedVars, &Args, Loc](CodeGenFunction &CGF,

1000

PrePostActionTy &) {

1001

auto &&CodeGen = [this, Fn, &CapturedVars, Loc](CodeGenFunction &CGF,

1002

PrePostActionTy &Action) {

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

1003

Action.Enter(CGF);

1004

1005

llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;

1006

OutlinedFnArgs.push_back(

1007

llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));

1008

OutlinedFnArgs.push_back(

1009

llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));

1010

OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());

Alexey Bataev

2017-08-14 15:01:03 +0000

[diff] [blame]

1011

emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);

Arpith Chacko Jacob

2017-01-10 15:42:51 +0000

[diff] [blame]

1012

};

1013

1014

RegionCodeGenTy RCG(CodeGen);

1015

NVPTXActionTy Action(

1016

createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),

1017

Args,

1018

createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),

1019

Args);

1020

RCG.setAction(Action);

RCG(CGF);

};

if (IfCond)

emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);

1026

else {

1027

CodeGenFunction::RunCleanupsScope Scope(CGF);

1028

RegionCodeGenTy ThenRCG(L0ParallelGen);

1029

ThenRCG(CGF);

1030

}

1031

}

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

1032

1033

void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(

1034

CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,

1035

ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {

1036

// Just call the outlined function to execute the parallel region.

1037

// OutlinedFn(&GTid, &zero, CapturedStruct);

1038

//

1039

// TODO: Do something with IfCond when support for the 'if' clause

1040

// is added on Spmd target directives.

1041

llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;

Carlo Bertolli

7971209

2018-02-28 20:48:35 +0000

[diff] [blame]

1042

1043

Address ZeroAddr = CGF.CreateMemTemp(

1044

CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),

1045

".zero.addr");

1046

CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));

1047

OutlinedFnArgs.push_back(ZeroAddr.getPointer());

1048

OutlinedFnArgs.push_back(ZeroAddr.getPointer());

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

1049

OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());

Alexey Bataev

2017-08-14 15:01:03 +0000

[diff] [blame]

1050

emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);

Arpith Chacko Jacob

2017-01-18 19:35:00 +0000

[diff] [blame]

1051

}

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1052

Alexey Bataev

2018-01-04 20:18:55 +0000

[diff] [blame]

1053

/// Cast value to the specified type.

1054

static llvm::Value *

1055

castValueToType(CodeGenFunction &CGF, llvm::Value *Val, llvm::Type *CastTy,

1056

llvm::Optional<bool> IsSigned = llvm::None) {

1057

if (Val->getType() == CastTy)

1058

return Val;

1059

if (Val->getType()->getPrimitiveSizeInBits() > 0 &&

1060

CastTy->getPrimitiveSizeInBits() > 0 &&

1061

Val->getType()->getPrimitiveSizeInBits() ==

1062

CastTy->getPrimitiveSizeInBits())

1063

return CGF.Builder.CreateBitCast(Val, CastTy);

1064

if (IsSigned.hasValue() && CastTy->isIntegerTy() &&

1065

Val->getType()->isIntegerTy())

1066

return CGF.Builder.CreateIntCast(Val, CastTy, *IsSigned);

1067

Address CastItem = CGF.CreateTempAlloca(

1068

CastTy,

1069

CharUnits::fromQuantity(

1070

CGF.CGM.getDataLayout().getPrefTypeAlignment(Val->getType())));

1071

Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(

1072

CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));

1073

CGF.Builder.CreateStore(Val, ValCastItem);

1074

return CGF.Builder.CreateLoad(CastItem);

1075

}

1076

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1077

/// This function creates calls to one of two shuffle functions to copy

1078

/// variables between lanes in a warp.

1079

static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1080

llvm::Value *Elem,

1081

llvm::Value *Offset) {

1082

auto &CGM = CGF.CGM;

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1083

auto &Bld = CGF.Builder;

1084

CGOpenMPRuntimeNVPTX &RT =

1085

*(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));

1086

Alexey Bataev

2018-01-04 20:18:55 +0000

[diff] [blame]

1087

unsigned Size = CGM.getDataLayout().getTypeStoreSize(Elem->getType());

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1088

assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction.");

1089

1090

OpenMPRTLFunctionNVPTX ShuffleFn = Size <= 4

1091

? OMPRTL_NVPTX__kmpc_shuffle_int32

1092

: OMPRTL_NVPTX__kmpc_shuffle_int64;

1093

1094

// Cast all types to 32- or 64-bit values before calling shuffle routines.

Alexey Bataev

2018-01-04 20:18:55 +0000

[diff] [blame]

1095

llvm::Type *CastTy = Size <= 4 ? CGM.Int32Ty : CGM.Int64Ty;

1096

llvm::Value *ElemCast = castValueToType(CGF, Elem, CastTy, /*isSigned=*/true);

1097

auto *WarpSize =

1098

Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1099

1100

auto *ShuffledVal =

1101

CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn),

1102

{ElemCast, Offset, WarpSize});

1103

Alexey Bataev

2018-01-04 20:18:55 +0000

[diff] [blame]

1104

return castValueToType(CGF, ShuffledVal, Elem->getType(), /*isSigned=*/true);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

}

namespace {

enum CopyAction : unsigned {

1109

// RemoteLaneToThread: Copy over a Reduce list from a remote lane in

1110

// the warp using shuffle instructions.

1111

RemoteLaneToThread,

1112

// ThreadCopy: Make a copy of a Reduce list on the thread's stack.

1113

ThreadCopy,

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1114

// ThreadToScratchpad: Copy a team-reduced array to the scratchpad.

1115

ThreadToScratchpad,

1116

// ScratchpadToThread: Copy from a scratchpad array in global memory

1117

// containing team-reduced data to a thread's stack.

1118

ScratchpadToThread,

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

};

} // namespace

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1122

struct CopyOptionsTy {

1123

llvm::Value *RemoteLaneOffset;

1124

llvm::Value *ScratchpadIndex;

1125

llvm::Value *ScratchpadWidth;

1126

};

1127

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1128

/// Emit instructions to copy a Reduce list, which contains partially

1129

/// aggregated values, in the specified direction.

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1130

static void emitReductionListCopy(

1131

CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,

1132

ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,

1133

CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1134

1135

auto &CGM = CGF.CGM;

1136

auto &C = CGM.getContext();

1137

auto &Bld = CGF.Builder;

1138

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1139

auto *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;

1140

auto *ScratchpadIndex = CopyOptions.ScratchpadIndex;

1141

auto *ScratchpadWidth = CopyOptions.ScratchpadWidth;

1142

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1143

// Iterates, element-by-element, through the source Reduce list and

1144

// make a copy.

1145

unsigned Idx = 0;

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1146

unsigned Size = Privates.size();

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1147

for (auto &Private : Privates) {

1148

Address SrcElementAddr = Address::invalid();

1149

Address DestElementAddr = Address::invalid();

1150

Address DestElementPtrAddr = Address::invalid();

1151

// Should we shuffle in an element from a remote lane?

1152

bool ShuffleInElement = false;

1153

// Set to true to update the pointer in the dest Reduce list to a

1154

// newly created element.

1155

bool UpdateDestListPtr = false;

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1156

// Increment the src or dest pointer to the scratchpad, for each

1157

// new element.

1158

bool IncrScratchpadSrc = false;

1159

bool IncrScratchpadDest = false;

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1160

1161

switch (Action) {

1162

case RemoteLaneToThread: {

1163

// Step 1.1: Get the address for the src element in the Reduce list.

1164

Address SrcElementPtrAddr =

1165

Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());

Alexey Bataev

2018-01-04 20:18:55 +0000

[diff] [blame]

1166

SrcElementAddr = CGF.EmitLoadOfPointer(

1167

SrcElementPtrAddr,

1168

C.getPointerType(Private->getType())->castAs<PointerType>());

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1169

1170

// Step 1.2: Create a temporary to store the element in the destination

1171

// Reduce list.

1172

DestElementPtrAddr =

1173

Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());

1174

DestElementAddr =

1175

CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");

1176

ShuffleInElement = true;

1177

UpdateDestListPtr = true;

break;

}

case ThreadCopy: {

// Step 1.1: Get the address for the src element in the Reduce list.

1182

Address SrcElementPtrAddr =

1183

Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());

Alexey Bataev

2018-01-04 20:18:55 +0000

[diff] [blame]

1184

SrcElementAddr = CGF.EmitLoadOfPointer(

1185

SrcElementPtrAddr,

1186

C.getPointerType(Private->getType())->castAs<PointerType>());

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1187

1188

// Step 1.2: Get the address for dest element. The destination

1189

// element has already been created on the thread's stack.

1190

DestElementPtrAddr =

1191

Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());

Alexey Bataev

2018-01-04 20:18:55 +0000

[diff] [blame]

1192

DestElementAddr = CGF.EmitLoadOfPointer(

1193

DestElementPtrAddr,

1194

C.getPointerType(Private->getType())->castAs<PointerType>());

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1195

break;

1196

}

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1197

case ThreadToScratchpad: {

1198

// Step 1.1: Get the address for the src element in the Reduce list.

1199

Address SrcElementPtrAddr =

1200

Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());

Alexey Bataev

2018-01-04 20:18:55 +0000

[diff] [blame]

1201

SrcElementAddr = CGF.EmitLoadOfPointer(

1202

SrcElementPtrAddr,

1203

C.getPointerType(Private->getType())->castAs<PointerType>());

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1204

1205

// Step 1.2: Get the address for dest element:

1206

// address = base + index * ElementSizeInChars.

1207

unsigned ElementSizeInChars =

1208

C.getTypeSizeInChars(Private->getType()).getQuantity();

1209

auto *CurrentOffset =

1210

Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),

1211

ScratchpadIndex);

1212

auto *ScratchPadElemAbsolutePtrVal =

1213

Bld.CreateAdd(DestBase.getPointer(), CurrentOffset);

1214

ScratchPadElemAbsolutePtrVal =

1215

Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);

Alexey Bataev

2018-01-04 20:18:55 +0000

[diff] [blame]

1216

DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,

1217

C.getTypeAlignInChars(Private->getType()));

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1218

IncrScratchpadDest = true;

1219

break;

1220

}

1221

case ScratchpadToThread: {

1222

// Step 1.1: Get the address for the src element in the scratchpad.

1223

// address = base + index * ElementSizeInChars.

1224

unsigned ElementSizeInChars =

1225

C.getTypeSizeInChars(Private->getType()).getQuantity();

1226

auto *CurrentOffset =

1227

Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),

1228

ScratchpadIndex);

1229

auto *ScratchPadElemAbsolutePtrVal =

1230

Bld.CreateAdd(SrcBase.getPointer(), CurrentOffset);

1231

ScratchPadElemAbsolutePtrVal =

1232

Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);

1233

SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,

1234

C.getTypeAlignInChars(Private->getType()));

1235

IncrScratchpadSrc = true;

1236

1237

// Step 1.2: Create a temporary to store the element in the destination

1238

// Reduce list.

1239

DestElementPtrAddr =

1240

Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());

1241

DestElementAddr =

1242

CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");

1243

UpdateDestListPtr = true;

1244

break;

1245

}

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1246

}

1247

1248

// Regardless of src and dest of copy, we emit the load of src

1249

// element as this is required in all directions

1250

SrcElementAddr = Bld.CreateElementBitCast(

1251

SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));

1252

llvm::Value *Elem =

1253

CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,

Alexey Bataev

2018-01-23 18:12:38 +0000

[diff] [blame]

1254

Private->getType(), Private->getExprLoc());

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1255

1256

// Now that all active lanes have read the element in the

1257

// Reduce list, shuffle over the value from the remote lane.

Alexey Bataev

2018-01-04 20:18:55 +0000

[diff] [blame]

1258

if (ShuffleInElement)

1259

Elem = createRuntimeShuffleFunction(CGF, Elem, RemoteLaneOffset);

1260

1261

DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,

1262

SrcElementAddr.getElementType());

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1263

1264

// Store the source element value to the dest element address.

1265

CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,

1266

Private->getType());

1267

1268

// Step 3.1: Modify reference in dest Reduce list as needed.

1269

// Modifying the reference in Reduce list to point to the newly

1270

// created element. The element is live in the current function

1271

// scope and that of functions it invokes (i.e., reduce_function).

1272

// RemoteReduceData[i] = (void*)&RemoteElem

1273

if (UpdateDestListPtr) {

1274

CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(

1275

DestElementAddr.getPointer(), CGF.VoidPtrTy),

1276

DestElementPtrAddr, /*Volatile=*/false,

C.VoidPtrTy);

}

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1280

// Step 4.1: Increment SrcBase/DestBase so that it points to the starting

1281

// address of the next element in scratchpad memory, unless we're currently

1282

// processing the last one. Memory alignment is also taken care of here.

1283

if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {

1284

llvm::Value *ScratchpadBasePtr =

1285

IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();

1286

unsigned ElementSizeInChars =

1287

C.getTypeSizeInChars(Private->getType()).getQuantity();

1288

ScratchpadBasePtr = Bld.CreateAdd(

1289

ScratchpadBasePtr,

1290

Bld.CreateMul(ScratchpadWidth, llvm::ConstantInt::get(

1291

CGM.SizeTy, ElementSizeInChars)));

1292

1293

// Take care of global memory alignment for performance

1294

ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr,

1295

llvm::ConstantInt::get(CGM.SizeTy, 1));

1296

ScratchpadBasePtr = Bld.CreateSDiv(

1297

ScratchpadBasePtr,

1298

llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));

1299

ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr,

1300

llvm::ConstantInt::get(CGM.SizeTy, 1));

1301

ScratchpadBasePtr = Bld.CreateMul(

1302

ScratchpadBasePtr,

1303

llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));

1304

1305

if (IncrScratchpadDest)

1306

DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());

1307

else /* IncrScratchpadSrc = true */

1308

SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());

1309

}

1310

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

Idx++;

}

}

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1315

/// This function emits a helper that loads data from the scratchpad array

1316

/// and (optionally) reduces it with the input operand.

1317

///

1318

/// load_and_reduce(local, scratchpad, index, width, should_reduce)

1319

/// reduce_data remote;

1320

/// for elem in remote:

1321

/// remote.elem = Scratchpad[elem_id][index]

1322

/// if (should_reduce)

1323

/// local = local @ remote

1324

/// else

1325

/// local = remote

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1326

static llvm::Value *emitReduceScratchpadFunction(

1327

CodeGenModule &CGM, ArrayRef<const Expr *> Privates,

1328

QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1329

auto &C = CGM.getContext();

1330

auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);

1331

1332

// Destination of the copy.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1333

ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

1334

C.VoidPtrTy, ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1335

// Base address of the scratchpad array, with each element storing a

1336

// Reduce list per team.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1337

ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

1338

C.VoidPtrTy, ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1339

// A source index into the scratchpad array.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1340

ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,

1341

ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1342

// Row width of an element in the scratchpad array, typically

1343

// the number of teams.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1344

ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,

1345

ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1346

// If should_reduce == 1, then it's load AND reduce,

1347

// If should_reduce == 0 (or otherwise), then it only loads (+ copy).

1348

// The latter case is used for initialization.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1349

ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

1350

Int32Ty, ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1351

1352

FunctionArgList Args;

1353

Args.push_back(&ReduceListArg);

1354

Args.push_back(&ScratchPadArg);

1355

Args.push_back(&IndexArg);

1356

Args.push_back(&WidthArg);

1357

Args.push_back(&ShouldReduceArg);

1358

1359

auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

1360

auto *Fn = llvm::Function::Create(

1361

CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

1362

"_omp_reduction_load_and_reduce", &CGM.getModule());

Rafael Espindola

2018-02-28 23:46:35 +0000

[diff] [blame]

1363

CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1364

CodeGenFunction CGF(CGM);

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1365

CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1366

1367

auto &Bld = CGF.Builder;

1368

1369

// Get local Reduce list pointer.

1370

Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

1371

Address ReduceListAddr(

1372

Bld.CreatePointerBitCastOrAddrSpaceCast(

1373

CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,

Alexey Bataev

2018-01-23 18:12:38 +0000

[diff] [blame]

1374

C.VoidPtrTy, Loc),

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1375

CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),

1376

CGF.getPointerAlign());

1377

1378

Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);

1379

llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(

Alexey Bataev

2018-01-23 18:12:38 +0000

[diff] [blame]

1380

AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1381

1382

Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);

Alexey Bataev

2018-01-23 18:12:38 +0000

[diff] [blame]

1383

llvm::Value *IndexVal = Bld.CreateIntCast(

1384

CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),

1385

CGM.SizeTy, /*isSigned=*/true);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1386

1387

Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);

Alexey Bataev

2018-01-23 18:12:38 +0000

[diff] [blame]

1388

llvm::Value *WidthVal = Bld.CreateIntCast(

1389

CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),

1390

CGM.SizeTy, /*isSigned=*/true);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1391

1392

Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);

1393

llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(

Alexey Bataev

2018-01-23 18:12:38 +0000

[diff] [blame]

1394

AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, Loc);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1395

1396

// The absolute ptr address to the base addr of the next element to copy.

1397

llvm::Value *CumulativeElemBasePtr =

1398

Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);

1399

Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());

1400

1401

// Create a Remote Reduce list to store the elements read from the

1402

// scratchpad array.

1403

Address RemoteReduceList =

1404

CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");

1405

1406

// Assemble remote Reduce list from scratchpad array.

1407

emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,

1408

SrcDataAddr, RemoteReduceList,

1409

{/*RemoteLaneOffset=*/nullptr,

1410

/*ScratchpadIndex=*/IndexVal,

1411

/*ScratchpadWidth=*/WidthVal});

1412

1413

llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");

1414

llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");

1415

llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");

1416

1417

auto CondReduce = Bld.CreateICmpEQ(ShouldReduceVal, Bld.getInt32(1));

1418

Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);

1419

1420

CGF.EmitBlock(ThenBB);

1421

// We should reduce with the local Reduce list.

1422

// reduce_function(LocalReduceList, RemoteReduceList)

1423

llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

1424

ReduceListAddr.getPointer(), CGF.VoidPtrTy);

1425

llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

1426

RemoteReduceList.getPointer(), CGF.VoidPtrTy);

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1427

CGM.getOpenMPRuntime().emitOutlinedFunctionCall(

1428

CGF, Loc, ReduceFn, {LocalDataPtr, RemoteDataPtr});

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1429

Bld.CreateBr(MergeBB);

1430

1431

CGF.EmitBlock(ElseBB);

1432

// No reduction; just copy:

1433

// Local Reduce list = Remote Reduce list.

1434

emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,

1435

RemoteReduceList, ReduceListAddr);

1436

Bld.CreateBr(MergeBB);

1437

1438

CGF.EmitBlock(MergeBB);

1439

1440

CGF.FinishFunction();

return Fn;

}

/// This function emits a helper that stores reduced data from the team

1445

/// master to a scratchpad array in global memory.

1446

///

1447

/// for elem in Reduce List:

1448

/// scratchpad[elem_id][index] = elem

1449

///

Benjamin Kramer

674d579

2017-05-26 20:08:24 +0000

[diff] [blame]

1450

static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,

1451

ArrayRef<const Expr *> Privates,

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1452

QualType ReductionArrayTy,

1453

SourceLocation Loc) {

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1454

1455

auto &C = CGM.getContext();

1456

auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);

1457

1458

// Source of the copy.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1459

ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

1460

C.VoidPtrTy, ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1461

// Base address of the scratchpad array, with each element storing a

1462

// Reduce list per team.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1463

ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

1464

C.VoidPtrTy, ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1465

// A destination index into the scratchpad array, typically the team

1466

// identifier.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1467

ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,

1468

ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1469

// Row width of an element in the scratchpad array, typically

1470

// the number of teams.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1471

ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,

1472

ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1473

1474

FunctionArgList Args;

1475

Args.push_back(&ReduceListArg);

1476

Args.push_back(&ScratchPadArg);

1477

Args.push_back(&IndexArg);

1478

Args.push_back(&WidthArg);

1479

1480

auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

1481

auto *Fn = llvm::Function::Create(

1482

CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

1483

"_omp_reduction_copy_to_scratchpad", &CGM.getModule());

Rafael Espindola

2018-02-28 23:46:35 +0000

[diff] [blame]

1484

CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1485

CodeGenFunction CGF(CGM);

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1486

CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1487

1488

auto &Bld = CGF.Builder;

1489

1490

Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

1491

Address SrcDataAddr(

1492

Bld.CreatePointerBitCastOrAddrSpaceCast(

1493

CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,

Alexey Bataev

2018-01-23 18:12:38 +0000

[diff] [blame]

1494

C.VoidPtrTy, Loc),

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1495

CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),

1496

CGF.getPointerAlign());

1497

1498

Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);

1499

llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(

Alexey Bataev

2018-01-23 18:12:38 +0000

[diff] [blame]

1500

AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1501

1502

Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);

Alexey Bataev

2018-01-23 18:12:38 +0000

[diff] [blame]

1503

llvm::Value *IndexVal = Bld.CreateIntCast(

1504

CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),

1505

CGF.SizeTy, /*isSigned=*/true);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1506

1507

Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);

1508

llvm::Value *WidthVal =

1509

Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,

1510

Int32Ty, SourceLocation()),

1511

CGF.SizeTy, /*isSigned=*/true);

1512

1513

// The absolute ptr address to the base addr of the next element to copy.

1514

llvm::Value *CumulativeElemBasePtr =

1515

Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);

1516

Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());

1517

1518

emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,

1519

SrcDataAddr, DestDataAddr,

1520

{/*RemoteLaneOffset=*/nullptr,

1521

/*ScratchpadIndex=*/IndexVal,

1522

/*ScratchpadWidth=*/WidthVal});

1523

1524

CGF.FinishFunction();

return Fn;

}

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1528

/// This function emits a helper that gathers Reduce lists from the first

1529

/// lane of every active warp to lanes in the first warp.

1530

///

1531

/// void inter_warp_copy_func(void* reduce_data, num_warps)

1532

/// shared smem[warp_size];

1533

/// For all data entries D in reduce_data:

1534

/// If (I am the first lane in each warp)

1535

/// Copy my local D to smem[warp_id]

1536

/// sync

1537

/// if (I am the first warp)

1538

/// Copy smem[thread_id] to my local D

1539

/// sync

1540

static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,

1541

ArrayRef<const Expr *> Privates,

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1542

QualType ReductionArrayTy,

1543

SourceLocation Loc) {

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1544

auto &C = CGM.getContext();

1545

auto &M = CGM.getModule();

1546

1547

// ReduceList: thread local Reduce list.

1548

// At the stage of the computation when this function is called, partially

1549

// aggregated values reside in the first lane of every active warp.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1550

ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

1551

C.VoidPtrTy, ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1552

// NumWarps: number of warps active in the parallel region. This could

1553

// be smaller than 32 (max warps in a CTA) for partial block reduction.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1554

ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

Alexey Bataev

5622323

2017-06-09 13:40:18 +0000

[diff] [blame]

1555

C.getIntTypeForBitwidth(32, /* Signed */ true),

1556

ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1557

FunctionArgList Args;

1558

Args.push_back(&ReduceListArg);

1559

Args.push_back(&NumWarpsArg);

1560

1561

auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

1562

auto *Fn = llvm::Function::Create(

1563

CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

1564

"_omp_reduction_inter_warp_copy_func", &CGM.getModule());

Rafael Espindola

2018-02-28 23:46:35 +0000

[diff] [blame]

1565

CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1566

CodeGenFunction CGF(CGM);

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1567

CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1568

1569

auto &Bld = CGF.Builder;

1570

1571

// This array is used as a medium to transfer, one reduce element at a time,

1572

// the data from the first lane of every warp to lanes in the first warp

1573

// in order to perform the final step of a reduction in a parallel region

1574

// (reduction across warps). The array is placed in NVPTX __shared__ memory

1575

// for reduced latency, as well as to have a distinct copy for concurrently

1576

// executing target regions. The array is declared with common linkage so

1577

// as to be shared across compilation units.

1578

const char *TransferMediumName =

1579

"__openmp_nvptx_data_transfer_temporary_storage";

1580

llvm::GlobalVariable *TransferMedium =

1581

M.getGlobalVariable(TransferMediumName);

1582

if (!TransferMedium) {

1583

auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);

1584

unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);

1585

TransferMedium = new llvm::GlobalVariable(

1586

M, Ty,

1587

/*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,

1588

llvm::Constant::getNullValue(Ty), TransferMediumName,

1589

/*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,

SharedAddressSpace);

}

// Get the CUDA thread id of the current OpenMP thread on the GPU.

1594

auto *ThreadID = getNVPTXThreadID(CGF);

1595

// nvptx_lane_id = nvptx_id % warpsize

1596

auto *LaneID = getNVPTXLaneID(CGF);

1597

// nvptx_warp_id = nvptx_id / warpsize

1598

auto *WarpID = getNVPTXWarpID(CGF);

1599

1600

Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

1601

Address LocalReduceList(

1602

Bld.CreatePointerBitCastOrAddrSpaceCast(

1603

CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,

1604

C.VoidPtrTy, SourceLocation()),

1605

CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),

1606

CGF.getPointerAlign());

1607

1608

unsigned Idx = 0;

1609

for (auto &Private : Privates) {

1610

//

1611

// Warp master copies reduce element to transfer medium in __shared__

1612

// memory.

1613

//

1614

llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");

1615

llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");

1616

llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");

// if (lane_id == 0)

auto IsWarpMaster =

Bld.CreateICmpEQ(LaneID, Bld.getInt32(0), "warp_master");

1621

Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);

1622

CGF.EmitBlock(ThenBB);

1623

1624

// Reduce element = LocalReduceList[i]

1625

Address ElemPtrPtrAddr =

1626

Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());

1627

llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(

1628

ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());

1629

// elemptr = (type[i]*)(elemptrptr)

1630

Address ElemPtr =

1631

Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));

1632

ElemPtr = Bld.CreateElementBitCast(

1633

ElemPtr, CGF.ConvertTypeForMem(Private->getType()));

1634

// elem = *elemptr

1635

llvm::Value *Elem = CGF.EmitLoadOfScalar(

1636

ElemPtr, /*Volatile=*/false, Private->getType(), SourceLocation());

1637

1638

// Get pointer to location in transfer medium.

1639

// MediumPtr = &medium[warp_id]

1640

llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(

1641

TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});

1642

Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));

1643

// Casting to actual data type.

1644

// MediumPtr = (type[i]*)MediumPtrAddr;

1645

MediumPtr = Bld.CreateElementBitCast(

1646

MediumPtr, CGF.ConvertTypeForMem(Private->getType()));

1647

1648

//*MediumPtr = elem

1649

Bld.CreateStore(Elem, MediumPtr);

1650

1651

Bld.CreateBr(MergeBB);

1652

1653

CGF.EmitBlock(ElseBB);

1654

Bld.CreateBr(MergeBB);

1655

1656

CGF.EmitBlock(MergeBB);

1657

1658

Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);

1659

llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(

1660

AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());

1661

1662

auto *NumActiveThreads = Bld.CreateNSWMul(

1663

NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");

1664

// named_barrier_sync(ParallelBarrierID, num_active_threads)

1665

syncParallelThreads(CGF, NumActiveThreads);

1666

1667

//

1668

// Warp 0 copies reduce element from transfer medium.

1669

//

1670

llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");

1671

llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");

1672

llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");

1673

1674

// Up to 32 threads in warp 0 are active.

1675

auto IsActiveThread =

1676

Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");

1677

Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);

1678

1679

CGF.EmitBlock(W0ThenBB);

1680

1681

// SrcMediumPtr = &medium[tid]

1682

llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(

1683

TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});

1684

Address SrcMediumPtr(SrcMediumPtrVal,

1685

C.getTypeAlignInChars(Private->getType()));

1686

// SrcMediumVal = *SrcMediumPtr;

1687

SrcMediumPtr = Bld.CreateElementBitCast(

1688

SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));

1689

llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(

1690

SrcMediumPtr, /*Volatile=*/false, Private->getType(), SourceLocation());

1691

1692

// TargetElemPtr = (type[i]*)(SrcDataAddr[i])

1693

Address TargetElemPtrPtr =

1694

Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());

1695

llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(

1696

TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());

1697

Address TargetElemPtr =

1698

Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));

1699

TargetElemPtr = Bld.CreateElementBitCast(

1700

TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));

1701

1702

// *TargetElemPtr = SrcMediumVal;

1703

CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,

1704

Private->getType());

1705

Bld.CreateBr(W0MergeBB);

1706

1707

CGF.EmitBlock(W0ElseBB);

1708

Bld.CreateBr(W0MergeBB);

1709

1710

CGF.EmitBlock(W0MergeBB);

1711

1712

// While warp 0 copies values from transfer medium, all other warps must

1713

// wait.

1714

syncParallelThreads(CGF, NumActiveThreads);

Idx++;

}

CGF.FinishFunction();

return Fn;

}

/// Emit a helper that reduces data across two OpenMP threads (lanes)

1723

/// in the same warp. It uses shuffle instructions to copy over data from

1724

/// a remote lane's stack. The reduction algorithm performed is specified

1725

/// by the fourth parameter.

1726

///

1727

/// Algorithm Versions.

1728

/// Full Warp Reduce (argument value 0):

1729

/// This algorithm assumes that all 32 lanes are active and gathers

1730

/// data from these 32 lanes, producing a single resultant value.

1731

/// Contiguous Partial Warp Reduce (argument value 1):

1732

/// This algorithm assumes that only a *contiguous* subset of lanes

1733

/// are active. This happens for the last warp in a parallel region

1734

/// when the user specified num_threads is not an integer multiple of

1735

/// 32. This contiguous subset always starts with the zeroth lane.

1736

/// Partial Warp Reduce (argument value 2):

1737

/// This algorithm gathers data from any number of lanes at any position.

1738

/// All reduced values are stored in the lowest possible lane. The set

1739

/// of problems every algorithm addresses is a super set of those

1740

/// addressable by algorithms with a lower version number. Overhead

1741

/// increases as algorithm version increases.

///

/// Terminology

/// Reduce element:

/// Reduce element refers to the individual data field with primitive

1746

/// data types to be combined and reduced across threads.

1747

/// Reduce list:

1748

/// Reduce list refers to a collection of local, thread-private

1749

/// reduce elements.

1750

/// Remote Reduce list:

1751

/// Remote Reduce list refers to a collection of remote (relative to

1752

/// the current thread) reduce elements.

1753

///

1754

/// We distinguish between three states of threads that are important to

1755

/// the implementation of this function.

1756

/// Alive threads:

1757

/// Threads in a warp executing the SIMT instruction, as distinguished from

1758

/// threads that are inactive due to divergent control flow.

1759

/// Active threads:

1760

/// The minimal set of threads that has to be alive upon entry to this

1761

/// function. The computation is correct iff active threads are alive.

1762

/// Some threads are alive but they are not active because they do not

1763

/// contribute to the computation in any useful manner. Turning them off

1764

/// may introduce control flow overheads without any tangible benefits.

1765

/// Effective threads:

1766

/// In order to comply with the argument requirements of the shuffle

1767

/// function, we must keep all lanes holding data alive. But at most

1768

/// half of them perform value aggregation; we refer to this half of

1769

/// threads as effective. The other half is simply handing off their

/// data.

///

/// Procedure

/// Value shuffle:

/// In this step active threads transfer data from higher lane positions

1775

/// in the warp to lower lane positions, creating Remote Reduce list.

1776

/// Value aggregation:

1777

/// In this step, effective threads combine their thread local Reduce list

1778

/// with Remote Reduce list and store the result in the thread local

1779

/// Reduce list.

1780

/// Value copy:

1781

/// In this step, we deal with the assumption made by algorithm 2

1782

/// (i.e. contiguity assumption). When we have an odd number of lanes

1783

/// active, say 2k+1, only k threads will be effective and therefore k

1784

/// new values will be produced. However, the Reduce list owned by the

1785

/// (2k+1)th thread is ignored in the value aggregation. Therefore

1786

/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so

1787

/// that the contiguity assumption still holds.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1788

static llvm::Value *emitShuffleAndReduceFunction(

1789

CodeGenModule &CGM, ArrayRef<const Expr *> Privates,

1790

QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1791

auto &C = CGM.getContext();

1792

1793

// Thread local Reduce list used to host the values of data to be reduced.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1794

ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

1795

C.VoidPtrTy, ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1796

// Current lane id; could be logical.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1797

ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,

1798

ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1799

// Offset of the remote source lane relative to the current lane.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1800

ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

1801

C.ShortTy, ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1802

// Algorithm version. This is expected to be known at compile time.

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1803

ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

1804

C.ShortTy, ImplicitParamDecl::Other);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1805

FunctionArgList Args;

1806

Args.push_back(&ReduceListArg);

1807

Args.push_back(&LaneIDArg);

1808

Args.push_back(&RemoteLaneOffsetArg);

1809

Args.push_back(&AlgoVerArg);

1810

1811

auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

1812

auto *Fn = llvm::Function::Create(

1813

CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

1814

"_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());

Rafael Espindola

2018-02-28 23:46:35 +0000

[diff] [blame]

1815

CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1816

CodeGenFunction CGF(CGM);

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1817

CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1818

1819

auto &Bld = CGF.Builder;

1820

1821

Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

1822

Address LocalReduceList(

1823

Bld.CreatePointerBitCastOrAddrSpaceCast(

1824

CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,

1825

C.VoidPtrTy, SourceLocation()),

1826

CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),

1827

CGF.getPointerAlign());

1828

1829

Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);

1830

llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(

1831

AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());

1832

1833

Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);

1834

llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(

1835

AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());

1836

1837

Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);

1838

llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(

1839

AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());

1840

1841

// Create a local thread-private variable to host the Reduce list

1842

// from a remote lane.

1843

Address RemoteReduceList =

1844

CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");

1845

1846

// This loop iterates through the list of reduce elements and copies,

1847

// element by element, from a remote lane in the warp to RemoteReduceList,

1848

// hosted on the thread's stack.

1849

emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,

1850

LocalReduceList, RemoteReduceList,

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

1851

{/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,

1852

/*ScratchpadIndex=*/nullptr,

1853

/*ScratchpadWidth=*/nullptr});

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1854

1855

// The actions to be performed on the Remote Reduce list is dependent

1856

// on the algorithm version.

1857

//

1858

// if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&

1859

// LaneId % 2 == 0 && Offset > 0):

1860

// do the reduction value aggregation

1861

//

1862

// The thread local variable Reduce list is mutated in place to host the

1863

// reduced data, which is the aggregated value produced from local and

1864

// remote lanes.

1865

//

1866

// Note that AlgoVer is expected to be a constant integer known at compile

1867

// time.

1868

// When AlgoVer==0, the first conjunction evaluates to true, making

1869

// the entire predicate true during compile time.

1870

// When AlgoVer==1, the second conjunction has only the second part to be

1871

// evaluated during runtime. Other conjunctions evaluates to false

1872

// during compile time.

1873

// When AlgoVer==2, the third conjunction has only the second part to be

1874

// evaluated during runtime. Other conjunctions evaluates to false

1875

// during compile time.

1876

auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0));

1877

1878

auto Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));

1879

auto CondAlgo1 = Bld.CreateAnd(

1880

Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));

1881

1882

auto Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));

1883

auto CondAlgo2 = Bld.CreateAnd(

1884

Algo2,

1885

Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)),

1886

Bld.getInt16(0)));

1887

CondAlgo2 = Bld.CreateAnd(

1888

CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));

1889

1890

auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);

1891

CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);

1892

1893

llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");

1894

llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");

1895

llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");

1896

Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);

1897

1898

CGF.EmitBlock(ThenBB);

1899

// reduce_function(LocalReduceList, RemoteReduceList)

1900

llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

1901

LocalReduceList.getPointer(), CGF.VoidPtrTy);

1902

llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

1903

RemoteReduceList.getPointer(), CGF.VoidPtrTy);

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

1904

CGM.getOpenMPRuntime().emitOutlinedFunctionCall(

1905

CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

1906

Bld.CreateBr(MergeBB);

1907

1908

CGF.EmitBlock(ElseBB);

1909

Bld.CreateBr(MergeBB);

1910

1911

CGF.EmitBlock(MergeBB);

1912

1913

// if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local

1914

// Reduce list.

1915

Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));

1916

auto CondCopy = Bld.CreateAnd(

1917

Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));

1918

1919

llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");

1920

llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");

1921

llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");

1922

Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);

1923

1924

CGF.EmitBlock(CpyThenBB);

1925

emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,

1926

RemoteReduceList, LocalReduceList);

1927

Bld.CreateBr(CpyMergeBB);

1928

1929

CGF.EmitBlock(CpyElseBB);

1930

Bld.CreateBr(CpyMergeBB);

1931

1932

CGF.EmitBlock(CpyMergeBB);

1933

1934

CGF.FinishFunction();

return Fn;

}

///

/// Design of OpenMP reductions on the GPU

1940

///

1941

/// Consider a typical OpenMP program with one or more reduction

/// clauses:

///

/// float foo;

/// double bar;

/// #pragma omp target teams distribute parallel for \

1947

/// reduction(+:foo) reduction(*:bar)

1948

/// for (int i = 0; i < N; i++) {

1949

/// foo += A[i]; bar *= B[i];

1950

/// }

1951

///

1952

/// where 'foo' and 'bar' are reduced across all OpenMP threads in

1953

/// all teams. In our OpenMP implementation on the NVPTX device an

1954

/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads

1955

/// within a team are mapped to CUDA threads within a threadblock.

1956

/// Our goal is to efficiently aggregate values across all OpenMP

1957

/// threads such that:

1958

///

1959

/// - the compiler and runtime are logically concise, and

1960

/// - the reduction is performed efficiently in a hierarchical

1961

/// manner as follows: within OpenMP threads in the same warp,

1962

/// across warps in a threadblock, and finally across teams on

1963

/// the NVPTX device.

1964

///

1965

/// Introduction to Decoupling

1966

///

1967

/// We would like to decouple the compiler and the runtime so that the

1968

/// latter is ignorant of the reduction variables (number, data types)

1969

/// and the reduction operators. This allows a simpler interface

1970

/// and implementation while still attaining good performance.

1971

///

1972

/// Pseudocode for the aforementioned OpenMP program generated by the

1973

/// compiler is as follows:

1974

///

1975

/// 1. Create private copies of reduction variables on each OpenMP

1976

/// thread: 'foo_private', 'bar_private'

1977

/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned

1978

/// to it and writes the result in 'foo_private' and 'bar_private'

1979

/// respectively.

1980

/// 3. Call the OpenMP runtime on the GPU to reduce within a team

1981

/// and store the result on the team master:

1982

///

1983

/// __kmpc_nvptx_parallel_reduce_nowait(...,

1984

/// reduceData, shuffleReduceFn, interWarpCpyFn)

1985

///

1986

/// where:

1987

/// struct ReduceData {

/// double *foo;

/// double *bar;

/// } reduceData

/// reduceData.foo = &foo_private

1992

/// reduceData.bar = &bar_private

1993

///

1994

/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two

1995

/// auxiliary functions generated by the compiler that operate on

1996

/// variables of type 'ReduceData'. They aid the runtime perform

1997

/// algorithmic steps in a data agnostic manner.

1998

///

1999

/// 'shuffleReduceFn' is a pointer to a function that reduces data

2000

/// of type 'ReduceData' across two OpenMP threads (lanes) in the

2001

/// same warp. It takes the following arguments as input:

2002

///

2003

/// a. variable of type 'ReduceData' on the calling lane,

2004

/// b. its lane_id,

2005

/// c. an offset relative to the current lane_id to generate a

2006

/// remote_lane_id. The remote lane contains the second

2007

/// variable of type 'ReduceData' that is to be reduced.

2008

/// d. an algorithm version parameter determining which reduction

2009

/// algorithm to use.

2010

///

2011

/// 'shuffleReduceFn' retrieves data from the remote lane using

2012

/// efficient GPU shuffle intrinsics and reduces, using the

2013

/// algorithm specified by the 4th parameter, the two operands

2014

/// element-wise. The result is written to the first operand.

2015

///

2016

/// Different reduction algorithms are implemented in different

2017

/// runtime functions, all calling 'shuffleReduceFn' to perform

2018

/// the essential reduction step. Therefore, based on the 4th

2019

/// parameter, this function behaves slightly differently to

2020

/// cooperate with the runtime to ensure correctness under

2021

/// different circumstances.

2022

///

2023

/// 'InterWarpCpyFn' is a pointer to a function that transfers

2024

/// reduced variables across warps. It tunnels, through CUDA

2025

/// shared memory, the thread-private data of type 'ReduceData'

2026

/// from lane 0 of each warp to a lane in the first warp.

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

2027

/// 4. Call the OpenMP runtime on the GPU to reduce across teams.

2028

/// The last team writes the global reduced value to memory.

2029

///

2030

/// ret = __kmpc_nvptx_teams_reduce_nowait(...,

2031

/// reduceData, shuffleReduceFn, interWarpCpyFn,

2032

/// scratchpadCopyFn, loadAndReduceFn)

2033

///

2034

/// 'scratchpadCopyFn' is a helper that stores reduced

2035

/// data from the team master to a scratchpad array in

2036

/// global memory.

2037

///

2038

/// 'loadAndReduceFn' is a helper that loads data from

2039

/// the scratchpad array and reduces it with the input

2040

/// operand.

2041

///

2042

/// These compiler generated functions hide address

2043

/// calculation and alignment information from the runtime.

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

2044

/// 5. if ret == 1:

2045

/// The team master of the last team stores the reduced

2046

/// result to the globals in memory.

2047

/// foo += reduceData.foo; bar *= reduceData.bar

2048

///

2049

///

2050

/// Warp Reduction Algorithms

2051

///

2052

/// On the warp level, we have three algorithms implemented in the

2053

/// OpenMP runtime depending on the number of active lanes:

2054

///

2055

/// Full Warp Reduction

2056

///

2057

/// The reduce algorithm within a warp where all lanes are active

2058

/// is implemented in the runtime as follows:

2059

///

2060

/// full_warp_reduce(void *reduce_data,

2061

/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {

2062

/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)

2063

/// ShuffleReduceFn(reduce_data, 0, offset, 0);

2064

/// }

2065

///

2066

/// The algorithm completes in log(2, WARPSIZE) steps.

2067

///

2068

/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is

2069

/// not used therefore we save instructions by not retrieving lane_id

2070

/// from the corresponding special registers. The 4th parameter, which

2071

/// represents the version of the algorithm being used, is set to 0 to

2072

/// signify full warp reduction.

2073

///

2074

/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:

2075

///

2076

/// #reduce_elem refers to an element in the local lane's data structure

2077

/// #remote_elem is retrieved from a remote lane

2078

/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);

2079

/// reduce_elem = reduce_elem REDUCE_OP remote_elem;

2080

///

2081

/// Contiguous Partial Warp Reduction

2082

///

2083

/// This reduce algorithm is used within a warp where only the first

2084

/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the

2085

/// number of OpenMP threads in a parallel region is not a multiple of

2086

/// WARPSIZE. The algorithm is implemented in the runtime as follows:

2087

///

2088

/// void

2089

/// contiguous_partial_reduce(void *reduce_data,

2090

/// kmp_ShuffleReductFctPtr ShuffleReduceFn,

2091

/// int size, int lane_id) {

2092

/// int curr_size;

2093

/// int offset;

2094

/// curr_size = size;

2095

/// mask = curr_size/2;

2096

/// while (offset>0) {

2097

/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);

2098

/// curr_size = (curr_size+1)/2;

2099

/// offset = curr_size/2;

/// }

/// }

///

/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:

2104

///

2105

/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);

2106

/// if (lane_id < offset)

2107

/// reduce_elem = reduce_elem REDUCE_OP remote_elem

2108

/// else

2109

/// reduce_elem = remote_elem

2110

///

2111

/// This algorithm assumes that the data to be reduced are located in a

2112

/// contiguous subset of lanes starting from the first. When there is

2113

/// an odd number of active lanes, the data in the last lane is not

2114

/// aggregated with any other lane's dat but is instead copied over.

2115

///

2116

/// Dispersed Partial Warp Reduction

2117

///

2118

/// This algorithm is used within a warp when any discontiguous subset of

2119

/// lanes are active. It is used to implement the reduction operation

2120

/// across lanes in an OpenMP simd region or in a nested parallel region.

2121

///

2122

/// void

2123

/// dispersed_partial_reduce(void *reduce_data,

2124

/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {

2125

/// int size, remote_id;

2126

/// int logical_lane_id = number_of_active_lanes_before_me() * 2;

2127

/// do {

2128

/// remote_id = next_active_lane_id_right_after_me();

2129

/// # the above function returns 0 of no active lane

2130

/// # is present right after the current lane.

2131

/// size = number_of_active_lanes_in_this_warp();

2132

/// logical_lane_id /= 2;

2133

/// ShuffleReduceFn(reduce_data, logical_lane_id,

2134

/// remote_id-1-threadIdx.x, 2);

2135

/// } while (logical_lane_id % 2 == 0 && size > 1);

2136

/// }

2137

///

2138

/// There is no assumption made about the initial state of the reduction.

2139

/// Any number of lanes (>=1) could be active at any position. The reduction

2140

/// result is returned in the first active lane.

2141

///

2142

/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:

2143

///

2144

/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);

2145

/// if (lane_id % 2 == 0 && offset > 0)

2146

/// reduce_elem = reduce_elem REDUCE_OP remote_elem

2147

/// else

2148

/// reduce_elem = remote_elem

2149

///

2150

///

2151

/// Intra-Team Reduction

2152

///

2153

/// This function, as implemented in the runtime call

2154

/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP

2155

/// threads in a team. It first reduces within a warp using the

2156

/// aforementioned algorithms. We then proceed to gather all such

2157

/// reduced values at the first warp.

2158

///

2159

/// The runtime makes use of the function 'InterWarpCpyFn', which copies

2160

/// data from each of the "warp master" (zeroth lane of each warp, where

2161

/// warp-reduced data is held) to the zeroth warp. This step reduces (in

2162

/// a mathematical sense) the problem of reduction across warp masters in

2163

/// a block to the problem of warp reduction.

2164

///

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

2165

///

2166

/// Inter-Team Reduction

2167

///

2168

/// Once a team has reduced its data to a single value, it is stored in

2169

/// a global scratchpad array. Since each team has a distinct slot, this

2170

/// can be done without locking.

2171

///

2172

/// The last team to write to the scratchpad array proceeds to reduce the

2173

/// scratchpad array. One or more workers in the last team use the helper

2174

/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,

2175

/// the k'th worker reduces every k'th element.

2176

///

2177

/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to

2178

/// reduce across workers and compute a globally reduced value.

2179

///

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

2180

void CGOpenMPRuntimeNVPTX::emitReduction(

2181

CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,

2182

ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,

2183

ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {

2184

if (!CGF.HaveInsertPoint())

2185

return;

2186

2187

bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

2188

bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);

2189

// FIXME: Add support for simd reduction.

2190

assert((TeamsReduction || ParallelReduction) &&

2191

"Invalid reduction selection in emitReduction.");

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

2192

2193

auto &C = CGM.getContext();

2194

2195

// 1. Build a list of reduction variables.

2196

// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};

2197

auto Size = RHSExprs.size();

2198

for (auto *E : Privates) {

2199

if (E->getType()->isVariablyModifiedType())

2200

// Reserve place for array size.

2201

++Size;

2202

}

2203

llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);

2204

QualType ReductionArrayTy =

2205

C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,

2206

/*IndexTypeQuals=*/0);

2207

Address ReductionList =

2208

CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");

2209

auto IPriv = Privates.begin();

2210

unsigned Idx = 0;

2211

for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {

2212

Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,

2213

CGF.getPointerSize());

2214

CGF.Builder.CreateStore(

2215

CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(

2216

CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),

2217

Elem);

2218

if ((*IPriv)->getType()->isVariablyModifiedType()) {

2219

// Store array size.

2220

++Idx;

2221

Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,

2222

CGF.getPointerSize());

2223

llvm::Value *Size = CGF.Builder.CreateIntCast(

2224

CGF.getVLASize(

2225

CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))

Sander de Smalen

891af03a

2018-02-03 13:55:59 +0000

[diff] [blame]

2226

.NumElts,

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

2227

CGF.SizeTy, /*isSigned=*/false);

2228

CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),

Elem);

}

}

// 2. Emit reduce_func().

2234

auto *ReductionFn = emitReductionFunction(

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

2235

CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),

2236

Privates, LHSExprs, RHSExprs, ReductionOps);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

2237

2238

// 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),

2239

// RedList, shuffle_reduce_func, interwarp_copy_func);

2240

auto *ThreadId = getThreadID(CGF, Loc);

2241

auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);

2242

auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(

2243

ReductionList.getPointer(), CGF.VoidPtrTy);

2244

2245

auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction(

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

2246

CGM, Privates, ReductionArrayTy, ReductionFn, Loc);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

2247

auto *InterWarpCopyFn =

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

2248

emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

2249

2250

llvm::Value *Res = nullptr;

2251

if (ParallelReduction) {

2252

llvm::Value *Args[] = {ThreadId,

2253

CGF.Builder.getInt32(RHSExprs.size()),

2254

ReductionArrayTySize,

RL,

ShuffleAndReduceFn,

InterWarpCopyFn};

Res = CGF.EmitRuntimeCall(

2260

createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),

Args);

}

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

2264

if (TeamsReduction) {

2265

auto *ScratchPadCopyFn =

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

2266

emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

2267

auto *LoadAndReduceFn = emitReduceScratchpadFunction(

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

2268

CGM, Privates, ReductionArrayTy, ReductionFn, Loc);

Arpith Chacko Jacob

2017-02-16 16:48:49 +0000

[diff] [blame]

2269

2270

llvm::Value *Args[] = {ThreadId,

2271

CGF.Builder.getInt32(RHSExprs.size()),

2272

ReductionArrayTySize,

RL,

ShuffleAndReduceFn,

InterWarpCopyFn,

ScratchPadCopyFn,

LoadAndReduceFn};

Res = CGF.EmitRuntimeCall(

2279

createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),

Args);

}

Arpith Chacko Jacob

2017-02-16 16:20:16 +0000

[diff] [blame]

2283

// 5. Build switch(res)

2284

auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");

2285

auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);

2286

2287

// 6. Build case 1: where we have reduced values in the master

2288

// thread in each team.

2289

// __kmpc_end_reduce{_nowait}(<gtid>);

2290

// break;

2291

auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");

2292

SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);

2293

CGF.EmitBlock(Case1BB);

2294

2295

// Add emission of __kmpc_end_reduce{_nowait}(<gtid>);

2296

llvm::Value *EndArgs[] = {ThreadId};

2297

auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps,

2298

this](CodeGenFunction &CGF, PrePostActionTy &Action) {

2299

auto IPriv = Privates.begin();

2300

auto ILHS = LHSExprs.begin();

2301

auto IRHS = RHSExprs.begin();

2302

for (auto *E : ReductionOps) {

2303

emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),

2304

cast<DeclRefExpr>(*IRHS));

++IPriv;

++ILHS;

++IRHS;

}

};

RegionCodeGenTy RCG(CodeGen);

2311

NVPTXActionTy Action(

2312

nullptr, llvm::None,

2313

createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),

2314

EndArgs);

2315

RCG.setAction(Action);

2316

RCG(CGF);

2317

CGF.EmitBranch(DefaultBB);

2318

CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);

2319

}

Alexey Bataev

2017-08-08 18:04:06 +0000

[diff] [blame]

2320

2321

const VarDecl *

2322

CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,

2323

const VarDecl *NativeParam) const {

2324

if (!NativeParam->getType()->isReferenceType())

2325

return NativeParam;

2326

QualType ArgType = NativeParam->getType();

2327

QualifierCollector QC;

2328

const Type *NonQualTy = QC.strip(ArgType);

2329

QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();

2330

if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {

2331

if (Attr->getCaptureKind() == OMPC_map) {

2332

PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,

2333

LangAS::opencl_global);

2334

}

2335

}

2336

ArgType = CGM.getContext().getPointerType(PointeeTy);

2337

QC.addRestrict();

2338

enum { NVPTX_local_addr = 5 };

Alexander Richardson

6d98943

2017-10-15 18:48:14 +0000

[diff] [blame]

2339

QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));

Alexey Bataev

2017-08-08 18:04:06 +0000

[diff] [blame]

2340

ArgType = QC.apply(CGM.getContext(), ArgType);

Alexey Bataev

b45d43c

2017-11-22 16:02:03 +0000

[diff] [blame]

2341

if (isa<ImplicitParamDecl>(NativeParam)) {

2342

return ImplicitParamDecl::Create(

2343

CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),

2344

NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);

2345

}

2346

return ParmVarDecl::Create(

2347

CGM.getContext(),

2348

const_cast<DeclContext *>(NativeParam->getDeclContext()),

2349

NativeParam->getLocStart(), NativeParam->getLocation(),

2350

NativeParam->getIdentifier(), ArgType,

2351

/*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);

Alexey Bataev

2017-08-08 18:04:06 +0000

[diff] [blame]

}

Address

CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,

2356

const VarDecl *NativeParam,

2357

const VarDecl *TargetParam) const {

2358

assert(NativeParam != TargetParam &&

2359

NativeParam->getType()->isReferenceType() &&

2360

"Native arg must not be the same as target arg.");

2361

Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);

2362

QualType NativeParamType = NativeParam->getType();

2363

QualifierCollector QC;

2364

const Type *NonQualTy = QC.strip(NativeParamType);

2365

QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();

2366

unsigned NativePointeeAddrSpace =

Alexander Richardson

6d98943

2017-10-15 18:48:14 +0000

[diff] [blame]

2367

CGF.getContext().getTargetAddressSpace(NativePointeeTy);

Alexey Bataev

36f2c4d

2017-09-13 20:20:59 +0000

[diff] [blame]

2368

QualType TargetTy = TargetParam->getType();

Alexey Bataev

2017-08-08 18:04:06 +0000

[diff] [blame]

2369

llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(

Alexey Bataev

36f2c4d

2017-09-13 20:20:59 +0000

[diff] [blame]

2370

LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());

Alexey Bataev

2017-08-08 18:04:06 +0000

[diff] [blame]

2371

// First cast to generic.

2372

TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(

2373

TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(

2374

/*AddrSpace=*/0));

2375

// Cast from generic to native address space.

2376

TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(

2377

TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(

2378

NativePointeeAddrSpace));

2379

Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);

2380

CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,

Alexey Bataev

36f2c4d

2017-09-13 20:20:59 +0000

[diff] [blame]

2381

NativeParamType);

Alexey Bataev

2017-08-08 18:04:06 +0000

[diff] [blame]

2382

return NativeParamAddr;

2383

}

2384

2385

void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(

Alexey Bataev

2017-08-14 15:01:03 +0000

[diff] [blame]

2386

CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,

Alexey Bataev

2017-08-08 18:04:06 +0000

[diff] [blame]

2387

ArrayRef<llvm::Value *> Args) const {

2388

SmallVector<llvm::Value *, 4> TargetArgs;

Alexey Bataev

07ed94a

2017-08-15 14:34:04 +0000

[diff] [blame]

2389

TargetArgs.reserve(Args.size());

Alexey Bataev

2017-08-08 18:04:06 +0000

[diff] [blame]

2390

auto *FnType =

2391

cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());

2392

for (unsigned I = 0, E = Args.size(); I < E; ++I) {

Alexey Bataev

07ed94a

2017-08-15 14:34:04 +0000

[diff] [blame]

2393

if (FnType->isVarArg() && FnType->getNumParams() <= I) {

2394

TargetArgs.append(std::next(Args.begin(), I), Args.end());

2395

break;

2396

}

Alexey Bataev

2017-08-08 18:04:06 +0000

[diff] [blame]

2397

llvm::Type *TargetType = FnType->getParamType(I);

2398

llvm::Value *NativeArg = Args[I];

2399

if (!TargetType->isPointerTy()) {

2400

TargetArgs.emplace_back(NativeArg);

2401

continue;

2402

}

2403

llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(

2404

NativeArg, NativeArg->getType()->getPointerElementType()->getPointerTo(

2405

/*AddrSpace=*/0));

2406

TargetArgs.emplace_back(

2407

CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));

2408

}

Alexey Bataev

2017-08-14 15:01:03 +0000

[diff] [blame]

2409

CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);

Alexey Bataev

2017-08-08 18:04:06 +0000

[diff] [blame]

2410

}

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

2411

2412

/// Emit function which wraps the outline parallel region

2413

/// and controls the arguments which are passed to this function.

2414

/// The wrapper ensures that the outlined function is called

2415

/// with the correct arguments when data is shared.

2416

llvm::Function *CGOpenMPRuntimeNVPTX::createDataSharingWrapper(

2417

llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {

2418

ASTContext &Ctx = CGM.getContext();

Alexey Bataev

475a744

2018-01-12 19:39:11 +0000

[diff] [blame]

2419

const CapturedStmt &CS = *D.getCapturedStmt(OMPD_parallel);

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

2420

2421

// Create a function that takes as argument the source thread.

2422

FunctionArgList WrapperArgs;

2423

QualType Int16QTy =

2424

Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);

2425

QualType Int32QTy =

2426

Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);

2427

QualType Int32PtrQTy = Ctx.getPointerType(Int32QTy);

2428

QualType VoidPtrPtrQTy = Ctx.getPointerType(Ctx.VoidPtrTy);

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

2429

ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getLocStart(),

2430

/*Id=*/nullptr, Int16QTy,

2431

ImplicitParamDecl::Other);

2432

ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getLocStart(),

2433

/*Id=*/nullptr, Int32QTy,

2434

ImplicitParamDecl::Other);

2435

ImplicitParamDecl SharedArgsList(Ctx, /*DC=*/nullptr, D.getLocStart(),

2436

/*Id=*/nullptr, VoidPtrPtrQTy,

2437

ImplicitParamDecl::Other);

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

2438

WrapperArgs.emplace_back(&ParallelLevelArg);

2439

WrapperArgs.emplace_back(&WrapperArg);

2440

WrapperArgs.emplace_back(&SharedArgsList);

2441

2442

auto &CGFI =

2443

CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);

2444

2445

auto *Fn = llvm::Function::Create(

2446

CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

2447

OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule());

Rafael Espindola

2018-02-28 23:46:35 +0000

[diff] [blame]

2448

CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

2449

Fn->setLinkage(llvm::GlobalValue::InternalLinkage);

2450

2451

CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

2452

CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,

2453

D.getLocStart(), D.getLocStart());

Gheorghe-Teodor Bercea

2017-11-21 15:54:54 +0000

[diff] [blame]

2454

2455

const auto *RD = CS.getCapturedRecordDecl();

2456

auto CurField = RD->field_begin();

2457

2458

// Get the array of arguments.

2459

SmallVector<llvm::Value *, 8> Args;

2460

2461

// TODO: suppport SIMD and pass actual values

2462

Args.emplace_back(llvm::ConstantPointerNull::get(

2463

CGM.Int32Ty->getPointerTo()));

2464

Args.emplace_back(llvm::ConstantPointerNull::get(

2465

CGM.Int32Ty->getPointerTo()));

2466

2467

CGBuilderTy &Bld = CGF.Builder;

2468

auto CI = CS.capture_begin();

2469

2470

// Load the start of the array

2471

auto SharedArgs =

2472

CGF.EmitLoadOfPointer(CGF.GetAddrOfLocalVar(&SharedArgsList),

2473

VoidPtrPtrQTy->castAs<PointerType>());

2474

2475

// For each captured variable

2476

for (unsigned I = 0; I < CS.capture_size(); ++I, ++CI, ++CurField) {

2477

// Name of captured variable

2478

StringRef Name;

2479

if (CI->capturesThis())

2480

Name = "this";

2481

else

2482

Name = CI->getCapturedVar()->getName();

2483

2484

// We retrieve the CLANG type of the argument. We use it to create

2485

// an alloca which will give us the LLVM type.

2486

QualType ElemTy = CurField->getType();

2487

// If this is a capture by copy the element type has to be the pointer to

2488

// the data.

2489

if (CI->capturesVariableByCopy())

2490

ElemTy = Ctx.getPointerType(ElemTy);

2491

2492

// Get shared address of the captured variable.

2493

Address ArgAddress = Bld.CreateConstInBoundsGEP(

2494

SharedArgs, I, CGF.getPointerSize());

2495

Address TypedArgAddress = Bld.CreateBitCast(

2496

ArgAddress, CGF.ConvertTypeForMem(Ctx.getPointerType(ElemTy)));

2497

llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedArgAddress,

2498

/*Volatile=*/false, Int32PtrQTy, SourceLocation());

2499

Args.emplace_back(Arg);

2500

}

2501

Alexey Bataev

2018-01-04 19:45:16 +0000

[diff] [blame]

2502

emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedParallelFn, Args);

Gheorghe-Teodor Bercea