| Rubin Xu | 7bc1b61 | 2021-02-16 09:38:50 +0000 | [diff] [blame^] | 1 | // Copyright 2015 the V8 project authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "src/compiler/backend/instruction-scheduler.h" |
| 6 | |
| 7 | #include "src/base/iterator.h" |
| 8 | #include "src/base/optional.h" |
| 9 | #include "src/base/utils/random-number-generator.h" |
| 10 | |
| 11 | namespace v8 { |
| 12 | namespace internal { |
| 13 | namespace compiler { |
| 14 | |
| 15 | void InstructionScheduler::SchedulingQueueBase::AddNode( |
| 16 | ScheduleGraphNode* node) { |
| 17 | // We keep the ready list sorted by total latency so that we can quickly find |
| 18 | // the next best candidate to schedule. |
| 19 | auto it = nodes_.begin(); |
| 20 | while ((it != nodes_.end()) && |
| 21 | ((*it)->total_latency() >= node->total_latency())) { |
| 22 | ++it; |
| 23 | } |
| 24 | nodes_.insert(it, node); |
| 25 | } |
| 26 | |
| 27 | InstructionScheduler::ScheduleGraphNode* |
| 28 | InstructionScheduler::CriticalPathFirstQueue::PopBestCandidate(int cycle) { |
| 29 | DCHECK(!IsEmpty()); |
| 30 | auto candidate = nodes_.end(); |
| 31 | for (auto iterator = nodes_.begin(); iterator != nodes_.end(); ++iterator) { |
| 32 | // We only consider instructions that have all their operands ready. |
| 33 | if (cycle >= (*iterator)->start_cycle()) { |
| 34 | candidate = iterator; |
| 35 | break; |
| 36 | } |
| 37 | } |
| 38 | |
| 39 | if (candidate != nodes_.end()) { |
| 40 | ScheduleGraphNode* result = *candidate; |
| 41 | nodes_.erase(candidate); |
| 42 | return result; |
| 43 | } |
| 44 | |
| 45 | return nullptr; |
| 46 | } |
| 47 | |
| 48 | InstructionScheduler::ScheduleGraphNode* |
| 49 | InstructionScheduler::StressSchedulerQueue::PopBestCandidate(int cycle) { |
| 50 | DCHECK(!IsEmpty()); |
| 51 | // Choose a random element from the ready list. |
| 52 | auto candidate = nodes_.begin(); |
| 53 | std::advance(candidate, random_number_generator()->NextInt( |
| 54 | static_cast<int>(nodes_.size()))); |
| 55 | ScheduleGraphNode* result = *candidate; |
| 56 | nodes_.erase(candidate); |
| 57 | return result; |
| 58 | } |
| 59 | |
| 60 | InstructionScheduler::ScheduleGraphNode::ScheduleGraphNode(Zone* zone, |
| 61 | Instruction* instr) |
| 62 | : instr_(instr), |
| 63 | successors_(zone), |
| 64 | unscheduled_predecessors_count_(0), |
| 65 | latency_(GetInstructionLatency(instr)), |
| 66 | total_latency_(-1), |
| 67 | start_cycle_(-1) {} |
| 68 | |
| 69 | void InstructionScheduler::ScheduleGraphNode::AddSuccessor( |
| 70 | ScheduleGraphNode* node) { |
| 71 | successors_.push_back(node); |
| 72 | node->unscheduled_predecessors_count_++; |
| 73 | } |
| 74 | |
| 75 | InstructionScheduler::InstructionScheduler(Zone* zone, |
| 76 | InstructionSequence* sequence) |
| 77 | : zone_(zone), |
| 78 | sequence_(sequence), |
| 79 | graph_(zone), |
| 80 | last_side_effect_instr_(nullptr), |
| 81 | pending_loads_(zone), |
| 82 | last_live_in_reg_marker_(nullptr), |
| 83 | last_deopt_or_trap_(nullptr), |
| 84 | operands_map_(zone) { |
| 85 | if (FLAG_turbo_stress_instruction_scheduling) { |
| 86 | random_number_generator_ = |
| 87 | base::Optional<base::RandomNumberGenerator>(FLAG_random_seed); |
| 88 | } |
| 89 | } |
| 90 | |
| 91 | void InstructionScheduler::StartBlock(RpoNumber rpo) { |
| 92 | DCHECK(graph_.empty()); |
| 93 | DCHECK_NULL(last_side_effect_instr_); |
| 94 | DCHECK(pending_loads_.empty()); |
| 95 | DCHECK_NULL(last_live_in_reg_marker_); |
| 96 | DCHECK_NULL(last_deopt_or_trap_); |
| 97 | DCHECK(operands_map_.empty()); |
| 98 | sequence()->StartBlock(rpo); |
| 99 | } |
| 100 | |
| 101 | void InstructionScheduler::EndBlock(RpoNumber rpo) { |
| 102 | if (FLAG_turbo_stress_instruction_scheduling) { |
| 103 | Schedule<StressSchedulerQueue>(); |
| 104 | } else { |
| 105 | Schedule<CriticalPathFirstQueue>(); |
| 106 | } |
| 107 | sequence()->EndBlock(rpo); |
| 108 | } |
| 109 | |
| 110 | void InstructionScheduler::AddTerminator(Instruction* instr) { |
| 111 | ScheduleGraphNode* new_node = zone()->New<ScheduleGraphNode>(zone(), instr); |
| 112 | // Make sure that basic block terminators are not moved by adding them |
| 113 | // as successor of every instruction. |
| 114 | for (ScheduleGraphNode* node : graph_) { |
| 115 | node->AddSuccessor(new_node); |
| 116 | } |
| 117 | graph_.push_back(new_node); |
| 118 | } |
| 119 | |
| 120 | void InstructionScheduler::AddInstruction(Instruction* instr) { |
| 121 | if (IsBarrier(instr)) { |
| 122 | if (FLAG_turbo_stress_instruction_scheduling) { |
| 123 | Schedule<StressSchedulerQueue>(); |
| 124 | } else { |
| 125 | Schedule<CriticalPathFirstQueue>(); |
| 126 | } |
| 127 | sequence()->AddInstruction(instr); |
| 128 | return; |
| 129 | } |
| 130 | |
| 131 | ScheduleGraphNode* new_node = zone()->New<ScheduleGraphNode>(zone(), instr); |
| 132 | |
| 133 | // We should not have branches in the middle of a block. |
| 134 | DCHECK_NE(instr->flags_mode(), kFlags_branch); |
| 135 | DCHECK_NE(instr->flags_mode(), kFlags_branch_and_poison); |
| 136 | |
| 137 | if (IsFixedRegisterParameter(instr)) { |
| 138 | if (last_live_in_reg_marker_ != nullptr) { |
| 139 | last_live_in_reg_marker_->AddSuccessor(new_node); |
| 140 | } |
| 141 | last_live_in_reg_marker_ = new_node; |
| 142 | } else { |
| 143 | if (last_live_in_reg_marker_ != nullptr) { |
| 144 | last_live_in_reg_marker_->AddSuccessor(new_node); |
| 145 | } |
| 146 | |
| 147 | // Make sure that instructions are not scheduled before the last |
| 148 | // deoptimization or trap point when they depend on it. |
| 149 | if ((last_deopt_or_trap_ != nullptr) && DependsOnDeoptOrTrap(instr)) { |
| 150 | last_deopt_or_trap_->AddSuccessor(new_node); |
| 151 | } |
| 152 | |
| 153 | // Instructions with side effects and memory operations can't be |
| 154 | // reordered with respect to each other. |
| 155 | if (HasSideEffect(instr)) { |
| 156 | if (last_side_effect_instr_ != nullptr) { |
| 157 | last_side_effect_instr_->AddSuccessor(new_node); |
| 158 | } |
| 159 | for (ScheduleGraphNode* load : pending_loads_) { |
| 160 | load->AddSuccessor(new_node); |
| 161 | } |
| 162 | pending_loads_.clear(); |
| 163 | last_side_effect_instr_ = new_node; |
| 164 | } else if (IsLoadOperation(instr)) { |
| 165 | // Load operations can't be reordered with side effects instructions but |
| 166 | // independent loads can be reordered with respect to each other. |
| 167 | if (last_side_effect_instr_ != nullptr) { |
| 168 | last_side_effect_instr_->AddSuccessor(new_node); |
| 169 | } |
| 170 | pending_loads_.push_back(new_node); |
| 171 | } else if (instr->IsDeoptimizeCall() || instr->IsTrap()) { |
| 172 | // Ensure that deopts or traps are not reordered with respect to |
| 173 | // side-effect instructions. |
| 174 | if (last_side_effect_instr_ != nullptr) { |
| 175 | last_side_effect_instr_->AddSuccessor(new_node); |
| 176 | } |
| 177 | last_deopt_or_trap_ = new_node; |
| 178 | } |
| 179 | |
| 180 | // Look for operand dependencies. |
| 181 | for (size_t i = 0; i < instr->InputCount(); ++i) { |
| 182 | const InstructionOperand* input = instr->InputAt(i); |
| 183 | if (input->IsUnallocated()) { |
| 184 | int32_t vreg = UnallocatedOperand::cast(input)->virtual_register(); |
| 185 | auto it = operands_map_.find(vreg); |
| 186 | if (it != operands_map_.end()) { |
| 187 | it->second->AddSuccessor(new_node); |
| 188 | } |
| 189 | } |
| 190 | } |
| 191 | |
| 192 | // Record the virtual registers defined by this instruction. |
| 193 | for (size_t i = 0; i < instr->OutputCount(); ++i) { |
| 194 | const InstructionOperand* output = instr->OutputAt(i); |
| 195 | if (output->IsUnallocated()) { |
| 196 | operands_map_[UnallocatedOperand::cast(output)->virtual_register()] = |
| 197 | new_node; |
| 198 | } else if (output->IsConstant()) { |
| 199 | operands_map_[ConstantOperand::cast(output)->virtual_register()] = |
| 200 | new_node; |
| 201 | } |
| 202 | } |
| 203 | } |
| 204 | |
| 205 | graph_.push_back(new_node); |
| 206 | } |
| 207 | |
| 208 | template <typename QueueType> |
| 209 | void InstructionScheduler::Schedule() { |
| 210 | QueueType ready_list(this); |
| 211 | |
| 212 | // Compute total latencies so that we can schedule the critical path first. |
| 213 | ComputeTotalLatencies(); |
| 214 | |
| 215 | // Add nodes which don't have dependencies to the ready list. |
| 216 | for (ScheduleGraphNode* node : graph_) { |
| 217 | if (!node->HasUnscheduledPredecessor()) { |
| 218 | ready_list.AddNode(node); |
| 219 | } |
| 220 | } |
| 221 | |
| 222 | // Go through the ready list and schedule the instructions. |
| 223 | int cycle = 0; |
| 224 | while (!ready_list.IsEmpty()) { |
| 225 | ScheduleGraphNode* candidate = ready_list.PopBestCandidate(cycle); |
| 226 | |
| 227 | if (candidate != nullptr) { |
| 228 | sequence()->AddInstruction(candidate->instruction()); |
| 229 | |
| 230 | for (ScheduleGraphNode* successor : candidate->successors()) { |
| 231 | successor->DropUnscheduledPredecessor(); |
| 232 | successor->set_start_cycle( |
| 233 | std::max(successor->start_cycle(), cycle + candidate->latency())); |
| 234 | |
| 235 | if (!successor->HasUnscheduledPredecessor()) { |
| 236 | ready_list.AddNode(successor); |
| 237 | } |
| 238 | } |
| 239 | } |
| 240 | |
| 241 | cycle++; |
| 242 | } |
| 243 | |
| 244 | // Reset own state. |
| 245 | graph_.clear(); |
| 246 | operands_map_.clear(); |
| 247 | pending_loads_.clear(); |
| 248 | last_deopt_or_trap_ = nullptr; |
| 249 | last_live_in_reg_marker_ = nullptr; |
| 250 | last_side_effect_instr_ = nullptr; |
| 251 | } |
| 252 | |
| 253 | int InstructionScheduler::GetInstructionFlags(const Instruction* instr) const { |
| 254 | switch (instr->arch_opcode()) { |
| 255 | case kArchNop: |
| 256 | case kArchStackCheckOffset: |
| 257 | case kArchFramePointer: |
| 258 | case kArchParentFramePointer: |
| 259 | case kArchStackSlot: // Despite its name this opcode will produce a |
| 260 | // reference to a frame slot, so it is not affected |
| 261 | // by the arm64 dual stack issues mentioned below. |
| 262 | case kArchComment: |
| 263 | case kArchDeoptimize: |
| 264 | case kArchJmp: |
| 265 | case kArchBinarySearchSwitch: |
| 266 | case kArchRet: |
| 267 | case kArchTableSwitch: |
| 268 | case kArchThrowTerminator: |
| 269 | return kNoOpcodeFlags; |
| 270 | |
| 271 | case kArchTruncateDoubleToI: |
| 272 | case kIeee754Float64Acos: |
| 273 | case kIeee754Float64Acosh: |
| 274 | case kIeee754Float64Asin: |
| 275 | case kIeee754Float64Asinh: |
| 276 | case kIeee754Float64Atan: |
| 277 | case kIeee754Float64Atanh: |
| 278 | case kIeee754Float64Atan2: |
| 279 | case kIeee754Float64Cbrt: |
| 280 | case kIeee754Float64Cos: |
| 281 | case kIeee754Float64Cosh: |
| 282 | case kIeee754Float64Exp: |
| 283 | case kIeee754Float64Expm1: |
| 284 | case kIeee754Float64Log: |
| 285 | case kIeee754Float64Log1p: |
| 286 | case kIeee754Float64Log10: |
| 287 | case kIeee754Float64Log2: |
| 288 | case kIeee754Float64Pow: |
| 289 | case kIeee754Float64Sin: |
| 290 | case kIeee754Float64Sinh: |
| 291 | case kIeee754Float64Tan: |
| 292 | case kIeee754Float64Tanh: |
| 293 | return kNoOpcodeFlags; |
| 294 | |
| 295 | case kArchStackPointerGreaterThan: |
| 296 | // The ArchStackPointerGreaterThan instruction loads the current stack |
| 297 | // pointer value and must not be reordered with instructions with side |
| 298 | // effects. |
| 299 | return kIsLoadOperation; |
| 300 | |
| 301 | case kArchWordPoisonOnSpeculation: |
| 302 | // While poisoning operations have no side effect, they must not be |
| 303 | // reordered relative to branches. |
| 304 | return kHasSideEffect; |
| 305 | |
| 306 | case kArchPrepareCallCFunction: |
| 307 | case kArchPrepareTailCall: |
| 308 | case kArchTailCallCodeObjectFromJSFunction: |
| 309 | case kArchTailCallCodeObject: |
| 310 | case kArchTailCallAddress: |
| 311 | case kArchTailCallWasm: |
| 312 | case kArchAbortCSAAssert: |
| 313 | return kHasSideEffect; |
| 314 | |
| 315 | case kArchDebugBreak: |
| 316 | return kIsBarrier; |
| 317 | |
| 318 | case kArchSaveCallerRegisters: |
| 319 | case kArchRestoreCallerRegisters: |
| 320 | return kIsBarrier; |
| 321 | |
| 322 | case kArchCallCFunction: |
| 323 | case kArchCallCodeObject: |
| 324 | case kArchCallJSFunction: |
| 325 | case kArchCallWasmFunction: |
| 326 | case kArchCallBuiltinPointer: |
| 327 | // Calls can cause GC and GC may relocate objects. If a pure instruction |
| 328 | // operates on a tagged pointer that was cast to a word then it may be |
| 329 | // incorrect to move the instruction across the call. Hence we mark all |
| 330 | // (non-tail-)calls as barriers. |
| 331 | return kIsBarrier; |
| 332 | |
| 333 | case kArchStoreWithWriteBarrier: |
| 334 | return kHasSideEffect; |
| 335 | |
| 336 | case kWord32AtomicLoadInt8: |
| 337 | case kWord32AtomicLoadUint8: |
| 338 | case kWord32AtomicLoadInt16: |
| 339 | case kWord32AtomicLoadUint16: |
| 340 | case kWord32AtomicLoadWord32: |
| 341 | return kIsLoadOperation; |
| 342 | |
| 343 | case kWord32AtomicStoreWord8: |
| 344 | case kWord32AtomicStoreWord16: |
| 345 | case kWord32AtomicStoreWord32: |
| 346 | return kHasSideEffect; |
| 347 | |
| 348 | case kWord32AtomicExchangeInt8: |
| 349 | case kWord32AtomicExchangeUint8: |
| 350 | case kWord32AtomicExchangeInt16: |
| 351 | case kWord32AtomicExchangeUint16: |
| 352 | case kWord32AtomicExchangeWord32: |
| 353 | case kWord32AtomicCompareExchangeInt8: |
| 354 | case kWord32AtomicCompareExchangeUint8: |
| 355 | case kWord32AtomicCompareExchangeInt16: |
| 356 | case kWord32AtomicCompareExchangeUint16: |
| 357 | case kWord32AtomicCompareExchangeWord32: |
| 358 | case kWord32AtomicAddInt8: |
| 359 | case kWord32AtomicAddUint8: |
| 360 | case kWord32AtomicAddInt16: |
| 361 | case kWord32AtomicAddUint16: |
| 362 | case kWord32AtomicAddWord32: |
| 363 | case kWord32AtomicSubInt8: |
| 364 | case kWord32AtomicSubUint8: |
| 365 | case kWord32AtomicSubInt16: |
| 366 | case kWord32AtomicSubUint16: |
| 367 | case kWord32AtomicSubWord32: |
| 368 | case kWord32AtomicAndInt8: |
| 369 | case kWord32AtomicAndUint8: |
| 370 | case kWord32AtomicAndInt16: |
| 371 | case kWord32AtomicAndUint16: |
| 372 | case kWord32AtomicAndWord32: |
| 373 | case kWord32AtomicOrInt8: |
| 374 | case kWord32AtomicOrUint8: |
| 375 | case kWord32AtomicOrInt16: |
| 376 | case kWord32AtomicOrUint16: |
| 377 | case kWord32AtomicOrWord32: |
| 378 | case kWord32AtomicXorInt8: |
| 379 | case kWord32AtomicXorUint8: |
| 380 | case kWord32AtomicXorInt16: |
| 381 | case kWord32AtomicXorUint16: |
| 382 | case kWord32AtomicXorWord32: |
| 383 | return kHasSideEffect; |
| 384 | |
| 385 | #define CASE(Name) case k##Name: |
| 386 | TARGET_ARCH_OPCODE_LIST(CASE) |
| 387 | #undef CASE |
| 388 | return GetTargetInstructionFlags(instr); |
| 389 | } |
| 390 | |
| 391 | UNREACHABLE(); |
| 392 | } |
| 393 | |
| 394 | void InstructionScheduler::ComputeTotalLatencies() { |
| 395 | for (ScheduleGraphNode* node : base::Reversed(graph_)) { |
| 396 | int max_latency = 0; |
| 397 | |
| 398 | for (ScheduleGraphNode* successor : node->successors()) { |
| 399 | DCHECK_NE(-1, successor->total_latency()); |
| 400 | if (successor->total_latency() > max_latency) { |
| 401 | max_latency = successor->total_latency(); |
| 402 | } |
| 403 | } |
| 404 | |
| 405 | node->set_total_latency(max_latency + node->latency()); |
| 406 | } |
| 407 | } |
| 408 | |
| 409 | } // namespace compiler |
| 410 | } // namespace internal |
| 411 | } // namespace v8 |