Blame - compiler/optimizing/scheduler.h - platform/art

blob: 9236a0e4fa79fc7ec54f19f58144addc5fafeaae [file] [log] [blame]

Alexandre Rames	22aa54b	2016-10-18 09:32:29 +0100	[diff] [blame]	1	/*
				2	* Copyright (C) 2016 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_H_
				18	#define ART_COMPILER_OPTIMIZING_SCHEDULER_H_
				19
				20	#include <fstream>
				21
				22	#include "base/time_utils.h"
				23	#include "driver/compiler_driver.h"
				24	#include "nodes.h"
				25	#include "optimization.h"
				26
				27	namespace art {
				28
				29	// General description of instruction scheduling.
				30	//
				31	// This pass tries to improve the quality of the generated code by reordering
				32	// instructions in the graph to avoid execution delays caused by execution
				33	// dependencies.
				34	// Currently, scheduling is performed at the block level, so no `HInstruction`
				35	// ever leaves its block in this pass.
				36	//
				37	// The scheduling process iterates through blocks in the graph. For blocks that
				38	// we can and want to schedule:
				39	// 1) Build a dependency graph for instructions.
				40	// It includes data dependencies (inputs/uses), but also environment
				41	// dependencies and side-effect dependencies.
				42	// 2) Schedule the dependency graph.
				43	// This is a topological sort of the dependency graph, using heuristics to
				44	// decide what node to scheduler first when there are multiple candidates.
				45	//
				46	// A few factors impacting the quality of the scheduling are:
				47	// - The heuristics used to decide what node to schedule in the topological sort
				48	// when there are multiple valid candidates. There is a wide range of
				49	// complexity possible here, going from a simple model only considering
				50	// latencies, to a super detailed CPU pipeline model.
				51	// - Fewer dependencies in the dependency graph give more freedom for the
				52	// scheduling heuristics. For example de-aliasing can allow possibilities for
				53	// reordering of memory accesses.
				54	// - The level of abstraction of the IR. It is easier to evaluate scheduling for
				55	// IRs that translate to a single assembly instruction than for IRs
				56	// that generate multiple assembly instructions or generate different code
				57	// depending on properties of the IR.
				58	// - Scheduling is performed before register allocation, it is not aware of the
				59	// impact of moving instructions on register allocation.
				60	//
				61	//
				62	// The scheduling code uses the terms predecessors, successors, and dependencies.
				63	// This can be confusing at times, so here are clarifications.
				64	// These terms are used from the point of view of the program dependency graph. So
				65	// the inputs of an instruction are part of its dependencies, and hence part its
				66	// predecessors. So the uses of an instruction are (part of) its successors.
				67	// (Side-effect dependencies can yield predecessors or successors that are not
				68	// inputs or uses.)
				69	//
				70	// Here is a trivial example. For the Java code:
				71	//
				72	// int a = 1 + 2;
				73	//
				74	// we would have the instructions
				75	//
				76	// i1 HIntConstant 1
				77	// i2 HIntConstant 2
				78	// i3 HAdd [i1,i2]
				79	//
				80	// `i1` and `i2` are predecessors of `i3`.
				81	// `i3` is a successor of `i1` and a successor of `i2`.
				82	// In a scheduling graph for this code we would have three nodes `n1`, `n2`,
				83	// and `n3` (respectively for instructions `i1`, `i1`, and `i3`).
				84	// Conceptually the program dependency graph for this would contain two edges
				85	//
				86	// n1 -> n3
				87	// n2 -> n3
				88	//
				89	// Since we schedule backwards (starting from the last instruction in each basic
				90	// block), the implementation of nodes keeps a list of pointers their
				91	// predecessors. So `n3` would keep pointers to its predecessors `n1` and `n2`.
				92	//
				93	// Node dependencies are also referred to from the program dependency graph
				94	// point of view: we say that node `B` immediately depends on `A` if there is an
				95	// edge from `A` to `B` in the program dependency graph. `A` is a predecessor of
				96	// `B`, `B` is a successor of `A`. In the example above `n3` depends on `n1` and
				97	// `n2`.
				98	// Since nodes in the scheduling graph keep a list of their predecessors, node
				99	// `B` will have a pointer to its predecessor `A`.
				100	// As we schedule backwards, `B` will be selected for scheduling before `A` is.
				101	//
				102	// So the scheduling for the example above could happen as follow
				103	//
				104	// \|---------------------------+------------------------\|
				105	// \| candidates for scheduling \| instructions scheduled \|
				106	// \| --------------------------+------------------------\|
				107	//
				108	// The only node without successors is `n3`, so it is the only initial
				109	// candidate.
				110	//
				111	// \| n3 \| (none) \|
				112	//
				113	// We schedule `n3` as the last (and only) instruction. All its predecessors
				114	// that do not have any unscheduled successors become candidate. That is, `n1`
				115	// and `n2` become candidates.
				116	//
				117	// \| n1, n2 \| n3 \|
				118	//
				119	// One of the candidates is selected. In practice this is where scheduling
				120	// heuristics kick in, to decide which of the candidates should be selected.
				121	// In this example, let it be `n1`. It is scheduled before previously scheduled
				122	// nodes (in program order). There are no other nodes to add to the list of
				123	// candidates.
				124	//
				125	// \| n2 \| n1 \|
				126	// \| \| n3 \|
				127	//
				128	// The only candidate available for scheduling is `n2`. Schedule it before
				129	// (in program order) the previously scheduled nodes.
				130	//
				131	// \| (none) \| n2 \|
				132	// \| \| n1 \|
				133	// \| \| n3 \|
				134	// \|---------------------------+------------------------\|
				135	//
				136	// So finally the instructions will be executed in the order `i2`, `i1`, and `i3`.
				137	// In this trivial example, it does not matter which of `i1` and `i2` is
				138	// scheduled first since they are constants. However the same process would
				139	// apply if `i1` and `i2` were actual operations (for example `HMul` and `HDiv`).
				140
				141	// Set to true to have instruction scheduling dump scheduling graphs to the file
				142	// `scheduling_graphs.dot`. See `SchedulingGraph::DumpAsDotGraph()`.
				143	static constexpr bool kDumpDotSchedulingGraphs = false;
				144
				145	// Typically used as a default instruction latency.
				146	static constexpr uint32_t kGenericInstructionLatency = 1;
				147
				148	class HScheduler;
				149
				150	/**
				151	* A node representing an `HInstruction` in the `SchedulingGraph`.
				152	*/
				153	class SchedulingNode : public ArenaObject<kArenaAllocScheduler> {
				154	public:
				155	SchedulingNode(HInstruction* instr, ArenaAllocator* arena, bool is_scheduling_barrier)
				156	: latency_(0),
				157	internal_latency_(0),
				158	critical_path_(0),
				159	instruction_(instr),
				160	is_scheduling_barrier_(is_scheduling_barrier),
				161	data_predecessors_(arena->Adapter(kArenaAllocScheduler)),
				162	other_predecessors_(arena->Adapter(kArenaAllocScheduler)),
				163	num_unscheduled_successors_(0) {
				164	data_predecessors_.reserve(kPreallocatedPredecessors);
				165	}
				166
				167	void AddDataPredecessor(SchedulingNode* predecessor) {
				168	data_predecessors_.push_back(predecessor);
				169	predecessor->num_unscheduled_successors_++;
				170	}
				171
				172	void AddOtherPredecessor(SchedulingNode* predecessor) {
				173	other_predecessors_.push_back(predecessor);
				174	predecessor->num_unscheduled_successors_++;
				175	}
				176
				177	void DecrementNumberOfUnscheduledSuccessors() {
				178	num_unscheduled_successors_--;
				179	}
				180
				181	void MaybeUpdateCriticalPath(uint32_t other_critical_path) {
				182	critical_path_ = std::max(critical_path_, other_critical_path);
				183	}
				184
				185	bool HasUnscheduledSuccessors() const {
				186	return num_unscheduled_successors_ != 0;
				187	}
				188
				189	HInstruction* GetInstruction() const { return instruction_; }
				190	uint32_t GetLatency() const { return latency_; }
				191	void SetLatency(uint32_t latency) { latency_ = latency; }
				192	uint32_t GetInternalLatency() const { return internal_latency_; }
				193	void SetInternalLatency(uint32_t internal_latency) { internal_latency_ = internal_latency; }
				194	uint32_t GetCriticalPath() const { return critical_path_; }
				195	bool IsSchedulingBarrier() const { return is_scheduling_barrier_; }
				196	const ArenaVector<SchedulingNode*>& GetDataPredecessors() const { return data_predecessors_; }
				197	const ArenaVector<SchedulingNode*>& GetOtherPredecessors() const { return other_predecessors_; }
				198
				199	private:
				200	// The latency of this node. It represents the latency between the moment the
				201	// last instruction for this node has executed to the moment the result
				202	// produced by this node is available to users.
				203	uint32_t latency_;
				204	// This represents the time spent within the generated code for this node.
				205	// It should be zero for nodes that only generate a single instruction.
				206	uint32_t internal_latency_;
				207
				208	// The critical path from this instruction to the end of scheduling. It is
				209	// used by the scheduling heuristics to measure the priority of this instruction.
				210	// It is defined as
				211	// critical_path_ = latency_ + max((use.internal_latency_ + use.critical_path_) for all uses)
				212	// (Note that here 'uses' is equivalent to 'data successors'. Also see comments in
				213	// `HScheduler::Schedule(SchedulingNode* scheduling_node)`).
				214	uint32_t critical_path_;
				215
				216	// The instruction that this node represents.
				217	HInstruction* const instruction_;
				218
				219	// If a node is scheduling barrier, other nodes cannot be scheduled before it.
				220	const bool is_scheduling_barrier_;
				221
				222	// The lists of predecessors. They cannot be scheduled before this node. Once
				223	// this node is scheduled, we check whether any of its predecessors has become a
				224	// valid candidate for scheduling.
				225	// Predecessors in `data_predecessors_` are data dependencies. Those in
				226	// `other_predecessors_` contain side-effect dependencies, environment
				227	// dependencies, and scheduling barrier dependencies.
				228	ArenaVector<SchedulingNode*> data_predecessors_;
				229	ArenaVector<SchedulingNode*> other_predecessors_;
				230
				231	// The number of unscheduled successors for this node. This number is
				232	// decremented as successors are scheduled. When it reaches zero this node
				233	// becomes a valid candidate to schedule.
				234	uint32_t num_unscheduled_successors_;
				235
				236	static constexpr size_t kPreallocatedPredecessors = 4;
				237	};
				238
				239	/*
				240	* Directed acyclic graph for scheduling.
				241	*/
				242	class SchedulingGraph : public ValueObject {
				243	public:
				244	SchedulingGraph(const HScheduler* scheduler, ArenaAllocator* arena)
				245	: scheduler_(scheduler),
				246	arena_(arena),
				247	contains_scheduling_barrier_(false),
				248	nodes_map_(arena_->Adapter(kArenaAllocScheduler)) {}
				249
				250	SchedulingNode* AddNode(HInstruction* instr, bool is_scheduling_barrier = false) {
				251	SchedulingNode* node = new (arena_) SchedulingNode(instr, arena_, is_scheduling_barrier);
				252	nodes_map_.Insert(std::make_pair(instr, node));
				253	contains_scheduling_barrier_ \|= is_scheduling_barrier;
				254	AddDependencies(instr, is_scheduling_barrier);
				255	return node;
				256	}
				257
				258	void Clear() {
				259	nodes_map_.Clear();
				260	contains_scheduling_barrier_ = false;
				261	}
				262
				263	SchedulingNode* GetNode(const HInstruction* instr) const {
				264	auto it = nodes_map_.Find(instr);
				265	if (it == nodes_map_.end()) {
				266	return nullptr;
				267	} else {
				268	return it->second;
				269	}
				270	}
				271
				272	bool IsSchedulingBarrier(const HInstruction* instruction) const;
				273
				274	bool HasImmediateDataDependency(const SchedulingNode* node, const SchedulingNode* other) const;
				275	bool HasImmediateDataDependency(const HInstruction* node, const HInstruction* other) const;
				276	bool HasImmediateOtherDependency(const SchedulingNode* node, const SchedulingNode* other) const;
				277	bool HasImmediateOtherDependency(const HInstruction* node, const HInstruction* other) const;
				278
				279	size_t Size() const {
				280	return nodes_map_.Size();
				281	}
				282
				283	// Dump the scheduling graph, in dot file format, appending it to the file
				284	// `scheduling_graphs.dot`.
				285	void DumpAsDotGraph(const std::string& description,
				286	const ArenaVector<SchedulingNode*>& initial_candidates);
				287
				288	protected:
				289	void AddDependency(SchedulingNode* node, SchedulingNode* dependency, bool is_data_dependency);
				290	void AddDataDependency(SchedulingNode* node, SchedulingNode* dependency) {
				291	AddDependency(node, dependency, /is_data_dependency/true);
				292	}
				293	void AddOtherDependency(SchedulingNode* node, SchedulingNode* dependency) {
				294	AddDependency(node, dependency, /is_data_dependency/false);
				295	}
				296
				297	// Add dependencies nodes for the given `HInstruction`: inputs, environments, and side-effects.
				298	void AddDependencies(HInstruction* instruction, bool is_scheduling_barrier = false);
				299
				300	const HScheduler* const scheduler_;
				301
				302	ArenaAllocator* const arena_;
				303
				304	bool contains_scheduling_barrier_;
				305
				306	ArenaHashMap<const HInstruction, SchedulingNode> nodes_map_;
				307	};
				308
				309	/*
				310	* The visitors derived from this base class are used by schedulers to evaluate
				311	* the latencies of `HInstruction`s.
				312	*/
				313	class SchedulingLatencyVisitor : public HGraphDelegateVisitor {
				314	public:
				315	// This class and its sub-classes will never be used to drive a visit of an
				316	// `HGraph` but only to visit `HInstructions` one at a time, so we do not need
				317	// to pass a valid graph to `HGraphDelegateVisitor()`.
Andreas Gampe	d9911ee	2017-03-27 13:27:24 -0700	[diff] [blame^]	318	SchedulingLatencyVisitor()
				319	: HGraphDelegateVisitor(nullptr),
				320	last_visited_latency_(0),
				321	last_visited_internal_latency_(0) {}
Alexandre Rames	22aa54b	2016-10-18 09:32:29 +0100	[diff] [blame]	322
				323	void VisitInstruction(HInstruction* instruction) OVERRIDE {
				324	LOG(FATAL) << "Error visiting " << instruction->DebugName() << ". "
				325	"Architecture-specific scheduling latency visitors must handle all instructions"
				326	" (potentially by overriding the generic `VisitInstruction()`.";
				327	UNREACHABLE();
				328	}
				329
				330	void Visit(HInstruction* instruction) {
				331	instruction->Accept(this);
				332	}
				333
				334	void CalculateLatency(SchedulingNode* node) {
				335	// By default nodes have no internal latency.
				336	last_visited_internal_latency_ = 0;
				337	Visit(node->GetInstruction());
				338	}
				339
				340	uint32_t GetLastVisitedLatency() const { return last_visited_latency_; }
				341	uint32_t GetLastVisitedInternalLatency() const { return last_visited_internal_latency_; }
				342
				343	protected:
				344	// The latency of the most recent visited SchedulingNode.
				345	// This is for reporting the latency value to the user of this visitor.
				346	uint32_t last_visited_latency_;
				347	// This represents the time spent within the generated code for the most recent visited
				348	// SchedulingNode. This is for reporting the internal latency value to the user of this visitor.
				349	uint32_t last_visited_internal_latency_;
				350	};
				351
				352	class SchedulingNodeSelector : public ArenaObject<kArenaAllocScheduler> {
				353	public:
				354	virtual SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode> nodes,
				355	const SchedulingGraph& graph) = 0;
				356	virtual ~SchedulingNodeSelector() {}
				357	protected:
				358	static void DeleteNodeAtIndex(ArenaVector<SchedulingNode> nodes, size_t index) {
				359	(*nodes)[index] = nodes->back();
				360	nodes->pop_back();
				361	}
				362	};
				363
				364	/*
				365	* Select a `SchedulingNode` at random within the candidates.
				366	*/
				367	class RandomSchedulingNodeSelector : public SchedulingNodeSelector {
				368	public:
				369	explicit RandomSchedulingNodeSelector() : seed_(0) {
				370	seed_ = static_cast<uint32_t>(NanoTime());
				371	srand(seed_);
				372	}
				373
				374	SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode> nodes,
				375	const SchedulingGraph& graph) OVERRIDE {
				376	UNUSED(graph);
				377	DCHECK(!nodes->empty());
				378	size_t select = rand_r(&seed_) % nodes->size();
				379	SchedulingNode* select_node = (*nodes)[select];
				380	DeleteNodeAtIndex(nodes, select);
				381	return select_node;
				382	}
				383
				384	uint32_t seed_;
				385	};
				386
				387	/*
				388	* Select a `SchedulingNode` according to critical path information,
				389	* with heuristics to favor certain instruction patterns like materialized condition.
				390	*/
				391	class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector {
				392	public:
				393	CriticalPathSchedulingNodeSelector() : prev_select_(nullptr) {}
				394
				395	SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode> nodes,
				396	const SchedulingGraph& graph) OVERRIDE;
				397
				398	protected:
				399	SchedulingNode* GetHigherPrioritySchedulingNode(SchedulingNode* candidate,
				400	SchedulingNode* check) const;
				401
				402	SchedulingNode* SelectMaterializedCondition(ArenaVector<SchedulingNode> nodes,
				403	const SchedulingGraph& graph) const;
				404
				405	private:
				406	const SchedulingNode* prev_select_;
				407	};
				408
				409	class HScheduler {
				410	public:
				411	HScheduler(ArenaAllocator* arena,
				412	SchedulingLatencyVisitor* latency_visitor,
				413	SchedulingNodeSelector* selector)
				414	: arena_(arena),
				415	latency_visitor_(latency_visitor),
				416	selector_(selector),
				417	only_optimize_loop_blocks_(true),
				418	scheduling_graph_(this, arena),
Andreas Gampe	d9911ee	2017-03-27 13:27:24 -0700	[diff] [blame^]	419	cursor_(nullptr),
Alexandre Rames	22aa54b	2016-10-18 09:32:29 +0100	[diff] [blame]	420	candidates_(arena_->Adapter(kArenaAllocScheduler)) {}
				421	virtual ~HScheduler() {}
				422
				423	void Schedule(HGraph* graph);
				424
				425	void SetOnlyOptimizeLoopBlocks(bool loop_only) { only_optimize_loop_blocks_ = loop_only; }
				426
				427	// Instructions can not be rescheduled across a scheduling barrier.
				428	virtual bool IsSchedulingBarrier(const HInstruction* instruction) const;
				429
				430	protected:
				431	void Schedule(HBasicBlock* block);
				432	void Schedule(SchedulingNode* scheduling_node);
				433	void Schedule(HInstruction* instruction);
				434
				435	// Any instruction returning `false` via this method will prevent its
				436	// containing basic block from being scheduled.
				437	// This method is used to restrict scheduling to instructions that we know are
				438	// safe to handle.
				439	virtual bool IsSchedulable(const HInstruction* instruction) const;
				440	bool IsSchedulable(const HBasicBlock* block) const;
				441
				442	void CalculateLatency(SchedulingNode* node) {
				443	latency_visitor_->CalculateLatency(node);
				444	node->SetLatency(latency_visitor_->GetLastVisitedLatency());
				445	node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency());
				446	}
				447
				448	ArenaAllocator* const arena_;
				449	SchedulingLatencyVisitor* const latency_visitor_;
				450	SchedulingNodeSelector* const selector_;
				451	bool only_optimize_loop_blocks_;
				452
				453	// We instantiate the members below as part of this class to avoid
				454	// instantiating them locally for every chunk scheduled.
				455	SchedulingGraph scheduling_graph_;
				456	// A pointer indicating where the next instruction to be scheduled will be inserted.
				457	HInstruction* cursor_;
				458	// The list of candidates for scheduling. A node becomes a candidate when all
				459	// its predecessors have been scheduled.
				460	ArenaVector<SchedulingNode*> candidates_;
				461
				462	private:
				463	DISALLOW_COPY_AND_ASSIGN(HScheduler);
				464	};
				465
				466	inline bool SchedulingGraph::IsSchedulingBarrier(const HInstruction* instruction) const {
				467	return scheduler_->IsSchedulingBarrier(instruction);
				468	}
				469
				470	class HInstructionScheduling : public HOptimization {
				471	public:
				472	HInstructionScheduling(HGraph* graph, InstructionSet instruction_set)
				473	: HOptimization(graph, kInstructionScheduling),
				474	instruction_set_(instruction_set) {}
				475
				476	void Run() {
				477	Run(/only_optimize_loop_blocks/ true, /schedule_randomly/ false);
				478	}
				479	void Run(bool only_optimize_loop_blocks, bool schedule_randomly);
				480
				481	static constexpr const char* kInstructionScheduling = "scheduler";
				482
				483	const InstructionSet instruction_set_;
				484
				485	private:
				486	DISALLOW_COPY_AND_ASSIGN(HInstructionScheduling);
				487	};
				488
				489	} // namespace art
				490
				491	#endif // ART_COMPILER_OPTIMIZING_SCHEDULER_H_