Blame - compiler/optimizing/scheduler.h - platform/art

blob: 73e8087cd049b701246f1c44a99d07f863bd7301 [file] [log] [blame]

Alexandre Rames	22aa54b	2016-10-18 09:32:29 +0100	[diff] [blame]	1	/*
				2	* Copyright (C) 2016 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_H_
				18	#define ART_COMPILER_OPTIMIZING_SCHEDULER_H_
				19
				20	#include <fstream>
				21
				22	#include "base/time_utils.h"
				23	#include "driver/compiler_driver.h"
				24	#include "nodes.h"
				25	#include "optimization.h"
xueliang.zhong	f7caf68	2017-03-01 16:07:02 +0000	[diff] [blame^]	26	#include "code_generator.h"
Alexandre Rames	22aa54b	2016-10-18 09:32:29 +0100	[diff] [blame]	27
				28	namespace art {
				29
				30	// General description of instruction scheduling.
				31	//
				32	// This pass tries to improve the quality of the generated code by reordering
				33	// instructions in the graph to avoid execution delays caused by execution
				34	// dependencies.
				35	// Currently, scheduling is performed at the block level, so no `HInstruction`
				36	// ever leaves its block in this pass.
				37	//
				38	// The scheduling process iterates through blocks in the graph. For blocks that
				39	// we can and want to schedule:
				40	// 1) Build a dependency graph for instructions.
				41	// It includes data dependencies (inputs/uses), but also environment
				42	// dependencies and side-effect dependencies.
				43	// 2) Schedule the dependency graph.
				44	// This is a topological sort of the dependency graph, using heuristics to
				45	// decide what node to scheduler first when there are multiple candidates.
				46	//
				47	// A few factors impacting the quality of the scheduling are:
				48	// - The heuristics used to decide what node to schedule in the topological sort
				49	// when there are multiple valid candidates. There is a wide range of
				50	// complexity possible here, going from a simple model only considering
				51	// latencies, to a super detailed CPU pipeline model.
				52	// - Fewer dependencies in the dependency graph give more freedom for the
				53	// scheduling heuristics. For example de-aliasing can allow possibilities for
				54	// reordering of memory accesses.
				55	// - The level of abstraction of the IR. It is easier to evaluate scheduling for
				56	// IRs that translate to a single assembly instruction than for IRs
				57	// that generate multiple assembly instructions or generate different code
				58	// depending on properties of the IR.
				59	// - Scheduling is performed before register allocation, it is not aware of the
				60	// impact of moving instructions on register allocation.
				61	//
				62	//
				63	// The scheduling code uses the terms predecessors, successors, and dependencies.
				64	// This can be confusing at times, so here are clarifications.
				65	// These terms are used from the point of view of the program dependency graph. So
				66	// the inputs of an instruction are part of its dependencies, and hence part its
				67	// predecessors. So the uses of an instruction are (part of) its successors.
				68	// (Side-effect dependencies can yield predecessors or successors that are not
				69	// inputs or uses.)
				70	//
				71	// Here is a trivial example. For the Java code:
				72	//
				73	// int a = 1 + 2;
				74	//
				75	// we would have the instructions
				76	//
				77	// i1 HIntConstant 1
				78	// i2 HIntConstant 2
				79	// i3 HAdd [i1,i2]
				80	//
				81	// `i1` and `i2` are predecessors of `i3`.
				82	// `i3` is a successor of `i1` and a successor of `i2`.
				83	// In a scheduling graph for this code we would have three nodes `n1`, `n2`,
				84	// and `n3` (respectively for instructions `i1`, `i1`, and `i3`).
				85	// Conceptually the program dependency graph for this would contain two edges
				86	//
				87	// n1 -> n3
				88	// n2 -> n3
				89	//
				90	// Since we schedule backwards (starting from the last instruction in each basic
				91	// block), the implementation of nodes keeps a list of pointers their
				92	// predecessors. So `n3` would keep pointers to its predecessors `n1` and `n2`.
				93	//
				94	// Node dependencies are also referred to from the program dependency graph
				95	// point of view: we say that node `B` immediately depends on `A` if there is an
				96	// edge from `A` to `B` in the program dependency graph. `A` is a predecessor of
				97	// `B`, `B` is a successor of `A`. In the example above `n3` depends on `n1` and
				98	// `n2`.
				99	// Since nodes in the scheduling graph keep a list of their predecessors, node
				100	// `B` will have a pointer to its predecessor `A`.
				101	// As we schedule backwards, `B` will be selected for scheduling before `A` is.
				102	//
				103	// So the scheduling for the example above could happen as follow
				104	//
				105	// \|---------------------------+------------------------\|
				106	// \| candidates for scheduling \| instructions scheduled \|
				107	// \| --------------------------+------------------------\|
				108	//
				109	// The only node without successors is `n3`, so it is the only initial
				110	// candidate.
				111	//
				112	// \| n3 \| (none) \|
				113	//
				114	// We schedule `n3` as the last (and only) instruction. All its predecessors
				115	// that do not have any unscheduled successors become candidate. That is, `n1`
				116	// and `n2` become candidates.
				117	//
				118	// \| n1, n2 \| n3 \|
				119	//
				120	// One of the candidates is selected. In practice this is where scheduling
				121	// heuristics kick in, to decide which of the candidates should be selected.
				122	// In this example, let it be `n1`. It is scheduled before previously scheduled
				123	// nodes (in program order). There are no other nodes to add to the list of
				124	// candidates.
				125	//
				126	// \| n2 \| n1 \|
				127	// \| \| n3 \|
				128	//
				129	// The only candidate available for scheduling is `n2`. Schedule it before
				130	// (in program order) the previously scheduled nodes.
				131	//
				132	// \| (none) \| n2 \|
				133	// \| \| n1 \|
				134	// \| \| n3 \|
				135	// \|---------------------------+------------------------\|
				136	//
				137	// So finally the instructions will be executed in the order `i2`, `i1`, and `i3`.
				138	// In this trivial example, it does not matter which of `i1` and `i2` is
				139	// scheduled first since they are constants. However the same process would
				140	// apply if `i1` and `i2` were actual operations (for example `HMul` and `HDiv`).
				141
				142	// Set to true to have instruction scheduling dump scheduling graphs to the file
				143	// `scheduling_graphs.dot`. See `SchedulingGraph::DumpAsDotGraph()`.
				144	static constexpr bool kDumpDotSchedulingGraphs = false;
				145
				146	// Typically used as a default instruction latency.
				147	static constexpr uint32_t kGenericInstructionLatency = 1;
				148
				149	class HScheduler;
				150
				151	/**
				152	* A node representing an `HInstruction` in the `SchedulingGraph`.
				153	*/
				154	class SchedulingNode : public ArenaObject<kArenaAllocScheduler> {
				155	public:
				156	SchedulingNode(HInstruction* instr, ArenaAllocator* arena, bool is_scheduling_barrier)
				157	: latency_(0),
				158	internal_latency_(0),
				159	critical_path_(0),
				160	instruction_(instr),
				161	is_scheduling_barrier_(is_scheduling_barrier),
				162	data_predecessors_(arena->Adapter(kArenaAllocScheduler)),
				163	other_predecessors_(arena->Adapter(kArenaAllocScheduler)),
				164	num_unscheduled_successors_(0) {
				165	data_predecessors_.reserve(kPreallocatedPredecessors);
				166	}
				167
				168	void AddDataPredecessor(SchedulingNode* predecessor) {
				169	data_predecessors_.push_back(predecessor);
				170	predecessor->num_unscheduled_successors_++;
				171	}
				172
				173	void AddOtherPredecessor(SchedulingNode* predecessor) {
				174	other_predecessors_.push_back(predecessor);
				175	predecessor->num_unscheduled_successors_++;
				176	}
				177
				178	void DecrementNumberOfUnscheduledSuccessors() {
				179	num_unscheduled_successors_--;
				180	}
				181
				182	void MaybeUpdateCriticalPath(uint32_t other_critical_path) {
				183	critical_path_ = std::max(critical_path_, other_critical_path);
				184	}
				185
				186	bool HasUnscheduledSuccessors() const {
				187	return num_unscheduled_successors_ != 0;
				188	}
				189
				190	HInstruction* GetInstruction() const { return instruction_; }
				191	uint32_t GetLatency() const { return latency_; }
				192	void SetLatency(uint32_t latency) { latency_ = latency; }
				193	uint32_t GetInternalLatency() const { return internal_latency_; }
				194	void SetInternalLatency(uint32_t internal_latency) { internal_latency_ = internal_latency; }
				195	uint32_t GetCriticalPath() const { return critical_path_; }
				196	bool IsSchedulingBarrier() const { return is_scheduling_barrier_; }
				197	const ArenaVector<SchedulingNode*>& GetDataPredecessors() const { return data_predecessors_; }
				198	const ArenaVector<SchedulingNode*>& GetOtherPredecessors() const { return other_predecessors_; }
				199
				200	private:
				201	// The latency of this node. It represents the latency between the moment the
				202	// last instruction for this node has executed to the moment the result
				203	// produced by this node is available to users.
				204	uint32_t latency_;
				205	// This represents the time spent within the generated code for this node.
				206	// It should be zero for nodes that only generate a single instruction.
				207	uint32_t internal_latency_;
				208
				209	// The critical path from this instruction to the end of scheduling. It is
				210	// used by the scheduling heuristics to measure the priority of this instruction.
				211	// It is defined as
				212	// critical_path_ = latency_ + max((use.internal_latency_ + use.critical_path_) for all uses)
				213	// (Note that here 'uses' is equivalent to 'data successors'. Also see comments in
				214	// `HScheduler::Schedule(SchedulingNode* scheduling_node)`).
				215	uint32_t critical_path_;
				216
				217	// The instruction that this node represents.
				218	HInstruction* const instruction_;
				219
				220	// If a node is scheduling barrier, other nodes cannot be scheduled before it.
				221	const bool is_scheduling_barrier_;
				222
				223	// The lists of predecessors. They cannot be scheduled before this node. Once
				224	// this node is scheduled, we check whether any of its predecessors has become a
				225	// valid candidate for scheduling.
				226	// Predecessors in `data_predecessors_` are data dependencies. Those in
				227	// `other_predecessors_` contain side-effect dependencies, environment
				228	// dependencies, and scheduling barrier dependencies.
				229	ArenaVector<SchedulingNode*> data_predecessors_;
				230	ArenaVector<SchedulingNode*> other_predecessors_;
				231
				232	// The number of unscheduled successors for this node. This number is
				233	// decremented as successors are scheduled. When it reaches zero this node
				234	// becomes a valid candidate to schedule.
				235	uint32_t num_unscheduled_successors_;
				236
				237	static constexpr size_t kPreallocatedPredecessors = 4;
				238	};
				239
				240	/*
				241	* Directed acyclic graph for scheduling.
				242	*/
				243	class SchedulingGraph : public ValueObject {
				244	public:
				245	SchedulingGraph(const HScheduler* scheduler, ArenaAllocator* arena)
				246	: scheduler_(scheduler),
				247	arena_(arena),
				248	contains_scheduling_barrier_(false),
				249	nodes_map_(arena_->Adapter(kArenaAllocScheduler)) {}
				250
				251	SchedulingNode* AddNode(HInstruction* instr, bool is_scheduling_barrier = false) {
				252	SchedulingNode* node = new (arena_) SchedulingNode(instr, arena_, is_scheduling_barrier);
				253	nodes_map_.Insert(std::make_pair(instr, node));
				254	contains_scheduling_barrier_ \|= is_scheduling_barrier;
				255	AddDependencies(instr, is_scheduling_barrier);
				256	return node;
				257	}
				258
				259	void Clear() {
				260	nodes_map_.Clear();
				261	contains_scheduling_barrier_ = false;
				262	}
				263
				264	SchedulingNode* GetNode(const HInstruction* instr) const {
				265	auto it = nodes_map_.Find(instr);
				266	if (it == nodes_map_.end()) {
				267	return nullptr;
				268	} else {
				269	return it->second;
				270	}
				271	}
				272
				273	bool IsSchedulingBarrier(const HInstruction* instruction) const;
				274
				275	bool HasImmediateDataDependency(const SchedulingNode* node, const SchedulingNode* other) const;
				276	bool HasImmediateDataDependency(const HInstruction* node, const HInstruction* other) const;
				277	bool HasImmediateOtherDependency(const SchedulingNode* node, const SchedulingNode* other) const;
				278	bool HasImmediateOtherDependency(const HInstruction* node, const HInstruction* other) const;
				279
				280	size_t Size() const {
				281	return nodes_map_.Size();
				282	}
				283
				284	// Dump the scheduling graph, in dot file format, appending it to the file
				285	// `scheduling_graphs.dot`.
				286	void DumpAsDotGraph(const std::string& description,
				287	const ArenaVector<SchedulingNode*>& initial_candidates);
				288
				289	protected:
				290	void AddDependency(SchedulingNode* node, SchedulingNode* dependency, bool is_data_dependency);
				291	void AddDataDependency(SchedulingNode* node, SchedulingNode* dependency) {
				292	AddDependency(node, dependency, /is_data_dependency/true);
				293	}
				294	void AddOtherDependency(SchedulingNode* node, SchedulingNode* dependency) {
				295	AddDependency(node, dependency, /is_data_dependency/false);
				296	}
				297
				298	// Add dependencies nodes for the given `HInstruction`: inputs, environments, and side-effects.
				299	void AddDependencies(HInstruction* instruction, bool is_scheduling_barrier = false);
				300
				301	const HScheduler* const scheduler_;
				302
				303	ArenaAllocator* const arena_;
				304
				305	bool contains_scheduling_barrier_;
				306
				307	ArenaHashMap<const HInstruction, SchedulingNode> nodes_map_;
				308	};
				309
				310	/*
				311	* The visitors derived from this base class are used by schedulers to evaluate
				312	* the latencies of `HInstruction`s.
				313	*/
				314	class SchedulingLatencyVisitor : public HGraphDelegateVisitor {
				315	public:
				316	// This class and its sub-classes will never be used to drive a visit of an
				317	// `HGraph` but only to visit `HInstructions` one at a time, so we do not need
				318	// to pass a valid graph to `HGraphDelegateVisitor()`.
Andreas Gampe	d9911ee	2017-03-27 13:27:24 -0700	[diff] [blame]	319	SchedulingLatencyVisitor()
				320	: HGraphDelegateVisitor(nullptr),
				321	last_visited_latency_(0),
				322	last_visited_internal_latency_(0) {}
Alexandre Rames	22aa54b	2016-10-18 09:32:29 +0100	[diff] [blame]	323
				324	void VisitInstruction(HInstruction* instruction) OVERRIDE {
				325	LOG(FATAL) << "Error visiting " << instruction->DebugName() << ". "
				326	"Architecture-specific scheduling latency visitors must handle all instructions"
				327	" (potentially by overriding the generic `VisitInstruction()`.";
				328	UNREACHABLE();
				329	}
				330
				331	void Visit(HInstruction* instruction) {
				332	instruction->Accept(this);
				333	}
				334
				335	void CalculateLatency(SchedulingNode* node) {
				336	// By default nodes have no internal latency.
				337	last_visited_internal_latency_ = 0;
				338	Visit(node->GetInstruction());
				339	}
				340
				341	uint32_t GetLastVisitedLatency() const { return last_visited_latency_; }
				342	uint32_t GetLastVisitedInternalLatency() const { return last_visited_internal_latency_; }
				343
				344	protected:
				345	// The latency of the most recent visited SchedulingNode.
				346	// This is for reporting the latency value to the user of this visitor.
				347	uint32_t last_visited_latency_;
				348	// This represents the time spent within the generated code for the most recent visited
				349	// SchedulingNode. This is for reporting the internal latency value to the user of this visitor.
				350	uint32_t last_visited_internal_latency_;
				351	};
				352
				353	class SchedulingNodeSelector : public ArenaObject<kArenaAllocScheduler> {
				354	public:
				355	virtual SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode> nodes,
				356	const SchedulingGraph& graph) = 0;
				357	virtual ~SchedulingNodeSelector() {}
				358	protected:
				359	static void DeleteNodeAtIndex(ArenaVector<SchedulingNode> nodes, size_t index) {
				360	(*nodes)[index] = nodes->back();
				361	nodes->pop_back();
				362	}
				363	};
				364
				365	/*
				366	* Select a `SchedulingNode` at random within the candidates.
				367	*/
				368	class RandomSchedulingNodeSelector : public SchedulingNodeSelector {
				369	public:
				370	explicit RandomSchedulingNodeSelector() : seed_(0) {
				371	seed_ = static_cast<uint32_t>(NanoTime());
				372	srand(seed_);
				373	}
				374
				375	SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode> nodes,
				376	const SchedulingGraph& graph) OVERRIDE {
				377	UNUSED(graph);
				378	DCHECK(!nodes->empty());
				379	size_t select = rand_r(&seed_) % nodes->size();
				380	SchedulingNode* select_node = (*nodes)[select];
				381	DeleteNodeAtIndex(nodes, select);
				382	return select_node;
				383	}
				384
				385	uint32_t seed_;
				386	};
				387
				388	/*
				389	* Select a `SchedulingNode` according to critical path information,
				390	* with heuristics to favor certain instruction patterns like materialized condition.
				391	*/
				392	class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector {
				393	public:
				394	CriticalPathSchedulingNodeSelector() : prev_select_(nullptr) {}
				395
				396	SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode> nodes,
				397	const SchedulingGraph& graph) OVERRIDE;
				398
				399	protected:
				400	SchedulingNode* GetHigherPrioritySchedulingNode(SchedulingNode* candidate,
				401	SchedulingNode* check) const;
				402
				403	SchedulingNode* SelectMaterializedCondition(ArenaVector<SchedulingNode> nodes,
				404	const SchedulingGraph& graph) const;
				405
				406	private:
				407	const SchedulingNode* prev_select_;
				408	};
				409
				410	class HScheduler {
				411	public:
				412	HScheduler(ArenaAllocator* arena,
				413	SchedulingLatencyVisitor* latency_visitor,
				414	SchedulingNodeSelector* selector)
				415	: arena_(arena),
				416	latency_visitor_(latency_visitor),
				417	selector_(selector),
				418	only_optimize_loop_blocks_(true),
				419	scheduling_graph_(this, arena),
Andreas Gampe	d9911ee	2017-03-27 13:27:24 -0700	[diff] [blame]	420	cursor_(nullptr),
Alexandre Rames	22aa54b	2016-10-18 09:32:29 +0100	[diff] [blame]	421	candidates_(arena_->Adapter(kArenaAllocScheduler)) {}
				422	virtual ~HScheduler() {}
				423
				424	void Schedule(HGraph* graph);
				425
				426	void SetOnlyOptimizeLoopBlocks(bool loop_only) { only_optimize_loop_blocks_ = loop_only; }
				427
				428	// Instructions can not be rescheduled across a scheduling barrier.
				429	virtual bool IsSchedulingBarrier(const HInstruction* instruction) const;
				430
				431	protected:
				432	void Schedule(HBasicBlock* block);
				433	void Schedule(SchedulingNode* scheduling_node);
				434	void Schedule(HInstruction* instruction);
				435
				436	// Any instruction returning `false` via this method will prevent its
				437	// containing basic block from being scheduled.
				438	// This method is used to restrict scheduling to instructions that we know are
				439	// safe to handle.
				440	virtual bool IsSchedulable(const HInstruction* instruction) const;
				441	bool IsSchedulable(const HBasicBlock* block) const;
				442
				443	void CalculateLatency(SchedulingNode* node) {
				444	latency_visitor_->CalculateLatency(node);
				445	node->SetLatency(latency_visitor_->GetLastVisitedLatency());
				446	node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency());
				447	}
				448
				449	ArenaAllocator* const arena_;
				450	SchedulingLatencyVisitor* const latency_visitor_;
				451	SchedulingNodeSelector* const selector_;
				452	bool only_optimize_loop_blocks_;
				453
				454	// We instantiate the members below as part of this class to avoid
				455	// instantiating them locally for every chunk scheduled.
				456	SchedulingGraph scheduling_graph_;
				457	// A pointer indicating where the next instruction to be scheduled will be inserted.
				458	HInstruction* cursor_;
				459	// The list of candidates for scheduling. A node becomes a candidate when all
				460	// its predecessors have been scheduled.
				461	ArenaVector<SchedulingNode*> candidates_;
				462
				463	private:
				464	DISALLOW_COPY_AND_ASSIGN(HScheduler);
				465	};
				466
				467	inline bool SchedulingGraph::IsSchedulingBarrier(const HInstruction* instruction) const {
				468	return scheduler_->IsSchedulingBarrier(instruction);
				469	}
				470
				471	class HInstructionScheduling : public HOptimization {
				472	public:
xueliang.zhong	f7caf68	2017-03-01 16:07:02 +0000	[diff] [blame^]	473	HInstructionScheduling(HGraph* graph, InstructionSet instruction_set, CodeGenerator* cg = nullptr)
Alexandre Rames	22aa54b	2016-10-18 09:32:29 +0100	[diff] [blame]	474	: HOptimization(graph, kInstructionScheduling),
xueliang.zhong	f7caf68	2017-03-01 16:07:02 +0000	[diff] [blame^]	475	codegen_(cg),
Alexandre Rames	22aa54b	2016-10-18 09:32:29 +0100	[diff] [blame]	476	instruction_set_(instruction_set) {}
				477
				478	void Run() {
				479	Run(/only_optimize_loop_blocks/ true, /schedule_randomly/ false);
				480	}
				481	void Run(bool only_optimize_loop_blocks, bool schedule_randomly);
				482
				483	static constexpr const char* kInstructionScheduling = "scheduler";
				484
xueliang.zhong	f7caf68	2017-03-01 16:07:02 +0000	[diff] [blame^]	485	CodeGenerator* const codegen_;
Alexandre Rames	22aa54b	2016-10-18 09:32:29 +0100	[diff] [blame]	486	const InstructionSet instruction_set_;
				487
				488	private:
				489	DISALLOW_COPY_AND_ASSIGN(HInstructionScheduling);
				490	};
				491
				492	} // namespace art
				493
				494	#endif // ART_COMPILER_OPTIMIZING_SCHEDULER_H_