blob: 14367499cfedf8fe6954ea2b48d0e695afb3874b [file] [log] [blame]
Vivek Goyal31e4c282009-12-03 12:59:42 -05001/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * Nauman Rafique <nauman@google.com>
12 */
13#include <linux/ioprio.h>
Vivek Goyal22084192009-12-03 12:59:49 -050014#include <linux/seq_file.h>
15#include <linux/kdev_t.h>
Vivek Goyal9d6a9862009-12-04 10:36:41 -050016#include <linux/module.h>
Stephen Rothwellaccee782009-12-07 19:29:39 +110017#include <linux/err.h>
Divyesh Shah91952912010-04-01 15:01:41 -070018#include <linux/blkdev.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090019#include <linux/slab.h>
Gui Jianfeng34d0f172010-04-13 16:05:49 +080020#include <linux/genhd.h>
Tejun Heo72e06c22012-03-05 13:15:00 -080021#include <linux/delay.h>
22#include "blk-cgroup.h"
Tejun Heo5efd6112012-03-05 13:15:12 -080023#include "blk.h"
Vivek Goyal3e252062009-12-04 10:36:42 -050024
Divyesh Shah84c124d2010-04-09 08:31:19 +020025#define MAX_KEY_LEN 100
26
Vivek Goyal3e252062009-12-04 10:36:42 -050027static DEFINE_SPINLOCK(blkio_list_lock);
28static LIST_HEAD(blkio_list);
Vivek Goyalb1c35762009-12-03 12:59:47 -050029
Tejun Heo923adde2012-03-05 13:15:13 -080030static DEFINE_MUTEX(all_q_mutex);
31static LIST_HEAD(all_q_list);
32
Vivek Goyal31e4c282009-12-03 12:59:42 -050033struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
Vivek Goyal9d6a9862009-12-04 10:36:41 -050034EXPORT_SYMBOL_GPL(blkio_root_cgroup);
35
Tejun Heo035d10b2012-03-05 13:15:04 -080036static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];
37
Ben Blum67523c42010-03-10 15:22:11 -080038static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
39 struct cgroup *);
Tejun Heobb9d97b2011-12-12 18:12:21 -080040static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
41 struct cgroup_taskset *);
42static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
43 struct cgroup_taskset *);
Tejun Heo7ee9c562012-03-05 13:15:11 -080044static int blkiocg_pre_destroy(struct cgroup_subsys *, struct cgroup *);
Ben Blum67523c42010-03-10 15:22:11 -080045static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
46static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
47
Vivek Goyal062a6442010-09-15 17:06:33 -040048/* for encoding cft->private value on file */
49#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
50/* What policy owns the file, proportional or throttle */
51#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
52#define BLKIOFILE_ATTR(val) ((val) & 0xffff)
53
Ben Blum67523c42010-03-10 15:22:11 -080054struct cgroup_subsys blkio_subsys = {
55 .name = "blkio",
56 .create = blkiocg_create,
Tejun Heobb9d97b2011-12-12 18:12:21 -080057 .can_attach = blkiocg_can_attach,
58 .attach = blkiocg_attach,
Tejun Heo7ee9c562012-03-05 13:15:11 -080059 .pre_destroy = blkiocg_pre_destroy,
Ben Blum67523c42010-03-10 15:22:11 -080060 .destroy = blkiocg_destroy,
61 .populate = blkiocg_populate,
Ben Blum67523c42010-03-10 15:22:11 -080062 .subsys_id = blkio_subsys_id,
Ben Blum67523c42010-03-10 15:22:11 -080063 .module = THIS_MODULE,
64};
65EXPORT_SYMBOL_GPL(blkio_subsys);
66
Vivek Goyal31e4c282009-12-03 12:59:42 -050067struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
68{
69 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
70 struct blkio_cgroup, css);
71}
Vivek Goyal9d6a9862009-12-04 10:36:41 -050072EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
Vivek Goyal31e4c282009-12-03 12:59:42 -050073
Vivek Goyal70087dc2011-05-16 15:24:08 +020074struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
75{
76 return container_of(task_subsys_state(tsk, blkio_subsys_id),
77 struct blkio_cgroup, css);
78}
79EXPORT_SYMBOL_GPL(task_blkio_cgroup);
80
Vivek Goyal062a6442010-09-15 17:06:33 -040081static inline void
82blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
83{
84 struct blkio_policy_type *blkiop;
85
86 list_for_each_entry(blkiop, &blkio_list, list) {
87 /* If this policy does not own the blkg, do not send updates */
88 if (blkiop->plid != blkg->plid)
89 continue;
90 if (blkiop->ops.blkio_update_group_weight_fn)
Tejun Heoca32aef2012-03-05 13:15:03 -080091 blkiop->ops.blkio_update_group_weight_fn(blkg->q,
Vivek Goyalfe071432010-10-01 14:49:49 +020092 blkg, weight);
Vivek Goyal062a6442010-09-15 17:06:33 -040093 }
94}
95
Vivek Goyal4c9eefa2010-09-15 17:06:34 -040096static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
97 int fileid)
98{
99 struct blkio_policy_type *blkiop;
100
101 list_for_each_entry(blkiop, &blkio_list, list) {
102
103 /* If this policy does not own the blkg, do not send updates */
104 if (blkiop->plid != blkg->plid)
105 continue;
106
107 if (fileid == BLKIO_THROTL_read_bps_device
108 && blkiop->ops.blkio_update_group_read_bps_fn)
Tejun Heoca32aef2012-03-05 13:15:03 -0800109 blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
Vivek Goyalfe071432010-10-01 14:49:49 +0200110 blkg, bps);
Vivek Goyal4c9eefa2010-09-15 17:06:34 -0400111
112 if (fileid == BLKIO_THROTL_write_bps_device
113 && blkiop->ops.blkio_update_group_write_bps_fn)
Tejun Heoca32aef2012-03-05 13:15:03 -0800114 blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
Vivek Goyalfe071432010-10-01 14:49:49 +0200115 blkg, bps);
Vivek Goyal4c9eefa2010-09-15 17:06:34 -0400116 }
117}
118
Vivek Goyal7702e8f2010-09-15 17:06:36 -0400119static inline void blkio_update_group_iops(struct blkio_group *blkg,
120 unsigned int iops, int fileid)
121{
122 struct blkio_policy_type *blkiop;
123
124 list_for_each_entry(blkiop, &blkio_list, list) {
125
126 /* If this policy does not own the blkg, do not send updates */
127 if (blkiop->plid != blkg->plid)
128 continue;
129
130 if (fileid == BLKIO_THROTL_read_iops_device
131 && blkiop->ops.blkio_update_group_read_iops_fn)
Tejun Heoca32aef2012-03-05 13:15:03 -0800132 blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
Vivek Goyalfe071432010-10-01 14:49:49 +0200133 blkg, iops);
Vivek Goyal7702e8f2010-09-15 17:06:36 -0400134
135 if (fileid == BLKIO_THROTL_write_iops_device
136 && blkiop->ops.blkio_update_group_write_iops_fn)
Tejun Heoca32aef2012-03-05 13:15:03 -0800137 blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
Vivek Goyalfe071432010-10-01 14:49:49 +0200138 blkg,iops);
Vivek Goyal7702e8f2010-09-15 17:06:36 -0400139 }
140}
141
Divyesh Shah91952912010-04-01 15:01:41 -0700142/*
143 * Add to the appropriate stat variable depending on the request type.
144 * This should be called with the blkg->stats_lock held.
145 */
Divyesh Shah84c124d2010-04-09 08:31:19 +0200146static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
147 bool sync)
Divyesh Shah91952912010-04-01 15:01:41 -0700148{
Divyesh Shah84c124d2010-04-09 08:31:19 +0200149 if (direction)
150 stat[BLKIO_STAT_WRITE] += add;
Divyesh Shah91952912010-04-01 15:01:41 -0700151 else
Divyesh Shah84c124d2010-04-09 08:31:19 +0200152 stat[BLKIO_STAT_READ] += add;
153 if (sync)
154 stat[BLKIO_STAT_SYNC] += add;
Divyesh Shah91952912010-04-01 15:01:41 -0700155 else
Divyesh Shah84c124d2010-04-09 08:31:19 +0200156 stat[BLKIO_STAT_ASYNC] += add;
Divyesh Shah91952912010-04-01 15:01:41 -0700157}
158
Divyesh Shahcdc11842010-04-08 21:15:10 -0700159/*
160 * Decrements the appropriate stat variable if non-zero depending on the
161 * request type. Panics on value being zero.
162 * This should be called with the blkg->stats_lock held.
163 */
164static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
165{
166 if (direction) {
167 BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
168 stat[BLKIO_STAT_WRITE]--;
169 } else {
170 BUG_ON(stat[BLKIO_STAT_READ] == 0);
171 stat[BLKIO_STAT_READ]--;
172 }
173 if (sync) {
174 BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
175 stat[BLKIO_STAT_SYNC]--;
176 } else {
177 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
178 stat[BLKIO_STAT_ASYNC]--;
179 }
180}
181
182#ifdef CONFIG_DEBUG_BLK_CGROUP
Divyesh Shah812df482010-04-08 21:15:35 -0700183/* This should be called with the blkg->stats_lock held. */
184static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
185 struct blkio_group *curr_blkg)
186{
187 if (blkio_blkg_waiting(&blkg->stats))
188 return;
189 if (blkg == curr_blkg)
190 return;
191 blkg->stats.start_group_wait_time = sched_clock();
192 blkio_mark_blkg_waiting(&blkg->stats);
193}
194
195/* This should be called with the blkg->stats_lock held. */
196static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
197{
198 unsigned long long now;
199
200 if (!blkio_blkg_waiting(stats))
201 return;
202
203 now = sched_clock();
204 if (time_after64(now, stats->start_group_wait_time))
205 stats->group_wait_time += now - stats->start_group_wait_time;
206 blkio_clear_blkg_waiting(stats);
207}
208
209/* This should be called with the blkg->stats_lock held. */
210static void blkio_end_empty_time(struct blkio_group_stats *stats)
211{
212 unsigned long long now;
213
214 if (!blkio_blkg_empty(stats))
215 return;
216
217 now = sched_clock();
218 if (time_after64(now, stats->start_empty_time))
219 stats->empty_time += now - stats->start_empty_time;
220 blkio_clear_blkg_empty(stats);
221}
222
223void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
224{
225 unsigned long flags;
226
227 spin_lock_irqsave(&blkg->stats_lock, flags);
228 BUG_ON(blkio_blkg_idling(&blkg->stats));
229 blkg->stats.start_idle_time = sched_clock();
230 blkio_mark_blkg_idling(&blkg->stats);
231 spin_unlock_irqrestore(&blkg->stats_lock, flags);
232}
233EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
234
235void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
236{
237 unsigned long flags;
238 unsigned long long now;
239 struct blkio_group_stats *stats;
240
241 spin_lock_irqsave(&blkg->stats_lock, flags);
242 stats = &blkg->stats;
243 if (blkio_blkg_idling(stats)) {
244 now = sched_clock();
245 if (time_after64(now, stats->start_idle_time))
246 stats->idle_time += now - stats->start_idle_time;
247 blkio_clear_blkg_idling(stats);
248 }
249 spin_unlock_irqrestore(&blkg->stats_lock, flags);
250}
251EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
252
Divyesh Shaha11cdaa2010-04-13 19:59:17 +0200253void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
Divyesh Shahcdc11842010-04-08 21:15:10 -0700254{
255 unsigned long flags;
256 struct blkio_group_stats *stats;
257
258 spin_lock_irqsave(&blkg->stats_lock, flags);
259 stats = &blkg->stats;
260 stats->avg_queue_size_sum +=
261 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
262 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
263 stats->avg_queue_size_samples++;
Divyesh Shah812df482010-04-08 21:15:35 -0700264 blkio_update_group_wait_time(stats);
Divyesh Shahcdc11842010-04-08 21:15:10 -0700265 spin_unlock_irqrestore(&blkg->stats_lock, flags);
266}
Divyesh Shaha11cdaa2010-04-13 19:59:17 +0200267EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
268
Vivek Goyale5ff0822010-04-26 19:25:11 +0200269void blkiocg_set_start_empty_time(struct blkio_group *blkg)
Divyesh Shah28baf442010-04-14 11:22:38 +0200270{
271 unsigned long flags;
272 struct blkio_group_stats *stats;
273
274 spin_lock_irqsave(&blkg->stats_lock, flags);
275 stats = &blkg->stats;
276
277 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
278 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
279 spin_unlock_irqrestore(&blkg->stats_lock, flags);
280 return;
281 }
282
283 /*
Vivek Goyale5ff0822010-04-26 19:25:11 +0200284 * group is already marked empty. This can happen if cfqq got new
285 * request in parent group and moved to this group while being added
286 * to service tree. Just ignore the event and move on.
Divyesh Shah28baf442010-04-14 11:22:38 +0200287 */
Vivek Goyale5ff0822010-04-26 19:25:11 +0200288 if(blkio_blkg_empty(stats)) {
289 spin_unlock_irqrestore(&blkg->stats_lock, flags);
290 return;
291 }
292
Divyesh Shah28baf442010-04-14 11:22:38 +0200293 stats->start_empty_time = sched_clock();
294 blkio_mark_blkg_empty(stats);
295 spin_unlock_irqrestore(&blkg->stats_lock, flags);
296}
297EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
298
Divyesh Shaha11cdaa2010-04-13 19:59:17 +0200299void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
300 unsigned long dequeue)
301{
302 blkg->stats.dequeue += dequeue;
303}
304EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
Divyesh Shah812df482010-04-08 21:15:35 -0700305#else
306static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
307 struct blkio_group *curr_blkg) {}
308static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
Divyesh Shahcdc11842010-04-08 21:15:10 -0700309#endif
310
Divyesh Shaha11cdaa2010-04-13 19:59:17 +0200311void blkiocg_update_io_add_stats(struct blkio_group *blkg,
Divyesh Shahcdc11842010-04-08 21:15:10 -0700312 struct blkio_group *curr_blkg, bool direction,
313 bool sync)
314{
315 unsigned long flags;
316
317 spin_lock_irqsave(&blkg->stats_lock, flags);
318 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
319 sync);
Divyesh Shah812df482010-04-08 21:15:35 -0700320 blkio_end_empty_time(&blkg->stats);
321 blkio_set_start_group_wait_time(blkg, curr_blkg);
Divyesh Shahcdc11842010-04-08 21:15:10 -0700322 spin_unlock_irqrestore(&blkg->stats_lock, flags);
323}
Divyesh Shaha11cdaa2010-04-13 19:59:17 +0200324EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
Divyesh Shahcdc11842010-04-08 21:15:10 -0700325
Divyesh Shaha11cdaa2010-04-13 19:59:17 +0200326void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
Divyesh Shahcdc11842010-04-08 21:15:10 -0700327 bool direction, bool sync)
328{
329 unsigned long flags;
330
331 spin_lock_irqsave(&blkg->stats_lock, flags);
332 blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
333 direction, sync);
334 spin_unlock_irqrestore(&blkg->stats_lock, flags);
335}
Divyesh Shaha11cdaa2010-04-13 19:59:17 +0200336EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
Divyesh Shahcdc11842010-04-08 21:15:10 -0700337
Justin TerAvest167400d2011-03-12 16:54:00 +0100338void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
339 unsigned long unaccounted_time)
Vivek Goyal22084192009-12-03 12:59:49 -0500340{
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700341 unsigned long flags;
342
343 spin_lock_irqsave(&blkg->stats_lock, flags);
344 blkg->stats.time += time;
Vivek Goyala23e6862011-05-19 15:38:20 -0400345#ifdef CONFIG_DEBUG_BLK_CGROUP
Justin TerAvest167400d2011-03-12 16:54:00 +0100346 blkg->stats.unaccounted_time += unaccounted_time;
Vivek Goyala23e6862011-05-19 15:38:20 -0400347#endif
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700348 spin_unlock_irqrestore(&blkg->stats_lock, flags);
Vivek Goyal22084192009-12-03 12:59:49 -0500349}
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700350EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
Vivek Goyal22084192009-12-03 12:59:49 -0500351
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400352/*
353 * should be called under rcu read lock or queue lock to make sure blkg pointer
354 * is valid.
355 */
Divyesh Shah84c124d2010-04-09 08:31:19 +0200356void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
357 uint64_t bytes, bool direction, bool sync)
Divyesh Shah91952912010-04-01 15:01:41 -0700358{
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400359 struct blkio_group_stats_cpu *stats_cpu;
Vivek Goyal575969a2011-05-19 15:38:29 -0400360 unsigned long flags;
361
362 /*
363 * Disabling interrupts to provide mutual exclusion between two
364 * writes on same cpu. It probably is not needed for 64bit. Not
365 * optimizing that case yet.
366 */
367 local_irq_save(flags);
Divyesh Shah91952912010-04-01 15:01:41 -0700368
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400369 stats_cpu = this_cpu_ptr(blkg->stats_cpu);
370
Vivek Goyal575969a2011-05-19 15:38:29 -0400371 u64_stats_update_begin(&stats_cpu->syncp);
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400372 stats_cpu->sectors += bytes >> 9;
373 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
374 1, direction, sync);
375 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
376 bytes, direction, sync);
Vivek Goyal575969a2011-05-19 15:38:29 -0400377 u64_stats_update_end(&stats_cpu->syncp);
378 local_irq_restore(flags);
Divyesh Shah91952912010-04-01 15:01:41 -0700379}
Divyesh Shah84c124d2010-04-09 08:31:19 +0200380EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
Divyesh Shah91952912010-04-01 15:01:41 -0700381
Divyesh Shah84c124d2010-04-09 08:31:19 +0200382void blkiocg_update_completion_stats(struct blkio_group *blkg,
383 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
Divyesh Shah91952912010-04-01 15:01:41 -0700384{
385 struct blkio_group_stats *stats;
386 unsigned long flags;
387 unsigned long long now = sched_clock();
388
389 spin_lock_irqsave(&blkg->stats_lock, flags);
390 stats = &blkg->stats;
Divyesh Shah84c124d2010-04-09 08:31:19 +0200391 if (time_after64(now, io_start_time))
392 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
393 now - io_start_time, direction, sync);
394 if (time_after64(io_start_time, start_time))
395 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
396 io_start_time - start_time, direction, sync);
Divyesh Shah91952912010-04-01 15:01:41 -0700397 spin_unlock_irqrestore(&blkg->stats_lock, flags);
398}
Divyesh Shah84c124d2010-04-09 08:31:19 +0200399EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
Divyesh Shah91952912010-04-01 15:01:41 -0700400
Vivek Goyal317389a2011-05-23 10:02:19 +0200401/* Merged stats are per cpu. */
Divyesh Shah812d4022010-04-08 21:14:23 -0700402void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
403 bool sync)
404{
Vivek Goyal317389a2011-05-23 10:02:19 +0200405 struct blkio_group_stats_cpu *stats_cpu;
Divyesh Shah812d4022010-04-08 21:14:23 -0700406 unsigned long flags;
407
Vivek Goyal317389a2011-05-23 10:02:19 +0200408 /*
409 * Disabling interrupts to provide mutual exclusion between two
410 * writes on same cpu. It probably is not needed for 64bit. Not
411 * optimizing that case yet.
412 */
413 local_irq_save(flags);
414
415 stats_cpu = this_cpu_ptr(blkg->stats_cpu);
416
417 u64_stats_update_begin(&stats_cpu->syncp);
418 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
419 direction, sync);
420 u64_stats_update_end(&stats_cpu->syncp);
421 local_irq_restore(flags);
Divyesh Shah812d4022010-04-08 21:14:23 -0700422}
423EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
424
Tejun Heo03814112012-03-05 13:15:14 -0800425/**
426 * blkg_free - free a blkg
427 * @blkg: blkg to free
428 *
429 * Free @blkg which may be partially allocated.
430 */
431static void blkg_free(struct blkio_group *blkg)
432{
433 if (blkg) {
434 free_percpu(blkg->stats_cpu);
435 kfree(blkg->pd);
436 kfree(blkg);
437 }
438}
439
440/**
441 * blkg_alloc - allocate a blkg
442 * @blkcg: block cgroup the new blkg is associated with
443 * @q: request_queue the new blkg is associated with
444 * @pol: policy the new blkg is associated with
445 *
446 * Allocate a new blkg assocating @blkcg and @q for @pol.
447 *
448 * FIXME: Should be called with queue locked but currently isn't due to
449 * percpu stat breakage.
450 */
451static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
452 struct request_queue *q,
453 struct blkio_policy_type *pol)
454{
455 struct blkio_group *blkg;
456
457 /* alloc and init base part */
458 blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
459 if (!blkg)
460 return NULL;
461
462 spin_lock_init(&blkg->stats_lock);
463 rcu_assign_pointer(blkg->q, q);
464 blkg->blkcg = blkcg;
465 blkg->plid = pol->plid;
466 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
467
468 /* alloc per-policy data */
469 blkg->pd = kzalloc_node(sizeof(*blkg->pd) + pol->pdata_size, GFP_ATOMIC,
470 q->node);
471 if (!blkg->pd) {
472 blkg_free(blkg);
473 return NULL;
474 }
475
476 /* broken, read comment in the callsite */
477 blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
478 if (!blkg->stats_cpu) {
479 blkg_free(blkg);
480 return NULL;
481 }
482
483 /* attach pd to blkg and invoke per-policy init */
484 blkg->pd->blkg = blkg;
485 pol->ops.blkio_init_group_fn(blkg);
486 return blkg;
487}
488
Tejun Heocd1604f2012-03-05 13:15:06 -0800489struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
490 struct request_queue *q,
491 enum blkio_policy_id plid,
492 bool for_root)
493 __releases(q->queue_lock) __acquires(q->queue_lock)
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400494{
Tejun Heocd1604f2012-03-05 13:15:06 -0800495 struct blkio_policy_type *pol = blkio_policy[plid];
496 struct blkio_group *blkg, *new_blkg;
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400497
Tejun Heocd1604f2012-03-05 13:15:06 -0800498 WARN_ON_ONCE(!rcu_read_lock_held());
499 lockdep_assert_held(q->queue_lock);
Vivek Goyal31e4c282009-12-03 12:59:42 -0500500
Tejun Heocd1604f2012-03-05 13:15:06 -0800501 /*
502 * This could be the first entry point of blkcg implementation and
503 * we shouldn't allow anything to go through for a bypassing queue.
504 * The following can be removed if blkg lookup is guaranteed to
505 * fail on a bypassing queue.
506 */
507 if (unlikely(blk_queue_bypass(q)) && !for_root)
508 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
509
510 blkg = blkg_lookup(blkcg, q, plid);
511 if (blkg)
512 return blkg;
513
Tejun Heo7ee9c562012-03-05 13:15:11 -0800514 /* blkg holds a reference to blkcg */
Tejun Heocd1604f2012-03-05 13:15:06 -0800515 if (!css_tryget(&blkcg->css))
516 return ERR_PTR(-EINVAL);
517
518 /*
519 * Allocate and initialize.
520 *
521 * FIXME: The following is broken. Percpu memory allocation
522 * requires %GFP_KERNEL context and can't be performed from IO
523 * path. Allocation here should inherently be atomic and the
524 * following lock dancing can be removed once the broken percpu
525 * allocation is fixed.
526 */
527 spin_unlock_irq(q->queue_lock);
528 rcu_read_unlock();
529
Tejun Heo03814112012-03-05 13:15:14 -0800530 new_blkg = blkg_alloc(blkcg, q, pol);
Tejun Heocd1604f2012-03-05 13:15:06 -0800531
532 rcu_read_lock();
533 spin_lock_irq(q->queue_lock);
Tejun Heocd1604f2012-03-05 13:15:06 -0800534
535 /* did bypass get turned on inbetween? */
536 if (unlikely(blk_queue_bypass(q)) && !for_root) {
537 blkg = ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
538 goto out;
539 }
540
541 /* did someone beat us to it? */
542 blkg = blkg_lookup(blkcg, q, plid);
543 if (unlikely(blkg))
544 goto out;
545
546 /* did alloc fail? */
Tejun Heo03814112012-03-05 13:15:14 -0800547 if (unlikely(!new_blkg)) {
Tejun Heocd1604f2012-03-05 13:15:06 -0800548 blkg = ERR_PTR(-ENOMEM);
549 goto out;
550 }
551
552 /* insert */
553 spin_lock(&blkcg->lock);
554 swap(blkg, new_blkg);
Vivek Goyal31e4c282009-12-03 12:59:42 -0500555 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
Tejun Heocd1604f2012-03-05 13:15:06 -0800556 pol->ops.blkio_link_group_fn(q, blkg);
557 spin_unlock(&blkcg->lock);
558out:
Tejun Heo03814112012-03-05 13:15:14 -0800559 blkg_free(new_blkg);
Tejun Heocd1604f2012-03-05 13:15:06 -0800560 return blkg;
Vivek Goyal31e4c282009-12-03 12:59:42 -0500561}
Tejun Heocd1604f2012-03-05 13:15:06 -0800562EXPORT_SYMBOL_GPL(blkg_lookup_create);
Vivek Goyal31e4c282009-12-03 12:59:42 -0500563
Vivek Goyalb1c35762009-12-03 12:59:47 -0500564static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
565{
566 hlist_del_init_rcu(&blkg->blkcg_node);
Vivek Goyalb1c35762009-12-03 12:59:47 -0500567}
568
569/*
570 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
571 * indicating that blk_group was unhashed by the time we got to it.
572 */
Vivek Goyal31e4c282009-12-03 12:59:42 -0500573int blkiocg_del_blkio_group(struct blkio_group *blkg)
574{
Tejun Heo7ee9c562012-03-05 13:15:11 -0800575 struct blkio_cgroup *blkcg = blkg->blkcg;
Vivek Goyalb1c35762009-12-03 12:59:47 -0500576 unsigned long flags;
Vivek Goyalb1c35762009-12-03 12:59:47 -0500577 int ret = 1;
578
Tejun Heo7ee9c562012-03-05 13:15:11 -0800579 spin_lock_irqsave(&blkcg->lock, flags);
580 if (!hlist_unhashed(&blkg->blkcg_node)) {
581 __blkiocg_del_blkio_group(blkg);
582 ret = 0;
Vivek Goyalb1c35762009-12-03 12:59:47 -0500583 }
Tejun Heo7ee9c562012-03-05 13:15:11 -0800584 spin_unlock_irqrestore(&blkcg->lock, flags);
Jens Axboe0f3942a2010-05-03 14:28:55 +0200585
Vivek Goyalb1c35762009-12-03 12:59:47 -0500586 return ret;
Vivek Goyal31e4c282009-12-03 12:59:42 -0500587}
Vivek Goyal9d6a9862009-12-04 10:36:41 -0500588EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
Vivek Goyal31e4c282009-12-03 12:59:42 -0500589
590/* called under rcu_read_lock(). */
Tejun Heocd1604f2012-03-05 13:15:06 -0800591struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
592 struct request_queue *q,
593 enum blkio_policy_id plid)
Vivek Goyal31e4c282009-12-03 12:59:42 -0500594{
595 struct blkio_group *blkg;
596 struct hlist_node *n;
Vivek Goyal31e4c282009-12-03 12:59:42 -0500597
Tejun Heoca32aef2012-03-05 13:15:03 -0800598 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
599 if (blkg->q == q && blkg->plid == plid)
Vivek Goyal31e4c282009-12-03 12:59:42 -0500600 return blkg;
Vivek Goyal31e4c282009-12-03 12:59:42 -0500601 return NULL;
602}
Tejun Heocd1604f2012-03-05 13:15:06 -0800603EXPORT_SYMBOL_GPL(blkg_lookup);
Vivek Goyal31e4c282009-12-03 12:59:42 -0500604
Tejun Heo72e06c22012-03-05 13:15:00 -0800605void blkg_destroy_all(struct request_queue *q)
606{
607 struct blkio_policy_type *pol;
608
609 while (true) {
610 bool done = true;
611
612 spin_lock(&blkio_list_lock);
613 spin_lock_irq(q->queue_lock);
614
615 /*
616 * clear_queue_fn() might return with non-empty group list
617 * if it raced cgroup removal and lost. cgroup removal is
618 * guaranteed to make forward progress and retrying after a
619 * while is enough. This ugliness is scheduled to be
620 * removed after locking update.
621 */
622 list_for_each_entry(pol, &blkio_list, list)
623 if (!pol->ops.blkio_clear_queue_fn(q))
624 done = false;
625
626 spin_unlock_irq(q->queue_lock);
627 spin_unlock(&blkio_list_lock);
628
629 if (done)
630 break;
631
632 msleep(10); /* just some random duration I like */
633 }
634}
635
Vivek Goyalf0bdc8c2011-05-19 15:38:30 -0400636static void blkio_reset_stats_cpu(struct blkio_group *blkg)
637{
638 struct blkio_group_stats_cpu *stats_cpu;
639 int i, j, k;
640 /*
641 * Note: On 64 bit arch this should not be an issue. This has the
642 * possibility of returning some inconsistent value on 32bit arch
643 * as 64bit update on 32bit is non atomic. Taking care of this
644 * corner case makes code very complicated, like sending IPIs to
645 * cpus, taking care of stats of offline cpus etc.
646 *
647 * reset stats is anyway more of a debug feature and this sounds a
648 * corner case. So I am not complicating the code yet until and
649 * unless this becomes a real issue.
650 */
651 for_each_possible_cpu(i) {
652 stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
653 stats_cpu->sectors = 0;
654 for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
655 for (k = 0; k < BLKIO_STAT_TOTAL; k++)
656 stats_cpu->stat_arr_cpu[j][k] = 0;
657 }
658}
659
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700660static int
Divyesh Shah84c124d2010-04-09 08:31:19 +0200661blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700662{
663 struct blkio_cgroup *blkcg;
664 struct blkio_group *blkg;
Divyesh Shah812df482010-04-08 21:15:35 -0700665 struct blkio_group_stats *stats;
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700666 struct hlist_node *n;
Divyesh Shahcdc11842010-04-08 21:15:10 -0700667 uint64_t queued[BLKIO_STAT_TOTAL];
668 int i;
Divyesh Shah812df482010-04-08 21:15:35 -0700669#ifdef CONFIG_DEBUG_BLK_CGROUP
670 bool idling, waiting, empty;
671 unsigned long long now = sched_clock();
672#endif
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700673
674 blkcg = cgroup_to_blkio_cgroup(cgroup);
675 spin_lock_irq(&blkcg->lock);
676 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
677 spin_lock(&blkg->stats_lock);
Divyesh Shah812df482010-04-08 21:15:35 -0700678 stats = &blkg->stats;
679#ifdef CONFIG_DEBUG_BLK_CGROUP
680 idling = blkio_blkg_idling(stats);
681 waiting = blkio_blkg_waiting(stats);
682 empty = blkio_blkg_empty(stats);
683#endif
Divyesh Shahcdc11842010-04-08 21:15:10 -0700684 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
Divyesh Shah812df482010-04-08 21:15:35 -0700685 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
686 memset(stats, 0, sizeof(struct blkio_group_stats));
Divyesh Shahcdc11842010-04-08 21:15:10 -0700687 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
Divyesh Shah812df482010-04-08 21:15:35 -0700688 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
689#ifdef CONFIG_DEBUG_BLK_CGROUP
690 if (idling) {
691 blkio_mark_blkg_idling(stats);
692 stats->start_idle_time = now;
693 }
694 if (waiting) {
695 blkio_mark_blkg_waiting(stats);
696 stats->start_group_wait_time = now;
697 }
698 if (empty) {
699 blkio_mark_blkg_empty(stats);
700 stats->start_empty_time = now;
701 }
702#endif
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700703 spin_unlock(&blkg->stats_lock);
Vivek Goyalf0bdc8c2011-05-19 15:38:30 -0400704
705 /* Reset Per cpu stats which don't take blkg->stats_lock */
706 blkio_reset_stats_cpu(blkg);
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700707 }
Vivek Goyalf0bdc8c2011-05-19 15:38:30 -0400708
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700709 spin_unlock_irq(&blkcg->lock);
710 return 0;
711}
712
Tejun Heo7a4dd282012-03-05 13:15:09 -0800713static void blkio_get_key_name(enum stat_sub_type type, const char *dname,
714 char *str, int chars_left, bool diskname_only)
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700715{
Tejun Heo7a4dd282012-03-05 13:15:09 -0800716 snprintf(str, chars_left, "%s", dname);
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700717 chars_left -= strlen(str);
718 if (chars_left <= 0) {
719 printk(KERN_WARNING
720 "Possibly incorrect cgroup stat display format");
721 return;
722 }
Divyesh Shah84c124d2010-04-09 08:31:19 +0200723 if (diskname_only)
724 return;
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700725 switch (type) {
Divyesh Shah84c124d2010-04-09 08:31:19 +0200726 case BLKIO_STAT_READ:
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700727 strlcat(str, " Read", chars_left);
728 break;
Divyesh Shah84c124d2010-04-09 08:31:19 +0200729 case BLKIO_STAT_WRITE:
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700730 strlcat(str, " Write", chars_left);
731 break;
Divyesh Shah84c124d2010-04-09 08:31:19 +0200732 case BLKIO_STAT_SYNC:
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700733 strlcat(str, " Sync", chars_left);
734 break;
Divyesh Shah84c124d2010-04-09 08:31:19 +0200735 case BLKIO_STAT_ASYNC:
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700736 strlcat(str, " Async", chars_left);
737 break;
Divyesh Shah84c124d2010-04-09 08:31:19 +0200738 case BLKIO_STAT_TOTAL:
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700739 strlcat(str, " Total", chars_left);
740 break;
741 default:
742 strlcat(str, " Invalid", chars_left);
743 }
744}
745
Divyesh Shah84c124d2010-04-09 08:31:19 +0200746static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
Tejun Heo7a4dd282012-03-05 13:15:09 -0800747 struct cgroup_map_cb *cb, const char *dname)
Divyesh Shah84c124d2010-04-09 08:31:19 +0200748{
Tejun Heo7a4dd282012-03-05 13:15:09 -0800749 blkio_get_key_name(0, dname, str, chars_left, true);
Divyesh Shah84c124d2010-04-09 08:31:19 +0200750 cb->fill(cb, str, val);
751 return val;
752}
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700753
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400754
755static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
756 enum stat_type_cpu type, enum stat_sub_type sub_type)
757{
758 int cpu;
759 struct blkio_group_stats_cpu *stats_cpu;
Vivek Goyal575969a2011-05-19 15:38:29 -0400760 u64 val = 0, tval;
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400761
762 for_each_possible_cpu(cpu) {
Vivek Goyal575969a2011-05-19 15:38:29 -0400763 unsigned int start;
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400764 stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu);
765
Vivek Goyal575969a2011-05-19 15:38:29 -0400766 do {
767 start = u64_stats_fetch_begin(&stats_cpu->syncp);
768 if (type == BLKIO_STAT_CPU_SECTORS)
769 tval = stats_cpu->sectors;
770 else
771 tval = stats_cpu->stat_arr_cpu[type][sub_type];
772 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
773
774 val += tval;
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400775 }
776
777 return val;
778}
779
780static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
Tejun Heo7a4dd282012-03-05 13:15:09 -0800781 struct cgroup_map_cb *cb, const char *dname,
782 enum stat_type_cpu type)
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400783{
784 uint64_t disk_total, val;
785 char key_str[MAX_KEY_LEN];
786 enum stat_sub_type sub_type;
787
788 if (type == BLKIO_STAT_CPU_SECTORS) {
789 val = blkio_read_stat_cpu(blkg, type, 0);
Tejun Heo7a4dd282012-03-05 13:15:09 -0800790 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb,
791 dname);
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400792 }
793
794 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
795 sub_type++) {
Tejun Heo7a4dd282012-03-05 13:15:09 -0800796 blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
797 false);
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400798 val = blkio_read_stat_cpu(blkg, type, sub_type);
799 cb->fill(cb, key_str, val);
800 }
801
802 disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
803 blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
804
Tejun Heo7a4dd282012-03-05 13:15:09 -0800805 blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
806 false);
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400807 cb->fill(cb, key_str, disk_total);
808 return disk_total;
809}
810
Divyesh Shah84c124d2010-04-09 08:31:19 +0200811/* This should be called with blkg->stats_lock held */
812static uint64_t blkio_get_stat(struct blkio_group *blkg,
Tejun Heo7a4dd282012-03-05 13:15:09 -0800813 struct cgroup_map_cb *cb, const char *dname,
814 enum stat_type type)
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700815{
816 uint64_t disk_total;
817 char key_str[MAX_KEY_LEN];
Divyesh Shah84c124d2010-04-09 08:31:19 +0200818 enum stat_sub_type sub_type;
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700819
Divyesh Shah84c124d2010-04-09 08:31:19 +0200820 if (type == BLKIO_STAT_TIME)
821 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
Tejun Heo7a4dd282012-03-05 13:15:09 -0800822 blkg->stats.time, cb, dname);
Justin TerAvest9026e522011-03-22 21:26:54 +0100823#ifdef CONFIG_DEBUG_BLK_CGROUP
Justin TerAvest167400d2011-03-12 16:54:00 +0100824 if (type == BLKIO_STAT_UNACCOUNTED_TIME)
825 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
Tejun Heo7a4dd282012-03-05 13:15:09 -0800826 blkg->stats.unaccounted_time, cb, dname);
Divyesh Shahcdc11842010-04-08 21:15:10 -0700827 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
828 uint64_t sum = blkg->stats.avg_queue_size_sum;
829 uint64_t samples = blkg->stats.avg_queue_size_samples;
830 if (samples)
831 do_div(sum, samples);
832 else
833 sum = 0;
Tejun Heo7a4dd282012-03-05 13:15:09 -0800834 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
835 sum, cb, dname);
Divyesh Shahcdc11842010-04-08 21:15:10 -0700836 }
Divyesh Shah812df482010-04-08 21:15:35 -0700837 if (type == BLKIO_STAT_GROUP_WAIT_TIME)
838 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
Tejun Heo7a4dd282012-03-05 13:15:09 -0800839 blkg->stats.group_wait_time, cb, dname);
Divyesh Shah812df482010-04-08 21:15:35 -0700840 if (type == BLKIO_STAT_IDLE_TIME)
841 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
Tejun Heo7a4dd282012-03-05 13:15:09 -0800842 blkg->stats.idle_time, cb, dname);
Divyesh Shah812df482010-04-08 21:15:35 -0700843 if (type == BLKIO_STAT_EMPTY_TIME)
844 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
Tejun Heo7a4dd282012-03-05 13:15:09 -0800845 blkg->stats.empty_time, cb, dname);
Divyesh Shah84c124d2010-04-09 08:31:19 +0200846 if (type == BLKIO_STAT_DEQUEUE)
847 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
Tejun Heo7a4dd282012-03-05 13:15:09 -0800848 blkg->stats.dequeue, cb, dname);
Divyesh Shah84c124d2010-04-09 08:31:19 +0200849#endif
850
851 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
852 sub_type++) {
Tejun Heo7a4dd282012-03-05 13:15:09 -0800853 blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
854 false);
Divyesh Shah84c124d2010-04-09 08:31:19 +0200855 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700856 }
Divyesh Shah84c124d2010-04-09 08:31:19 +0200857 disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
858 blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
Tejun Heo7a4dd282012-03-05 13:15:09 -0800859 blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
860 false);
Divyesh Shah303a3ac2010-04-01 15:01:24 -0700861 cb->fill(cb, key_str, disk_total);
862 return disk_total;
863}
864
Tejun Heo4bfd4822012-03-05 13:15:08 -0800865static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
866 int fileid, struct blkio_cgroup *blkcg)
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800867{
Tejun Heoece84242011-10-19 14:31:15 +0200868 struct gendisk *disk = NULL;
Tejun Heoe56da7e2012-03-05 13:15:07 -0800869 struct blkio_group *blkg = NULL;
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800870 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
Wanlong Gaod11bb442011-09-21 10:22:10 +0200871 unsigned long major, minor;
Tejun Heoece84242011-10-19 14:31:15 +0200872 int i = 0, ret = -EINVAL;
873 int part;
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800874 dev_t dev;
Wanlong Gaod11bb442011-09-21 10:22:10 +0200875 u64 temp;
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800876
877 memset(s, 0, sizeof(s));
878
879 while ((p = strsep(&buf, " ")) != NULL) {
880 if (!*p)
881 continue;
882
883 s[i++] = p;
884
885 /* Prevent from inputing too many things */
886 if (i == 3)
887 break;
888 }
889
890 if (i != 2)
Tejun Heoece84242011-10-19 14:31:15 +0200891 goto out;
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800892
893 p = strsep(&s[0], ":");
894 if (p != NULL)
895 major_s = p;
896 else
Tejun Heoece84242011-10-19 14:31:15 +0200897 goto out;
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800898
899 minor_s = s[0];
900 if (!minor_s)
Tejun Heoece84242011-10-19 14:31:15 +0200901 goto out;
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800902
Tejun Heoece84242011-10-19 14:31:15 +0200903 if (strict_strtoul(major_s, 10, &major))
904 goto out;
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800905
Tejun Heoece84242011-10-19 14:31:15 +0200906 if (strict_strtoul(minor_s, 10, &minor))
907 goto out;
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800908
909 dev = MKDEV(major, minor);
910
Tejun Heoece84242011-10-19 14:31:15 +0200911 if (strict_strtoull(s[1], 10, &temp))
912 goto out;
Wanlong Gaod11bb442011-09-21 10:22:10 +0200913
Tejun Heoe56da7e2012-03-05 13:15:07 -0800914 disk = get_gendisk(dev, &part);
Tejun Heo4bfd4822012-03-05 13:15:08 -0800915 if (!disk || part)
Tejun Heoe56da7e2012-03-05 13:15:07 -0800916 goto out;
Tejun Heoe56da7e2012-03-05 13:15:07 -0800917
918 rcu_read_lock();
919
Tejun Heo4bfd4822012-03-05 13:15:08 -0800920 spin_lock_irq(disk->queue->queue_lock);
921 blkg = blkg_lookup_create(blkcg, disk->queue, plid, false);
922 spin_unlock_irq(disk->queue->queue_lock);
Tejun Heoe56da7e2012-03-05 13:15:07 -0800923
Tejun Heo4bfd4822012-03-05 13:15:08 -0800924 if (IS_ERR(blkg)) {
925 ret = PTR_ERR(blkg);
926 goto out_unlock;
Wanlong Gaod11bb442011-09-21 10:22:10 +0200927 }
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800928
Vivek Goyal062a6442010-09-15 17:06:33 -0400929 switch (plid) {
930 case BLKIO_POLICY_PROP:
Wanlong Gaod11bb442011-09-21 10:22:10 +0200931 if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
932 temp > BLKIO_WEIGHT_MAX)
Tejun Heoe56da7e2012-03-05 13:15:07 -0800933 goto out_unlock;
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800934
Tejun Heo4bfd4822012-03-05 13:15:08 -0800935 blkg->conf.weight = temp;
936 blkio_update_group_weight(blkg, temp ?: blkcg->weight);
Vivek Goyal4c9eefa2010-09-15 17:06:34 -0400937 break;
938 case BLKIO_POLICY_THROTL:
Vivek Goyal7702e8f2010-09-15 17:06:36 -0400939 switch(fileid) {
940 case BLKIO_THROTL_read_bps_device:
Tejun Heo4bfd4822012-03-05 13:15:08 -0800941 blkg->conf.bps[READ] = temp;
942 blkio_update_group_bps(blkg, temp ?: -1, fileid);
Tejun Heoe56da7e2012-03-05 13:15:07 -0800943 break;
Vivek Goyal7702e8f2010-09-15 17:06:36 -0400944 case BLKIO_THROTL_write_bps_device:
Tejun Heo4bfd4822012-03-05 13:15:08 -0800945 blkg->conf.bps[WRITE] = temp;
946 blkio_update_group_bps(blkg, temp ?: -1, fileid);
Vivek Goyal7702e8f2010-09-15 17:06:36 -0400947 break;
948 case BLKIO_THROTL_read_iops_device:
Tejun Heoe56da7e2012-03-05 13:15:07 -0800949 if (temp > THROTL_IOPS_MAX)
950 goto out_unlock;
Tejun Heo4bfd4822012-03-05 13:15:08 -0800951 blkg->conf.iops[READ] = temp;
952 blkio_update_group_iops(blkg, temp ?: -1, fileid);
Tejun Heoe56da7e2012-03-05 13:15:07 -0800953 break;
Vivek Goyal7702e8f2010-09-15 17:06:36 -0400954 case BLKIO_THROTL_write_iops_device:
Wanlong Gaod11bb442011-09-21 10:22:10 +0200955 if (temp > THROTL_IOPS_MAX)
Tejun Heoe56da7e2012-03-05 13:15:07 -0800956 goto out_unlock;
Tejun Heo4bfd4822012-03-05 13:15:08 -0800957 blkg->conf.iops[WRITE] = temp;
958 blkio_update_group_iops(blkg, temp ?: -1, fileid);
Vivek Goyal7702e8f2010-09-15 17:06:36 -0400959 break;
960 }
Vivek Goyal062a6442010-09-15 17:06:33 -0400961 break;
962 default:
963 BUG();
964 }
Tejun Heoece84242011-10-19 14:31:15 +0200965 ret = 0;
Tejun Heoe56da7e2012-03-05 13:15:07 -0800966out_unlock:
967 rcu_read_unlock();
Tejun Heoece84242011-10-19 14:31:15 +0200968out:
969 put_disk(disk);
Tejun Heoe56da7e2012-03-05 13:15:07 -0800970
971 /*
972 * If queue was bypassing, we should retry. Do so after a short
973 * msleep(). It isn't strictly necessary but queue can be
974 * bypassing for some time and it's always nice to avoid busy
975 * looping.
976 */
977 if (ret == -EBUSY) {
978 msleep(10);
979 return restart_syscall();
980 }
Tejun Heoece84242011-10-19 14:31:15 +0200981 return ret;
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800982}
983
Vivek Goyal062a6442010-09-15 17:06:33 -0400984static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
985 const char *buffer)
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800986{
987 int ret = 0;
988 char *buf;
Tejun Heoe56da7e2012-03-05 13:15:07 -0800989 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
Vivek Goyal062a6442010-09-15 17:06:33 -0400990 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
991 int fileid = BLKIOFILE_ATTR(cft->private);
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800992
993 buf = kstrdup(buffer, GFP_KERNEL);
994 if (!buf)
995 return -ENOMEM;
996
Tejun Heo4bfd4822012-03-05 13:15:08 -0800997 ret = blkio_policy_parse_and_set(buf, plid, fileid, blkcg);
Gui Jianfeng34d0f172010-04-13 16:05:49 +0800998 kfree(buf);
999 return ret;
1000}
1001
Vivek Goyal92616b52012-03-05 13:15:10 -08001002static const char *blkg_dev_name(struct blkio_group *blkg)
1003{
1004 /* some drivers (floppy) instantiate a queue w/o disk registered */
1005 if (blkg->q->backing_dev_info.dev)
1006 return dev_name(blkg->q->backing_dev_info.dev);
1007 return NULL;
1008}
1009
Tejun Heo4bfd4822012-03-05 13:15:08 -08001010static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
1011 struct seq_file *m)
Gui Jianfeng34d0f172010-04-13 16:05:49 +08001012{
Vivek Goyal92616b52012-03-05 13:15:10 -08001013 const char *dname = blkg_dev_name(blkg);
Tejun Heo4bfd4822012-03-05 13:15:08 -08001014 int fileid = BLKIOFILE_ATTR(cft->private);
1015 int rw = WRITE;
1016
Vivek Goyal92616b52012-03-05 13:15:10 -08001017 if (!dname)
1018 return;
1019
Tejun Heo4bfd4822012-03-05 13:15:08 -08001020 switch (blkg->plid) {
Vivek Goyal062a6442010-09-15 17:06:33 -04001021 case BLKIO_POLICY_PROP:
Tejun Heo4bfd4822012-03-05 13:15:08 -08001022 if (blkg->conf.weight)
Tejun Heo7a4dd282012-03-05 13:15:09 -08001023 seq_printf(m, "%s\t%u\n",
1024 dname, blkg->conf.weight);
Vivek Goyal4c9eefa2010-09-15 17:06:34 -04001025 break;
1026 case BLKIO_POLICY_THROTL:
Tejun Heo4bfd4822012-03-05 13:15:08 -08001027 switch (fileid) {
Vivek Goyal7702e8f2010-09-15 17:06:36 -04001028 case BLKIO_THROTL_read_bps_device:
Tejun Heo4bfd4822012-03-05 13:15:08 -08001029 rw = READ;
Vivek Goyal7702e8f2010-09-15 17:06:36 -04001030 case BLKIO_THROTL_write_bps_device:
Tejun Heo4bfd4822012-03-05 13:15:08 -08001031 if (blkg->conf.bps[rw])
Tejun Heo7a4dd282012-03-05 13:15:09 -08001032 seq_printf(m, "%s\t%llu\n",
1033 dname, blkg->conf.bps[rw]);
Vivek Goyal7702e8f2010-09-15 17:06:36 -04001034 break;
1035 case BLKIO_THROTL_read_iops_device:
Tejun Heo4bfd4822012-03-05 13:15:08 -08001036 rw = READ;
Vivek Goyal7702e8f2010-09-15 17:06:36 -04001037 case BLKIO_THROTL_write_iops_device:
Tejun Heo4bfd4822012-03-05 13:15:08 -08001038 if (blkg->conf.iops[rw])
Tejun Heo7a4dd282012-03-05 13:15:09 -08001039 seq_printf(m, "%s\t%u\n",
1040 dname, blkg->conf.iops[rw]);
Vivek Goyal7702e8f2010-09-15 17:06:36 -04001041 break;
1042 }
Vivek Goyal062a6442010-09-15 17:06:33 -04001043 break;
1044 default:
1045 BUG();
1046 }
1047}
1048
1049/* cgroup files which read their data from policy nodes end up here */
Tejun Heo4bfd4822012-03-05 13:15:08 -08001050static void blkio_read_conf(struct cftype *cft, struct blkio_cgroup *blkcg,
1051 struct seq_file *m)
Vivek Goyal062a6442010-09-15 17:06:33 -04001052{
Tejun Heo4bfd4822012-03-05 13:15:08 -08001053 struct blkio_group *blkg;
1054 struct hlist_node *n;
Gui Jianfeng34d0f172010-04-13 16:05:49 +08001055
Tejun Heo4bfd4822012-03-05 13:15:08 -08001056 spin_lock_irq(&blkcg->lock);
1057 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
1058 if (BLKIOFILE_POLICY(cft->private) == blkg->plid)
1059 blkio_print_group_conf(cft, blkg, m);
1060 spin_unlock_irq(&blkcg->lock);
Vivek Goyal062a6442010-09-15 17:06:33 -04001061}
1062
1063static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1064 struct seq_file *m)
1065{
1066 struct blkio_cgroup *blkcg;
1067 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1068 int name = BLKIOFILE_ATTR(cft->private);
1069
1070 blkcg = cgroup_to_blkio_cgroup(cgrp);
1071
1072 switch(plid) {
1073 case BLKIO_POLICY_PROP:
1074 switch(name) {
1075 case BLKIO_PROP_weight_device:
Tejun Heo4bfd4822012-03-05 13:15:08 -08001076 blkio_read_conf(cft, blkcg, m);
Vivek Goyal062a6442010-09-15 17:06:33 -04001077 return 0;
1078 default:
1079 BUG();
1080 }
1081 break;
Vivek Goyal4c9eefa2010-09-15 17:06:34 -04001082 case BLKIO_POLICY_THROTL:
1083 switch(name){
1084 case BLKIO_THROTL_read_bps_device:
1085 case BLKIO_THROTL_write_bps_device:
Vivek Goyal7702e8f2010-09-15 17:06:36 -04001086 case BLKIO_THROTL_read_iops_device:
1087 case BLKIO_THROTL_write_iops_device:
Tejun Heo4bfd4822012-03-05 13:15:08 -08001088 blkio_read_conf(cft, blkcg, m);
Vivek Goyal4c9eefa2010-09-15 17:06:34 -04001089 return 0;
1090 default:
1091 BUG();
1092 }
1093 break;
Vivek Goyal062a6442010-09-15 17:06:33 -04001094 default:
1095 BUG();
1096 }
1097
1098 return 0;
1099}
1100
1101static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001102 struct cftype *cft, struct cgroup_map_cb *cb,
1103 enum stat_type type, bool show_total, bool pcpu)
Vivek Goyal062a6442010-09-15 17:06:33 -04001104{
1105 struct blkio_group *blkg;
1106 struct hlist_node *n;
1107 uint64_t cgroup_total = 0;
1108
1109 rcu_read_lock();
1110 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
Vivek Goyal92616b52012-03-05 13:15:10 -08001111 const char *dname = blkg_dev_name(blkg);
Tejun Heo7a4dd282012-03-05 13:15:09 -08001112
Vivek Goyal92616b52012-03-05 13:15:10 -08001113 if (!dname || BLKIOFILE_POLICY(cft->private) != blkg->plid)
Tejun Heo7a4dd282012-03-05 13:15:09 -08001114 continue;
1115 if (pcpu)
1116 cgroup_total += blkio_get_stat_cpu(blkg, cb, dname,
1117 type);
1118 else {
1119 spin_lock_irq(&blkg->stats_lock);
1120 cgroup_total += blkio_get_stat(blkg, cb, dname, type);
1121 spin_unlock_irq(&blkg->stats_lock);
Vivek Goyal062a6442010-09-15 17:06:33 -04001122 }
1123 }
1124 if (show_total)
1125 cb->fill(cb, "Total", cgroup_total);
1126 rcu_read_unlock();
1127 return 0;
1128}
1129
1130/* All map kind of cgroup file get serviced by this function */
1131static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1132 struct cgroup_map_cb *cb)
1133{
1134 struct blkio_cgroup *blkcg;
1135 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1136 int name = BLKIOFILE_ATTR(cft->private);
1137
1138 blkcg = cgroup_to_blkio_cgroup(cgrp);
1139
1140 switch(plid) {
1141 case BLKIO_POLICY_PROP:
1142 switch(name) {
1143 case BLKIO_PROP_time:
1144 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001145 BLKIO_STAT_TIME, 0, 0);
Vivek Goyal062a6442010-09-15 17:06:33 -04001146 case BLKIO_PROP_sectors:
1147 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001148 BLKIO_STAT_CPU_SECTORS, 0, 1);
Vivek Goyal062a6442010-09-15 17:06:33 -04001149 case BLKIO_PROP_io_service_bytes:
1150 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001151 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
Vivek Goyal062a6442010-09-15 17:06:33 -04001152 case BLKIO_PROP_io_serviced:
1153 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001154 BLKIO_STAT_CPU_SERVICED, 1, 1);
Vivek Goyal062a6442010-09-15 17:06:33 -04001155 case BLKIO_PROP_io_service_time:
1156 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001157 BLKIO_STAT_SERVICE_TIME, 1, 0);
Vivek Goyal062a6442010-09-15 17:06:33 -04001158 case BLKIO_PROP_io_wait_time:
1159 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001160 BLKIO_STAT_WAIT_TIME, 1, 0);
Vivek Goyal062a6442010-09-15 17:06:33 -04001161 case BLKIO_PROP_io_merged:
1162 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal317389a2011-05-23 10:02:19 +02001163 BLKIO_STAT_CPU_MERGED, 1, 1);
Vivek Goyal062a6442010-09-15 17:06:33 -04001164 case BLKIO_PROP_io_queued:
1165 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001166 BLKIO_STAT_QUEUED, 1, 0);
Vivek Goyal062a6442010-09-15 17:06:33 -04001167#ifdef CONFIG_DEBUG_BLK_CGROUP
Justin TerAvest9026e522011-03-22 21:26:54 +01001168 case BLKIO_PROP_unaccounted_time:
1169 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001170 BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
Vivek Goyal062a6442010-09-15 17:06:33 -04001171 case BLKIO_PROP_dequeue:
1172 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001173 BLKIO_STAT_DEQUEUE, 0, 0);
Vivek Goyal062a6442010-09-15 17:06:33 -04001174 case BLKIO_PROP_avg_queue_size:
1175 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001176 BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
Vivek Goyal062a6442010-09-15 17:06:33 -04001177 case BLKIO_PROP_group_wait_time:
1178 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001179 BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
Vivek Goyal062a6442010-09-15 17:06:33 -04001180 case BLKIO_PROP_idle_time:
1181 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001182 BLKIO_STAT_IDLE_TIME, 0, 0);
Vivek Goyal062a6442010-09-15 17:06:33 -04001183 case BLKIO_PROP_empty_time:
1184 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001185 BLKIO_STAT_EMPTY_TIME, 0, 0);
Vivek Goyal062a6442010-09-15 17:06:33 -04001186#endif
1187 default:
1188 BUG();
1189 }
1190 break;
Vivek Goyal4c9eefa2010-09-15 17:06:34 -04001191 case BLKIO_POLICY_THROTL:
1192 switch(name){
1193 case BLKIO_THROTL_io_service_bytes:
1194 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001195 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
Vivek Goyal4c9eefa2010-09-15 17:06:34 -04001196 case BLKIO_THROTL_io_serviced:
1197 return blkio_read_blkg_stats(blkcg, cft, cb,
Vivek Goyal5624a4e2011-05-19 15:38:28 -04001198 BLKIO_STAT_CPU_SERVICED, 1, 1);
Vivek Goyal4c9eefa2010-09-15 17:06:34 -04001199 default:
1200 BUG();
1201 }
1202 break;
Vivek Goyal062a6442010-09-15 17:06:33 -04001203 default:
1204 BUG();
1205 }
1206
1207 return 0;
1208}
1209
Tejun Heo4bfd4822012-03-05 13:15:08 -08001210static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
Vivek Goyal062a6442010-09-15 17:06:33 -04001211{
1212 struct blkio_group *blkg;
1213 struct hlist_node *n;
Vivek Goyal062a6442010-09-15 17:06:33 -04001214
1215 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1216 return -EINVAL;
1217
1218 spin_lock(&blkio_list_lock);
1219 spin_lock_irq(&blkcg->lock);
1220 blkcg->weight = (unsigned int)val;
1221
Tejun Heo4bfd4822012-03-05 13:15:08 -08001222 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
1223 if (blkg->plid == plid && !blkg->conf.weight)
1224 blkio_update_group_weight(blkg, blkcg->weight);
Vivek Goyal062a6442010-09-15 17:06:33 -04001225
Vivek Goyal062a6442010-09-15 17:06:33 -04001226 spin_unlock_irq(&blkcg->lock);
1227 spin_unlock(&blkio_list_lock);
1228 return 0;
1229}
1230
1231static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1232 struct blkio_cgroup *blkcg;
1233 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1234 int name = BLKIOFILE_ATTR(cft->private);
1235
1236 blkcg = cgroup_to_blkio_cgroup(cgrp);
1237
1238 switch(plid) {
1239 case BLKIO_POLICY_PROP:
1240 switch(name) {
1241 case BLKIO_PROP_weight:
1242 return (u64)blkcg->weight;
1243 }
1244 break;
1245 default:
1246 BUG();
1247 }
1248 return 0;
1249}
1250
1251static int
1252blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1253{
1254 struct blkio_cgroup *blkcg;
1255 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1256 int name = BLKIOFILE_ATTR(cft->private);
1257
1258 blkcg = cgroup_to_blkio_cgroup(cgrp);
1259
1260 switch(plid) {
1261 case BLKIO_POLICY_PROP:
1262 switch(name) {
1263 case BLKIO_PROP_weight:
Tejun Heo4bfd4822012-03-05 13:15:08 -08001264 return blkio_weight_write(blkcg, plid, val);
Vivek Goyal062a6442010-09-15 17:06:33 -04001265 }
1266 break;
1267 default:
1268 BUG();
1269 }
Gui Jianfeng34d0f172010-04-13 16:05:49 +08001270
Gui Jianfeng34d0f172010-04-13 16:05:49 +08001271 return 0;
1272}
1273
Vivek Goyal31e4c282009-12-03 12:59:42 -05001274struct cftype blkio_files[] = {
1275 {
Gui Jianfeng34d0f172010-04-13 16:05:49 +08001276 .name = "weight_device",
Vivek Goyal062a6442010-09-15 17:06:33 -04001277 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1278 BLKIO_PROP_weight_device),
1279 .read_seq_string = blkiocg_file_read,
1280 .write_string = blkiocg_file_write,
Gui Jianfeng34d0f172010-04-13 16:05:49 +08001281 .max_write_len = 256,
1282 },
1283 {
Vivek Goyal31e4c282009-12-03 12:59:42 -05001284 .name = "weight",
Vivek Goyal062a6442010-09-15 17:06:33 -04001285 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1286 BLKIO_PROP_weight),
1287 .read_u64 = blkiocg_file_read_u64,
1288 .write_u64 = blkiocg_file_write_u64,
Vivek Goyal31e4c282009-12-03 12:59:42 -05001289 },
Vivek Goyal22084192009-12-03 12:59:49 -05001290 {
1291 .name = "time",
Vivek Goyal13f98252010-10-01 14:49:41 +02001292 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1293 BLKIO_PROP_time),
1294 .read_map = blkiocg_file_read_map,
Vivek Goyal22084192009-12-03 12:59:49 -05001295 },
1296 {
1297 .name = "sectors",
Vivek Goyal13f98252010-10-01 14:49:41 +02001298 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1299 BLKIO_PROP_sectors),
1300 .read_map = blkiocg_file_read_map,
Divyesh Shah303a3ac2010-04-01 15:01:24 -07001301 },
1302 {
1303 .name = "io_service_bytes",
Vivek Goyal13f98252010-10-01 14:49:41 +02001304 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1305 BLKIO_PROP_io_service_bytes),
1306 .read_map = blkiocg_file_read_map,
Divyesh Shah303a3ac2010-04-01 15:01:24 -07001307 },
1308 {
1309 .name = "io_serviced",
Vivek Goyal13f98252010-10-01 14:49:41 +02001310 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1311 BLKIO_PROP_io_serviced),
1312 .read_map = blkiocg_file_read_map,
Divyesh Shah303a3ac2010-04-01 15:01:24 -07001313 },
1314 {
1315 .name = "io_service_time",
Vivek Goyal13f98252010-10-01 14:49:41 +02001316 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1317 BLKIO_PROP_io_service_time),
1318 .read_map = blkiocg_file_read_map,
Divyesh Shah303a3ac2010-04-01 15:01:24 -07001319 },
1320 {
1321 .name = "io_wait_time",
Vivek Goyal13f98252010-10-01 14:49:41 +02001322 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1323 BLKIO_PROP_io_wait_time),
1324 .read_map = blkiocg_file_read_map,
Divyesh Shah84c124d2010-04-09 08:31:19 +02001325 },
1326 {
Divyesh Shah812d4022010-04-08 21:14:23 -07001327 .name = "io_merged",
Vivek Goyal13f98252010-10-01 14:49:41 +02001328 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1329 BLKIO_PROP_io_merged),
1330 .read_map = blkiocg_file_read_map,
Divyesh Shah812d4022010-04-08 21:14:23 -07001331 },
1332 {
Divyesh Shahcdc11842010-04-08 21:15:10 -07001333 .name = "io_queued",
Vivek Goyal13f98252010-10-01 14:49:41 +02001334 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1335 BLKIO_PROP_io_queued),
1336 .read_map = blkiocg_file_read_map,
Divyesh Shahcdc11842010-04-08 21:15:10 -07001337 },
1338 {
Divyesh Shah84c124d2010-04-09 08:31:19 +02001339 .name = "reset_stats",
1340 .write_u64 = blkiocg_reset_stats,
Vivek Goyal22084192009-12-03 12:59:49 -05001341 },
Vivek Goyal13f98252010-10-01 14:49:41 +02001342#ifdef CONFIG_BLK_DEV_THROTTLING
1343 {
Vivek Goyal4c9eefa2010-09-15 17:06:34 -04001344 .name = "throttle.read_bps_device",
1345 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1346 BLKIO_THROTL_read_bps_device),
1347 .read_seq_string = blkiocg_file_read,
1348 .write_string = blkiocg_file_write,
1349 .max_write_len = 256,
1350 },
1351
1352 {
1353 .name = "throttle.write_bps_device",
1354 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1355 BLKIO_THROTL_write_bps_device),
1356 .read_seq_string = blkiocg_file_read,
1357 .write_string = blkiocg_file_write,
1358 .max_write_len = 256,
1359 },
Vivek Goyal7702e8f2010-09-15 17:06:36 -04001360
1361 {
1362 .name = "throttle.read_iops_device",
1363 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1364 BLKIO_THROTL_read_iops_device),
1365 .read_seq_string = blkiocg_file_read,
1366 .write_string = blkiocg_file_write,
1367 .max_write_len = 256,
1368 },
1369
1370 {
1371 .name = "throttle.write_iops_device",
1372 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1373 BLKIO_THROTL_write_iops_device),
1374 .read_seq_string = blkiocg_file_read,
1375 .write_string = blkiocg_file_write,
1376 .max_write_len = 256,
1377 },
Vivek Goyal4c9eefa2010-09-15 17:06:34 -04001378 {
Vivek Goyal4c9eefa2010-09-15 17:06:34 -04001379 .name = "throttle.io_service_bytes",
1380 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1381 BLKIO_THROTL_io_service_bytes),
1382 .read_map = blkiocg_file_read_map,
1383 },
1384 {
Vivek Goyal4c9eefa2010-09-15 17:06:34 -04001385 .name = "throttle.io_serviced",
1386 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1387 BLKIO_THROTL_io_serviced),
1388 .read_map = blkiocg_file_read_map,
1389 },
Vivek Goyal13f98252010-10-01 14:49:41 +02001390#endif /* CONFIG_BLK_DEV_THROTTLING */
1391
Vivek Goyal22084192009-12-03 12:59:49 -05001392#ifdef CONFIG_DEBUG_BLK_CGROUP
Divyesh Shahcdc11842010-04-08 21:15:10 -07001393 {
1394 .name = "avg_queue_size",
Vivek Goyal062a6442010-09-15 17:06:33 -04001395 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1396 BLKIO_PROP_avg_queue_size),
1397 .read_map = blkiocg_file_read_map,
Divyesh Shahcdc11842010-04-08 21:15:10 -07001398 },
1399 {
Divyesh Shah812df482010-04-08 21:15:35 -07001400 .name = "group_wait_time",
Vivek Goyal062a6442010-09-15 17:06:33 -04001401 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1402 BLKIO_PROP_group_wait_time),
1403 .read_map = blkiocg_file_read_map,
Divyesh Shah812df482010-04-08 21:15:35 -07001404 },
1405 {
1406 .name = "idle_time",
Vivek Goyal062a6442010-09-15 17:06:33 -04001407 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1408 BLKIO_PROP_idle_time),
1409 .read_map = blkiocg_file_read_map,
Divyesh Shah812df482010-04-08 21:15:35 -07001410 },
1411 {
1412 .name = "empty_time",
Vivek Goyal062a6442010-09-15 17:06:33 -04001413 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1414 BLKIO_PROP_empty_time),
1415 .read_map = blkiocg_file_read_map,
Divyesh Shah812df482010-04-08 21:15:35 -07001416 },
1417 {
Vivek Goyal22084192009-12-03 12:59:49 -05001418 .name = "dequeue",
Vivek Goyal062a6442010-09-15 17:06:33 -04001419 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1420 BLKIO_PROP_dequeue),
1421 .read_map = blkiocg_file_read_map,
Divyesh Shahcdc11842010-04-08 21:15:10 -07001422 },
Justin TerAvest9026e522011-03-22 21:26:54 +01001423 {
1424 .name = "unaccounted_time",
1425 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1426 BLKIO_PROP_unaccounted_time),
1427 .read_map = blkiocg_file_read_map,
1428 },
Vivek Goyal22084192009-12-03 12:59:49 -05001429#endif
Vivek Goyal31e4c282009-12-03 12:59:42 -05001430};
1431
1432static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1433{
1434 return cgroup_add_files(cgroup, subsys, blkio_files,
1435 ARRAY_SIZE(blkio_files));
1436}
1437
Tejun Heo7ee9c562012-03-05 13:15:11 -08001438static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
1439 struct cgroup *cgroup)
Vivek Goyal31e4c282009-12-03 12:59:42 -05001440{
1441 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
Vivek Goyalb1c35762009-12-03 12:59:47 -05001442 unsigned long flags;
1443 struct blkio_group *blkg;
Tejun Heoca32aef2012-03-05 13:15:03 -08001444 struct request_queue *q;
Vivek Goyal3e252062009-12-04 10:36:42 -05001445 struct blkio_policy_type *blkiop;
Vivek Goyal31e4c282009-12-03 12:59:42 -05001446
Vivek Goyalb1c35762009-12-03 12:59:47 -05001447 rcu_read_lock();
Tejun Heo7ee9c562012-03-05 13:15:11 -08001448
Jens Axboe0f3942a2010-05-03 14:28:55 +02001449 do {
1450 spin_lock_irqsave(&blkcg->lock, flags);
Vivek Goyalb1c35762009-12-03 12:59:47 -05001451
Jens Axboe0f3942a2010-05-03 14:28:55 +02001452 if (hlist_empty(&blkcg->blkg_list)) {
1453 spin_unlock_irqrestore(&blkcg->lock, flags);
1454 break;
1455 }
1456
1457 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
1458 blkcg_node);
Tejun Heoca32aef2012-03-05 13:15:03 -08001459 q = rcu_dereference(blkg->q);
Jens Axboe0f3942a2010-05-03 14:28:55 +02001460 __blkiocg_del_blkio_group(blkg);
1461
Vivek Goyalb1c35762009-12-03 12:59:47 -05001462 spin_unlock_irqrestore(&blkcg->lock, flags);
Vivek Goyalb1c35762009-12-03 12:59:47 -05001463
Jens Axboe0f3942a2010-05-03 14:28:55 +02001464 /*
1465 * This blkio_group is being unlinked as associated cgroup is
1466 * going away. Let all the IO controlling policies know about
Vivek Goyal61014e92010-10-01 14:49:44 +02001467 * this event.
Jens Axboe0f3942a2010-05-03 14:28:55 +02001468 */
1469 spin_lock(&blkio_list_lock);
Vivek Goyal61014e92010-10-01 14:49:44 +02001470 list_for_each_entry(blkiop, &blkio_list, list) {
1471 if (blkiop->plid != blkg->plid)
1472 continue;
Tejun Heoca32aef2012-03-05 13:15:03 -08001473 blkiop->ops.blkio_unlink_group_fn(q, blkg);
Vivek Goyal61014e92010-10-01 14:49:44 +02001474 }
Jens Axboe0f3942a2010-05-03 14:28:55 +02001475 spin_unlock(&blkio_list_lock);
1476 } while (1);
Vivek Goyalb1c35762009-12-03 12:59:47 -05001477
Vivek Goyalb1c35762009-12-03 12:59:47 -05001478 rcu_read_unlock();
Tejun Heo7ee9c562012-03-05 13:15:11 -08001479
1480 return 0;
1481}
1482
1483static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1484{
1485 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1486
Ben Blum67523c42010-03-10 15:22:11 -08001487 if (blkcg != &blkio_root_cgroup)
1488 kfree(blkcg);
Vivek Goyal31e4c282009-12-03 12:59:42 -05001489}
1490
1491static struct cgroup_subsys_state *
1492blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1493{
Li Zefan03415092010-05-07 08:57:00 +02001494 struct blkio_cgroup *blkcg;
1495 struct cgroup *parent = cgroup->parent;
Vivek Goyal31e4c282009-12-03 12:59:42 -05001496
Li Zefan03415092010-05-07 08:57:00 +02001497 if (!parent) {
Vivek Goyal31e4c282009-12-03 12:59:42 -05001498 blkcg = &blkio_root_cgroup;
1499 goto done;
1500 }
1501
Vivek Goyal31e4c282009-12-03 12:59:42 -05001502 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1503 if (!blkcg)
1504 return ERR_PTR(-ENOMEM);
1505
1506 blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1507done:
1508 spin_lock_init(&blkcg->lock);
1509 INIT_HLIST_HEAD(&blkcg->blkg_list);
1510
1511 return &blkcg->css;
1512}
1513
Tejun Heo5efd6112012-03-05 13:15:12 -08001514/**
1515 * blkcg_init_queue - initialize blkcg part of request queue
1516 * @q: request_queue to initialize
1517 *
1518 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1519 * part of new request_queue @q.
1520 *
1521 * RETURNS:
1522 * 0 on success, -errno on failure.
1523 */
1524int blkcg_init_queue(struct request_queue *q)
1525{
Tejun Heo923adde2012-03-05 13:15:13 -08001526 int ret;
1527
Tejun Heo5efd6112012-03-05 13:15:12 -08001528 might_sleep();
1529
Tejun Heo923adde2012-03-05 13:15:13 -08001530 ret = blk_throtl_init(q);
1531 if (ret)
1532 return ret;
1533
1534 mutex_lock(&all_q_mutex);
1535 INIT_LIST_HEAD(&q->all_q_node);
1536 list_add_tail(&q->all_q_node, &all_q_list);
1537 mutex_unlock(&all_q_mutex);
1538
1539 return 0;
Tejun Heo5efd6112012-03-05 13:15:12 -08001540}
1541
1542/**
1543 * blkcg_drain_queue - drain blkcg part of request_queue
1544 * @q: request_queue to drain
1545 *
1546 * Called from blk_drain_queue(). Responsible for draining blkcg part.
1547 */
1548void blkcg_drain_queue(struct request_queue *q)
1549{
1550 lockdep_assert_held(q->queue_lock);
1551
1552 blk_throtl_drain(q);
1553}
1554
1555/**
1556 * blkcg_exit_queue - exit and release blkcg part of request_queue
1557 * @q: request_queue being released
1558 *
1559 * Called from blk_release_queue(). Responsible for exiting blkcg part.
1560 */
1561void blkcg_exit_queue(struct request_queue *q)
1562{
Tejun Heo923adde2012-03-05 13:15:13 -08001563 mutex_lock(&all_q_mutex);
1564 list_del_init(&q->all_q_node);
1565 mutex_unlock(&all_q_mutex);
1566
Tejun Heo5efd6112012-03-05 13:15:12 -08001567 blk_throtl_exit(q);
1568}
1569
Vivek Goyal31e4c282009-12-03 12:59:42 -05001570/*
1571 * We cannot support shared io contexts, as we have no mean to support
1572 * two tasks with the same ioc in two different groups without major rework
1573 * of the main cic data structures. For now we allow a task to change
1574 * its cgroup only if it's the only owner of its ioc.
1575 */
Tejun Heobb9d97b2011-12-12 18:12:21 -08001576static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1577 struct cgroup_taskset *tset)
Vivek Goyal31e4c282009-12-03 12:59:42 -05001578{
Tejun Heobb9d97b2011-12-12 18:12:21 -08001579 struct task_struct *task;
Vivek Goyal31e4c282009-12-03 12:59:42 -05001580 struct io_context *ioc;
1581 int ret = 0;
1582
1583 /* task_lock() is needed to avoid races with exit_io_context() */
Tejun Heobb9d97b2011-12-12 18:12:21 -08001584 cgroup_taskset_for_each(task, cgrp, tset) {
1585 task_lock(task);
1586 ioc = task->io_context;
1587 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1588 ret = -EINVAL;
1589 task_unlock(task);
1590 if (ret)
1591 break;
1592 }
Vivek Goyal31e4c282009-12-03 12:59:42 -05001593 return ret;
1594}
1595
Tejun Heobb9d97b2011-12-12 18:12:21 -08001596static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1597 struct cgroup_taskset *tset)
Vivek Goyal31e4c282009-12-03 12:59:42 -05001598{
Tejun Heobb9d97b2011-12-12 18:12:21 -08001599 struct task_struct *task;
Vivek Goyal31e4c282009-12-03 12:59:42 -05001600 struct io_context *ioc;
1601
Tejun Heobb9d97b2011-12-12 18:12:21 -08001602 cgroup_taskset_for_each(task, cgrp, tset) {
Linus Torvaldsb3c9dd12012-01-15 12:24:45 -08001603 /* we don't lose anything even if ioc allocation fails */
1604 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1605 if (ioc) {
1606 ioc_cgroup_changed(ioc);
Tejun Heo11a31222012-02-07 07:51:30 +01001607 put_io_context(ioc);
Linus Torvaldsb3c9dd12012-01-15 12:24:45 -08001608 }
Tejun Heobb9d97b2011-12-12 18:12:21 -08001609 }
Vivek Goyal31e4c282009-12-03 12:59:42 -05001610}
1611
Tejun Heo923adde2012-03-05 13:15:13 -08001612static void blkcg_bypass_start(void)
1613 __acquires(&all_q_mutex)
1614{
1615 struct request_queue *q;
1616
1617 mutex_lock(&all_q_mutex);
1618
1619 list_for_each_entry(q, &all_q_list, all_q_node) {
1620 blk_queue_bypass_start(q);
1621 blkg_destroy_all(q);
1622 }
1623}
1624
1625static void blkcg_bypass_end(void)
1626 __releases(&all_q_mutex)
1627{
1628 struct request_queue *q;
1629
1630 list_for_each_entry(q, &all_q_list, all_q_node)
1631 blk_queue_bypass_end(q);
1632
1633 mutex_unlock(&all_q_mutex);
1634}
1635
Vivek Goyal3e252062009-12-04 10:36:42 -05001636void blkio_policy_register(struct blkio_policy_type *blkiop)
1637{
Tejun Heo923adde2012-03-05 13:15:13 -08001638 blkcg_bypass_start();
Vivek Goyal3e252062009-12-04 10:36:42 -05001639 spin_lock(&blkio_list_lock);
Tejun Heo035d10b2012-03-05 13:15:04 -08001640
1641 BUG_ON(blkio_policy[blkiop->plid]);
1642 blkio_policy[blkiop->plid] = blkiop;
Vivek Goyal3e252062009-12-04 10:36:42 -05001643 list_add_tail(&blkiop->list, &blkio_list);
Tejun Heo035d10b2012-03-05 13:15:04 -08001644
Vivek Goyal3e252062009-12-04 10:36:42 -05001645 spin_unlock(&blkio_list_lock);
Tejun Heo923adde2012-03-05 13:15:13 -08001646 blkcg_bypass_end();
Vivek Goyal3e252062009-12-04 10:36:42 -05001647}
1648EXPORT_SYMBOL_GPL(blkio_policy_register);
1649
1650void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1651{
Tejun Heo923adde2012-03-05 13:15:13 -08001652 blkcg_bypass_start();
Vivek Goyal3e252062009-12-04 10:36:42 -05001653 spin_lock(&blkio_list_lock);
Tejun Heo035d10b2012-03-05 13:15:04 -08001654
1655 BUG_ON(blkio_policy[blkiop->plid] != blkiop);
1656 blkio_policy[blkiop->plid] = NULL;
Vivek Goyal3e252062009-12-04 10:36:42 -05001657 list_del_init(&blkiop->list);
Tejun Heo035d10b2012-03-05 13:15:04 -08001658
Vivek Goyal3e252062009-12-04 10:36:42 -05001659 spin_unlock(&blkio_list_lock);
Tejun Heo923adde2012-03-05 13:15:13 -08001660 blkcg_bypass_end();
Vivek Goyal3e252062009-12-04 10:36:42 -05001661}
1662EXPORT_SYMBOL_GPL(blkio_policy_unregister);