blkcg: move blkio_group_stats_cpu and friends to blk-throttle.c

blkio_group_stats_cpu is used only by blk-throtl and has no reason to
be defined in blkcg core.

* Move blkio_group_stats_cpu to blk-throttle.c and rename it to
  tg_stats_cpu.

* blkg_policy_data->stats_cpu is replaced with throtl_grp->stats_cpu.
  prfill functions updated accordingly.

* All related macros / functions are renamed so that they have tg_
  prefix and the unnecessary @pol arguments are dropped.

* Per-cpu stats allocation code is also moved from blk-cgroup.c to
  blk-throttle.c and gets simplified to only deal with
  BLKIO_POLICY_THROTL.  percpu stat free is performed by the exit
  method throtl_exit_blkio_group().

* throtl_reset_group_stats() implemented for
  blkio_reset_group_stats_fn method so that tg->stats_cpu can be
  reset.

Signed-off-by: Tejun Heo <tj@kernel.org>
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index cb259bc4..27f7960 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -40,6 +40,14 @@
 
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 
+/* Per-cpu group stats */
+struct tg_stats_cpu {
+	/* total bytes transferred */
+	struct blkg_rwstat		service_bytes;
+	/* total IOs serviced, post merge */
+	struct blkg_rwstat		serviced;
+};
+
 struct throtl_grp {
 	/* active throtl group service_tree member */
 	struct rb_node rb_node;
@@ -76,6 +84,12 @@
 
 	/* Some throttle limits got updated for the group */
 	int limits_changed;
+
+	/* Per cpu stats pointer */
+	struct tg_stats_cpu __percpu *stats_cpu;
+
+	/* List of tgs waiting for per cpu stats memory to be allocated */
+	struct list_head stats_alloc_node;
 };
 
 struct throtl_data
@@ -100,6 +114,13 @@
 	int limits_changed;
 };
 
+/* list and work item to allocate percpu group stats */
+static DEFINE_SPINLOCK(tg_stats_alloc_lock);
+static LIST_HEAD(tg_stats_alloc_list);
+
+static void tg_stats_alloc_fn(struct work_struct *);
+static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
+
 static inline struct throtl_grp *blkg_to_tg(struct blkio_group *blkg)
 {
 	return blkg_to_pdata(blkg, &blkio_policy_throtl);
@@ -142,6 +163,44 @@
 	return td->nr_queued[0] + td->nr_queued[1];
 }
 
+/*
+ * Worker for allocating per cpu stat for tgs. This is scheduled on the
+ * system_nrt_wq once there are some groups on the alloc_list waiting for
+ * allocation.
+ */
+static void tg_stats_alloc_fn(struct work_struct *work)
+{
+	static struct tg_stats_cpu *stats_cpu;	/* this fn is non-reentrant */
+	struct delayed_work *dwork = to_delayed_work(work);
+	bool empty = false;
+
+alloc_stats:
+	if (!stats_cpu) {
+		stats_cpu = alloc_percpu(struct tg_stats_cpu);
+		if (!stats_cpu) {
+			/* allocation failed, try again after some time */
+			queue_delayed_work(system_nrt_wq, dwork,
+					   msecs_to_jiffies(10));
+			return;
+		}
+	}
+
+	spin_lock_irq(&tg_stats_alloc_lock);
+
+	if (!list_empty(&tg_stats_alloc_list)) {
+		struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
+							 struct throtl_grp,
+							 stats_alloc_node);
+		swap(tg->stats_cpu, stats_cpu);
+		list_del_init(&tg->stats_alloc_node);
+	}
+
+	empty = list_empty(&tg_stats_alloc_list);
+	spin_unlock_irq(&tg_stats_alloc_lock);
+	if (!empty)
+		goto alloc_stats;
+}
+
 static void throtl_init_blkio_group(struct blkio_group *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -155,6 +214,43 @@
 	tg->bps[WRITE] = -1;
 	tg->iops[READ] = -1;
 	tg->iops[WRITE] = -1;
+
+	/*
+	 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
+	 * but percpu allocator can't be called from IO path.  Queue tg on
+	 * tg_stats_alloc_list and allocate from work item.
+	 */
+	spin_lock(&tg_stats_alloc_lock);
+	list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
+	queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0);
+	spin_unlock(&tg_stats_alloc_lock);
+}
+
+static void throtl_exit_blkio_group(struct blkio_group *blkg)
+{
+	struct throtl_grp *tg = blkg_to_tg(blkg);
+
+	spin_lock(&tg_stats_alloc_lock);
+	list_del_init(&tg->stats_alloc_node);
+	spin_unlock(&tg_stats_alloc_lock);
+
+	free_percpu(tg->stats_cpu);
+}
+
+static void throtl_reset_group_stats(struct blkio_group *blkg)
+{
+	struct throtl_grp *tg = blkg_to_tg(blkg);
+	int cpu;
+
+	if (tg->stats_cpu == NULL)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
+
+		blkg_rwstat_reset(&sc->service_bytes);
+		blkg_rwstat_reset(&sc->serviced);
+	}
 }
 
 static struct
@@ -565,12 +661,12 @@
 static void throtl_update_dispatch_stats(struct blkio_group *blkg, u64 bytes,
 					 int rw)
 {
-	struct blkg_policy_data *pd = blkg->pd[BLKIO_POLICY_THROTL];
-	struct blkio_group_stats_cpu *stats_cpu;
+	struct throtl_grp *tg = blkg_to_tg(blkg);
+	struct tg_stats_cpu *stats_cpu;
 	unsigned long flags;
 
 	/* If per cpu stats are not allocated yet, don't do any accounting. */
-	if (pd->stats_cpu == NULL)
+	if (tg->stats_cpu == NULL)
 		return;
 
 	/*
@@ -580,7 +676,7 @@
 	 */
 	local_irq_save(flags);
 
-	stats_cpu = this_cpu_ptr(pd->stats_cpu);
+	stats_cpu = this_cpu_ptr(tg->stats_cpu);
 
 	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
 	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
@@ -842,15 +938,15 @@
 	throtl_schedule_delayed_work(td, 0);
 }
 
-static u64 blkg_prfill_cpu_rwstat(struct seq_file *sf,
-				  struct blkg_policy_data *pd, int off)
+static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
+				struct blkg_policy_data *pd, int off)
 {
+	struct throtl_grp *tg = (void *)pd->pdata;
 	struct blkg_rwstat rwstat = { }, tmp;
 	int i, cpu;
 
 	for_each_possible_cpu(cpu) {
-		struct blkio_group_stats_cpu *sc =
-			per_cpu_ptr(pd->stats_cpu, cpu);
+		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
 
 		tmp = blkg_rwstat_read((void *)sc + off);
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
@@ -861,12 +957,12 @@
 }
 
 /* print per-cpu blkg_rwstat specified by BLKCG_STAT_PRIV() */
-static int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
-				  struct seq_file *sf)
+static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
+			       struct seq_file *sf)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 
-	blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_rwstat,
+	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat,
 			  BLKCG_STAT_POL(cft->private),
 			  BLKCG_STAT_OFF(cft->private), true);
 	return 0;
@@ -1012,14 +1108,14 @@
 	{
 		.name = "throttle.io_service_bytes",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_THROTL,
-				offsetof(struct blkio_group_stats_cpu, service_bytes)),
-		.read_seq_string = blkcg_print_cpu_rwstat,
+				offsetof(struct tg_stats_cpu, service_bytes)),
+		.read_seq_string = tg_print_cpu_rwstat,
 	},
 	{
 		.name = "throttle.io_serviced",
 		.private = BLKCG_STAT_PRIV(BLKIO_POLICY_THROTL,
-				offsetof(struct blkio_group_stats_cpu, serviced)),
-		.read_seq_string = blkcg_print_cpu_rwstat,
+				offsetof(struct tg_stats_cpu, serviced)),
+		.read_seq_string = tg_print_cpu_rwstat,
 	},
 	{ }	/* terminate */
 };
@@ -1034,6 +1130,8 @@
 static struct blkio_policy_type blkio_policy_throtl = {
 	.ops = {
 		.blkio_init_group_fn = throtl_init_blkio_group,
+		.blkio_exit_group_fn = throtl_exit_blkio_group,
+		.blkio_reset_group_stats_fn = throtl_reset_group_stats,
 	},
 	.plid = BLKIO_POLICY_THROTL,
 	.pdata_size = sizeof(struct throtl_grp),