blkio: Add io controller stats like

- io_service_time
- io_wait_time
- io_serviced
- io_service_bytes

These stats are accumulated per operation type helping us to distinguish between
read and write, and sync and async IO. This patch does not increment any of
these stats.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5be3981..ad6843f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -55,12 +55,15 @@
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
-void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
-						unsigned long time)
+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
 {
-	blkg->time += time;
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkg->stats_lock, flags);
+	blkg->stats.time += time;
+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
-EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
+EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 			struct blkio_group *blkg, void *key, dev_t dev)
@@ -170,13 +173,121 @@
 	return 0;
 }
 
-#define SHOW_FUNCTION_PER_GROUP(__VAR)					\
+static int
+blkiocg_reset_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
+{
+	struct blkio_cgroup *blkcg;
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+	struct blkio_group_stats *stats;
+
+	blkcg = cgroup_to_blkio_cgroup(cgroup);
+	spin_lock_irq(&blkcg->lock);
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		spin_lock(&blkg->stats_lock);
+		stats = &blkg->stats;
+		memset(stats, 0, sizeof(struct blkio_group_stats));
+		spin_unlock(&blkg->stats_lock);
+	}
+	spin_unlock_irq(&blkcg->lock);
+	return 0;
+}
+
+void get_key_name(int type, char *disk_id, char *str, int chars_left)
+{
+	strlcpy(str, disk_id, chars_left);
+	chars_left -= strlen(str);
+	if (chars_left <= 0) {
+		printk(KERN_WARNING
+			"Possibly incorrect cgroup stat display format");
+		return;
+	}
+	switch (type) {
+	case IO_READ:
+		strlcat(str, " Read", chars_left);
+		break;
+	case IO_WRITE:
+		strlcat(str, " Write", chars_left);
+		break;
+	case IO_SYNC:
+		strlcat(str, " Sync", chars_left);
+		break;
+	case IO_ASYNC:
+		strlcat(str, " Async", chars_left);
+		break;
+	case IO_TYPE_MAX:
+		strlcat(str, " Total", chars_left);
+		break;
+	default:
+		strlcat(str, " Invalid", chars_left);
+	}
+}
+
+typedef uint64_t (get_var) (struct blkio_group *, int);
+
+#define MAX_KEY_LEN 100
+uint64_t get_typed_stat(struct blkio_group *blkg, struct cgroup_map_cb *cb,
+		get_var *getvar, char *disk_id)
+{
+	uint64_t disk_total;
+	char key_str[MAX_KEY_LEN];
+	int type;
+
+	for (type = 0; type < IO_TYPE_MAX; type++) {
+		get_key_name(type, disk_id, key_str, MAX_KEY_LEN);
+		cb->fill(cb, key_str, getvar(blkg, type));
+	}
+	disk_total = getvar(blkg, IO_READ) + getvar(blkg, IO_WRITE);
+	get_key_name(IO_TYPE_MAX, disk_id, key_str, MAX_KEY_LEN);
+	cb->fill(cb, key_str, disk_total);
+	return disk_total;
+}
+
+uint64_t get_stat(struct blkio_group *blkg, struct cgroup_map_cb *cb,
+		get_var *getvar, char *disk_id)
+{
+	uint64_t var = getvar(blkg, 0);
+	cb->fill(cb, disk_id, var);
+	return var;
+}
+
+#define GET_STAT_INDEXED(__VAR)						\
+uint64_t get_##__VAR##_stat(struct blkio_group *blkg, int type)		\
+{									\
+	return blkg->stats.__VAR[type];					\
+}									\
+
+GET_STAT_INDEXED(io_service_bytes);
+GET_STAT_INDEXED(io_serviced);
+GET_STAT_INDEXED(io_service_time);
+GET_STAT_INDEXED(io_wait_time);
+#undef GET_STAT_INDEXED
+
+#define GET_STAT(__VAR, __CONV)						\
+uint64_t get_##__VAR##_stat(struct blkio_group *blkg, int dummy)	\
+{									\
+	uint64_t data = blkg->stats.__VAR;				\
+	if (__CONV)							\
+		data = (uint64_t)jiffies_to_msecs(data) * NSEC_PER_MSEC;\
+	return data;							\
+}
+
+GET_STAT(time, 1);
+GET_STAT(sectors, 0);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+GET_STAT(dequeue, 0);
+#endif
+#undef GET_STAT
+
+#define SHOW_FUNCTION_PER_GROUP(__VAR, get_stats, getvar, show_total)	\
 static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
-			struct cftype *cftype, struct seq_file *m)	\
+		struct cftype *cftype, struct cgroup_map_cb *cb)	\
 {									\
 	struct blkio_cgroup *blkcg;					\
 	struct blkio_group *blkg;					\
 	struct hlist_node *n;						\
+	uint64_t cgroup_total = 0;					\
+	char disk_id[10];						\
 									\
 	if (!cgroup_lock_live_group(cgroup))				\
 		return -ENODEV;						\
@@ -184,19 +295,32 @@
 	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
 	rcu_read_lock();						\
 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
-		if (blkg->dev)						\
-			seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev),	\
-				 MINOR(blkg->dev), blkg->__VAR);	\
+		if (blkg->dev) {					\
+			spin_lock_irq(&blkg->stats_lock);		\
+			snprintf(disk_id, 10, "%u:%u", MAJOR(blkg->dev),\
+					MINOR(blkg->dev));		\
+			cgroup_total += get_stats(blkg, cb, getvar,	\
+						disk_id);		\
+			spin_unlock_irq(&blkg->stats_lock);		\
+		}							\
 	}								\
+	if (show_total)							\
+		cb->fill(cb, "Total", cgroup_total);			\
 	rcu_read_unlock();						\
 	cgroup_unlock();						\
 	return 0;							\
 }
 
-SHOW_FUNCTION_PER_GROUP(time);
-SHOW_FUNCTION_PER_GROUP(sectors);
+SHOW_FUNCTION_PER_GROUP(time, get_stat, get_time_stat, 0);
+SHOW_FUNCTION_PER_GROUP(sectors, get_stat, get_sectors_stat, 0);
+SHOW_FUNCTION_PER_GROUP(io_service_bytes, get_typed_stat,
+			get_io_service_bytes_stat, 1);
+SHOW_FUNCTION_PER_GROUP(io_serviced, get_typed_stat, get_io_serviced_stat, 1);
+SHOW_FUNCTION_PER_GROUP(io_service_time, get_typed_stat,
+			get_io_service_time_stat, 1);
+SHOW_FUNCTION_PER_GROUP(io_wait_time, get_typed_stat, get_io_wait_time_stat, 1);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-SHOW_FUNCTION_PER_GROUP(dequeue);
+SHOW_FUNCTION_PER_GROUP(dequeue, get_stat, get_dequeue_stat, 0);
 #endif
 #undef SHOW_FUNCTION_PER_GROUP
 
@@ -204,7 +328,7 @@
 void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
 			unsigned long dequeue)
 {
-	blkg->dequeue += dequeue;
+	blkg->stats.dequeue += dequeue;
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
 #endif
@@ -217,16 +341,38 @@
 	},
 	{
 		.name = "time",
-		.read_seq_string = blkiocg_time_read,
+		.read_map = blkiocg_time_read,
+		.write_u64 = blkiocg_reset_write,
 	},
 	{
 		.name = "sectors",
-		.read_seq_string = blkiocg_sectors_read,
+		.read_map = blkiocg_sectors_read,
+		.write_u64 = blkiocg_reset_write,
+	},
+	{
+		.name = "io_service_bytes",
+		.read_map = blkiocg_io_service_bytes_read,
+		.write_u64 = blkiocg_reset_write,
+	},
+	{
+		.name = "io_serviced",
+		.read_map = blkiocg_io_serviced_read,
+		.write_u64 = blkiocg_reset_write,
+	},
+	{
+		.name = "io_service_time",
+		.read_map = blkiocg_io_service_time_read,
+		.write_u64 = blkiocg_reset_write,
+	},
+	{
+		.name = "io_wait_time",
+		.read_map = blkiocg_io_wait_time_read,
+		.write_u64 = blkiocg_reset_write,
 	},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
        {
 		.name = "dequeue",
-		.read_seq_string = blkiocg_dequeue_read,
+		.read_map = blkiocg_dequeue_read,
        },
 #endif
 };