Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull block layer fixes from Jens Axboe:
"A final set of fixes for 4.3.
It is (again) bigger than I would have liked, but it's all been
through the testing mill and has been carefully reviewed by multiple
parties. Each fix is either a regression fix for this cycle, or is
marked stable. You can scold me at KS. The pull request contains:
- Three simple fixes for NVMe, fixing regressions since 4.3. From
Arnd, Christoph, and Keith.
- A single xen-blkfront fix from Cathy, fixing a NULL dereference if
an error is returned through the staste change callback.
- Fixup for some bad/sloppy code in nbd that got introduced earlier
in this cycle. From Markus Pargmann.
- A blk-mq tagset use-after-free fix from Junichi.
- A backing device lifetime fix from Tejun, fixing a crash.
- And finally, a set of regression/stable fixes for cgroup writeback
from Tejun"
* 'for-linus' of git://git.kernel.dk/linux-block:
writeback: remove broken rbtree_postorder_for_each_entry_safe() usage in cgwb_bdi_destroy()
NVMe: Fix memory leak on retried commands
block: don't release bdi while request_queue has live references
nvme: use an integer value to Linux errno values
blk-mq: fix use-after-free in blk_mq_free_tag_set()
nvme: fix 32-bit build warning
writeback: fix incorrect calculation of available memory for memcg domains
writeback: memcg dirty_throttle_control should be initialized with wb->memcg_completions
writeback: bdi_writeback iteration must not skip dying ones
writeback: fix bdi_writeback iteration in wakeup_dirtytime_writeback()
writeback: laptop_mode_timer_fn() needs rcu_read_lock() around bdi_writeback iteration
nbd: Add locking for tasks
xen-blkfront: check for null drvdata in blkback_changed (XenbusStateClosing)
diff --git a/block/blk-core.c b/block/blk-core.c
index 2eb722d..18e92a6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -576,7 +576,7 @@
q->queue_lock = &q->__queue_lock;
spin_unlock_irq(lock);
- bdi_destroy(&q->backing_dev_info);
+ bdi_unregister(&q->backing_dev_info);
/* @q is and will stay empty, shutdown and put */
blk_put_queue(q);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index ed96474..ec2d119 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -641,6 +641,7 @@
{
bt_free(&tags->bitmap_tags);
bt_free(&tags->breserved_tags);
+ free_cpumask_var(tags->cpumask);
kfree(tags);
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7785ae9..85f0143 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2296,10 +2296,8 @@
int i;
for (i = 0; i < set->nr_hw_queues; i++) {
- if (set->tags[i]) {
+ if (set->tags[i])
blk_mq_free_rq_map(set, set->tags[i], i);
- free_cpumask_var(set->tags[i]->cpumask);
- }
}
kfree(set->tags);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3e44a9d..07b42f5 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -540,6 +540,7 @@
struct request_queue *q =
container_of(kobj, struct request_queue, kobj);
+ bdi_exit(&q->backing_dev_info);
blkcg_exit_queue(q);
if (q->elevator) {
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 293495a..1b87623 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -60,6 +60,7 @@
bool disconnect; /* a disconnect has been requested by user */
struct timer_list timeout_timer;
+ spinlock_t tasks_lock;
struct task_struct *task_recv;
struct task_struct *task_send;
@@ -140,21 +141,23 @@
static void nbd_xmit_timeout(unsigned long arg)
{
struct nbd_device *nbd = (struct nbd_device *)arg;
- struct task_struct *task;
+ unsigned long flags;
if (list_empty(&nbd->queue_head))
return;
nbd->disconnect = true;
- task = READ_ONCE(nbd->task_recv);
- if (task)
- force_sig(SIGKILL, task);
+ spin_lock_irqsave(&nbd->tasks_lock, flags);
- task = READ_ONCE(nbd->task_send);
- if (task)
+ if (nbd->task_recv)
+ force_sig(SIGKILL, nbd->task_recv);
+
+ if (nbd->task_send)
force_sig(SIGKILL, nbd->task_send);
+ spin_unlock_irqrestore(&nbd->tasks_lock, flags);
+
dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n");
}
@@ -403,17 +406,24 @@
{
struct request *req;
int ret;
+ unsigned long flags;
BUG_ON(nbd->magic != NBD_MAGIC);
sk_set_memalloc(nbd->sock->sk);
+ spin_lock_irqsave(&nbd->tasks_lock, flags);
nbd->task_recv = current;
+ spin_unlock_irqrestore(&nbd->tasks_lock, flags);
ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
if (ret) {
dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+
+ spin_lock_irqsave(&nbd->tasks_lock, flags);
nbd->task_recv = NULL;
+ spin_unlock_irqrestore(&nbd->tasks_lock, flags);
+
return ret;
}
@@ -429,7 +439,9 @@
device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+ spin_lock_irqsave(&nbd->tasks_lock, flags);
nbd->task_recv = NULL;
+ spin_unlock_irqrestore(&nbd->tasks_lock, flags);
if (signal_pending(current)) {
siginfo_t info;
@@ -534,8 +546,11 @@
{
struct nbd_device *nbd = data;
struct request *req;
+ unsigned long flags;
+ spin_lock_irqsave(&nbd->tasks_lock, flags);
nbd->task_send = current;
+ spin_unlock_irqrestore(&nbd->tasks_lock, flags);
set_user_nice(current, MIN_NICE);
while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
@@ -572,7 +587,15 @@
nbd_handle_req(nbd, req);
}
+ spin_lock_irqsave(&nbd->tasks_lock, flags);
nbd->task_send = NULL;
+ spin_unlock_irqrestore(&nbd->tasks_lock, flags);
+
+ /* Clear maybe pending signals */
+ if (signal_pending(current)) {
+ siginfo_t info;
+ dequeue_signal_lock(current, ¤t->blocked, &info);
+ }
return 0;
}
@@ -1052,6 +1075,7 @@
nbd_dev[i].magic = NBD_MAGIC;
INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
spin_lock_init(&nbd_dev[i].queue_lock);
+ spin_lock_init(&nbd_dev[i].tasks_lock);
INIT_LIST_HEAD(&nbd_dev[i].queue_head);
mutex_init(&nbd_dev[i].tx_lock);
init_timer(&nbd_dev[i].timeout_timer);
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 6f04771..ccc0c1f 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -603,27 +603,31 @@
struct nvme_iod *iod = ctx;
struct request *req = iod_get_private(iod);
struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
-
u16 status = le16_to_cpup(&cqe->status) >> 1;
+ bool requeue = false;
+ int error = 0;
if (unlikely(status)) {
if (!(status & NVME_SC_DNR || blk_noretry_request(req))
&& (jiffies - req->start_time) < req->timeout) {
unsigned long flags;
+ requeue = true;
blk_mq_requeue_request(req);
spin_lock_irqsave(req->q->queue_lock, flags);
if (!blk_queue_stopped(req->q))
blk_mq_kick_requeue_list(req->q);
spin_unlock_irqrestore(req->q->queue_lock, flags);
- return;
+ goto release_iod;
}
if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
if (cmd_rq->ctx == CMD_CTX_CANCELLED)
- status = -EINTR;
+ error = -EINTR;
+ else
+ error = status;
} else {
- status = nvme_error_status(status);
+ error = nvme_error_status(status);
}
}
@@ -635,8 +639,9 @@
if (cmd_rq->aborted)
dev_warn(nvmeq->dev->dev,
"completing aborted command with status:%04x\n",
- status);
+ error);
+release_iod:
if (iod->nents) {
dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
@@ -649,7 +654,8 @@
}
nvme_free_iod(nvmeq->dev, iod);
- blk_mq_complete_request(req, status);
+ if (likely(!requeue))
+ blk_mq_complete_request(req, error);
}
/* length is in bytes. gfp flags indicates whether we may sleep. */
@@ -1804,7 +1810,7 @@
length = (io.nblocks + 1) << ns->lba_shift;
meta_len = (io.nblocks + 1) * ns->ms;
- metadata = (void __user *)(unsigned long)io.metadata;
+ metadata = (void __user *)(uintptr_t)io.metadata;
write = io.opcode & 1;
if (ns->ext) {
@@ -1844,7 +1850,7 @@
c.rw.metadata = cpu_to_le64(meta_dma);
status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
- (void __user *)io.addr, length, NULL, 0);
+ (void __user *)(uintptr_t)io.addr, length, NULL, 0);
unmap:
if (meta) {
if (status == NVME_SC_SUCCESS && !write) {
@@ -1886,7 +1892,7 @@
timeout = msecs_to_jiffies(cmd.timeout_ms);
status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
- NULL, (void __user *)cmd.addr, cmd.data_len,
+ NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
&cmd.result, timeout);
if (status >= 0) {
if (put_user(cmd.result, &ucmd->result))
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 6111708..a69c02d 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1956,7 +1956,8 @@
break;
/* Missed the backend's Closing state -- fallthrough */
case XenbusStateClosing:
- blkfront_closing(info);
+ if (info)
+ blkfront_closing(info);
break;
}
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 091a364..29e4599 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -778,19 +778,24 @@
struct wb_writeback_work *base_work,
bool skip_if_busy)
{
- int next_memcg_id = 0;
- struct bdi_writeback *wb;
- struct wb_iter iter;
+ struct bdi_writeback *last_wb = NULL;
+ struct bdi_writeback *wb = list_entry_rcu(&bdi->wb_list,
+ struct bdi_writeback, bdi_node);
might_sleep();
restart:
rcu_read_lock();
- bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+ list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
struct wb_writeback_work fallback_work;
struct wb_writeback_work *work;
long nr_pages;
+ if (last_wb) {
+ wb_put(last_wb);
+ last_wb = NULL;
+ }
+
/* SYNC_ALL writes out I_DIRTY_TIME too */
if (!wb_has_dirty_io(wb) &&
(base_work->sync_mode == WB_SYNC_NONE ||
@@ -819,12 +824,22 @@
wb_queue_work(wb, work);
- next_memcg_id = wb->memcg_css->id + 1;
+ /*
+ * Pin @wb so that it stays on @bdi->wb_list. This allows
+ * continuing iteration from @wb after dropping and
+ * regrabbing rcu read lock.
+ */
+ wb_get(wb);
+ last_wb = wb;
+
rcu_read_unlock();
wb_wait_for_completion(bdi, &fallback_work_done);
goto restart;
}
rcu_read_unlock();
+
+ if (last_wb)
+ wb_put(last_wb);
}
#else /* CONFIG_CGROUP_WRITEBACK */
@@ -1857,12 +1872,11 @@
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
struct bdi_writeback *wb;
- struct wb_iter iter;
if (!bdi_has_dirty_io(bdi))
continue;
- bdi_for_each_wb(wb, bdi, &iter, 0)
+ list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
false, reason);
}
@@ -1894,11 +1908,10 @@
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
struct bdi_writeback *wb;
- struct wb_iter iter;
- bdi_for_each_wb(wb, bdi, &iter, 0)
- if (!list_empty(&bdi->wb.b_dirty_time))
- wb_wakeup(&bdi->wb);
+ list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
+ if (!list_empty(&wb->b_dirty_time))
+ wb_wakeup(wb);
}
rcu_read_unlock();
schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index a23209b..1b4d69f 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -116,6 +116,8 @@
struct list_head work_list;
struct delayed_work dwork; /* work item used for writeback */
+ struct list_head bdi_node; /* anchored at bdi->wb_list */
+
#ifdef CONFIG_CGROUP_WRITEBACK
struct percpu_ref refcnt; /* used only for !root wb's */
struct fprop_local_percpu memcg_completions;
@@ -150,6 +152,7 @@
atomic_long_t tot_write_bandwidth;
struct bdi_writeback wb; /* the root writeback info for this bdi */
+ struct list_head wb_list; /* list of all wbs */
#ifdef CONFIG_CGROUP_WRITEBACK
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
struct rb_root cgwb_congested_tree; /* their congested states */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index d5eb4ad1..c85f749 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -19,13 +19,17 @@
#include <linux/slab.h>
int __must_check bdi_init(struct backing_dev_info *bdi);
-void bdi_destroy(struct backing_dev_info *bdi);
+void bdi_exit(struct backing_dev_info *bdi);
__printf(3, 4)
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...);
int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
+void bdi_unregister(struct backing_dev_info *bdi);
+
int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
+void bdi_destroy(struct backing_dev_info *bdi);
+
void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
bool range_cyclic, enum wb_reason reason);
void wb_start_background_writeback(struct bdi_writeback *wb);
@@ -408,61 +412,6 @@
rcu_read_unlock();
}
-struct wb_iter {
- int start_memcg_id;
- struct radix_tree_iter tree_iter;
- void **slot;
-};
-
-static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
- struct backing_dev_info *bdi)
-{
- struct radix_tree_iter *titer = &iter->tree_iter;
-
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- if (iter->start_memcg_id >= 0) {
- iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
- iter->start_memcg_id = -1;
- } else {
- iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
- }
-
- if (!iter->slot)
- iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
- if (iter->slot)
- return *iter->slot;
- return NULL;
-}
-
-static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
- struct backing_dev_info *bdi,
- int start_memcg_id)
-{
- iter->start_memcg_id = start_memcg_id;
-
- if (start_memcg_id)
- return __wb_iter_next(iter, bdi);
- else
- return &bdi->wb;
-}
-
-/**
- * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
- * @wb_cur: cursor struct bdi_writeback pointer
- * @bdi: bdi to walk wb's of
- * @iter: pointer to struct wb_iter to be used as iteration buffer
- * @start_memcg_id: memcg ID to start iteration from
- *
- * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
- * memcg ID order starting from @start_memcg_id. @iter is struct wb_iter
- * to be used as temp storage during iteration. rcu_read_lock() must be
- * held throughout iteration.
- */
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \
- for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \
- (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
-
#else /* CONFIG_CGROUP_WRITEBACK */
static inline bool inode_cgwb_enabled(struct inode *inode)
@@ -522,14 +471,6 @@
{
}
-struct wb_iter {
- int next_id;
-};
-
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \
- for ((iter)->next_id = (start_blkcg_id); \
- ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
-
static inline int inode_congested(struct inode *inode, int cong_bits)
{
return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6452ff4..3e3318dd 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -676,8 +676,9 @@
struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
- unsigned long *pdirty, unsigned long *pwriteback);
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+ unsigned long *pheadroom, unsigned long *pdirty,
+ unsigned long *pwriteback);
#else /* CONFIG_CGROUP_WRITEBACK */
@@ -687,7 +688,8 @@
}
static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
- unsigned long *pavail,
+ unsigned long *pfilepages,
+ unsigned long *pheadroom,
unsigned long *pdirty,
unsigned long *pwriteback)
{
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 2df8ddc..619984f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -480,6 +480,10 @@
release_work);
struct backing_dev_info *bdi = wb->bdi;
+ spin_lock_irq(&cgwb_lock);
+ list_del_rcu(&wb->bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+
wb_shutdown(wb);
css_put(wb->memcg_css);
@@ -575,6 +579,7 @@
ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
if (!ret) {
atomic_inc(&bdi->usage_cnt);
+ list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
css_get(memcg_css);
@@ -676,7 +681,7 @@
static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
{
struct radix_tree_iter iter;
- struct bdi_writeback_congested *congested, *congested_n;
+ struct rb_node *rbn;
void **slot;
WARN_ON(test_bit(WB_registered, &bdi->wb.state));
@@ -686,9 +691,11 @@
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
cgwb_kill(*slot);
- rbtree_postorder_for_each_entry_safe(congested, congested_n,
- &bdi->cgwb_congested_tree, rb_node) {
- rb_erase(&congested->rb_node, &bdi->cgwb_congested_tree);
+ while ((rbn = rb_first(&bdi->cgwb_congested_tree))) {
+ struct bdi_writeback_congested *congested =
+ rb_entry(rbn, struct bdi_writeback_congested, rb_node);
+
+ rb_erase(rbn, &bdi->cgwb_congested_tree);
congested->bdi = NULL; /* mark @congested unlinked */
}
@@ -764,15 +771,22 @@
int bdi_init(struct backing_dev_info *bdi)
{
+ int ret;
+
bdi->dev = NULL;
bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = FPROP_FRAC_BASE;
INIT_LIST_HEAD(&bdi->bdi_list);
+ INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
- return cgwb_bdi_init(bdi);
+ ret = cgwb_bdi_init(bdi);
+
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+
+ return ret;
}
EXPORT_SYMBOL(bdi_init);
@@ -823,7 +837,7 @@
synchronize_rcu_expedited();
}
-void bdi_destroy(struct backing_dev_info *bdi)
+void bdi_unregister(struct backing_dev_info *bdi)
{
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
@@ -835,9 +849,19 @@
device_unregister(bdi->dev);
bdi->dev = NULL;
}
+}
+void bdi_exit(struct backing_dev_info *bdi)
+{
+ WARN_ON_ONCE(bdi->dev);
wb_exit(&bdi->wb);
}
+
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+ bdi_unregister(bdi);
+ bdi_exit(bdi);
+}
EXPORT_SYMBOL(bdi_destroy);
/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d9b5c81..c57c442 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3741,44 +3741,43 @@
/**
* mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
* @wb: bdi_writeback in question
- * @pavail: out parameter for number of available pages
+ * @pfilepages: out parameter for number of file pages
+ * @pheadroom: out parameter for number of allocatable pages according to memcg
* @pdirty: out parameter for number of dirty pages
* @pwriteback: out parameter for number of pages under writeback
*
- * Determine the numbers of available, dirty, and writeback pages in @wb's
- * memcg. Dirty and writeback are self-explanatory. Available is a bit
- * more involved.
+ * Determine the numbers of file, headroom, dirty, and writeback pages in
+ * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom
+ * is a bit more involved.
*
- * A memcg's headroom is "min(max, high) - used". The available memory is
- * calculated as the lowest headroom of itself and the ancestors plus the
- * number of pages already being used for file pages. Note that this
- * doesn't consider the actual amount of available memory in the system.
- * The caller should further cap *@pavail accordingly.
+ * A memcg's headroom is "min(max, high) - used". In the hierarchy, the
+ * headroom is calculated as the lowest headroom of itself and the
+ * ancestors. Note that this doesn't consider the actual amount of
+ * available memory in the system. The caller should further cap
+ * *@pheadroom accordingly.
*/
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
- unsigned long *pdirty, unsigned long *pwriteback)
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+ unsigned long *pheadroom, unsigned long *pdirty,
+ unsigned long *pwriteback)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- unsigned long head_room = PAGE_COUNTER_MAX;
- unsigned long file_pages;
*pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
/* this should eventually include NR_UNSTABLE_NFS */
*pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+ (1 << LRU_ACTIVE_FILE));
+ *pheadroom = PAGE_COUNTER_MAX;
- file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
- (1 << LRU_ACTIVE_FILE));
while ((parent = parent_mem_cgroup(memcg))) {
unsigned long ceiling = min(memcg->memory.limit, memcg->high);
unsigned long used = page_counter_read(&memcg->memory);
- head_room = min(head_room, ceiling - min(ceiling, used));
+ *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
memcg = parent;
}
-
- *pavail = file_pages + head_room;
}
#else /* CONFIG_CGROUP_WRITEBACK */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0a931cd..2c90357 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -145,9 +145,6 @@
unsigned long pos_ratio;
};
-#define DTC_INIT_COMMON(__wb) .wb = (__wb), \
- .wb_completions = &(__wb)->completions
-
/*
* Length of period for aging writeout fractions of bdis. This is an
* arbitrarily chosen number. The longer the period, the slower fractions will
@@ -157,12 +154,16 @@
#ifdef CONFIG_CGROUP_WRITEBACK
-#define GDTC_INIT(__wb) .dom = &global_wb_domain, \
- DTC_INIT_COMMON(__wb)
+#define GDTC_INIT(__wb) .wb = (__wb), \
+ .dom = &global_wb_domain, \
+ .wb_completions = &(__wb)->completions
+
#define GDTC_INIT_NO_WB .dom = &global_wb_domain
-#define MDTC_INIT(__wb, __gdtc) .dom = mem_cgroup_wb_domain(__wb), \
- .gdtc = __gdtc, \
- DTC_INIT_COMMON(__wb)
+
+#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \
+ .dom = mem_cgroup_wb_domain(__wb), \
+ .wb_completions = &(__wb)->memcg_completions, \
+ .gdtc = __gdtc
static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
@@ -213,7 +214,8 @@
#else /* CONFIG_CGROUP_WRITEBACK */
-#define GDTC_INIT(__wb) DTC_INIT_COMMON(__wb)
+#define GDTC_INIT(__wb) .wb = (__wb), \
+ .wb_completions = &(__wb)->completions
#define GDTC_INIT_NO_WB
#define MDTC_INIT(__wb, __gdtc)
@@ -682,13 +684,19 @@
return max(thresh, dom->dirty_limit);
}
-/* memory available to a memcg domain is capped by system-wide clean memory */
-static void mdtc_cap_avail(struct dirty_throttle_control *mdtc)
+/*
+ * Memory which can be further allocated to a memcg domain is capped by
+ * system-wide clean memory excluding the amount being used in the domain.
+ */
+static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
+ unsigned long filepages, unsigned long headroom)
{
struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
- unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+ unsigned long clean = filepages - min(filepages, mdtc->dirty);
+ unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+ unsigned long other_clean = global_clean - min(global_clean, clean);
- mdtc->avail = min(mdtc->avail, clean);
+ mdtc->avail = filepages + min(headroom, other_clean);
}
/**
@@ -1562,16 +1570,16 @@
}
if (mdtc) {
- unsigned long writeback;
+ unsigned long filepages, headroom, writeback;
/*
* If @wb belongs to !root memcg, repeat the same
* basic calculations for the memcg domain.
*/
- mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty,
- &writeback);
- mdtc_cap_avail(mdtc);
+ mem_cgroup_wb_stats(wb, &filepages, &headroom,
+ &mdtc->dirty, &writeback);
mdtc->dirty += writeback;
+ mdtc_calc_avail(mdtc, filepages, headroom);
domain_dirty_limits(mdtc);
@@ -1893,10 +1901,11 @@
return true;
if (mdtc) {
- unsigned long writeback;
+ unsigned long filepages, headroom, writeback;
- mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback);
- mdtc_cap_avail(mdtc);
+ mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
+ &writeback);
+ mdtc_calc_avail(mdtc, filepages, headroom);
domain_dirty_limits(mdtc); /* ditto, ignore writeback */
if (mdtc->dirty > mdtc->bg_thresh)
@@ -1956,7 +1965,6 @@
int nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
struct bdi_writeback *wb;
- struct wb_iter iter;
/*
* We want to write everything out, not just down to the dirty
@@ -1965,10 +1973,12 @@
if (!bdi_has_dirty_io(&q->backing_dev_info))
return;
- bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0)
+ rcu_read_lock();
+ list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node)
if (wb_has_dirty_io(wb))
wb_start_writeback(wb, nr_pages, true,
WB_REASON_LAPTOP_TIMER);
+ rcu_read_unlock();
}
/*