Merge branch 'percpu_ref-rcu-audit-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc
Pull percpu_ref rcu fixes from Tejun Heo:
"Jann Horn found that aio was depending on the internal RCU grace
periods of percpu-ref and that it's broken because aio uses regular
RCU while percpu_ref uses sched-RCU.
Depending on percpu_ref's internal grace periods isn't a good idea
because
- The RCU type might not match.
- percpu_ref's grace periods are used to switch to atomic mode. They
aren't between the last put and the invocation of the last release.
This is easy to get confused about and can lead to subtle bugs.
- percpu_ref might not have grace periods at all depending on its
current operation mode.
This patchset audits and fixes percpu_ref users for their RCU usages"
[ There's a continuation of this series that clarifies percpu_ref
documentation that the internal grace periods must not be depended
upon, and introduces rcu_work to simplify bouncing to a workqueue
after an RCU grace period.
That will go in for 4.17 - this is just the minimal set with the fixes
that are tagged for -stable ]
* 'percpu_ref-rcu-audit-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc:
RDMAVT: Fix synchronization around percpu_ref
fs/aio: Use RCU accessors for kioctx_table->table[]
fs/aio: Add explicit RCU grace period when freeing kioctx
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
index 1b2e536..cc429b5 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -489,11 +489,13 @@
unsigned long timeout;
struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
- if (percpu_ref_is_zero(&mr->refcount))
- return 0;
- /* avoid dma mr */
- if (mr->lkey)
+ if (mr->lkey) {
+ /* avoid dma mr */
rvt_dereg_clean_qps(mr);
+ /* @mr was indexed on rcu protected @lkey_table */
+ synchronize_rcu();
+ }
+
timeout = wait_for_completion_timeout(&mr->comp, 5 * HZ);
if (!timeout) {
rvt_pr_err(rdi,
diff --git a/fs/aio.c b/fs/aio.c
index a062d75..6bcd3fb 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -68,9 +68,9 @@
#define AIO_RING_PAGES 8
struct kioctx_table {
- struct rcu_head rcu;
- unsigned nr;
- struct kioctx *table[];
+ struct rcu_head rcu;
+ unsigned nr;
+ struct kioctx __rcu *table[];
};
struct kioctx_cpu {
@@ -115,7 +115,8 @@
struct page **ring_pages;
long nr_pages;
- struct work_struct free_work;
+ struct rcu_head free_rcu;
+ struct work_struct free_work; /* see free_ioctx() */
/*
* signals when all in-flight requests are done
@@ -329,7 +330,7 @@
for (i = 0; i < table->nr; i++) {
struct kioctx *ctx;
- ctx = table->table[i];
+ ctx = rcu_dereference(table->table[i]);
if (ctx && ctx->aio_ring_file == file) {
if (!atomic_read(&ctx->dead)) {
ctx->user_id = ctx->mmap_base = vma->vm_start;
@@ -588,6 +589,12 @@
return cancel(&kiocb->common);
}
+/*
+ * free_ioctx() should be RCU delayed to synchronize against the RCU
+ * protected lookup_ioctx() and also needs process context to call
+ * aio_free_ring(), so the double bouncing through kioctx->free_rcu and
+ * ->free_work.
+ */
static void free_ioctx(struct work_struct *work)
{
struct kioctx *ctx = container_of(work, struct kioctx, free_work);
@@ -601,6 +608,14 @@
kmem_cache_free(kioctx_cachep, ctx);
}
+static void free_ioctx_rcufn(struct rcu_head *head)
+{
+ struct kioctx *ctx = container_of(head, struct kioctx, free_rcu);
+
+ INIT_WORK(&ctx->free_work, free_ioctx);
+ schedule_work(&ctx->free_work);
+}
+
static void free_ioctx_reqs(struct percpu_ref *ref)
{
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
@@ -609,8 +624,8 @@
if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
complete(&ctx->rq_wait->comp);
- INIT_WORK(&ctx->free_work, free_ioctx);
- schedule_work(&ctx->free_work);
+ /* Synchronize against RCU protected table->table[] dereferences */
+ call_rcu(&ctx->free_rcu, free_ioctx_rcufn);
}
/*
@@ -651,9 +666,9 @@
while (1) {
if (table)
for (i = 0; i < table->nr; i++)
- if (!table->table[i]) {
+ if (!rcu_access_pointer(table->table[i])) {
ctx->id = i;
- table->table[i] = ctx;
+ rcu_assign_pointer(table->table[i], ctx);
spin_unlock(&mm->ioctx_lock);
/* While kioctx setup is in progress,
@@ -834,11 +849,11 @@
}
table = rcu_dereference_raw(mm->ioctx_table);
- WARN_ON(ctx != table->table[ctx->id]);
- table->table[ctx->id] = NULL;
+ WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
+ RCU_INIT_POINTER(table->table[ctx->id], NULL);
spin_unlock(&mm->ioctx_lock);
- /* percpu_ref_kill() will do the necessary call_rcu() */
+ /* free_ioctx_reqs() will do the necessary RCU synchronization */
wake_up_all(&ctx->wait);
/*
@@ -880,7 +895,8 @@
skipped = 0;
for (i = 0; i < table->nr; ++i) {
- struct kioctx *ctx = table->table[i];
+ struct kioctx *ctx =
+ rcu_dereference_protected(table->table[i], true);
if (!ctx) {
skipped++;
@@ -1069,7 +1085,7 @@
if (!table || id >= table->nr)
goto out;
- ctx = table->table[id];
+ ctx = rcu_dereference(table->table[id]);
if (ctx && ctx->user_id == ctx_id) {
percpu_ref_get(&ctx->users);
ret = ctx;