Merge branch 'for-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
Pull percpu updates from Tejun Heo:
- Nick improved generic implementations of percpu operations which
modify the variable and return so that they calculate the physical
address only once.
- percpu_ref percpu <-> atomic mode switching improvements. The
patchset was originally posted about a year ago but fell through the
crack.
- misc non-critical fixes.
* 'for-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu:
mm/percpu.c: fix potential memory leakage for pcpu_embed_first_chunk()
mm/percpu.c: correct max_distance calculation for pcpu_embed_first_chunk()
percpu: eliminate two sparse warnings
percpu: improve generic percpu modify-return implementation
percpu-refcount: init ->confirm_switch member properly
percpu_ref: allow operation mode switching operations to be called concurrently
percpu_ref: restructure operation mode switching
percpu_ref: unify staggered atomic switching wait behavior
percpu_ref: reorganize __percpu_ref_switch_to_atomic() and relocate percpu_ref_switch_to_atomic()
percpu_ref: remove unnecessary RCU grace period for staggered atomic switching confirmation
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index e02e3f8..84f58de 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -521,7 +521,8 @@
static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
const unsigned long __percpu *addr)
{
- unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
+ unsigned long __percpu *a =
+ (unsigned long __percpu *)addr + nr / BITS_PER_LONG;
#ifdef CONFIG_X86_64
return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0;
@@ -538,7 +539,7 @@
asm volatile("bt "__percpu_arg(2)",%1\n\t"
CC_SET(c)
: CC_OUT(c) (oldbit)
- : "m" (*(unsigned long *)addr), "Ir" (nr));
+ : "m" (*(unsigned long __percpu *)addr), "Ir" (nr));
return oldbit;
}
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 4d9f233..40e8870 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -65,6 +65,11 @@
#define PER_CPU_DEF_ATTRIBUTES
#endif
+#define raw_cpu_generic_read(pcp) \
+({ \
+ *raw_cpu_ptr(&(pcp)); \
+})
+
#define raw_cpu_generic_to_op(pcp, val, op) \
do { \
*raw_cpu_ptr(&(pcp)) op val; \
@@ -72,34 +77,39 @@
#define raw_cpu_generic_add_return(pcp, val) \
({ \
- raw_cpu_add(pcp, val); \
- raw_cpu_read(pcp); \
+ typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \
+ \
+ *__p += val; \
+ *__p; \
})
#define raw_cpu_generic_xchg(pcp, nval) \
({ \
+ typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \
typeof(pcp) __ret; \
- __ret = raw_cpu_read(pcp); \
- raw_cpu_write(pcp, nval); \
+ __ret = *__p; \
+ *__p = nval; \
__ret; \
})
#define raw_cpu_generic_cmpxchg(pcp, oval, nval) \
({ \
+ typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \
typeof(pcp) __ret; \
- __ret = raw_cpu_read(pcp); \
+ __ret = *__p; \
if (__ret == (oval)) \
- raw_cpu_write(pcp, nval); \
+ *__p = nval; \
__ret; \
})
#define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
({ \
+ typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1)); \
+ typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2)); \
int __ret = 0; \
- if (raw_cpu_read(pcp1) == (oval1) && \
- raw_cpu_read(pcp2) == (oval2)) { \
- raw_cpu_write(pcp1, nval1); \
- raw_cpu_write(pcp2, nval2); \
+ if (*__p1 == (oval1) && *__p2 == (oval2)) { \
+ *__p1 = nval1; \
+ *__p2 = nval2; \
__ret = 1; \
} \
(__ret); \
@@ -109,7 +119,7 @@
({ \
typeof(pcp) __ret; \
preempt_disable(); \
- __ret = *this_cpu_ptr(&(pcp)); \
+ __ret = raw_cpu_generic_read(pcp); \
preempt_enable(); \
__ret; \
})
@@ -118,17 +128,17 @@
do { \
unsigned long __flags; \
raw_local_irq_save(__flags); \
- *raw_cpu_ptr(&(pcp)) op val; \
+ raw_cpu_generic_to_op(pcp, val, op); \
raw_local_irq_restore(__flags); \
} while (0)
+
#define this_cpu_generic_add_return(pcp, val) \
({ \
typeof(pcp) __ret; \
unsigned long __flags; \
raw_local_irq_save(__flags); \
- raw_cpu_add(pcp, val); \
- __ret = raw_cpu_read(pcp); \
+ __ret = raw_cpu_generic_add_return(pcp, val); \
raw_local_irq_restore(__flags); \
__ret; \
})
@@ -138,8 +148,7 @@
typeof(pcp) __ret; \
unsigned long __flags; \
raw_local_irq_save(__flags); \
- __ret = raw_cpu_read(pcp); \
- raw_cpu_write(pcp, nval); \
+ __ret = raw_cpu_generic_xchg(pcp, nval); \
raw_local_irq_restore(__flags); \
__ret; \
})
@@ -149,9 +158,7 @@
typeof(pcp) __ret; \
unsigned long __flags; \
raw_local_irq_save(__flags); \
- __ret = raw_cpu_read(pcp); \
- if (__ret == (oval)) \
- raw_cpu_write(pcp, nval); \
+ __ret = raw_cpu_generic_cmpxchg(pcp, oval, nval); \
raw_local_irq_restore(__flags); \
__ret; \
})
@@ -168,16 +175,16 @@
})
#ifndef raw_cpu_read_1
-#define raw_cpu_read_1(pcp) (*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_1(pcp) raw_cpu_generic_read(pcp)
#endif
#ifndef raw_cpu_read_2
-#define raw_cpu_read_2(pcp) (*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_2(pcp) raw_cpu_generic_read(pcp)
#endif
#ifndef raw_cpu_read_4
-#define raw_cpu_read_4(pcp) (*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_4(pcp) raw_cpu_generic_read(pcp)
#endif
#ifndef raw_cpu_read_8
-#define raw_cpu_read_8(pcp) (*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_8(pcp) raw_cpu_generic_read(pcp)
#endif
#ifndef raw_cpu_write_1
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 27fe749..9ac959e 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -33,6 +33,7 @@
#define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1))
+static DEFINE_SPINLOCK(percpu_ref_switch_lock);
static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);
static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
@@ -82,6 +83,7 @@
atomic_long_set(&ref->count, start_count);
ref->release = release;
+ ref->confirm_switch = NULL;
return 0;
}
EXPORT_SYMBOL_GPL(percpu_ref_init);
@@ -101,6 +103,8 @@
unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
if (percpu_count) {
+ /* non-NULL confirm_switch indicates switching in progress */
+ WARN_ON_ONCE(ref->confirm_switch);
free_percpu(percpu_count);
ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
}
@@ -161,34 +165,67 @@
static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
percpu_ref_func_t *confirm_switch)
{
- if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) {
- /* switching from percpu to atomic */
- ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
-
- /*
- * Non-NULL ->confirm_switch is used to indicate that
- * switching is in progress. Use noop one if unspecified.
- */
- WARN_ON_ONCE(ref->confirm_switch);
- ref->confirm_switch =
- confirm_switch ?: percpu_ref_noop_confirm_switch;
-
- percpu_ref_get(ref); /* put after confirmation */
- call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
- } else if (confirm_switch) {
- /*
- * Somebody already set ATOMIC. Switching may still be in
- * progress. @confirm_switch must be invoked after the
- * switching is complete and a full sched RCU grace period
- * has passed. Wait synchronously for the previous
- * switching and schedule @confirm_switch invocation.
- */
- wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
- ref->confirm_switch = confirm_switch;
-
- percpu_ref_get(ref); /* put after confirmation */
- call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu);
+ if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
+ if (confirm_switch)
+ confirm_switch(ref);
+ return;
}
+
+ /* switching from percpu to atomic */
+ ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
+
+ /*
+ * Non-NULL ->confirm_switch is used to indicate that switching is
+ * in progress. Use noop one if unspecified.
+ */
+ ref->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch;
+
+ percpu_ref_get(ref); /* put after confirmation */
+ call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
+}
+
+static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
+{
+ unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
+ int cpu;
+
+ BUG_ON(!percpu_count);
+
+ if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
+ return;
+
+ atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
+
+ /*
+ * Restore per-cpu operation. smp_store_release() is paired with
+ * smp_read_barrier_depends() in __ref_is_percpu() and guarantees
+ * that the zeroing is visible to all percpu accesses which can see
+ * the following __PERCPU_REF_ATOMIC clearing.
+ */
+ for_each_possible_cpu(cpu)
+ *per_cpu_ptr(percpu_count, cpu) = 0;
+
+ smp_store_release(&ref->percpu_count_ptr,
+ ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
+}
+
+static void __percpu_ref_switch_mode(struct percpu_ref *ref,
+ percpu_ref_func_t *confirm_switch)
+{
+ lockdep_assert_held(&percpu_ref_switch_lock);
+
+ /*
+ * If the previous ATOMIC switching hasn't finished yet, wait for
+ * its completion. If the caller ensures that ATOMIC switching
+ * isn't in progress, this function can be called from any context.
+ */
+ wait_event_lock_irq(percpu_ref_switch_waitq, !ref->confirm_switch,
+ percpu_ref_switch_lock);
+
+ if (ref->force_atomic || (ref->percpu_count_ptr & __PERCPU_REF_DEAD))
+ __percpu_ref_switch_to_atomic(ref, confirm_switch);
+ else
+ __percpu_ref_switch_to_percpu(ref);
}
/**
@@ -207,47 +244,21 @@
* operations. Note that @ref will stay in atomic mode across kill/reinit
* cycles until percpu_ref_switch_to_percpu() is called.
*
- * This function normally doesn't block and can be called from any context
- * but it may block if @confirm_kill is specified and @ref is already in
- * the process of switching to atomic mode. In such cases, @confirm_switch
- * will be invoked after the switching is complete.
- *
- * Due to the way percpu_ref is implemented, @confirm_switch will be called
- * after at least one full sched RCU grace period has passed but this is an
- * implementation detail and must not be depended upon.
+ * This function may block if @ref is in the process of switching to atomic
+ * mode. If the caller ensures that @ref is not in the process of
+ * switching to atomic mode, this function can be called from any context.
*/
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
percpu_ref_func_t *confirm_switch)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
ref->force_atomic = true;
- __percpu_ref_switch_to_atomic(ref, confirm_switch);
-}
+ __percpu_ref_switch_mode(ref, confirm_switch);
-static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
-{
- unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
- int cpu;
-
- BUG_ON(!percpu_count);
-
- if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
- return;
-
- wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
-
- atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
-
- /*
- * Restore per-cpu operation. smp_store_release() is paired with
- * smp_read_barrier_depends() in __ref_is_percpu() and guarantees
- * that the zeroing is visible to all percpu accesses which can see
- * the following __PERCPU_REF_ATOMIC clearing.
- */
- for_each_possible_cpu(cpu)
- *per_cpu_ptr(percpu_count, cpu) = 0;
-
- smp_store_release(&ref->percpu_count_ptr,
- ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
+ spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
/**
@@ -264,17 +275,20 @@
* dying or dead, the actual switching takes place on the following
* percpu_ref_reinit().
*
- * This function normally doesn't block and can be called from any context
- * but it may block if @ref is in the process of switching to atomic mode
- * by percpu_ref_switch_atomic().
+ * This function may block if @ref is in the process of switching to atomic
+ * mode. If the caller ensures that @ref is not in the process of
+ * switching to atomic mode, this function can be called from any context.
*/
void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
- ref->force_atomic = false;
+ unsigned long flags;
- /* a dying or dead ref can't be switched to percpu mode w/o reinit */
- if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD))
- __percpu_ref_switch_to_percpu(ref);
+ spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
+ ref->force_atomic = false;
+ __percpu_ref_switch_mode(ref, NULL);
+
+ spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
/**
@@ -290,21 +304,23 @@
*
* This function normally doesn't block and can be called from any context
* but it may block if @confirm_kill is specified and @ref is in the
- * process of switching to atomic mode by percpu_ref_switch_atomic().
- *
- * Due to the way percpu_ref is implemented, @confirm_switch will be called
- * after at least one full sched RCU grace period has passed but this is an
- * implementation detail and must not be depended upon.
+ * process of switching to atomic mode by percpu_ref_switch_to_atomic().
*/
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
percpu_ref_func_t *confirm_kill)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD,
"%s called more than once on %pf!", __func__, ref->release);
ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
- __percpu_ref_switch_to_atomic(ref, confirm_kill);
+ __percpu_ref_switch_mode(ref, confirm_kill);
percpu_ref_put(ref);
+
+ spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
@@ -321,11 +337,16 @@
*/
void percpu_ref_reinit(struct percpu_ref *ref)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
WARN_ON_ONCE(!percpu_ref_is_zero(ref));
ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
percpu_ref_get(ref);
- if (!ref->force_atomic)
- __percpu_ref_switch_to_percpu(ref);
+ __percpu_ref_switch_mode(ref, NULL);
+
+ spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_reinit);
diff --git a/mm/percpu.c b/mm/percpu.c
index 9903830..2557143 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1961,8 +1961,9 @@
void *base = (void *)ULONG_MAX;
void **areas = NULL;
struct pcpu_alloc_info *ai;
- size_t size_sum, areas_size, max_distance;
- int group, i, rc;
+ size_t size_sum, areas_size;
+ unsigned long max_distance;
+ int group, i, highest_group, rc;
ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
cpu_distance_fn);
@@ -1978,7 +1979,8 @@
goto out_free;
}
- /* allocate, copy and determine base address */
+ /* allocate, copy and determine base address & max_distance */
+ highest_group = 0;
for (group = 0; group < ai->nr_groups; group++) {
struct pcpu_group_info *gi = &ai->groups[group];
unsigned int cpu = NR_CPUS;
@@ -1999,6 +2001,21 @@
areas[group] = ptr;
base = min(ptr, base);
+ if (ptr > areas[highest_group])
+ highest_group = group;
+ }
+ max_distance = areas[highest_group] - base;
+ max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
+
+ /* warn if maximum distance is further than 75% of vmalloc space */
+ if (max_distance > VMALLOC_TOTAL * 3 / 4) {
+ pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
+ max_distance, VMALLOC_TOTAL);
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+ /* and fail if we have fallback */
+ rc = -EINVAL;
+ goto out_free_areas;
+#endif
}
/*
@@ -2023,23 +2040,8 @@
}
/* base address is now known, determine group base offsets */
- max_distance = 0;
for (group = 0; group < ai->nr_groups; group++) {
ai->groups[group].base_offset = areas[group] - base;
- max_distance = max_t(size_t, max_distance,
- ai->groups[group].base_offset);
- }
- max_distance += ai->unit_size;
-
- /* warn if maximum distance is further than 75% of vmalloc space */
- if (max_distance > VMALLOC_TOTAL * 3 / 4) {
- pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n",
- max_distance, VMALLOC_TOTAL);
-#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
- /* and fail if we have fallback */
- rc = -EINVAL;
- goto out_free;
-#endif
}
pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",