rcu: Add RCU-sched flavors of get-state and cond-sync

The get_state_synchronize_rcu() and cond_synchronize_rcu() functions
allow polling for grace-period completion, with an actual wait for a
grace period occurring only when cond_synchronize_rcu() is called too
soon after the corresponding get_state_synchronize_rcu().  However,
these functions work only for vanilla RCU.  This commit adds the
get_state_synchronize_sched() and cond_synchronize_sched(), which provide
the same capability for RCU-sched.

Reported-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 3df6c1e..ff968b7 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -37,6 +37,16 @@
 	might_sleep();
 }
 
+static inline unsigned long get_state_synchronize_sched(void)
+{
+	return 0;
+}
+
+static inline void cond_synchronize_sched(unsigned long oldstate)
+{
+	might_sleep();
+}
+
 static inline void rcu_barrier_bh(void)
 {
 	wait_rcu_gp(call_rcu_bh);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 4568791..5abec82 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -76,6 +76,8 @@
 void rcu_barrier_sched(void);
 unsigned long get_state_synchronize_rcu(void);
 void cond_synchronize_rcu(unsigned long oldstate);
+unsigned long get_state_synchronize_sched(void);
+void cond_synchronize_sched(unsigned long oldstate);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 59e3268..0f2cb55 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -635,6 +635,8 @@
 	.deferred_free	= rcu_sched_torture_deferred_free,
 	.sync		= synchronize_sched,
 	.exp_sync	= synchronize_sched_expedited,
+	.get_state	= get_state_synchronize_sched,
+	.cond_sync	= cond_synchronize_sched,
 	.call		= call_rcu_sched,
 	.cb_barrier	= rcu_barrier_sched,
 	.fqs		= rcu_sched_force_quiescent_state,
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8b5dd8b..9629298 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3253,6 +3253,58 @@
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
 
+/**
+ * get_state_synchronize_sched - Snapshot current RCU-sched state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_sched()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_sched(void)
+{
+	/*
+	 * Any prior manipulation of RCU-protected data must happen
+	 * before the load from ->gpnum.
+	 */
+	smp_mb();  /* ^^^ */
+
+	/*
+	 * Make sure this load happens before the purportedly
+	 * time-consuming work between get_state_synchronize_sched()
+	 * and cond_synchronize_sched().
+	 */
+	return smp_load_acquire(&rcu_sched_state.gpnum);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_sched);
+
+/**
+ * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period
+ *
+ * @oldstate: return value from earlier call to get_state_synchronize_sched()
+ *
+ * If a full RCU-sched grace period has elapsed since the earlier call to
+ * get_state_synchronize_sched(), just return.  Otherwise, invoke
+ * synchronize_sched() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.  But
+ * counter wrap is harmless.  If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
+ */
+void cond_synchronize_sched(unsigned long oldstate)
+{
+	unsigned long newstate;
+
+	/*
+	 * Ensure that this load happens before any RCU-destructive
+	 * actions the caller might carry out after we return.
+	 */
+	newstate = smp_load_acquire(&rcu_sched_state.completed);
+	if (ULONG_CMP_GE(oldstate, newstate))
+		synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_sched);
+
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
 	/*