Merge branch 'sched/urgent' into sched/core, to pick up fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index eea2a6f..1ef5e48 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -301,8 +301,6 @@
if (!has_steal_clock)
return;
- memset(st, 0, sizeof(*st));
-
wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
pr_info("kvm-stealtime: cpu %d, msr %llx\n",
cpu, (unsigned long long) slow_virt_to_phys(st));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 253538f..b45acfd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2139,6 +2139,9 @@
__put_task_struct(t);
}
+struct task_struct *task_rcu_dereference(struct task_struct **ptask);
+struct task_struct *try_get_task_struct(struct task_struct **ptask);
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern void task_cputime(struct task_struct *t,
cputime_t *utime, cputime_t *stime);
diff --git a/kernel/exit.c b/kernel/exit.c
index 9e6e135..2fb4d44 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -211,6 +211,82 @@
}
/*
+ * Note that if this function returns a valid task_struct pointer (!NULL)
+ * task->usage must remain >0 for the duration of the RCU critical section.
+ */
+struct task_struct *task_rcu_dereference(struct task_struct **ptask)
+{
+ struct sighand_struct *sighand;
+ struct task_struct *task;
+
+ /*
+ * We need to verify that release_task() was not called and thus
+ * delayed_put_task_struct() can't run and drop the last reference
+ * before rcu_read_unlock(). We check task->sighand != NULL,
+ * but we can read the already freed and reused memory.
+ */
+retry:
+ task = rcu_dereference(*ptask);
+ if (!task)
+ return NULL;
+
+ probe_kernel_address(&task->sighand, sighand);
+
+ /*
+ * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
+ * was already freed we can not miss the preceding update of this
+ * pointer.
+ */
+ smp_rmb();
+ if (unlikely(task != READ_ONCE(*ptask)))
+ goto retry;
+
+ /*
+ * We've re-checked that "task == *ptask", now we have two different
+ * cases:
+ *
+ * 1. This is actually the same task/task_struct. In this case
+ * sighand != NULL tells us it is still alive.
+ *
+ * 2. This is another task which got the same memory for task_struct.
+ * We can't know this of course, and we can not trust
+ * sighand != NULL.
+ *
+ * In this case we actually return a random value, but this is
+ * correct.
+ *
+ * If we return NULL - we can pretend that we actually noticed that
+ * *ptask was updated when the previous task has exited. Or pretend
+ * that probe_slab_address(&sighand) reads NULL.
+ *
+ * If we return the new task (because sighand is not NULL for any
+ * reason) - this is fine too. This (new) task can't go away before
+ * another gp pass.
+ *
+ * And note: We could even eliminate the false positive if re-read
+ * task->sighand once again to avoid the falsely NULL. But this case
+ * is very unlikely so we don't care.
+ */
+ if (!sighand)
+ return NULL;
+
+ return task;
+}
+
+struct task_struct *try_get_task_struct(struct task_struct **ptask)
+{
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = task_rcu_dereference(ptask);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ return task;
+}
+
+/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
* by terminal-generated stop signals. Newly orphaned process groups are
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 51d7105..e406ba0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7231,7 +7231,6 @@
struct rq *rq = cpu_rq(cpu);
rq->calc_load_update = calc_load_update;
- account_reset_rq(rq);
update_max_interval();
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 75f98c5..3d60e5d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -257,7 +257,7 @@
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
}
-static __always_inline bool steal_account_process_tick(void)
+static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(¶virt_steal_enabled)) {
@@ -272,14 +272,14 @@
* time in jiffies. Lets cast the result to jiffies
* granularity and account the rest on the next rounds.
*/
- steal_jiffies = nsecs_to_jiffies(steal);
+ steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies);
this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
account_steal_time(jiffies_to_cputime(steal_jiffies));
return steal_jiffies;
}
#endif
- return false;
+ return 0;
}
/*
@@ -346,7 +346,7 @@
u64 cputime = (__force u64) cputime_one_jiffy;
u64 *cpustat = kcpustat_this_cpu->cpustat;
- if (steal_account_process_tick())
+ if (steal_account_process_tick(ULONG_MAX))
return;
cputime *= ticks;
@@ -477,7 +477,7 @@
return;
}
- if (steal_account_process_tick())
+ if (steal_account_process_tick(ULONG_MAX))
return;
if (user_tick)
@@ -681,12 +681,14 @@
static cputime_t get_vtime_delta(struct task_struct *tsk)
{
unsigned long now = READ_ONCE(jiffies);
- unsigned long delta = now - tsk->vtime_snap;
+ unsigned long delta_jiffies, steal_jiffies;
+ delta_jiffies = now - tsk->vtime_snap;
+ steal_jiffies = steal_account_process_tick(delta_jiffies);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime_snap = now;
- return jiffies_to_cputime(delta);
+ return jiffies_to_cputime(delta_jiffies - steal_jiffies);
}
static void __vtime_account_system(struct task_struct *tsk)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 0368c39..2a0a999 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -879,9 +879,9 @@
nr_switches = p->nvcsw + p->nivcsw;
-#ifdef CONFIG_SCHEDSTATS
P(se.nr_migrations);
+#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c8c5d2d..7306356 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1303,6 +1303,8 @@
{
if (env->best_task)
put_task_struct(env->best_task);
+ if (p)
+ get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
@@ -1370,31 +1372,11 @@
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
- bool assigned = false;
rcu_read_lock();
-
- raw_spin_lock_irq(&dst_rq->lock);
- cur = dst_rq->curr;
- /*
- * No need to move the exiting task or idle task.
- */
- if ((cur->flags & PF_EXITING) || is_idle_task(cur))
+ cur = task_rcu_dereference(&dst_rq->curr);
+ if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
cur = NULL;
- else {
- /*
- * The task_struct must be protected here to protect the
- * p->numa_faults access in the task_weight since the
- * numa_faults could already be freed in the following path:
- * finish_task_switch()
- * --> put_task_struct()
- * --> __put_task_struct()
- * --> task_numa_free()
- */
- get_task_struct(cur);
- }
-
- raw_spin_unlock_irq(&dst_rq->lock);
/*
* Because we have preemption enabled we can get migrated around and
@@ -1477,7 +1459,6 @@
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
- put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1503,16 +1484,9 @@
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign:
- assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
- /*
- * The dst_rq->curr isn't assigned. The protection for task_struct is
- * finished.
- */
- if (cur && !assigned)
- put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
@@ -3698,7 +3672,7 @@
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->throttle_count))
- return cfs_rq->throttled_clock_task;
+ return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
}
@@ -3836,13 +3810,11 @@
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
/* adjust cfs_rq_clock_task() */
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
}
-#endif
return 0;
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c5aeedf..9fb873c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -201,6 +201,8 @@
*/
static void cpu_idle_loop(void)
{
+ int cpu = smp_processor_id();
+
while (1) {
/*
* If the arch has a polling bit, we maintain an invariant:
@@ -219,7 +221,7 @@
check_pgt_cache();
rmb();
- if (cpu_is_offline(smp_processor_id())) {
+ if (cpu_is_offline(cpu)) {
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7cbeb92..71ce986 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1809,16 +1809,3 @@
#else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false)
#endif
-
-static inline void account_reset_rq(struct rq *rq)
-{
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- rq->prev_irq_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT
- rq->prev_steal_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- rq->prev_steal_time_rq = 0;
-#endif
-}