[PATCH] spufs: Improved SPU preemptability.

This patch makes it easier to preempt an SPU context by
having the scheduler hold ctx->state_sema for much shorter
periods of time.

As part of this restructuring, the control logic for the "run"
operation is moved from arch/ppc64/kernel/spu_base.c to
fs/spufs/file.c.  Of course the base retains "bottom half"
handlers for class{0,1} irqs.  The new run loop will re-acquire
an SPU if preempted.

From: Mark Nutter <mnutter@us.ibm.com>
Signed-off-by: Arnd Bergmann <arndb@de.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 167580c..8abd4bd 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -130,7 +130,8 @@
 	spu->dar = ea;
 	spu->dsisr = dsisr;
 	mb();
-	wake_up(&spu->stop_wq);
+	if (spu->stop_callback)
+		spu->stop_callback(spu);
 	return 0;
 }
 
@@ -151,7 +152,8 @@
 {
 	pr_debug("%s\n", __FUNCTION__);
 	spu->stop_code = in_be32(&spu->problem->spu_status_R);
-	wake_up(&spu->stop_wq);
+	if (spu->stop_callback)
+		spu->stop_callback(spu);
 	return 0;
 }
 
@@ -159,7 +161,8 @@
 {
 	pr_debug("%s\n", __FUNCTION__);
 	spu->stop_code = in_be32(&spu->problem->spu_status_R);
-	wake_up(&spu->stop_wq);
+	if (spu->stop_callback)
+		spu->stop_callback(spu);
 	return 0;
 }
 
@@ -190,12 +193,13 @@
 
 	spu = data;
 	spu->class_0_pending = 1;
-	wake_up(&spu->stop_wq);
+	if (spu->stop_callback)
+		spu->stop_callback(spu);
 
 	return IRQ_HANDLED;
 }
 
-static int
+int
 spu_irq_class_0_bottom(struct spu *spu)
 {
 	unsigned long stat;
@@ -214,8 +218,10 @@
 		__spu_trap_error(spu);
 
 	out_be64(&spu->priv1->int_stat_class0_RW, stat);
-	return 0;
+
+	return (stat & 0x7) ? -EIO : 0;
 }
+EXPORT_SYMBOL_GPL(spu_irq_class_0_bottom);
 
 static irqreturn_t
 spu_irq_class_1(int irq, void *data, struct pt_regs *regs)
@@ -250,6 +256,7 @@
 
 	return stat ? IRQ_HANDLED : IRQ_NONE;
 }
+EXPORT_SYMBOL_GPL(spu_irq_class_1_bottom);
 
 static irqreturn_t
 spu_irq_class_2(int irq, void *data, struct pt_regs *regs)
@@ -478,7 +485,7 @@
 	return -EFAULT;
 }
 
-static int spu_handle_pte_fault(struct spu *spu)
+int spu_irq_class_1_bottom(struct spu *spu)
 {
 	u64 ea, dsisr, access, error = 0UL;
 	int ret = 0;
@@ -508,76 +515,6 @@
 	return ret;
 }
 
-static inline int spu_pending(struct spu *spu, u32 * stat)
-{
-	struct spu_problem __iomem *prob = spu->problem;
-	u64 pte_fault;
-
-	*stat = in_be32(&prob->spu_status_R);
-	pte_fault = spu->dsisr &
-		    (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED);
-	return (!(*stat & 0x1) || pte_fault || spu->class_0_pending) ? 1 : 0;
-}
-
-int spu_run(struct spu *spu)
-{
-	struct spu_problem __iomem *prob;
-	struct spu_priv1 __iomem *priv1;
-	struct spu_priv2 __iomem *priv2;
-	u32 status;
-	int ret;
-
-	prob = spu->problem;
-	priv1 = spu->priv1;
-	priv2 = spu->priv2;
-
-	/* Let SPU run.  */
-	eieio();
-	out_be32(&prob->spu_runcntl_RW, SPU_RUNCNTL_RUNNABLE);
-
-	do {
-		ret = wait_event_interruptible(spu->stop_wq,
-					       spu_pending(spu, &status));
-
-		if (spu->dsisr &
-		    (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED))
-			ret = spu_handle_pte_fault(spu);
-
-		if (spu->class_0_pending)
-			spu_irq_class_0_bottom(spu);
-
-		if (!ret && signal_pending(current))
-			ret = -ERESTARTSYS;
-
-	} while (!ret && !(status &
-			   (SPU_STATUS_STOPPED_BY_STOP |
-			    SPU_STATUS_STOPPED_BY_HALT)));
-
-	/* Ensure SPU is stopped.  */
-	out_be32(&prob->spu_runcntl_RW, SPU_RUNCNTL_STOP);
-	eieio();
-	while (in_be32(&prob->spu_status_R) & SPU_STATUS_RUNNING)
-		cpu_relax();
-
-	out_be64(&priv2->slb_invalidate_all_W, 0);
-	out_be64(&priv1->tlb_invalidate_entry_W, 0UL);
-	eieio();
-
-	/* Check for SPU breakpoint.  */
-	if (unlikely(current->ptrace & PT_PTRACED)) {
-		status = in_be32(&prob->spu_status_R);
-
-		if ((status & SPU_STATUS_STOPPED_BY_STOP)
-		    && status >> SPU_STOP_STATUS_SHIFT == 0x3fff) {
-			force_sig(SIGTRAP, current);
-			ret = -ERESTARTSYS;
-		}
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(spu_run);
-
 static void __iomem * __init map_spe_prop(struct device_node *n,
 						 const char *name)
 {
@@ -693,9 +630,9 @@
 	out_be64(&spu->priv1->mfc_sdr_RW, mfspr(SPRN_SDR1));
 	out_be64(&spu->priv1->mfc_sr1_RW, 0x33);
 
-	init_waitqueue_head(&spu->stop_wq);
 	spu->ibox_callback = NULL;
 	spu->wbox_callback = NULL;
+	spu->stop_callback = NULL;
 
 	down(&spu_mutex);
 	spu->number = number++;
diff --git a/arch/powerpc/platforms/cell/spufs/backing_ops.c b/arch/powerpc/platforms/cell/spufs/backing_ops.c
index caf0984..66567c1 100644
--- a/arch/powerpc/platforms/cell/spufs/backing_ops.c
+++ b/arch/powerpc/platforms/cell/spufs/backing_ops.c
@@ -232,6 +232,23 @@
 	return ctx->csa.lscsa->ls;
 }
 
+static void spu_backing_runcntl_write(struct spu_context *ctx, u32 val)
+{
+	spin_lock(&ctx->csa.register_lock);
+	ctx->csa.prob.spu_runcntl_RW = val;
+	if (val & SPU_RUNCNTL_RUNNABLE) {
+		ctx->csa.prob.spu_status_R |= SPU_STATUS_RUNNING;
+	} else {
+		ctx->csa.prob.spu_status_R &= ~SPU_STATUS_RUNNING;
+	}
+	spin_unlock(&ctx->csa.register_lock);
+}
+
+static void spu_backing_runcntl_stop(struct spu_context *ctx)
+{
+	spu_backing_runcntl_write(ctx, SPU_RUNCNTL_STOP);
+}
+
 struct spu_context_ops spu_backing_ops = {
 	.mbox_read = spu_backing_mbox_read,
 	.mbox_stat_read = spu_backing_mbox_stat_read,
@@ -249,4 +266,6 @@
 	.npc_write = spu_backing_npc_write,
 	.status_read = spu_backing_status_read,
 	.get_ls = spu_backing_get_ls,
+	.runcntl_write = spu_backing_runcntl_write,
+	.runcntl_stop = spu_backing_runcntl_stop,
 };
diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index 5d6195f..0d88a1c 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -45,6 +45,7 @@
 	init_rwsem(&ctx->state_sema);
 	init_waitqueue_head(&ctx->ibox_wq);
 	init_waitqueue_head(&ctx->wbox_wq);
+	init_waitqueue_head(&ctx->stop_wq);
 	ctx->ibox_fasync = NULL;
 	ctx->wbox_fasync = NULL;
 	ctx->state = SPU_STATE_SAVED;
@@ -105,7 +106,7 @@
 	up_read(&ctx->state_sema);
 }
 
-static void spu_unmap_mappings(struct spu_context *ctx)
+void spu_unmap_mappings(struct spu_context *ctx)
 {
 	unmap_mapping_range(ctx->local_store, 0, LS_SIZE, 1);
 }
@@ -126,7 +127,6 @@
 
 	down_write(&ctx->state_sema);
 	if (ctx->state == SPU_STATE_SAVED) {
-		spu_unmap_mappings(ctx);
 		ret = spu_activate(ctx, 0);
 		ctx->state = SPU_STATE_RUNNABLE;
 	}
@@ -154,7 +154,6 @@
 	down_write(&ctx->state_sema);
 
 	if (ctx->state == SPU_STATE_RUNNABLE) {
-		spu_unmap_mappings(ctx);
 		spu_deactivate(ctx);
 		ctx->state = SPU_STATE_SAVED;
 	}
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 0fe1fec..af5adc3 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/poll.h>
+#include <linux/ptrace.h>
 
 #include <asm/io.h>
 #include <asm/semaphore.h>
@@ -540,26 +541,122 @@
 	.read	= spufs_wbox_stat_read,
 };
 
-long spufs_run_spu(struct file *file, struct spu_context *ctx,
-				u32 *npc, u32 *status)
+/* interrupt-level stop callback function. */
+void spufs_stop_callback(struct spu *spu)
+{
+	struct spu_context *ctx = spu->ctx;
+
+	wake_up_all(&ctx->stop_wq);
+}
+
+static inline int spu_stopped(struct spu_context *ctx, u32 * stat)
+{
+	struct spu *spu;
+	u64 pte_fault;
+
+	*stat = ctx->ops->status_read(ctx);
+	if (ctx->state != SPU_STATE_RUNNABLE)
+		return 1;
+	spu = ctx->spu;
+	pte_fault = spu->dsisr &
+	    (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED);
+	return (!(*stat & 0x1) || pte_fault || spu->class_0_pending) ? 1 : 0;
+}
+
+static inline int spu_run_init(struct spu_context *ctx, u32 * npc,
+			       u32 * status)
 {
 	int ret;
 
-	ret = spu_acquire_runnable(ctx);
-	if (ret)
+	if ((ret = spu_acquire_runnable(ctx)) != 0)
+		return ret;
+	ctx->ops->npc_write(ctx, *npc);
+	ctx->ops->runcntl_write(ctx, SPU_RUNCNTL_RUNNABLE);
+	return 0;
+}
+
+static inline int spu_run_fini(struct spu_context *ctx, u32 * npc,
+			       u32 * status)
+{
+	int ret = 0;
+
+	*status = ctx->ops->status_read(ctx);
+	*npc = ctx->ops->npc_read(ctx);
+	spu_release(ctx);
+
+	if (signal_pending(current))
+		ret = -ERESTARTSYS;
+	if (unlikely(current->ptrace & PT_PTRACED)) {
+		if ((*status & SPU_STATUS_STOPPED_BY_STOP)
+		    && (*status >> SPU_STOP_STATUS_SHIFT) == 0x3fff) {
+			force_sig(SIGTRAP, current);
+			ret = -ERESTARTSYS;
+		}
+	}
+	return ret;
+}
+
+static inline int spu_reacquire_runnable(struct spu_context *ctx, u32 *npc,
+				         u32 *status)
+{
+	int ret;
+
+	if ((ret = spu_run_fini(ctx, npc, status)) != 0)
+		return ret;
+	if (*status & (SPU_STATUS_STOPPED_BY_STOP |
+		       SPU_STATUS_STOPPED_BY_HALT)) {
+		return *status;
+	}
+	if ((ret = spu_run_init(ctx, npc, status)) != 0)
+		return ret;
+	return 0;
+}
+
+static inline int spu_process_events(struct spu_context *ctx)
+{
+	struct spu *spu = ctx->spu;
+	u64 pte_fault = MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED;
+	int ret = 0;
+
+	if (spu->dsisr & pte_fault)
+		ret = spu_irq_class_1_bottom(spu);
+	if (spu->class_0_pending)
+		ret = spu_irq_class_0_bottom(spu);
+	if (!ret && signal_pending(current))
+		ret = -ERESTARTSYS;
+	return ret;
+}
+
+long spufs_run_spu(struct file *file, struct spu_context *ctx,
+		   u32 * npc, u32 * status)
+{
+	int ret;
+
+	if ((ret = spu_run_init(ctx, npc, status)) != 0)
 		return ret;
 
-	ctx->ops->npc_write(ctx, *npc);
+	do {
+		ret = spufs_wait(ctx->stop_wq, spu_stopped(ctx, status));
+		if (unlikely(ret))
+			break;
+		if (unlikely(ctx->state != SPU_STATE_RUNNABLE)) {
+			ret = spu_reacquire_runnable(ctx, npc, status);
+			if (ret) {
+				return ret;
+			}
+			continue;
+		}
+		ret = spu_process_events(ctx);
 
-	ret = spu_run(ctx->spu);
+	} while (!ret && !(*status & (SPU_STATUS_STOPPED_BY_STOP |
+				      SPU_STATUS_STOPPED_BY_HALT)));
 
+	ctx->ops->runcntl_stop(ctx);
+	ret = spu_run_fini(ctx, npc, status);
 	if (!ret)
-		ret = ctx->ops->status_read(ctx);
-
-	*npc = ctx->ops->npc_read(ctx);
-
-	spu_release(ctx);
+		ret = *status;
 	spu_yield(ctx);
+
 	return ret;
 }
 
diff --git a/arch/powerpc/platforms/cell/spufs/hw_ops.c b/arch/powerpc/platforms/cell/spufs/hw_ops.c
index 2e90cae..6881241 100644
--- a/arch/powerpc/platforms/cell/spufs/hw_ops.c
+++ b/arch/powerpc/platforms/cell/spufs/hw_ops.c
@@ -186,6 +186,21 @@
 	return ctx->spu->local_store;
 }
 
+static void spu_hw_runcntl_write(struct spu_context *ctx, u32 val)
+{
+	eieio();
+	out_be32(&ctx->spu->problem->spu_runcntl_RW, val);
+}
+
+static void spu_hw_runcntl_stop(struct spu_context *ctx)
+{
+	spin_lock_irq(&ctx->spu->register_lock);
+	out_be32(&ctx->spu->problem->spu_runcntl_RW, SPU_RUNCNTL_STOP);
+	while (in_be32(&ctx->spu->problem->spu_status_R) & SPU_STATUS_RUNNING)
+		cpu_relax();
+	spin_unlock_irq(&ctx->spu->register_lock);
+}
+
 struct spu_context_ops spu_hw_ops = {
 	.mbox_read = spu_hw_mbox_read,
 	.mbox_stat_read = spu_hw_mbox_stat_read,
@@ -203,4 +218,6 @@
 	.npc_write = spu_hw_npc_write,
 	.status_read = spu_hw_status_read,
 	.get_ls = spu_hw_get_ls,
+	.runcntl_write = spu_hw_runcntl_write,
+	.runcntl_stop = spu_hw_runcntl_stop,
 };
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 5750270..e2f10b5 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -119,7 +119,8 @@
 	}
 }
 
-static void prio_wait(struct spu_runqueue *rq, u64 flags)
+static void prio_wait(struct spu_runqueue *rq, struct spu_context *ctx,
+		      u64 flags)
 {
 	int prio = current->prio;
 	wait_queue_head_t *wq = &rq->prio.waitq[prio];
@@ -130,9 +131,11 @@
 	prepare_to_wait_exclusive(wq, &wait, TASK_INTERRUPTIBLE);
 	if (!signal_pending(current)) {
 		up(&rq->sem);
+		up_write(&ctx->state_sema);
 		pr_debug("%s: pid=%d prio=%d\n", __FUNCTION__,
 			 current->pid, current->prio);
 		schedule();
+		down_write(&ctx->state_sema);
 		down(&rq->sem);
 	}
 	finish_wait(wq, &wait);
@@ -173,7 +176,9 @@
 	mm_needs_global_tlbie(spu->mm);
 	spu->ibox_callback = spufs_ibox_callback;
 	spu->wbox_callback = spufs_wbox_callback;
+	spu->stop_callback = spufs_stop_callback;
 	mb();
+	spu_unmap_mappings(ctx);
 	spu_restore(&ctx->csa, spu);
 }
 
@@ -181,10 +186,12 @@
 {
 	pr_debug("%s: unbind pid=%d SPU=%d\n", __FUNCTION__,
 		 spu->pid, spu->number);
+	spu_unmap_mappings(ctx);
 	spu_save(&ctx->csa, spu);
 	ctx->state = SPU_STATE_SAVED;
 	spu->ibox_callback = NULL;
 	spu->wbox_callback = NULL;
+	spu->stop_callback = NULL;
 	spu->mm = NULL;
 	spu->pid = 0;
 	spu->prio = MAX_PRIO;
@@ -196,37 +203,35 @@
 static struct spu *preempt_active(struct spu_runqueue *rq)
 {
 	struct list_head *p;
-	struct spu_context *ctx;
-	struct spu *spu;
+	struct spu *worst, *spu;
 
-	/* Future: implement real preemption.  For now just
-	 * boot a lower priority ctx that is in "detached"
-	 * state, i.e. on a processor but not currently in
-	 * spu_run().
-	 */
+	worst = list_entry(rq->active_list.next, struct spu, sched_list);
 	list_for_each(p, &rq->active_list) {
 		spu = list_entry(p, struct spu, sched_list);
-		if (current->prio < spu->prio) {
-			ctx = spu->ctx;
-			if (down_write_trylock(&ctx->state_sema)) {
-				if (ctx->state != SPU_STATE_RUNNABLE) {
-					up_write(&ctx->state_sema);
-					continue;
-				}
-				pr_debug("%s: booting pid=%d from SPU %d\n",
-					 __FUNCTION__, spu->pid, spu->number);
-				del_active(rq, spu);
-				up(&rq->sem);
-				unbind_context(spu, ctx);
-				up_write(&ctx->state_sema);
-				return spu;
-			}
+		if (spu->prio > worst->prio) {
+			worst = spu;
+		}
+	}
+	if (current->prio < worst->prio) {
+		struct spu_context *ctx = worst->ctx;
+
+		spu = worst;
+		if (down_write_trylock(&ctx->state_sema)) {
+			pr_debug("%s: booting pid=%d from SPU %d\n",
+				 __FUNCTION__, spu->pid, spu->number);
+			del_active(rq, spu);
+			up(&rq->sem);
+			wake_up_all(&ctx->stop_wq);
+			ctx->ops->runcntl_stop(ctx);
+			unbind_context(spu, ctx);
+			up_write(&ctx->state_sema);
+			return spu;
 		}
 	}
 	return NULL;
 }
 
-static struct spu *get_idle_spu(u64 flags)
+static struct spu *get_idle_spu(struct spu_context *ctx, u64 flags)
 {
 	struct spu_runqueue *rq;
 	struct spu *spu = NULL;
@@ -255,7 +260,7 @@
 				if ((spu = preempt_active(rq)) != NULL)
 					return spu;
 			}
-			prio_wait(rq, flags);
+			prio_wait(rq, ctx, flags);
 			if (signal_pending(current)) {
 				prio_wakeup(rq);
 				spu = NULL;
@@ -322,7 +327,7 @@
 
 	if (ctx->spu)
 		return 0;
-	spu = get_idle_spu(flags);
+	spu = get_idle_spu(ctx, flags);
 	if (!spu)
 		return (signal_pending(current)) ? -ERESTARTSYS : -EAGAIN;
 	bind_context(spu, ctx);
@@ -347,17 +352,19 @@
 void spu_yield(struct spu_context *ctx)
 {
 	struct spu *spu;
+	int need_yield = 0;
 
-	if (!down_write_trylock(&ctx->state_sema))
-		return;
+	down_write(&ctx->state_sema);
 	spu = ctx->spu;
-	if ((ctx->state == SPU_STATE_RUNNABLE) &&
-	    (sched_find_first_bit(spu->rq->prio.bitmap) <= current->prio)) {
+	if (spu && (sched_find_first_bit(spu->rq->prio.bitmap) < MAX_PRIO)) {
 		pr_debug("%s: yielding SPU %d\n", __FUNCTION__, spu->number);
 		spu_deactivate(ctx);
 		ctx->state = SPU_STATE_SAVED;
+		need_yield = 1;
 	}
 	up_write(&ctx->state_sema);
+	if (unlikely(need_yield))
+		yield();
 }
 
 int __init spu_sched_init(void)
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 93c6a05..20f4e51 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -51,6 +51,7 @@
 	struct kref kref;
 	wait_queue_head_t ibox_wq;
 	wait_queue_head_t wbox_wq;
+	wait_queue_head_t stop_wq;
 	struct fasync_struct *ibox_fasync;
 	struct fasync_struct *wbox_fasync;
 	struct spu_context_ops *ops;
@@ -74,6 +75,8 @@
 	void (*npc_write) (struct spu_context * ctx, u32 data);
 	 u32(*status_read) (struct spu_context * ctx);
 	char*(*get_ls) (struct spu_context * ctx);
+	void (*runcntl_write) (struct spu_context * ctx, u32 data);
+	void (*runcntl_stop) (struct spu_context * ctx);
 };
 
 extern struct spu_context_ops spu_hw_ops;
@@ -99,6 +102,7 @@
 void destroy_spu_context(struct kref *kref);
 struct spu_context * get_spu_context(struct spu_context *ctx);
 int put_spu_context(struct spu_context *ctx);
+void spu_unmap_mappings(struct spu_context *ctx);
 
 void spu_forget(struct spu_context *ctx);
 void spu_acquire(struct spu_context *ctx);
@@ -118,5 +122,6 @@
 /* irq callback funcs. */
 void spufs_ibox_callback(struct spu *spu);
 void spufs_wbox_callback(struct spu *spu);
+void spufs_stop_callback(struct spu *spu);
 
 #endif