[PATCH] spufs: cooperative scheduler support
This adds a scheduler for SPUs to make it possible to use
more logical SPUs than physical ones are present in the
system.
Currently, there is no support for preempting a running
SPU thread, they have to leave the SPU by either triggering
an event on the SPU that causes it to return to the
owning thread or by sending a signal to it.
This patch also adds operations that enable accessing an SPU
in either runnable or saved state. We use an RW semaphore
to protect the state of the SPU from changing underneath
us, while we are holding it readable. In order to change
the state, it is acquired writeable and a context save
or restore is executed before downgrading the semaphore
to read-only.
From: Mark Nutter <mnutter@us.ibm.com>,
Uli Weigand <Ulrich.Weigand@de.ibm.com>
Signed-off-by: Arnd Bergmann <arndb@de.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
new file mode 100644
index 0000000..c0d9d83
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -0,0 +1,419 @@
+/* sched.c - SPU scheduler.
+ *
+ * Copyright (C) IBM 2005
+ * Author: Mark Nutter <mnutter@us.ibm.com>
+ *
+ * SPU scheduler, based on Linux thread priority. For now use
+ * a simple "cooperative" yield model with no preemption. SPU
+ * scheduling will eventually be preemptive: When a thread with
+ * a higher static priority gets ready to run, then an active SPU
+ * context will be preempted and returned to the waitq.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG 1
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/completion.h>
+#include <linux/vmalloc.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/spu.h>
+#include <asm/spu_csa.h>
+#include "spufs.h"
+
+#define SPU_BITMAP_SIZE (((MAX_PRIO+BITS_PER_LONG)/BITS_PER_LONG)+1)
+struct spu_prio_array {
+ atomic_t nr_blocked;
+ unsigned long bitmap[SPU_BITMAP_SIZE];
+ wait_queue_head_t waitq[MAX_PRIO];
+};
+
+/* spu_runqueue - This is the main runqueue data structure for SPUs. */
+struct spu_runqueue {
+ struct semaphore sem;
+ unsigned long nr_active;
+ unsigned long nr_idle;
+ unsigned long nr_switches;
+ struct list_head active_list;
+ struct list_head idle_list;
+ struct spu_prio_array prio;
+};
+
+static struct spu_runqueue *spu_runqueues = NULL;
+
+static inline struct spu_runqueue *spu_rq(void)
+{
+ /* Future: make this a per-NODE array,
+ * and use cpu_to_node(smp_processor_id())
+ */
+ return spu_runqueues;
+}
+
+static inline struct spu *del_idle(struct spu_runqueue *rq)
+{
+ struct spu *spu;
+
+ BUG_ON(rq->nr_idle <= 0);
+ BUG_ON(list_empty(&rq->idle_list));
+ /* Future: Move SPU out of low-power SRI state. */
+ spu = list_entry(rq->idle_list.next, struct spu, sched_list);
+ list_del_init(&spu->sched_list);
+ rq->nr_idle--;
+ return spu;
+}
+
+static inline void del_active(struct spu_runqueue *rq, struct spu *spu)
+{
+ BUG_ON(rq->nr_active <= 0);
+ BUG_ON(list_empty(&rq->active_list));
+ list_del_init(&spu->sched_list);
+ rq->nr_active--;
+}
+
+static inline void add_idle(struct spu_runqueue *rq, struct spu *spu)
+{
+ /* Future: Put SPU into low-power SRI state. */
+ list_add_tail(&spu->sched_list, &rq->idle_list);
+ rq->nr_idle++;
+}
+
+static inline void add_active(struct spu_runqueue *rq, struct spu *spu)
+{
+ rq->nr_active++;
+ rq->nr_switches++;
+ list_add_tail(&spu->sched_list, &rq->active_list);
+}
+
+static void prio_wakeup(struct spu_runqueue *rq)
+{
+ if (atomic_read(&rq->prio.nr_blocked) && rq->nr_idle) {
+ int best = sched_find_first_bit(rq->prio.bitmap);
+ if (best < MAX_PRIO) {
+ wait_queue_head_t *wq = &rq->prio.waitq[best];
+ wake_up_interruptible_nr(wq, 1);
+ }
+ }
+}
+
+static void prio_wait(struct spu_runqueue *rq, u64 flags)
+{
+ int prio = current->prio;
+ wait_queue_head_t *wq = &rq->prio.waitq[prio];
+ DEFINE_WAIT(wait);
+
+ __set_bit(prio, rq->prio.bitmap);
+ atomic_inc(&rq->prio.nr_blocked);
+ prepare_to_wait_exclusive(wq, &wait, TASK_INTERRUPTIBLE);
+ if (!signal_pending(current)) {
+ up(&rq->sem);
+ pr_debug("%s: pid=%d prio=%d\n", __FUNCTION__,
+ current->pid, current->prio);
+ schedule();
+ down(&rq->sem);
+ }
+ finish_wait(wq, &wait);
+ atomic_dec(&rq->prio.nr_blocked);
+ if (!waitqueue_active(wq))
+ __clear_bit(prio, rq->prio.bitmap);
+}
+
+static inline int is_best_prio(struct spu_runqueue *rq)
+{
+ int best_prio;
+
+ best_prio = sched_find_first_bit(rq->prio.bitmap);
+ return (current->prio < best_prio) ? 1 : 0;
+}
+
+static inline void mm_needs_global_tlbie(struct mm_struct *mm)
+{
+ /* Global TLBIE broadcast required with SPEs. */
+#if (NR_CPUS > 1)
+ __cpus_setall(&mm->cpu_vm_mask, NR_CPUS);
+#else
+ __cpus_setall(&mm->cpu_vm_mask, NR_CPUS+1); /* is this ok? */
+#endif
+}
+
+static inline void bind_context(struct spu *spu, struct spu_context *ctx)
+{
+ pr_debug("%s: pid=%d SPU=%d\n", __FUNCTION__, current->pid,
+ spu->number);
+ spu->ctx = ctx;
+ spu->flags = 0;
+ ctx->spu = spu;
+ ctx->ops = &spu_hw_ops;
+ spu->pid = current->pid;
+ spu->prio = current->prio;
+ spu->mm = ctx->owner;
+ mm_needs_global_tlbie(spu->mm);
+ spu->ibox_callback = spufs_ibox_callback;
+ spu->wbox_callback = spufs_wbox_callback;
+ mb();
+ spu_restore(&ctx->csa, spu);
+}
+
+static inline void unbind_context(struct spu *spu, struct spu_context *ctx)
+{
+ pr_debug("%s: unbind pid=%d SPU=%d\n", __FUNCTION__,
+ spu->pid, spu->number);
+ spu_save(&ctx->csa, spu);
+ ctx->state = SPU_STATE_SAVED;
+ spu->ibox_callback = NULL;
+ spu->wbox_callback = NULL;
+ spu->mm = NULL;
+ spu->pid = 0;
+ spu->prio = MAX_PRIO;
+ ctx->ops = &spu_backing_ops;
+ ctx->spu = NULL;
+ spu->ctx = NULL;
+}
+
+static struct spu *preempt_active(struct spu_runqueue *rq)
+{
+ struct list_head *p;
+ struct spu_context *ctx;
+ struct spu *spu;
+
+ /* Future: implement real preemption. For now just
+ * boot a lower priority ctx that is in "detached"
+ * state, i.e. on a processor but not currently in
+ * spu_run().
+ */
+ list_for_each(p, &rq->active_list) {
+ spu = list_entry(p, struct spu, sched_list);
+ if (current->prio < spu->prio) {
+ ctx = spu->ctx;
+ if (down_write_trylock(&ctx->state_sema)) {
+ if (ctx->state != SPU_STATE_RUNNABLE) {
+ up_write(&ctx->state_sema);
+ continue;
+ }
+ pr_debug("%s: booting pid=%d from SPU %d\n",
+ __FUNCTION__, spu->pid, spu->number);
+ del_active(rq, spu);
+ up(&rq->sem);
+ unbind_context(spu, ctx);
+ up_write(&ctx->state_sema);
+ return spu;
+ }
+ }
+ }
+ return NULL;
+}
+
+static struct spu *get_idle_spu(u64 flags)
+{
+ struct spu_runqueue *rq;
+ struct spu *spu = NULL;
+
+ rq = spu_rq();
+ down(&rq->sem);
+ for (;;) {
+ if (rq->nr_idle > 0) {
+ if (is_best_prio(rq)) {
+ /* Fall through. */
+ spu = del_idle(rq);
+ break;
+ } else {
+ prio_wakeup(rq);
+ up(&rq->sem);
+ yield();
+ if (signal_pending(current)) {
+ return NULL;
+ }
+ rq = spu_rq();
+ down(&rq->sem);
+ continue;
+ }
+ } else {
+ if (is_best_prio(rq)) {
+ if ((spu = preempt_active(rq)) != NULL)
+ return spu;
+ }
+ prio_wait(rq, flags);
+ if (signal_pending(current)) {
+ prio_wakeup(rq);
+ spu = NULL;
+ break;
+ }
+ continue;
+ }
+ }
+ up(&rq->sem);
+ return spu;
+}
+
+static void put_idle_spu(struct spu *spu)
+{
+ struct spu_runqueue *rq = spu->rq;
+
+ down(&rq->sem);
+ add_idle(rq, spu);
+ prio_wakeup(rq);
+ up(&rq->sem);
+}
+
+static int get_active_spu(struct spu *spu)
+{
+ struct spu_runqueue *rq = spu->rq;
+ struct list_head *p;
+ struct spu *tmp;
+ int rc = 0;
+
+ down(&rq->sem);
+ list_for_each(p, &rq->active_list) {
+ tmp = list_entry(p, struct spu, sched_list);
+ if (tmp == spu) {
+ del_active(rq, spu);
+ rc = 1;
+ break;
+ }
+ }
+ up(&rq->sem);
+ return rc;
+}
+
+static void put_active_spu(struct spu *spu)
+{
+ struct spu_runqueue *rq = spu->rq;
+
+ down(&rq->sem);
+ add_active(rq, spu);
+ up(&rq->sem);
+}
+
+/* Lock order:
+ * spu_activate() & spu_deactivate() require the
+ * caller to have down_write(&ctx->state_sema).
+ *
+ * The rq->sem is breifly held (inside or outside a
+ * given ctx lock) for list management, but is never
+ * held during save/restore.
+ */
+
+int spu_activate(struct spu_context *ctx, u64 flags)
+{
+ struct spu *spu;
+
+ if (ctx->spu)
+ return 0;
+ spu = get_idle_spu(flags);
+ if (!spu)
+ return (signal_pending(current)) ? -ERESTARTSYS : -EAGAIN;
+ bind_context(spu, ctx);
+ put_active_spu(spu);
+ return 0;
+}
+
+void spu_deactivate(struct spu_context *ctx)
+{
+ struct spu *spu;
+ int needs_idle;
+
+ spu = ctx->spu;
+ if (!spu)
+ return;
+ needs_idle = get_active_spu(spu);
+ unbind_context(spu, ctx);
+ if (needs_idle)
+ put_idle_spu(spu);
+}
+
+void spu_yield(struct spu_context *ctx)
+{
+ struct spu *spu;
+
+ if (!down_write_trylock(&ctx->state_sema))
+ return;
+ spu = ctx->spu;
+ if ((ctx->state == SPU_STATE_RUNNABLE) &&
+ (sched_find_first_bit(spu->rq->prio.bitmap) <= current->prio)) {
+ pr_debug("%s: yielding SPU %d\n", __FUNCTION__, spu->number);
+ spu_deactivate(ctx);
+ ctx->state = SPU_STATE_SAVED;
+ }
+ up_write(&ctx->state_sema);
+}
+
+int __init spu_sched_init(void)
+{
+ struct spu_runqueue *rq;
+ struct spu *spu;
+ int i;
+
+ rq = spu_runqueues = kmalloc(sizeof(struct spu_runqueue), GFP_KERNEL);
+ if (!rq) {
+ printk(KERN_WARNING "%s: Unable to allocate runqueues.\n",
+ __FUNCTION__);
+ return 1;
+ }
+ memset(rq, 0, sizeof(struct spu_runqueue));
+ init_MUTEX(&rq->sem);
+ INIT_LIST_HEAD(&rq->active_list);
+ INIT_LIST_HEAD(&rq->idle_list);
+ rq->nr_active = 0;
+ rq->nr_idle = 0;
+ rq->nr_switches = 0;
+ atomic_set(&rq->prio.nr_blocked, 0);
+ for (i = 0; i < MAX_PRIO; i++) {
+ init_waitqueue_head(&rq->prio.waitq[i]);
+ __clear_bit(i, rq->prio.bitmap);
+ }
+ __set_bit(MAX_PRIO, rq->prio.bitmap);
+ for (;;) {
+ spu = spu_alloc();
+ if (!spu)
+ break;
+ pr_debug("%s: adding SPU[%d]\n", __FUNCTION__, spu->number);
+ add_idle(rq, spu);
+ spu->rq = rq;
+ }
+ if (!rq->nr_idle) {
+ printk(KERN_WARNING "%s: No available SPUs.\n", __FUNCTION__);
+ kfree(rq);
+ return 1;
+ }
+ return 0;
+}
+
+void __exit spu_sched_exit(void)
+{
+ struct spu_runqueue *rq = spu_rq();
+ struct spu *spu;
+
+ if (!rq) {
+ printk(KERN_WARNING "%s: no runqueues!\n", __FUNCTION__);
+ return;
+ }
+ while (rq->nr_idle > 0) {
+ spu = del_idle(rq);
+ if (!spu)
+ break;
+ spu_free(spu);
+ }
+ kfree(rq);
+}