Merge git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6-nmw
* git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6-nmw:
GFS2: Remove lock_kernel from gfs2_put_super()
GFS2: Add tracepoints
diff --git a/Documentation/ide/ide.txt b/Documentation/ide/ide.txt
index 0c78f4b..e77bebf 100644
--- a/Documentation/ide/ide.txt
+++ b/Documentation/ide/ide.txt
@@ -216,6 +216,8 @@
* "noflush=[interface_number.device_number]" to disable flush requests
+* "nohpa=[interface_number.device_number]" to disable Host Protected Area
+
* "noprobe=[interface_number.device_number]" to skip probing
* "nowerr=[interface_number.device_number]" to ignore the WRERR_STAT bit
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 7bcdebf..0bf8a88 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -887,11 +887,8 @@
ide-core.nodma= [HW] (E)IDE subsystem
Format: =0.0 to prevent dma on hda, =0.1 hdb =1.0 hdc
- .vlb_clock .pci_clock .noflush .noprobe .nowerr .cdrom
- .chs .ignore_cable are additional options
- See Documentation/ide/ide.txt.
-
- idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed
+ .vlb_clock .pci_clock .noflush .nohpa .noprobe .nowerr
+ .cdrom .chs .ignore_cable are additional options
See Documentation/ide/ide.txt.
ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
index 1f4f9e8..28c8cdf 100644
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
@@ -1,6 +1,5 @@
# This creates the demonstration utility "lguest" which runs a Linux guest.
-CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE
-LDLIBS:=-lz
+CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE
all: lguest
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index d36fcc0..9ebcd6e 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -16,6 +16,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/wait.h>
+#include <sys/eventfd.h>
#include <fcntl.h>
#include <stdbool.h>
#include <errno.h>
@@ -59,7 +60,6 @@
/*:*/
#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
-#define NET_PEERNUM 1
#define BRIDGE_PFX "bridge:"
#ifndef SIOCBRADDIF
#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
@@ -76,19 +76,12 @@
do { if (verbose) printf(args); } while(0)
/*:*/
-/* File descriptors for the Waker. */
-struct {
- int pipe[2];
- int lguest_fd;
-} waker_fds;
-
/* The pointer to the start of guest memory. */
static void *guest_base;
/* The maximum guest physical address allowed, and maximum possible. */
static unsigned long guest_limit, guest_max;
-/* The pipe for signal hander to write to. */
-static int timeoutpipe[2];
-static unsigned int timeout_usec = 500;
+/* The /dev/lguest file descriptor. */
+static int lguest_fd;
/* a per-cpu variable indicating whose vcpu is currently running */
static unsigned int __thread cpu_id;
@@ -96,11 +89,6 @@
/* This is our list of devices. */
struct device_list
{
- /* Summary information about the devices in our list: ready to pass to
- * select() to ask which need servicing.*/
- fd_set infds;
- int max_infd;
-
/* Counter to assign interrupt numbers. */
unsigned int next_irq;
@@ -126,22 +114,21 @@
/* The linked-list pointer. */
struct device *next;
- /* The this device's descriptor, as mapped into the Guest. */
+ /* The device's descriptor, as mapped into the Guest. */
struct lguest_device_desc *desc;
+ /* We can't trust desc values once Guest has booted: we use these. */
+ unsigned int feature_len;
+ unsigned int num_vq;
+
/* The name of this device, for --verbose. */
const char *name;
- /* If handle_input is set, it wants to be called when this file
- * descriptor is ready. */
- int fd;
- bool (*handle_input)(int fd, struct device *me);
-
/* Any queues attached to this device */
struct virtqueue *vq;
- /* Handle status being finalized (ie. feature bits stable). */
- void (*ready)(struct device *me);
+ /* Is it operational */
+ bool running;
/* Device-specific data. */
void *priv;
@@ -164,22 +151,28 @@
/* Last available index we saw. */
u16 last_avail_idx;
- /* The routine to call when the Guest pings us, or timeout. */
- void (*handle_output)(int fd, struct virtqueue *me, bool timeout);
+ /* How many are used since we sent last irq? */
+ unsigned int pending_used;
- /* Outstanding buffers */
- unsigned int inflight;
+ /* Eventfd where Guest notifications arrive. */
+ int eventfd;
- /* Is this blocked awaiting a timer? */
- bool blocked;
+ /* Function for the thread which is servicing this virtqueue. */
+ void (*service)(struct virtqueue *vq);
+ pid_t thread;
};
/* Remember the arguments to the program so we can "reboot" */
static char **main_args;
-/* Since guest is UP and we don't run at the same time, we don't need barriers.
- * But I include them in the code in case others copy it. */
-#define wmb()
+/* The original tty settings to restore on exit. */
+static struct termios orig_term;
+
+/* We have to be careful with barriers: our devices are all run in separate
+ * threads and so we need to make sure that changes visible to the Guest happen
+ * in precise order. */
+#define wmb() __asm__ __volatile__("" : : : "memory")
+#define mb() __asm__ __volatile__("" : : : "memory")
/* Convert an iovec element to the given type.
*
@@ -245,7 +238,7 @@
static u8 *get_feature_bits(struct device *dev)
{
return (u8 *)(dev->desc + 1)
- + dev->desc->num_vq * sizeof(struct lguest_vqconfig);
+ + dev->num_vq * sizeof(struct lguest_vqconfig);
}
/*L:100 The Launcher code itself takes us out into userspace, that scary place
@@ -505,99 +498,19 @@
* saw the arguments it expects when we looked at initialize() in lguest_user.c:
* the base of Guest "physical" memory, the top physical page to allow and the
* entry point for the Guest. */
-static int tell_kernel(unsigned long start)
+static void tell_kernel(unsigned long start)
{
unsigned long args[] = { LHREQ_INITIALIZE,
(unsigned long)guest_base,
guest_limit / getpagesize(), start };
- int fd;
-
verbose("Guest: %p - %p (%#lx)\n",
guest_base, guest_base + guest_limit, guest_limit);
- fd = open_or_die("/dev/lguest", O_RDWR);
- if (write(fd, args, sizeof(args)) < 0)
+ lguest_fd = open_or_die("/dev/lguest", O_RDWR);
+ if (write(lguest_fd, args, sizeof(args)) < 0)
err(1, "Writing to /dev/lguest");
-
- /* We return the /dev/lguest file descriptor to control this Guest */
- return fd;
}
/*:*/
-static void add_device_fd(int fd)
-{
- FD_SET(fd, &devices.infds);
- if (fd > devices.max_infd)
- devices.max_infd = fd;
-}
-
-/*L:200
- * The Waker.
- *
- * With console, block and network devices, we can have lots of input which we
- * need to process. We could try to tell the kernel what file descriptors to
- * watch, but handing a file descriptor mask through to the kernel is fairly
- * icky.
- *
- * Instead, we clone off a thread which watches the file descriptors and writes
- * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
- * stop running the Guest. This causes the Launcher to return from the
- * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
- * the LHREQ_BREAK and wake us up again.
- *
- * This, of course, is merely a different *kind* of icky.
- *
- * Given my well-known antipathy to threads, I'd prefer to use processes. But
- * it's easier to share Guest memory with threads, and trivial to share the
- * devices.infds as the Launcher changes it.
- */
-static int waker(void *unused)
-{
- /* Close the write end of the pipe: only the Launcher has it open. */
- close(waker_fds.pipe[1]);
-
- for (;;) {
- fd_set rfds = devices.infds;
- unsigned long args[] = { LHREQ_BREAK, 1 };
- unsigned int maxfd = devices.max_infd;
-
- /* We also listen to the pipe from the Launcher. */
- FD_SET(waker_fds.pipe[0], &rfds);
- if (waker_fds.pipe[0] > maxfd)
- maxfd = waker_fds.pipe[0];
-
- /* Wait until input is ready from one of the devices. */
- select(maxfd+1, &rfds, NULL, NULL, NULL);
-
- /* Message from Launcher? */
- if (FD_ISSET(waker_fds.pipe[0], &rfds)) {
- char c;
- /* If this fails, then assume Launcher has exited.
- * Don't do anything on exit: we're just a thread! */
- if (read(waker_fds.pipe[0], &c, 1) != 1)
- _exit(0);
- continue;
- }
-
- /* Send LHREQ_BREAK command to snap the Launcher out of it. */
- pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id);
- }
- return 0;
-}
-
-/* This routine just sets up a pipe to the Waker process. */
-static void setup_waker(int lguest_fd)
-{
- /* This pipe is closed when Launcher dies, telling Waker. */
- if (pipe(waker_fds.pipe) != 0)
- err(1, "Creating pipe for Waker");
-
- /* Waker also needs to know the lguest fd */
- waker_fds.lguest_fd = lguest_fd;
-
- if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1)
- err(1, "Creating Waker");
-}
-
/*
* Device Handling.
*
@@ -623,49 +536,90 @@
/* Each buffer in the virtqueues is actually a chain of descriptors. This
* function returns the next descriptor in the chain, or vq->vring.num if we're
* at the end. */
-static unsigned next_desc(struct virtqueue *vq, unsigned int i)
+static unsigned next_desc(struct vring_desc *desc,
+ unsigned int i, unsigned int max)
{
unsigned int next;
/* If this descriptor says it doesn't chain, we're done. */
- if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT))
- return vq->vring.num;
+ if (!(desc[i].flags & VRING_DESC_F_NEXT))
+ return max;
/* Check they're not leading us off end of descriptors. */
- next = vq->vring.desc[i].next;
+ next = desc[i].next;
/* Make sure compiler knows to grab that: we don't want it changing! */
wmb();
- if (next >= vq->vring.num)
+ if (next >= max)
errx(1, "Desc next is %u", next);
return next;
}
+/* This actually sends the interrupt for this virtqueue */
+static void trigger_irq(struct virtqueue *vq)
+{
+ unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
+
+ /* Don't inform them if nothing used. */
+ if (!vq->pending_used)
+ return;
+ vq->pending_used = 0;
+
+ /* If they don't want an interrupt, don't send one, unless empty. */
+ if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+ && lg_last_avail(vq) != vq->vring.avail->idx)
+ return;
+
+ /* Send the Guest an interrupt tell them we used something up. */
+ if (write(lguest_fd, buf, sizeof(buf)) != 0)
+ err(1, "Triggering irq %i", vq->config.irq);
+}
+
/* This looks in the virtqueue and for the first available buffer, and converts
* it to an iovec for convenient access. Since descriptors consist of some
* number of output then some number of input descriptors, it's actually two
* iovecs, but we pack them into one and note how many of each there were.
*
- * This function returns the descriptor number found, or vq->vring.num (which
- * is never a valid descriptor number) if none was found. */
-static unsigned get_vq_desc(struct virtqueue *vq,
- struct iovec iov[],
- unsigned int *out_num, unsigned int *in_num)
+ * This function returns the descriptor number found. */
+static unsigned wait_for_vq_desc(struct virtqueue *vq,
+ struct iovec iov[],
+ unsigned int *out_num, unsigned int *in_num)
{
- unsigned int i, head;
- u16 last_avail;
+ unsigned int i, head, max;
+ struct vring_desc *desc;
+ u16 last_avail = lg_last_avail(vq);
+
+ while (last_avail == vq->vring.avail->idx) {
+ u64 event;
+
+ /* OK, tell Guest about progress up to now. */
+ trigger_irq(vq);
+
+ /* OK, now we need to know about added descriptors. */
+ vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+
+ /* They could have slipped one in as we were doing that: make
+ * sure it's written, then check again. */
+ mb();
+ if (last_avail != vq->vring.avail->idx) {
+ vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+ break;
+ }
+
+ /* Nothing new? Wait for eventfd to tell us they refilled. */
+ if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
+ errx(1, "Event read failed?");
+
+ /* We don't need to be notified again. */
+ vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+ }
/* Check it isn't doing very strange things with descriptor numbers. */
- last_avail = lg_last_avail(vq);
if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
errx(1, "Guest moved used index from %u to %u",
last_avail, vq->vring.avail->idx);
- /* If there's nothing new since last we looked, return invalid. */
- if (vq->vring.avail->idx == last_avail)
- return vq->vring.num;
-
/* Grab the next descriptor number they're advertising, and increment
* the index we've seen. */
head = vq->vring.avail->ring[last_avail % vq->vring.num];
@@ -678,15 +632,28 @@
/* When we start there are none of either input nor output. */
*out_num = *in_num = 0;
+ max = vq->vring.num;
+ desc = vq->vring.desc;
i = head;
+
+ /* If this is an indirect entry, then this buffer contains a descriptor
+ * table which we handle as if it's any normal descriptor chain. */
+ if (desc[i].flags & VRING_DESC_F_INDIRECT) {
+ if (desc[i].len % sizeof(struct vring_desc))
+ errx(1, "Invalid size for indirect buffer table");
+
+ max = desc[i].len / sizeof(struct vring_desc);
+ desc = check_pointer(desc[i].addr, desc[i].len);
+ i = 0;
+ }
+
do {
/* Grab the first descriptor, and check it's OK. */
- iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len;
+ iov[*out_num + *in_num].iov_len = desc[i].len;
iov[*out_num + *in_num].iov_base
- = check_pointer(vq->vring.desc[i].addr,
- vq->vring.desc[i].len);
+ = check_pointer(desc[i].addr, desc[i].len);
/* If this is an input descriptor, increment that count. */
- if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE)
+ if (desc[i].flags & VRING_DESC_F_WRITE)
(*in_num)++;
else {
/* If it's an output descriptor, they're all supposed
@@ -697,11 +664,10 @@
}
/* If we've got too many, that implies a descriptor loop. */
- if (*out_num + *in_num > vq->vring.num)
+ if (*out_num + *in_num > max)
errx(1, "Looped descriptor");
- } while ((i = next_desc(vq, i)) != vq->vring.num);
+ } while ((i = next_desc(desc, i, max)) != max);
- vq->inflight++;
return head;
}
@@ -719,44 +685,20 @@
/* Make sure buffer is written before we update index. */
wmb();
vq->vring.used->idx++;
- vq->inflight--;
-}
-
-/* This actually sends the interrupt for this virtqueue */
-static void trigger_irq(int fd, struct virtqueue *vq)
-{
- unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
-
- /* If they don't want an interrupt, don't send one, unless empty. */
- if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && vq->inflight)
- return;
-
- /* Send the Guest an interrupt tell them we used something up. */
- if (write(fd, buf, sizeof(buf)) != 0)
- err(1, "Triggering irq %i", vq->config.irq);
+ vq->pending_used++;
}
/* And here's the combo meal deal. Supersize me! */
-static void add_used_and_trigger(int fd, struct virtqueue *vq,
- unsigned int head, int len)
+static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
{
add_used(vq, head, len);
- trigger_irq(fd, vq);
+ trigger_irq(vq);
}
/*
* The Console
*
- * Here is the input terminal setting we save, and the routine to restore them
- * on exit so the user gets their terminal back. */
-static struct termios orig_term;
-static void restore_term(void)
-{
- tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
-}
-
-/* We associate some data with the console for our exit hack. */
+ * We associate some data with the console for our exit hack. */
struct console_abort
{
/* How many times have they hit ^C? */
@@ -766,276 +708,275 @@
};
/* This is the routine which handles console input (ie. stdin). */
-static bool handle_console_input(int fd, struct device *dev)
+static void console_input(struct virtqueue *vq)
{
int len;
unsigned int head, in_num, out_num;
- struct iovec iov[dev->vq->vring.num];
- struct console_abort *abort = dev->priv;
+ struct console_abort *abort = vq->dev->priv;
+ struct iovec iov[vq->vring.num];
- /* First we need a console buffer from the Guests's input virtqueue. */
- head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
-
- /* If they're not ready for input, stop listening to this file
- * descriptor. We'll start again once they add an input buffer. */
- if (head == dev->vq->vring.num)
- return false;
-
+ /* Make sure there's a descriptor waiting. */
+ head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
if (out_num)
errx(1, "Output buffers in console in queue?");
- /* This is why we convert to iovecs: the readv() call uses them, and so
- * it reads straight into the Guest's buffer. */
- len = readv(dev->fd, iov, in_num);
+ /* Read it in. */
+ len = readv(STDIN_FILENO, iov, in_num);
if (len <= 0) {
- /* This implies that the console is closed, is /dev/null, or
- * something went terribly wrong. */
+ /* Ran out of input? */
warnx("Failed to get console input, ignoring console.");
- /* Put the input terminal back. */
- restore_term();
- /* Remove callback from input vq, so it doesn't restart us. */
- dev->vq->handle_output = NULL;
- /* Stop listening to this fd: don't call us again. */
- return false;
+ /* For simplicity, dying threads kill the whole Launcher. So
+ * just nap here. */
+ for (;;)
+ pause();
}
- /* Tell the Guest about the new input. */
- add_used_and_trigger(fd, dev->vq, head, len);
+ add_used_and_trigger(vq, head, len);
/* Three ^C within one second? Exit.
*
- * This is such a hack, but works surprisingly well. Each ^C has to be
- * in a buffer by itself, so they can't be too fast. But we check that
- * we get three within about a second, so they can't be too slow. */
- if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) {
- if (!abort->count++)
- gettimeofday(&abort->start, NULL);
- else if (abort->count == 3) {
- struct timeval now;
- gettimeofday(&now, NULL);
- if (now.tv_sec <= abort->start.tv_sec+1) {
- unsigned long args[] = { LHREQ_BREAK, 0 };
- /* Close the fd so Waker will know it has to
- * exit. */
- close(waker_fds.pipe[1]);
- /* Just in case Waker is blocked in BREAK, send
- * unbreak now. */
- write(fd, args, sizeof(args));
- exit(2);
- }
- abort->count = 0;
- }
- } else
- /* Any other key resets the abort counter. */
+ * This is such a hack, but works surprisingly well. Each ^C has to
+ * be in a buffer by itself, so they can't be too fast. But we check
+ * that we get three within about a second, so they can't be too
+ * slow. */
+ if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
abort->count = 0;
+ return;
+ }
- /* Everything went OK! */
- return true;
-}
-
-/* Handling output for console is simple: we just get all the output buffers
- * and write them to stdout. */
-static void handle_console_output(int fd, struct virtqueue *vq, bool timeout)
-{
- unsigned int head, out, in;
- int len;
- struct iovec iov[vq->vring.num];
-
- /* Keep getting output buffers from the Guest until we run out. */
- while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
- if (in)
- errx(1, "Input buffers in output queue?");
- len = writev(STDOUT_FILENO, iov, out);
- add_used_and_trigger(fd, vq, head, len);
+ abort->count++;
+ if (abort->count == 1)
+ gettimeofday(&abort->start, NULL);
+ else if (abort->count == 3) {
+ struct timeval now;
+ gettimeofday(&now, NULL);
+ /* Kill all Launcher processes with SIGINT, like normal ^C */
+ if (now.tv_sec <= abort->start.tv_sec+1)
+ kill(0, SIGINT);
+ abort->count = 0;
}
}
-/* This is called when we no longer want to hear about Guest changes to a
- * virtqueue. This is more efficient in high-traffic cases, but it means we
- * have to set a timer to check if any more changes have occurred. */
-static void block_vq(struct virtqueue *vq)
+/* This is the routine which handles console output (ie. stdout). */
+static void console_output(struct virtqueue *vq)
{
- struct itimerval itm;
+ unsigned int head, out, in;
+ struct iovec iov[vq->vring.num];
- vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
- vq->blocked = true;
-
- itm.it_interval.tv_sec = 0;
- itm.it_interval.tv_usec = 0;
- itm.it_value.tv_sec = 0;
- itm.it_value.tv_usec = timeout_usec;
-
- setitimer(ITIMER_REAL, &itm, NULL);
+ head = wait_for_vq_desc(vq, iov, &out, &in);
+ if (in)
+ errx(1, "Input buffers in console output queue?");
+ while (!iov_empty(iov, out)) {
+ int len = writev(STDOUT_FILENO, iov, out);
+ if (len <= 0)
+ err(1, "Write to stdout gave %i", len);
+ iov_consume(iov, out, len);
+ }
+ add_used(vq, head, 0);
}
/*
* The Network
*
* Handling output for network is also simple: we get all the output buffers
- * and write them (ignoring the first element) to this device's file descriptor
- * (/dev/net/tun).
+ * and write them to /dev/net/tun.
*/
-static void handle_net_output(int fd, struct virtqueue *vq, bool timeout)
+struct net_info {
+ int tunfd;
+};
+
+static void net_output(struct virtqueue *vq)
{
- unsigned int head, out, in, num = 0;
- int len;
+ struct net_info *net_info = vq->dev->priv;
+ unsigned int head, out, in;
struct iovec iov[vq->vring.num];
- static int last_timeout_num;
- /* Keep getting output buffers from the Guest until we run out. */
- while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
- if (in)
- errx(1, "Input buffers in output queue?");
- len = writev(vq->dev->fd, iov, out);
- if (len < 0)
- err(1, "Writing network packet to tun");
- add_used_and_trigger(fd, vq, head, len);
- num++;
- }
-
- /* Block further kicks and set up a timer if we saw anything. */
- if (!timeout && num)
- block_vq(vq);
-
- /* We never quite know how long should we wait before we check the
- * queue again for more packets. We start at 500 microseconds, and if
- * we get fewer packets than last time, we assume we made the timeout
- * too small and increase it by 10 microseconds. Otherwise, we drop it
- * by one microsecond every time. It seems to work well enough. */
- if (timeout) {
- if (num < last_timeout_num)
- timeout_usec += 10;
- else if (timeout_usec > 1)
- timeout_usec--;
- last_timeout_num = num;
- }
+ head = wait_for_vq_desc(vq, iov, &out, &in);
+ if (in)
+ errx(1, "Input buffers in net output queue?");
+ if (writev(net_info->tunfd, iov, out) < 0)
+ errx(1, "Write to tun failed?");
+ add_used(vq, head, 0);
}
-/* This is where we handle a packet coming in from the tun device to our
+/* Will reading from this file descriptor block? */
+static bool will_block(int fd)
+{
+ fd_set fdset;
+ struct timeval zero = { 0, 0 };
+ FD_ZERO(&fdset);
+ FD_SET(fd, &fdset);
+ return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
+}
+
+/* This is where we handle packets coming in from the tun device to our
* Guest. */
-static bool handle_tun_input(int fd, struct device *dev)
+static void net_input(struct virtqueue *vq)
{
- unsigned int head, in_num, out_num;
int len;
- struct iovec iov[dev->vq->vring.num];
+ unsigned int head, out, in;
+ struct iovec iov[vq->vring.num];
+ struct net_info *net_info = vq->dev->priv;
- /* First we need a network buffer from the Guests's recv virtqueue. */
- head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
- if (head == dev->vq->vring.num) {
- /* Now, it's expected that if we try to send a packet too
- * early, the Guest won't be ready yet. Wait until the device
- * status says it's ready. */
- /* FIXME: Actually want DRIVER_ACTIVE here. */
+ head = wait_for_vq_desc(vq, iov, &out, &in);
+ if (out)
+ errx(1, "Output buffers in net input queue?");
- /* Now tell it we want to know if new things appear. */
- dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
- wmb();
+ /* Deliver interrupt now, since we're about to sleep. */
+ if (vq->pending_used && will_block(net_info->tunfd))
+ trigger_irq(vq);
- /* We'll turn this back on if input buffers are registered. */
- return false;
- } else if (out_num)
- errx(1, "Output buffers in network recv queue?");
-
- /* Read the packet from the device directly into the Guest's buffer. */
- len = readv(dev->fd, iov, in_num);
+ len = readv(net_info->tunfd, iov, in);
if (len <= 0)
- err(1, "reading network");
-
- /* Tell the Guest about the new packet. */
- add_used_and_trigger(fd, dev->vq, head, len);
-
- verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
- ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
- head != dev->vq->vring.num ? "sent" : "discarded");
-
- /* All good. */
- return true;
+ err(1, "Failed to read from tun.");
+ add_used(vq, head, len);
}
-/*L:215 This is the callback attached to the network and console input
- * virtqueues: it ensures we try again, in case we stopped console or net
- * delivery because Guest didn't have any buffers. */
-static void enable_fd(int fd, struct virtqueue *vq, bool timeout)
+/* This is the helper to create threads. */
+static int do_thread(void *_vq)
{
- add_device_fd(vq->dev->fd);
- /* Snap the Waker out of its select loop. */
- write(waker_fds.pipe[1], "", 1);
+ struct virtqueue *vq = _vq;
+
+ for (;;)
+ vq->service(vq);
+ return 0;
}
-static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout)
+/* When a child dies, we kill our entire process group with SIGTERM. This
+ * also has the side effect that the shell restores the console for us! */
+static void kill_launcher(int signal)
{
- /* We don't need to know again when Guest refills receive buffer. */
- vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
- enable_fd(fd, vq, timeout);
+ kill(0, SIGTERM);
+}
+
+static void reset_device(struct device *dev)
+{
+ struct virtqueue *vq;
+
+ verbose("Resetting device %s\n", dev->name);
+
+ /* Clear any features they've acked. */
+ memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len);
+
+ /* We're going to be explicitly killing threads, so ignore them. */
+ signal(SIGCHLD, SIG_IGN);
+
+ /* Zero out the virtqueues, get rid of their threads */
+ for (vq = dev->vq; vq; vq = vq->next) {
+ if (vq->thread != (pid_t)-1) {
+ kill(vq->thread, SIGTERM);
+ waitpid(vq->thread, NULL, 0);
+ vq->thread = (pid_t)-1;
+ }
+ memset(vq->vring.desc, 0,
+ vring_size(vq->config.num, LGUEST_VRING_ALIGN));
+ lg_last_avail(vq) = 0;
+ }
+ dev->running = false;
+
+ /* Now we care if threads die. */
+ signal(SIGCHLD, (void *)kill_launcher);
+}
+
+static void create_thread(struct virtqueue *vq)
+{
+ /* Create stack for thread and run it. Since stack grows
+ * upwards, we point the stack pointer to the end of this
+ * region. */
+ char *stack = malloc(32768);
+ unsigned long args[] = { LHREQ_EVENTFD,
+ vq->config.pfn*getpagesize(), 0 };
+
+ /* Create a zero-initialized eventfd. */
+ vq->eventfd = eventfd(0, 0);
+ if (vq->eventfd < 0)
+ err(1, "Creating eventfd");
+ args[2] = vq->eventfd;
+
+ /* Attach an eventfd to this virtqueue: it will go off
+ * when the Guest does an LHCALL_NOTIFY for this vq. */
+ if (write(lguest_fd, &args, sizeof(args)) != 0)
+ err(1, "Attaching eventfd");
+
+ /* CLONE_VM: because it has to access the Guest memory, and
+ * SIGCHLD so we get a signal if it dies. */
+ vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
+ if (vq->thread == (pid_t)-1)
+ err(1, "Creating clone");
+ /* We close our local copy, now the child has it. */
+ close(vq->eventfd);
+}
+
+static void start_device(struct device *dev)
+{
+ unsigned int i;
+ struct virtqueue *vq;
+
+ verbose("Device %s OK: offered", dev->name);
+ for (i = 0; i < dev->feature_len; i++)
+ verbose(" %02x", get_feature_bits(dev)[i]);
+ verbose(", accepted");
+ for (i = 0; i < dev->feature_len; i++)
+ verbose(" %02x", get_feature_bits(dev)
+ [dev->feature_len+i]);
+
+ for (vq = dev->vq; vq; vq = vq->next) {
+ if (vq->service)
+ create_thread(vq);
+ }
+ dev->running = true;
+}
+
+static void cleanup_devices(void)
+{
+ struct device *dev;
+
+ for (dev = devices.dev; dev; dev = dev->next)
+ reset_device(dev);
+
+ /* If we saved off the original terminal settings, restore them now. */
+ if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
+ tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
}
/* When the Guest tells us they updated the status field, we handle it. */
static void update_device_status(struct device *dev)
{
- struct virtqueue *vq;
-
- /* This is a reset. */
- if (dev->desc->status == 0) {
- verbose("Resetting device %s\n", dev->name);
-
- /* Clear any features they've acked. */
- memset(get_feature_bits(dev) + dev->desc->feature_len, 0,
- dev->desc->feature_len);
-
- /* Zero out the virtqueues. */
- for (vq = dev->vq; vq; vq = vq->next) {
- memset(vq->vring.desc, 0,
- vring_size(vq->config.num, LGUEST_VRING_ALIGN));
- lg_last_avail(vq) = 0;
- }
- } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
+ /* A zero status is a reset, otherwise it's a set of flags. */
+ if (dev->desc->status == 0)
+ reset_device(dev);
+ else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
warnx("Device %s configuration FAILED", dev->name);
+ if (dev->running)
+ reset_device(dev);
} else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
- unsigned int i;
-
- verbose("Device %s OK: offered", dev->name);
- for (i = 0; i < dev->desc->feature_len; i++)
- verbose(" %02x", get_feature_bits(dev)[i]);
- verbose(", accepted");
- for (i = 0; i < dev->desc->feature_len; i++)
- verbose(" %02x", get_feature_bits(dev)
- [dev->desc->feature_len+i]);
-
- if (dev->ready)
- dev->ready(dev);
+ if (!dev->running)
+ start_device(dev);
}
}
/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
-static void handle_output(int fd, unsigned long addr)
+static void handle_output(unsigned long addr)
{
struct device *i;
- struct virtqueue *vq;
- /* Check each device and virtqueue. */
+ /* Check each device. */
for (i = devices.dev; i; i = i->next) {
+ struct virtqueue *vq;
+
/* Notifications to device descriptors update device status. */
if (from_guest_phys(addr) == i->desc) {
update_device_status(i);
return;
}
- /* Notifications to virtqueues mean output has occurred. */
+ /* Devices *can* be used before status is set to DRIVER_OK. */
for (vq = i->vq; vq; vq = vq->next) {
- if (vq->config.pfn != addr/getpagesize())
+ if (addr != vq->config.pfn*getpagesize())
continue;
-
- /* Guest should acknowledge (and set features!) before
- * using the device. */
- if (i->desc->status == 0) {
- warnx("%s gave early output", i->name);
- return;
- }
-
- if (strcmp(vq->dev->name, "console") != 0)
- verbose("Output to %s\n", vq->dev->name);
- if (vq->handle_output)
- vq->handle_output(fd, vq, false);
+ if (i->running)
+ errx(1, "Notification on running %s", i->name);
+ start_device(i);
return;
}
}
@@ -1049,71 +990,6 @@
strnlen(from_guest_phys(addr), guest_limit - addr));
}
-static void handle_timeout(int fd)
-{
- char buf[32];
- struct device *i;
- struct virtqueue *vq;
-
- /* Clear the pipe */
- read(timeoutpipe[0], buf, sizeof(buf));
-
- /* Check each device and virtqueue: flush blocked ones. */
- for (i = devices.dev; i; i = i->next) {
- for (vq = i->vq; vq; vq = vq->next) {
- if (!vq->blocked)
- continue;
-
- vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
- vq->blocked = false;
- if (vq->handle_output)
- vq->handle_output(fd, vq, true);
- }
- }
-}
-
-/* This is called when the Waker wakes us up: check for incoming file
- * descriptors. */
-static void handle_input(int fd)
-{
- /* select() wants a zeroed timeval to mean "don't wait". */
- struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
-
- for (;;) {
- struct device *i;
- fd_set fds = devices.infds;
- int num;
-
- num = select(devices.max_infd+1, &fds, NULL, NULL, &poll);
- /* Could get interrupted */
- if (num < 0)
- continue;
- /* If nothing is ready, we're done. */
- if (num == 0)
- break;
-
- /* Otherwise, call the device(s) which have readable file
- * descriptors and a method of handling them. */
- for (i = devices.dev; i; i = i->next) {
- if (i->handle_input && FD_ISSET(i->fd, &fds)) {
- if (i->handle_input(fd, i))
- continue;
-
- /* If handle_input() returns false, it means we
- * should no longer service it. Networking and
- * console do this when there's no input
- * buffers to deliver into. Console also uses
- * it when it discovers that stdin is closed. */
- FD_CLR(i->fd, &devices.infds);
- }
- }
-
- /* Is this the timeout fd? */
- if (FD_ISSET(timeoutpipe[0], &fds))
- handle_timeout(fd);
- }
-}
-
/*L:190
* Device Setup
*
@@ -1129,8 +1005,8 @@
static u8 *device_config(const struct device *dev)
{
return (void *)(dev->desc + 1)
- + dev->desc->num_vq * sizeof(struct lguest_vqconfig)
- + dev->desc->feature_len * 2;
+ + dev->num_vq * sizeof(struct lguest_vqconfig)
+ + dev->feature_len * 2;
}
/* This routine allocates a new "struct lguest_device_desc" from descriptor
@@ -1159,7 +1035,7 @@
/* Each device descriptor is followed by the description of its virtqueues. We
* specify how many descriptors the virtqueue is to have. */
static void add_virtqueue(struct device *dev, unsigned int num_descs,
- void (*handle_output)(int, struct virtqueue *, bool))
+ void (*service)(struct virtqueue *))
{
unsigned int pages;
struct virtqueue **i, *vq = malloc(sizeof(*vq));
@@ -1174,8 +1050,8 @@
vq->next = NULL;
vq->last_avail_idx = 0;
vq->dev = dev;
- vq->inflight = 0;
- vq->blocked = false;
+ vq->service = service;
+ vq->thread = (pid_t)-1;
/* Initialize the configuration. */
vq->config.num = num_descs;
@@ -1191,6 +1067,7 @@
* yet, otherwise we'd be overwriting them. */
assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
memcpy(device_config(dev), &vq->config, sizeof(vq->config));
+ dev->num_vq++;
dev->desc->num_vq++;
verbose("Virtqueue page %#lx\n", to_guest_phys(p));
@@ -1199,15 +1076,6 @@
* second. */
for (i = &dev->vq; *i; i = &(*i)->next);
*i = vq;
-
- /* Set the routine to call when the Guest does something to this
- * virtqueue. */
- vq->handle_output = handle_output;
-
- /* As an optimization, set the advisory "Don't Notify Me" flag if we
- * don't have a handler */
- if (!handle_output)
- vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
}
/* The first half of the feature bitmask is for us to advertise features. The
@@ -1219,7 +1087,7 @@
/* We can't extend the feature bits once we've added config bytes */
if (dev->desc->feature_len <= bit / CHAR_BIT) {
assert(dev->desc->config_len == 0);
- dev->desc->feature_len = (bit / CHAR_BIT) + 1;
+ dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1;
}
features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
@@ -1243,22 +1111,17 @@
* calling new_dev_desc() to allocate the descriptor and device memory.
*
* See what I mean about userspace being boring? */
-static struct device *new_device(const char *name, u16 type, int fd,
- bool (*handle_input)(int, struct device *))
+static struct device *new_device(const char *name, u16 type)
{
struct device *dev = malloc(sizeof(*dev));
/* Now we populate the fields one at a time. */
- dev->fd = fd;
- /* If we have an input handler for this file descriptor, then we add it
- * to the device_list's fdset and maxfd. */
- if (handle_input)
- add_device_fd(dev->fd);
dev->desc = new_dev_desc(type);
- dev->handle_input = handle_input;
dev->name = name;
dev->vq = NULL;
- dev->ready = NULL;
+ dev->feature_len = 0;
+ dev->num_vq = 0;
+ dev->running = false;
/* Append to device list. Prepending to a single-linked list is
* easier, but the user expects the devices to be arranged on the bus
@@ -1286,13 +1149,10 @@
* raw input stream to the Guest. */
term.c_lflag &= ~(ISIG|ICANON|ECHO);
tcsetattr(STDIN_FILENO, TCSANOW, &term);
- /* If we exit gracefully, the original settings will be
- * restored so the user can see what they're typing. */
- atexit(restore_term);
}
- dev = new_device("console", VIRTIO_ID_CONSOLE,
- STDIN_FILENO, handle_console_input);
+ dev = new_device("console", VIRTIO_ID_CONSOLE);
+
/* We store the console state in dev->priv, and initialize it. */
dev->priv = malloc(sizeof(struct console_abort));
((struct console_abort *)dev->priv)->count = 0;
@@ -1301,31 +1161,13 @@
* they put something the input queue, we make sure we're listening to
* stdin. When they put something in the output queue, we write it to
* stdout. */
- add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
- add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
+ add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
+ add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
- verbose("device %u: console\n", devices.device_num++);
+ verbose("device %u: console\n", ++devices.device_num);
}
/*:*/
-static void timeout_alarm(int sig)
-{
- write(timeoutpipe[1], "", 1);
-}
-
-static void setup_timeout(void)
-{
- if (pipe(timeoutpipe) != 0)
- err(1, "Creating timeout pipe");
-
- if (fcntl(timeoutpipe[1], F_SETFL,
- fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0)
- err(1, "Making timeout pipe nonblocking");
-
- add_device_fd(timeoutpipe[0]);
- signal(SIGALRM, timeout_alarm);
-}
-
/*M:010 Inter-guest networking is an interesting area. Simplest is to have a
* --sharenet=<name> option which opens or creates a named pipe. This can be
* used to send packets to another guest in a 1:1 manner.
@@ -1447,21 +1289,23 @@
static void setup_tun_net(char *arg)
{
struct device *dev;
- int netfd, ipfd;
+ struct net_info *net_info = malloc(sizeof(*net_info));
+ int ipfd;
u32 ip = INADDR_ANY;
bool bridging = false;
char tapif[IFNAMSIZ], *p;
struct virtio_net_config conf;
- netfd = get_tun_device(tapif);
+ net_info->tunfd = get_tun_device(tapif);
/* First we create a new network device. */
- dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
+ dev = new_device("net", VIRTIO_ID_NET);
+ dev->priv = net_info;
/* Network devices need a receive and a send queue, just like
* console. */
- add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd);
- add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
+ add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
+ add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
/* We need a socket to perform the magic network ioctls to bring up the
* tap interface, connect to the bridge etc. Any socket will do! */
@@ -1502,6 +1346,8 @@
add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
add_feature(dev, VIRTIO_NET_F_HOST_ECN);
+ /* We handle indirect ring entries */
+ add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
set_config(dev, sizeof(conf), &conf);
/* We don't need the socket any more; setup is done. */
@@ -1550,20 +1396,18 @@
* Remember that the block device is handled by a separate I/O thread. We head
* straight into the core of that thread here:
*/
-static bool service_io(struct device *dev)
+static void blk_request(struct virtqueue *vq)
{
- struct vblk_info *vblk = dev->priv;
+ struct vblk_info *vblk = vq->dev->priv;
unsigned int head, out_num, in_num, wlen;
int ret;
u8 *in;
struct virtio_blk_outhdr *out;
- struct iovec iov[dev->vq->vring.num];
+ struct iovec iov[vq->vring.num];
off64_t off;
- /* See if there's a request waiting. If not, nothing to do. */
- head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
- if (head == dev->vq->vring.num)
- return false;
+ /* Get the next request. */
+ head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
/* Every block request should contain at least one output buffer
* (detailing the location on disk and the type of request) and one
@@ -1637,83 +1481,21 @@
if (out->type & VIRTIO_BLK_T_BARRIER)
fdatasync(vblk->fd);
- /* We can't trigger an IRQ, because we're not the Launcher. It does
- * that when we tell it we're done. */
- add_used(dev->vq, head, wlen);
- return true;
-}
-
-/* This is the thread which actually services the I/O. */
-static int io_thread(void *_dev)
-{
- struct device *dev = _dev;
- struct vblk_info *vblk = dev->priv;
- char c;
-
- /* Close other side of workpipe so we get 0 read when main dies. */
- close(vblk->workpipe[1]);
- /* Close the other side of the done_fd pipe. */
- close(dev->fd);
-
- /* When this read fails, it means Launcher died, so we follow. */
- while (read(vblk->workpipe[0], &c, 1) == 1) {
- /* We acknowledge each request immediately to reduce latency,
- * rather than waiting until we've done them all. I haven't
- * measured to see if it makes any difference.
- *
- * That would be an interesting test, wouldn't it? You could
- * also try having more than one I/O thread. */
- while (service_io(dev))
- write(vblk->done_fd, &c, 1);
- }
- return 0;
-}
-
-/* Now we've seen the I/O thread, we return to the Launcher to see what happens
- * when that thread tells us it's completed some I/O. */
-static bool handle_io_finish(int fd, struct device *dev)
-{
- char c;
-
- /* If the I/O thread died, presumably it printed the error, so we
- * simply exit. */
- if (read(dev->fd, &c, 1) != 1)
- exit(1);
-
- /* It did some work, so trigger the irq. */
- trigger_irq(fd, dev->vq);
- return true;
-}
-
-/* When the Guest submits some I/O, we just need to wake the I/O thread. */
-static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout)
-{
- struct vblk_info *vblk = vq->dev->priv;
- char c = 0;
-
- /* Wake up I/O thread and tell it to go to work! */
- if (write(vblk->workpipe[1], &c, 1) != 1)
- /* Presumably it indicated why it died. */
- exit(1);
+ add_used(vq, head, wlen);
}
/*L:198 This actually sets up a virtual block device. */
static void setup_block_file(const char *filename)
{
- int p[2];
struct device *dev;
struct vblk_info *vblk;
- void *stack;
struct virtio_blk_config conf;
- /* This is the pipe the I/O thread will use to tell us I/O is done. */
- pipe(p);
-
/* The device responds to return from I/O thread. */
- dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);
+ dev = new_device("block", VIRTIO_ID_BLOCK);
/* The device has one virtqueue, where the Guest places requests. */
- add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);
+ add_virtqueue(dev, VIRTQUEUE_NUM, blk_request);
/* Allocate the room for our own bookkeeping */
vblk = dev->priv = malloc(sizeof(*vblk));
@@ -1735,49 +1517,29 @@
set_config(dev, sizeof(conf), &conf);
- /* The I/O thread writes to this end of the pipe when done. */
- vblk->done_fd = p[1];
-
- /* This is the second pipe, which is how we tell the I/O thread about
- * more work. */
- pipe(vblk->workpipe);
-
- /* Create stack for thread and run it. Since stack grows upwards, we
- * point the stack pointer to the end of this region. */
- stack = malloc(32768);
- /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
- * becoming a zombie. */
- if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
- err(1, "Creating clone");
-
- /* We don't need to keep the I/O thread's end of the pipes open. */
- close(vblk->done_fd);
- close(vblk->workpipe[0]);
-
verbose("device %u: virtblock %llu sectors\n",
- devices.device_num, le64_to_cpu(conf.capacity));
+ ++devices.device_num, le64_to_cpu(conf.capacity));
}
+struct rng_info {
+ int rfd;
+};
+
/* Our random number generator device reads from /dev/random into the Guest's
* input buffers. The usual case is that the Guest doesn't want random numbers
* and so has no buffers although /dev/random is still readable, whereas
* console is the reverse.
*
* The same logic applies, however. */
-static bool handle_rng_input(int fd, struct device *dev)
+static void rng_input(struct virtqueue *vq)
{
int len;
unsigned int head, in_num, out_num, totlen = 0;
- struct iovec iov[dev->vq->vring.num];
+ struct rng_info *rng_info = vq->dev->priv;
+ struct iovec iov[vq->vring.num];
/* First we need a buffer from the Guests's virtqueue. */
- head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
-
- /* If they're not ready for input, stop listening to this file
- * descriptor. We'll start again once they add an input buffer. */
- if (head == dev->vq->vring.num)
- return false;
-
+ head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
if (out_num)
errx(1, "Output buffers in rng?");
@@ -1785,7 +1547,7 @@
* it reads straight into the Guest's buffer. We loop to make sure we
* fill it. */
while (!iov_empty(iov, in_num)) {
- len = readv(dev->fd, iov, in_num);
+ len = readv(rng_info->rfd, iov, in_num);
if (len <= 0)
err(1, "Read from /dev/random gave %i", len);
iov_consume(iov, in_num, len);
@@ -1793,25 +1555,23 @@
}
/* Tell the Guest about the new input. */
- add_used_and_trigger(fd, dev->vq, head, totlen);
-
- /* Everything went OK! */
- return true;
+ add_used(vq, head, totlen);
}
/* And this creates a "hardware" random number device for the Guest. */
static void setup_rng(void)
{
struct device *dev;
- int fd;
+ struct rng_info *rng_info = malloc(sizeof(*rng_info));
- fd = open_or_die("/dev/random", O_RDONLY);
+ rng_info->rfd = open_or_die("/dev/random", O_RDONLY);
/* The device responds to return from I/O thread. */
- dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input);
+ dev = new_device("rng", VIRTIO_ID_RNG);
+ dev->priv = rng_info;
/* The device has one virtqueue, where the Guest places inbufs. */
- add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+ add_virtqueue(dev, VIRTQUEUE_NUM, rng_input);
verbose("device %u: rng\n", devices.device_num++);
}
@@ -1827,17 +1587,18 @@
for (i = 3; i < FD_SETSIZE; i++)
close(i);
- /* The exec automatically gets rid of the I/O and Waker threads. */
+ /* Reset all the devices (kills all threads). */
+ cleanup_devices();
+
execv(main_args[0], main_args);
err(1, "Could not exec %s", main_args[0]);
}
/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves
* its input and output, and finally, lays it to rest. */
-static void __attribute__((noreturn)) run_guest(int lguest_fd)
+static void __attribute__((noreturn)) run_guest(void)
{
for (;;) {
- unsigned long args[] = { LHREQ_BREAK, 0 };
unsigned long notify_addr;
int readval;
@@ -1848,8 +1609,7 @@
/* One unsigned long means the Guest did HCALL_NOTIFY */
if (readval == sizeof(notify_addr)) {
verbose("Notify on address %#lx\n", notify_addr);
- handle_output(lguest_fd, notify_addr);
- continue;
+ handle_output(notify_addr);
/* ENOENT means the Guest died. Reading tells us why. */
} else if (errno == ENOENT) {
char reason[1024] = { 0 };
@@ -1858,19 +1618,9 @@
/* ERESTART means that we need to reboot the guest */
} else if (errno == ERESTART) {
restart_guest();
- /* EAGAIN means a signal (timeout).
- * Anything else means a bug or incompatible change. */
- } else if (errno != EAGAIN)
+ /* Anything else means a bug or incompatible change. */
+ } else
err(1, "Running guest failed");
-
- /* Only service input on thread for CPU 0. */
- if (cpu_id != 0)
- continue;
-
- /* Service input, then unset the BREAK to release the Waker. */
- handle_input(lguest_fd);
- if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
- err(1, "Resetting break");
}
}
/*L:240
@@ -1904,8 +1654,8 @@
/* Memory, top-level pagetable, code startpoint and size of the
* (optional) initrd. */
unsigned long mem = 0, start, initrd_size = 0;
- /* Two temporaries and the /dev/lguest file descriptor. */
- int i, c, lguest_fd;
+ /* Two temporaries. */
+ int i, c;
/* The boot information for the Guest. */
struct boot_params *boot;
/* If they specify an initrd file to load. */
@@ -1913,18 +1663,10 @@
/* Save the args: we "reboot" by execing ourselves again. */
main_args = argv;
- /* We don't "wait" for the children, so prevent them from becoming
- * zombies. */
- signal(SIGCHLD, SIG_IGN);
- /* First we initialize the device list. Since console and network
- * device receive input from a file descriptor, we keep an fdset
- * (infds) and the maximum fd number (max_infd) with the head of the
- * list. We also keep a pointer to the last device. Finally, we keep
- * the next interrupt number to use for devices (1: remember that 0 is
- * used by the timer). */
- FD_ZERO(&devices.infds);
- devices.max_infd = -1;
+ /* First we initialize the device list. We keep a pointer to the last
+ * device, and the next interrupt number to use for devices (1:
+ * remember that 0 is used by the timer). */
devices.lastdev = NULL;
devices.next_irq = 1;
@@ -1982,9 +1724,6 @@
/* We always have a console device */
setup_console();
- /* We can timeout waiting for Guest network transmit. */
- setup_timeout();
-
/* Now we load the kernel */
start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
@@ -2023,15 +1762,16 @@
/* We tell the kernel to initialize the Guest: this returns the open
* /dev/lguest file descriptor. */
- lguest_fd = tell_kernel(start);
+ tell_kernel(start);
- /* We clone off a thread, which wakes the Launcher whenever one of the
- * input file descriptors needs attention. We call this the Waker, and
- * we'll cover it in a moment. */
- setup_waker(lguest_fd);
+ /* Ensure that we terminate if a child dies. */
+ signal(SIGCHLD, kill_launcher);
+
+ /* If we exit via err(), this kills all the threads, restores tty. */
+ atexit(cleanup_devices);
/* Finally, run the Guest. This doesn't return. */
- run_guest(lguest_fd);
+ run_guest();
}
/*:*/
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
index 28c7473..efb3a6a 100644
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -37,7 +37,6 @@
"Paravirtualized guest support" = Y
"Lguest guest support" = Y
"High Memory Support" = off/4GB
- "PAE (Physical Address Extension) Support" = N
"Alignment value to which kernel should be aligned" = 0x100000
(CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
CONFIG_PHYSICAL_ALIGN=0x100000)
diff --git a/arch/alpha/mm/extable.c b/arch/alpha/mm/extable.c
index 62dc379..813c9b6 100644
--- a/arch/alpha/mm/extable.c
+++ b/arch/alpha/mm/extable.c
@@ -48,6 +48,27 @@
cmp_ex, swap_ex);
}
+#ifdef CONFIG_MODULES
+/*
+ * Any entry referring to the module init will be at the beginning or
+ * the end.
+ */
+void trim_init_extable(struct module *m)
+{
+ /*trim the beginning*/
+ while (m->num_exentries &&
+ within_module_init(ex_to_addr(&m->extable[0]), m)) {
+ m->extable++;
+ m->num_exentries--;
+ }
+ /*trim the end*/
+ while (m->num_exentries &&
+ within_module_init(ex_to_addr(&m->extable[m->num_exentries-1]),
+ m))
+ m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
+
const struct exception_table_entry *
search_extable(const struct exception_table_entry *first,
const struct exception_table_entry *last,
diff --git a/arch/avr32/kernel/module.c b/arch/avr32/kernel/module.c
index 1167fe9c..98f94d0 100644
--- a/arch/avr32/kernel/module.c
+++ b/arch/avr32/kernel/module.c
@@ -32,8 +32,6 @@
mod->arch.syminfo = NULL;
vfree(module_region);
- /* FIXME: if module_region == mod->init_region, trim exception
- * table entries. */
}
static inline int check_rela(Elf32_Rela *rela, struct module *module,
diff --git a/arch/cris/kernel/module.c b/arch/cris/kernel/module.c
index a187833..abc13e3 100644
--- a/arch/cris/kernel/module.c
+++ b/arch/cris/kernel/module.c
@@ -48,8 +48,6 @@
void module_free(struct module *mod, void *module_region)
{
FREE_MODULE(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
}
/* We don't need anything special. */
diff --git a/arch/frv/kernel/module.c b/arch/frv/kernel/module.c
index 850d168..711763c 100644
--- a/arch/frv/kernel/module.c
+++ b/arch/frv/kernel/module.c
@@ -35,8 +35,6 @@
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
}
/* We don't need anything special. */
diff --git a/arch/h8300/kernel/module.c b/arch/h8300/kernel/module.c
index cfc9127..0865e29 100644
--- a/arch/h8300/kernel/module.c
+++ b/arch/h8300/kernel/module.c
@@ -23,8 +23,6 @@
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
}
/* We don't need anything special. */
diff --git a/arch/ia64/mm/extable.c b/arch/ia64/mm/extable.c
index 71c50dd..e95d5ad 100644
--- a/arch/ia64/mm/extable.c
+++ b/arch/ia64/mm/extable.c
@@ -53,6 +53,32 @@
cmp_ex, swap_ex);
}
+static inline unsigned long ex_to_addr(const struct exception_table_entry *x)
+{
+ return (unsigned long)&x->insn + x->insn;
+}
+
+#ifdef CONFIG_MODULES
+/*
+ * Any entry referring to the module init will be at the beginning or
+ * the end.
+ */
+void trim_init_extable(struct module *m)
+{
+ /*trim the beginning*/
+ while (m->num_exentries &&
+ within_module_init(ex_to_addr(&m->extable[0]), m)) {
+ m->extable++;
+ m->num_exentries--;
+ }
+ /*trim the end*/
+ while (m->num_exentries &&
+ within_module_init(ex_to_addr(&m->extable[m->num_exentries-1]),
+ m))
+ m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
+
const struct exception_table_entry *
search_extable (const struct exception_table_entry *first,
const struct exception_table_entry *last,
diff --git a/arch/m32r/kernel/module.c b/arch/m32r/kernel/module.c
index 8d42057..cb5f37d 100644
--- a/arch/m32r/kernel/module.c
+++ b/arch/m32r/kernel/module.c
@@ -44,8 +44,6 @@
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
}
/* We don't need anything special. */
diff --git a/arch/m68k/kernel/module.c b/arch/m68k/kernel/module.c
index 774862b..cd6bcb1 100644
--- a/arch/m68k/kernel/module.c
+++ b/arch/m68k/kernel/module.c
@@ -31,8 +31,6 @@
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
}
/* We don't need anything special. */
diff --git a/arch/m68knommu/kernel/module.c b/arch/m68knommu/kernel/module.c
index 3b1a2ff..d11ffae 100644
--- a/arch/m68knommu/kernel/module.c
+++ b/arch/m68knommu/kernel/module.c
@@ -23,8 +23,6 @@
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
}
/* We don't need anything special. */
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 1f60e27..3e9100d 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -68,8 +68,6 @@
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
}
int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
diff --git a/arch/mn10300/kernel/module.c b/arch/mn10300/kernel/module.c
index 6b287f2..4fa0e36 100644
--- a/arch/mn10300/kernel/module.c
+++ b/arch/mn10300/kernel/module.c
@@ -48,8 +48,6 @@
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- * table entries. */
}
/*
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index ecd1c50..ef5caf2 100644
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -267,8 +267,6 @@
mod->arch.section = NULL;
vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
}
/* Additional bytes needed in front of individual sections */
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index 43e7e3a..477c663 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -43,8 +43,6 @@
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
}
static const Elf_Shdr *find_section(const Elf_Ehdr *hdr,
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index eed4a00..ab2e3ed 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -56,8 +56,6 @@
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
}
static void
diff --git a/arch/sh/kernel/module.c b/arch/sh/kernel/module.c
index c19b0f7..c2efdcd 100644
--- a/arch/sh/kernel/module.c
+++ b/arch/sh/kernel/module.c
@@ -46,8 +46,6 @@
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
}
/* We don't need anything special. */
diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h
index 47d5619..8303ac4 100644
--- a/arch/sparc/include/asm/uaccess_32.h
+++ b/arch/sparc/include/asm/uaccess_32.h
@@ -17,6 +17,9 @@
#ifndef __ASSEMBLY__
+#define ARCH_HAS_SORT_EXTABLE
+#define ARCH_HAS_SEARCH_EXTABLE
+
/* Sparc is not segmented, however we need to be able to fool access_ok()
* when doing system calls from kernel mode legitimately.
*
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index 9027376..0ee642f 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -75,8 +75,6 @@
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
}
/* Make generic code ignore STT_REGISTER dummy undefined symbols. */
diff --git a/arch/sparc/mm/extable.c b/arch/sparc/mm/extable.c
index 16cc289..a61c349 100644
--- a/arch/sparc/mm/extable.c
+++ b/arch/sparc/mm/extable.c
@@ -28,6 +28,10 @@
* word 3: last insn address + 4 bytes
* word 4: fixup code address
*
+ * Deleted entries are encoded as:
+ * word 1: unused
+ * word 2: -1
+ *
* See asm/uaccess.h for more details.
*/
@@ -39,6 +43,10 @@
continue;
}
+ /* A deleted entry; see trim_init_extable */
+ if (walk->fixup == -1)
+ continue;
+
if (walk->insn == value)
return walk;
}
@@ -57,6 +65,27 @@
return NULL;
}
+#ifdef CONFIG_MODULES
+/* We could memmove them around; easier to mark the trimmed ones. */
+void trim_init_extable(struct module *m)
+{
+ unsigned int i;
+ bool range;
+
+ for (i = 0; i < m->num_exentries; i += range ? 2 : 1) {
+ range = m->extable[i].fixup == 0;
+
+ if (within_module_init(m->extable[i].insn, m)) {
+ m->extable[i].fixup = -1;
+ if (range)
+ m->extable[i+1].fixup = -1;
+ }
+ if (range)
+ i++;
+ }
+}
+#endif /* CONFIG_MODULES */
+
/* Special extable search, which handles ranges. Returns fixup */
unsigned long search_extables_range(unsigned long addr, unsigned long *g2)
{
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index 58da248..9ce3f16 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -53,16 +53,21 @@
#else
# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
#endif
+#define MODULES_VADDR VMALLOC_START
+#define MODULES_END VMALLOC_END
+#define MODULES_LEN (MODULES_VADDR - MODULES_END)
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
-
+#define __PAGE_KERNEL_EXEC \
+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
#define PAGE_COPY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
#define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
+#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
/*
* The i386 can't do page protection for execute, and considers that the same
diff --git a/arch/um/sys-i386/Makefile b/arch/um/sys-i386/Makefile
index 598b5c1..1b549bc 100644
--- a/arch/um/sys-i386/Makefile
+++ b/arch/um/sys-i386/Makefile
@@ -8,7 +8,7 @@
subarch-obj-y = lib/semaphore_32.o lib/string_32.o
subarch-obj-$(CONFIG_HIGHMEM) += mm/highmem_32.o
-subarch-obj-$(CONFIG_MODULES) += kernel/module_32.o
+subarch-obj-$(CONFIG_MODULES) += kernel/module.o
USER_OBJS := bugs.o ptrace_user.o fault.o
diff --git a/arch/um/sys-x86_64/Makefile b/arch/um/sys-x86_64/Makefile
index c8b4cce..2201e9c 100644
--- a/arch/um/sys-x86_64/Makefile
+++ b/arch/um/sys-x86_64/Makefile
@@ -8,10 +8,8 @@
setjmp.o signal.o stub.o stub_segv.o syscalls.o syscall_table.o \
sysrq.o ksyms.o tls.o
-obj-$(CONFIG_MODULES) += um_module.o
-
subarch-obj-y = lib/csum-partial_64.o lib/memcpy_64.o lib/thunk_64.o
-subarch-obj-$(CONFIG_MODULES) += kernel/module_64.o
+subarch-obj-$(CONFIG_MODULES) += kernel/module.o
ldt-y = ../sys-i386/ldt.o
diff --git a/arch/um/sys-x86_64/um_module.c b/arch/um/sys-x86_64/um_module.c
deleted file mode 100644
index 3dead39..0000000
--- a/arch/um/sys-x86_64/um_module.c
+++ /dev/null
@@ -1,21 +0,0 @@
-#include <linux/vmalloc.h>
-#include <linux/moduleloader.h>
-
-/* Copied from i386 arch/i386/kernel/module.c */
-void *module_alloc(unsigned long size)
-{
- if (size == 0)
- return NULL;
- return vmalloc_exec(size);
-}
-
-/* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
-{
- vfree(module_region);
- /*
- * FIXME: If module_region == mod->init_region, trim exception
- * table entries.
- */
-}
-
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 1caf576..313389c 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -17,8 +17,13 @@
/* Pages for switcher itself, then two pages per cpu */
#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
-/* We map at -4M for ease of mapping into the guest (one PTE page). */
+/* We map at -4M (-2M when PAE is activated) for ease of mapping
+ * into the guest (one PTE page). */
+#ifdef CONFIG_X86_PAE
+#define SWITCHER_ADDR 0xFFE00000
+#else
#define SWITCHER_ADDR 0xFFC00000
+#endif
/* Found in switcher.S */
extern unsigned long default_idt_entries[];
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index faae199..d31c4a6 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -12,11 +12,13 @@
#define LHCALL_TS 8
#define LHCALL_SET_CLOCKEVENT 9
#define LHCALL_HALT 10
+#define LHCALL_SET_PMD 13
#define LHCALL_SET_PTE 14
-#define LHCALL_SET_PMD 15
+#define LHCALL_SET_PGD 15
#define LHCALL_LOAD_TLS 16
#define LHCALL_NOTIFY 17
#define LHCALL_LOAD_GDT_ENTRY 18
+#define LHCALL_SEND_INTERRUPTS 19
#define LGUEST_TRAP_ENTRY 0x1F
@@ -32,10 +34,10 @@
* operations? There are two ways: the direct way is to make a "hypercall",
* to make requests of the Host Itself.
*
- * We use the KVM hypercall mechanism. Eighteen hypercalls are
+ * We use the KVM hypercall mechanism. Seventeen hypercalls are
* available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %ebx, %ecx and %edx. If a return
- * value makes sense, it's returned in %eax.
+ * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
+ * If a return value makes sense, it's returned in %eax.
*
* Grossly invalid calls result in Sudden Death at the hands of the vengeful
* Host, rather than returning failure. This reflects Winston Churchill's
@@ -47,8 +49,9 @@
#define LHCALL_RING_SIZE 64
struct hcall_args {
- /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
- unsigned long arg0, arg1, arg2, arg3;
+ /* These map directly onto eax, ebx, ecx, edx and esi
+ * in struct lguest_regs */
+ unsigned long arg0, arg1, arg2, arg3, arg4;
};
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index 2733fad..5e67c15 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -46,6 +46,10 @@
# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
#endif
+#define MODULES_VADDR VMALLOC_START
+#define MODULES_END VMALLOC_END
+#define MODULES_LEN (MODULES_VADDR - MODULES_END)
+
#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
#endif /* _ASM_X86_PGTABLE_32_DEFS_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4f78bd6..f3477bb 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -73,7 +73,7 @@
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-$(CONFIG_KPROBES) += kprobes.o
-obj-$(CONFIG_MODULES) += module_$(BITS).o
+obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
obj-$(CONFIG_KGDB) += kgdb.o
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 1a830cb..dfdbf64 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -126,6 +126,7 @@
#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+ OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
BLANK();
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module.c
similarity index 74%
rename from arch/x86/kernel/module_64.c
rename to arch/x86/kernel/module.c
index c23880b..89f386f 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module.c
@@ -1,6 +1,5 @@
-/* Kernel module help for x86-64
+/* Kernel module help for x86.
Copyright (C) 2001 Rusty Russell.
- Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -22,23 +21,18 @@
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
#include <linux/bug.h>
+#include <linux/mm.h>
#include <asm/system.h>
#include <asm/page.h>
#include <asm/pgtable.h>
+#if 0
+#define DEBUGP printk
+#else
#define DEBUGP(fmt...)
-
-#ifndef CONFIG_UML
-void module_free(struct module *mod, void *module_region)
-{
- vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
-}
+#endif
void *module_alloc(unsigned long size)
{
@@ -54,9 +48,15 @@
if (!area)
return NULL;
- return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
+ return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
+ PAGE_KERNEL_EXEC);
}
-#endif
+
+/* Free memory returned from module_alloc */
+void module_free(struct module *mod, void *module_region)
+{
+ vfree(module_region);
+}
/* We don't need anything special. */
int module_frob_arch_sections(Elf_Ehdr *hdr,
@@ -67,6 +67,58 @@
return 0;
}
+#ifdef CONFIG_X86_32
+int apply_relocate(Elf32_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ unsigned int i;
+ Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
+ Elf32_Sym *sym;
+ uint32_t *location;
+
+ DEBUGP("Applying relocate section %u to %u\n", relsec,
+ sechdrs[relsec].sh_info);
+ for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+ /* This is where to make the change */
+ location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ + rel[i].r_offset;
+ /* This is the symbol it is referring to. Note that all
+ undefined symbols have been resolved. */
+ sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
+ + ELF32_R_SYM(rel[i].r_info);
+
+ switch (ELF32_R_TYPE(rel[i].r_info)) {
+ case R_386_32:
+ /* We add the value into the location given */
+ *location += sym->st_value;
+ break;
+ case R_386_PC32:
+ /* Add the value, subtract its postition */
+ *location += sym->st_value - (uint32_t)location;
+ break;
+ default:
+ printk(KERN_ERR "module %s: Unknown relocation: %u\n",
+ me->name, ELF32_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+}
+
+int apply_relocate_add(Elf32_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
+ me->name);
+ return -ENOEXEC;
+}
+#else /*X86_64*/
int apply_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
@@ -147,6 +199,8 @@
return -ENOSYS;
}
+#endif
+
int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
deleted file mode 100644
index 0edd819..0000000
--- a/arch/x86/kernel/module_32.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Kernel module help for i386.
- Copyright (C) 2001 Rusty Russell.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-*/
-#include <linux/moduleloader.h>
-#include <linux/elf.h>
-#include <linux/vmalloc.h>
-#include <linux/fs.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/bug.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt...)
-#endif
-
-void *module_alloc(unsigned long size)
-{
- if (size == 0)
- return NULL;
- return vmalloc_exec(size);
-}
-
-
-/* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
-{
- vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
-}
-
-/* We don't need anything special. */
-int module_frob_arch_sections(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- char *secstrings,
- struct module *mod)
-{
- return 0;
-}
-
-int apply_relocate(Elf32_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- unsigned int i;
- Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
- Elf32_Sym *sym;
- uint32_t *location;
-
- DEBUGP("Applying relocate section %u to %u\n", relsec,
- sechdrs[relsec].sh_info);
- for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
- /* This is where to make the change */
- location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
- + rel[i].r_offset;
- /* This is the symbol it is referring to. Note that all
- undefined symbols have been resolved. */
- sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
- + ELF32_R_SYM(rel[i].r_info);
-
- switch (ELF32_R_TYPE(rel[i].r_info)) {
- case R_386_32:
- /* We add the value into the location given */
- *location += sym->st_value;
- break;
- case R_386_PC32:
- /* Add the value, subtract its postition */
- *location += sym->st_value - (uint32_t)location;
- break;
- default:
- printk(KERN_ERR "module %s: Unknown relocation: %u\n",
- me->name, ELF32_R_TYPE(rel[i].r_info));
- return -ENOEXEC;
- }
- }
- return 0;
-}
-
-int apply_relocate_add(Elf32_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
- me->name);
- return -ENOEXEC;
-}
-
-int module_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
-{
- const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
- *para = NULL;
- char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
-
- for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
- if (!strcmp(".text", secstrings + s->sh_name))
- text = s;
- if (!strcmp(".altinstructions", secstrings + s->sh_name))
- alt = s;
- if (!strcmp(".smp_locks", secstrings + s->sh_name))
- locks = s;
- if (!strcmp(".parainstructions", secstrings + s->sh_name))
- para = s;
- }
-
- if (alt) {
- /* patch .altinstructions */
- void *aseg = (void *)alt->sh_addr;
- apply_alternatives(aseg, aseg + alt->sh_size);
- }
- if (locks && text) {
- void *lseg = (void *)locks->sh_addr;
- void *tseg = (void *)text->sh_addr;
- alternatives_smp_module_add(me, me->name,
- lseg, lseg + locks->sh_size,
- tseg, tseg + text->sh_size);
- }
-
- if (para) {
- void *pseg = (void *)para->sh_addr;
- apply_paravirt(pseg, pseg + para->sh_size);
- }
-
- return module_bug_finalize(hdr, sechdrs, me);
-}
-
-void module_arch_cleanup(struct module *mod)
-{
- alternatives_smp_module_del(mod);
- module_bug_cleanup(mod);
-}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d1c636b..be5ae80 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -301,15 +301,13 @@
#ifdef CONFIG_BLK_DEV_INITRD
-#ifdef CONFIG_X86_32
-
#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
static void __init relocate_initrd(void)
{
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
- u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+ u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
u64 ramdisk_here;
unsigned long slop, clen, mapaddr;
char *p, *q;
@@ -365,14 +363,13 @@
ramdisk_image, ramdisk_image + ramdisk_size - 1,
ramdisk_here, ramdisk_here + ramdisk_size - 1);
}
-#endif
static void __init reserve_initrd(void)
{
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
u64 ramdisk_end = ramdisk_image + ramdisk_size;
- u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+ u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
if (!boot_params.hdr.type_of_loader ||
!ramdisk_image || !ramdisk_size)
@@ -402,14 +399,8 @@
return;
}
-#ifdef CONFIG_X86_32
relocate_initrd();
-#else
- printk(KERN_ERR "initrd extends beyond end of memory "
- "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
- ramdisk_end, end_of_lowmem);
- initrd_start = 0;
-#endif
+
free_early(ramdisk_image, ramdisk_end);
}
#else
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 4c85b2e..367e878 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -108,6 +108,8 @@
/* Data */
. = ALIGN(PAGE_SIZE);
.data : AT(ADDR(.data) - LOAD_OFFSET) {
+ /* Start of data section */
+ _sdata = .;
DATA_DATA
CONSTRUCTORS
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 8dab8f7..3871804 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,7 +2,6 @@
bool "Lguest guest support"
select PARAVIRT
depends on X86_32
- depends on !X86_PAE
select VIRTIO
select VIRTIO_RING
select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4e0c265..7bc65f0 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -87,7 +87,7 @@
/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a
* ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall
+ * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
* arguments, and a "hcall_status" word which is 0 if the call is ready to go,
* and 255 once the Host has finished with it.
*
@@ -96,7 +96,8 @@
* effect of causing the Host to run all the stored calls in the ring buffer
* which empties it for next time! */
static void async_hcall(unsigned long call, unsigned long arg1,
- unsigned long arg2, unsigned long arg3)
+ unsigned long arg2, unsigned long arg3,
+ unsigned long arg4)
{
/* Note: This code assumes we're uniprocessor. */
static unsigned int next_call;
@@ -108,12 +109,13 @@
local_irq_save(flags);
if (lguest_data.hcall_status[next_call] != 0xFF) {
/* Table full, so do normal hcall which will flush table. */
- kvm_hypercall3(call, arg1, arg2, arg3);
+ kvm_hypercall4(call, arg1, arg2, arg3, arg4);
} else {
lguest_data.hcalls[next_call].arg0 = call;
lguest_data.hcalls[next_call].arg1 = arg1;
lguest_data.hcalls[next_call].arg2 = arg2;
lguest_data.hcalls[next_call].arg3 = arg3;
+ lguest_data.hcalls[next_call].arg4 = arg4;
/* Arguments must all be written before we mark it to go */
wmb();
lguest_data.hcall_status[next_call] = 0;
@@ -141,7 +143,7 @@
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
kvm_hypercall1(call, arg1);
else
- async_hcall(call, arg1, 0, 0);
+ async_hcall(call, arg1, 0, 0, 0);
}
static void lazy_hcall2(unsigned long call,
@@ -151,7 +153,7 @@
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
kvm_hypercall2(call, arg1, arg2);
else
- async_hcall(call, arg1, arg2, 0);
+ async_hcall(call, arg1, arg2, 0, 0);
}
static void lazy_hcall3(unsigned long call,
@@ -162,9 +164,23 @@
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
kvm_hypercall3(call, arg1, arg2, arg3);
else
- async_hcall(call, arg1, arg2, arg3);
+ async_hcall(call, arg1, arg2, arg3, 0);
}
+#ifdef CONFIG_X86_PAE
+static void lazy_hcall4(unsigned long call,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4)
+{
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
+ kvm_hypercall4(call, arg1, arg2, arg3, arg4);
+ else
+ async_hcall(call, arg1, arg2, arg3, arg4);
+}
+#endif
+
/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
* issue the do-nothing hypercall to flush any stored calls. */
static void lguest_leave_lazy_mmu_mode(void)
@@ -179,7 +195,7 @@
paravirt_end_context_switch(next);
}
-/*G:033
+/*G:032
* After that diversion we return to our first native-instruction
* replacements: four functions for interrupt control.
*
@@ -199,30 +215,28 @@
{
return lguest_data.irq_enabled;
}
-PV_CALLEE_SAVE_REGS_THUNK(save_fl);
-
-/* restore_flags() just sets the flags back to the value given. */
-static void restore_fl(unsigned long flags)
-{
- lguest_data.irq_enabled = flags;
-}
-PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
/* Interrupts go off... */
static void irq_disable(void)
{
lguest_data.irq_enabled = 0;
}
+
+/* Let's pause a moment. Remember how I said these are called so often?
+ * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
+ * break some rules. In particular, these functions are assumed to save their
+ * own registers if they need to: normal C functions assume they can trash the
+ * eax register. To use normal C functions, we use
+ * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
+ * C function, then restores it. */
+PV_CALLEE_SAVE_REGS_THUNK(save_fl);
PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
-
-/* Interrupts go on... */
-static void irq_enable(void)
-{
- lguest_data.irq_enabled = X86_EFLAGS_IF;
-}
-PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
-
/*:*/
+
+/* These are in i386_head.S */
+extern void lg_irq_enable(void);
+extern void lg_restore_fl(unsigned long flags);
+
/*M:003 Note that we don't check for outstanding interrupts when we re-enable
* them (or when we unmask an interrupt). This seems to work for the moment,
* since interrupts are rare and we'll just get the interrupt on the next timer
@@ -368,8 +382,8 @@
case 1: /* Basic feature request. */
/* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
*cx &= 0x00002201;
- /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */
- *dx &= 0x07808111;
+ /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
+ *dx &= 0x07808151;
/* The Host can do a nice optimization if it knows that the
* kernel mappings (addresses above 0xC0000000 or whatever
* PAGE_OFFSET is set to) haven't changed. But Linux calls
@@ -388,6 +402,11 @@
if (*ax > 0x80000008)
*ax = 0x80000008;
break;
+ case 0x80000001:
+ /* Here we should fix nx cap depending on host. */
+ /* For this version of PAE, we just clear NX bit. */
+ *dx &= ~(1 << 20);
+ break;
}
}
@@ -521,25 +540,52 @@
static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
+#ifdef CONFIG_X86_PAE
+ lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
+ ptep->pte_low, ptep->pte_high);
+#else
lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
+#endif
}
static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
- *ptep = pteval;
+ native_set_pte(ptep, pteval);
lguest_pte_update(mm, addr, ptep);
}
-/* The Guest calls this to set a top-level entry. Again, we set the entry then
- * tell the Host which top-level page we changed, and the index of the entry we
- * changed. */
+/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
+ * to set a middle-level entry when PAE is activated.
+ * Again, we set the entry then tell the Host which page we changed,
+ * and the index of the entry we changed. */
+#ifdef CONFIG_X86_PAE
+static void lguest_set_pud(pud_t *pudp, pud_t pudval)
+{
+ native_set_pud(pudp, pudval);
+
+ /* 32 bytes aligned pdpt address and the index. */
+ lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
+ (__pa(pudp) & 0x1F) / sizeof(pud_t));
+}
+
static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
- *pmdp = pmdval;
+ native_set_pmd(pmdp, pmdval);
lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
- (__pa(pmdp) & (PAGE_SIZE - 1)) / 4);
+ (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
}
+#else
+
+/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not
+ * activated. */
+static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+ native_set_pmd(pmdp, pmdval);
+ lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
+ (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
+}
+#endif
/* There are a couple of legacy places where the kernel sets a PTE, but we
* don't know the top level any more. This is useless for us, since we don't
@@ -552,11 +598,31 @@
* which brings boot back to 0.25 seconds. */
static void lguest_set_pte(pte_t *ptep, pte_t pteval)
{
- *ptep = pteval;
+ native_set_pte(ptep, pteval);
if (cr3_changed)
lazy_hcall1(LHCALL_FLUSH_TLB, 1);
}
+#ifdef CONFIG_X86_PAE
+static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ native_set_pte_atomic(ptep, pte);
+ if (cr3_changed)
+ lazy_hcall1(LHCALL_FLUSH_TLB, 1);
+}
+
+void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ native_pte_clear(mm, addr, ptep);
+ lguest_pte_update(mm, addr, ptep);
+}
+
+void lguest_pmd_clear(pmd_t *pmdp)
+{
+ lguest_set_pmd(pmdp, __pmd(0));
+}
+#endif
+
/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
* native page table operations. On native hardware you can set a new page
* table entry whenever you want, but if you want to remove one you have to do
@@ -628,13 +694,12 @@
{
unsigned int i;
- for (i = 0; i < LGUEST_IRQS; i++) {
- int vector = FIRST_EXTERNAL_VECTOR + i;
+ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
/* Some systems map "vectors" to interrupts weirdly. Lguest has
* a straightforward 1 to 1 mapping, so force that here. */
- __get_cpu_var(vector_irq)[vector] = i;
- if (vector != SYSCALL_VECTOR)
- set_intr_gate(vector, interrupt[i]);
+ __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
+ if (i != SYSCALL_VECTOR)
+ set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
/* This call is required to set up for 4k stacks, where we have
* separate stacks for hard and soft interrupts. */
@@ -973,10 +1038,10 @@
*
* Our current solution is to allow the paravirt back end to optionally patch
* over the indirect calls to replace them with something more efficient. We
- * patch the four most commonly called functions: disable interrupts, enable
- * interrupts, restore interrupts and save interrupts. We usually have 6 or 10
- * bytes to patch into: the Guest versions of these operations are small enough
- * that we can fit comfortably.
+ * patch two of the simplest of the most commonly called functions: disable
+ * interrupts and save interrupts. We usually have 6 or 10 bytes to patch
+ * into: the Guest versions of these operations are small enough that we can
+ * fit comfortably.
*
* First we need assembly templates of each of the patchable Guest operations,
* and these are in i386_head.S. */
@@ -987,8 +1052,6 @@
const char *start, *end;
} lguest_insns[] = {
[PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
- [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
- [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
[PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
};
@@ -1026,6 +1089,7 @@
pv_info.name = "lguest";
pv_info.paravirt_enabled = 1;
pv_info.kernel_rpl = 1;
+ pv_info.shared_kernel_pmd = 1;
/* We set up all the lguest overrides for sensitive operations. These
* are detailed with the operations themselves. */
@@ -1033,9 +1097,9 @@
/* interrupt-related operations */
pv_irq_ops.init_IRQ = lguest_init_IRQ;
pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
- pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
+ pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
- pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
+ pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
pv_irq_ops.safe_halt = lguest_safe_halt;
/* init-time operations */
@@ -1071,6 +1135,12 @@
pv_mmu_ops.set_pte = lguest_set_pte;
pv_mmu_ops.set_pte_at = lguest_set_pte_at;
pv_mmu_ops.set_pmd = lguest_set_pmd;
+#ifdef CONFIG_X86_PAE
+ pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
+ pv_mmu_ops.pte_clear = lguest_pte_clear;
+ pv_mmu_ops.pmd_clear = lguest_pmd_clear;
+ pv_mmu_ops.set_pud = lguest_set_pud;
+#endif
pv_mmu_ops.read_cr2 = lguest_read_cr2;
pv_mmu_ops.read_cr3 = lguest_read_cr3;
pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index f795419..a9c8cfe 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -46,10 +46,64 @@
.globl lgstart_##name; .globl lgend_##name
LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-/*:*/
+
+/*G:033 But using those wrappers is inefficient (we'll see why that doesn't
+ * matter for save_fl and irq_disable later). If we write our routines
+ * carefully in assembler, we can avoid clobbering any registers and avoid
+ * jumping through the wrapper functions.
+ *
+ * I skipped over our first piece of assembler, but this one is worth studying
+ * in a bit more detail so I'll describe in easy stages. First, the routine
+ * to enable interrupts: */
+ENTRY(lg_irq_enable)
+ /* The reverse of irq_disable, this sets lguest_data.irq_enabled to
+ * X86_EFLAGS_IF (ie. "Interrupts enabled"). */
+ movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
+ /* But now we need to check if the Host wants to know: there might have
+ * been interrupts waiting to be delivered, in which case it will have
+ * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
+ * jump to send_interrupts, otherwise we're done. */
+ testl $0, lguest_data+LGUEST_DATA_irq_pending
+ jnz send_interrupts
+ /* One cool thing about x86 is that you can do many things without using
+ * a register. In this case, the normal path hasn't needed to save or
+ * restore any registers at all! */
+ ret
+send_interrupts:
+ /* OK, now we need a register: eax is used for the hypercall number,
+ * which is LHCALL_SEND_INTERRUPTS.
+ *
+ * We used not to bother with this pending detection at all, which was
+ * much simpler. Sooner or later the Host would realize it had to
+ * send us an interrupt. But that turns out to make performance 7
+ * times worse on a simple tcp benchmark. So now we do this the hard
+ * way. */
+ pushl %eax
+ movl $LHCALL_SEND_INTERRUPTS, %eax
+ /* This is a vmcall instruction (same thing that KVM uses). Older
+ * assembler versions might not know the "vmcall" instruction, so we
+ * create one manually here. */
+ .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
+ popl %eax
+ ret
+
+/* Finally, the "popf" or "restore flags" routine. The %eax register holds the
+ * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
+ * enabling interrupts again, if it's 0 we're leaving them off. */
+ENTRY(lg_restore_fl)
+ /* This is just "lguest_data.irq_enabled = flags;" */
+ movl %eax, lguest_data+LGUEST_DATA_irq_enabled
+ /* Now, if the %eax value has enabled interrupts and
+ * lguest_data.irq_pending is set, we want to tell the Host so it can
+ * deliver any outstanding interrupts. Fortunately, both values will
+ * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
+ * instruction will AND them together for us. If both are set, we
+ * jump to send_interrupts. */
+ testl lguest_data+LGUEST_DATA_irq_pending, %eax
+ jnz send_interrupts
+ /* Again, the normal path has used no extra registers. Clever, huh? */
+ ret
/* These demark the EIP range where host should never deliver interrupts. */
.global lguest_noirq_start
diff --git a/arch/xtensa/kernel/module.c b/arch/xtensa/kernel/module.c
index 3981a46..c1accea 100644
--- a/arch/xtensa/kernel/module.c
+++ b/arch/xtensa/kernel/module.c
@@ -34,8 +34,6 @@
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
}
int module_frob_arch_sections(Elf32_Ehdr *hdr,
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c0facaa..43db3ea 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -254,7 +254,7 @@
return index << PART_BITS;
}
-static int virtblk_probe(struct virtio_device *vdev)
+static int __devinit virtblk_probe(struct virtio_device *vdev)
{
struct virtio_blk *vblk;
int err;
@@ -288,7 +288,7 @@
sg_init_table(vblk->sg, vblk->sg_elems);
/* We expect one virtqueue, for output. */
- vblk->vq = vdev->config->find_vq(vdev, 0, blk_done);
+ vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests");
if (IS_ERR(vblk->vq)) {
err = PTR_ERR(vblk->vq);
goto out_free_vblk;
@@ -388,14 +388,14 @@
out_mempool:
mempool_destroy(vblk->pool);
out_free_vq:
- vdev->config->del_vq(vblk->vq);
+ vdev->config->del_vqs(vdev);
out_free_vblk:
kfree(vblk);
out:
return err;
}
-static void virtblk_remove(struct virtio_device *vdev)
+static void __devexit virtblk_remove(struct virtio_device *vdev)
{
struct virtio_blk *vblk = vdev->priv;
@@ -409,7 +409,7 @@
blk_cleanup_queue(vblk->disk->queue);
put_disk(vblk->disk);
mempool_destroy(vblk->pool);
- vdev->config->del_vq(vblk->vq);
+ vdev->config->del_vqs(vdev);
kfree(vblk);
}
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 86e83f8..32216b6 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -35,13 +35,13 @@
static void random_recv_done(struct virtqueue *vq)
{
- int len;
+ unsigned int len;
/* We can get spurious callbacks, e.g. shared IRQs + virtio_pci. */
if (!vq->vq_ops->get_buf(vq, &len))
return;
- data_left = len / sizeof(random_data[0]);
+ data_left += len;
complete(&have_data);
}
@@ -49,7 +49,7 @@
{
struct scatterlist sg;
- sg_init_one(&sg, random_data, RANDOM_DATA_SIZE);
+ sg_init_one(&sg, random_data+data_left, RANDOM_DATA_SIZE-data_left);
/* There should always be room for one buffer. */
if (vq->vq_ops->add_buf(vq, &sg, 0, 1, random_data) != 0)
BUG();
@@ -59,24 +59,32 @@
/* At least we don't udelay() in a loop like some other drivers. */
static int virtio_data_present(struct hwrng *rng, int wait)
{
- if (data_left)
+ if (data_left >= sizeof(u32))
return 1;
+again:
if (!wait)
return 0;
wait_for_completion(&have_data);
+
+ /* Not enough? Re-register. */
+ if (unlikely(data_left < sizeof(u32))) {
+ register_buffer();
+ goto again;
+ }
+
return 1;
}
/* virtio_data_present() must have succeeded before this is called. */
static int virtio_data_read(struct hwrng *rng, u32 *data)
{
- BUG_ON(!data_left);
+ BUG_ON(data_left < sizeof(u32));
+ data_left -= sizeof(u32);
+ *data = random_data[data_left / 4];
- *data = random_data[--data_left];
-
- if (!data_left) {
+ if (data_left < sizeof(u32)) {
init_completion(&have_data);
register_buffer();
}
@@ -94,13 +102,13 @@
int err;
/* We expect a single virtqueue. */
- vq = vdev->config->find_vq(vdev, 0, random_recv_done);
+ vq = virtio_find_single_vq(vdev, random_recv_done, "input");
if (IS_ERR(vq))
return PTR_ERR(vq);
err = hwrng_register(&virtio_hwrng);
if (err) {
- vdev->config->del_vq(vq);
+ vdev->config->del_vqs(vdev);
return err;
}
@@ -112,7 +120,7 @@
{
vdev->config->reset(vdev);
hwrng_unregister(&virtio_hwrng);
- vdev->config->del_vq(vq);
+ vdev->config->del_vqs(vdev);
}
static struct virtio_device_id id_table[] = {
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index ff6f5a4..c74dacf 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -188,6 +188,9 @@
* Finally we put our input buffer in the input queue, ready to receive. */
static int __devinit virtcons_probe(struct virtio_device *dev)
{
+ vq_callback_t *callbacks[] = { hvc_handle_input, NULL};
+ const char *names[] = { "input", "output" };
+ struct virtqueue *vqs[2];
int err;
vdev = dev;
@@ -199,20 +202,15 @@
goto fail;
}
- /* Find the input queue. */
+ /* Find the queues. */
/* FIXME: This is why we want to wean off hvc: we do nothing
* when input comes in. */
- in_vq = vdev->config->find_vq(vdev, 0, hvc_handle_input);
- if (IS_ERR(in_vq)) {
- err = PTR_ERR(in_vq);
+ err = vdev->config->find_vqs(vdev, 2, vqs, callbacks, names);
+ if (err)
goto free;
- }
- out_vq = vdev->config->find_vq(vdev, 1, NULL);
- if (IS_ERR(out_vq)) {
- err = PTR_ERR(out_vq);
- goto free_in_vq;
- }
+ in_vq = vqs[0];
+ out_vq = vqs[1];
/* Start using the new console output. */
virtio_cons.get_chars = get_chars;
@@ -233,17 +231,15 @@
hvc = hvc_alloc(0, 0, &virtio_cons, PAGE_SIZE);
if (IS_ERR(hvc)) {
err = PTR_ERR(hvc);
- goto free_out_vq;
+ goto free_vqs;
}
/* Register the input buffer the first time. */
add_inbuf();
return 0;
-free_out_vq:
- vdev->config->del_vq(out_vq);
-free_in_vq:
- vdev->config->del_vq(in_vq);
+free_vqs:
+ vdev->config->del_vqs(vdev);
free:
kfree(inbuf);
fail:
diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 403d0e4..fc0949a 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -216,6 +216,7 @@
.host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA | IDE_HFLAG_SINGLE |
IDE_HFLAG_NO_IO_32BIT | IDE_HFLAG_UNMASK_IRQS,
.pio_mask = ATA_PIO6,
+ .chipset = ide_generic,
};
/*
@@ -246,8 +247,7 @@
static int __init at91_ide_probe(struct platform_device *pdev)
{
int ret;
- hw_regs_t hw;
- hw_regs_t *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
struct ide_host *host;
struct resource *res;
unsigned long tf_base = 0, ctl_base = 0;
@@ -304,10 +304,9 @@
ide_std_init_ports(&hw, tf_base, ctl_base + 6);
hw.irq = board->irq_pin;
- hw.chipset = ide_generic;
hw.dev = &pdev->dev;
- host = ide_host_alloc(&at91_ide_port_info, hws);
+ host = ide_host_alloc(&at91_ide_port_info, hws, 1);
if (!host) {
perr("failed to allocate ide host\n");
return -ENOMEM;
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 4601364..58121bd 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -449,7 +449,7 @@
}
#endif
-static void auide_setup_ports(hw_regs_t *hw, _auide_hwif *ahwif)
+static void auide_setup_ports(struct ide_hw *hw, _auide_hwif *ahwif)
{
int i;
unsigned long *ata_regs = hw->io_ports_array;
@@ -499,6 +499,7 @@
#ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA
.mwdma_mask = ATA_MWDMA2,
#endif
+ .chipset = ide_au1xxx,
};
static int au_ide_probe(struct platform_device *dev)
@@ -507,7 +508,7 @@
struct resource *res;
struct ide_host *host;
int ret = 0;
- hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
#if defined(CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA)
char *mode = "MWDMA2";
@@ -548,9 +549,8 @@
auide_setup_ports(&hw, ahwif);
hw.irq = ahwif->irq;
hw.dev = &dev->dev;
- hw.chipset = ide_au1xxx;
- ret = ide_host_add(&au1xxx_port_info, hws, &host);
+ ret = ide_host_add(&au1xxx_port_info, hws, 1, &host);
if (ret)
goto out;
diff --git a/drivers/ide/buddha.c b/drivers/ide/buddha.c
index d028f88..e3c6a59 100644
--- a/drivers/ide/buddha.c
+++ b/drivers/ide/buddha.c
@@ -121,7 +121,7 @@
return 1;
}
-static void __init buddha_setup_ports(hw_regs_t *hw, unsigned long base,
+static void __init buddha_setup_ports(struct ide_hw *hw, unsigned long base,
unsigned long ctl, unsigned long irq_port,
ide_ack_intr_t *ack_intr)
{
@@ -139,13 +139,12 @@
hw->irq = IRQ_AMIGA_PORTS;
hw->ack_intr = ack_intr;
-
- hw->chipset = ide_generic;
}
static const struct ide_port_info buddha_port_info = {
.host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
.irq_flags = IRQF_SHARED,
+ .chipset = ide_generic,
};
/*
@@ -161,7 +160,7 @@
while ((z = zorro_find_device(ZORRO_WILDCARD, z))) {
unsigned long board;
- hw_regs_t hw[MAX_NUM_HWIFS], *hws[] = { NULL, NULL, NULL, NULL };
+ struct ide_hw hw[MAX_NUM_HWIFS], *hws[MAX_NUM_HWIFS];
if (z->id == ZORRO_PROD_INDIVIDUAL_COMPUTERS_BUDDHA) {
buddha_num_hwifs = BUDDHA_NUM_HWIFS;
@@ -225,7 +224,7 @@
hws[i] = &hw[i];
}
- ide_host_add(&buddha_port_info, hws, NULL);
+ ide_host_add(&buddha_port_info, hws, i, NULL);
}
return 0;
diff --git a/drivers/ide/cmd640.c b/drivers/ide/cmd640.c
index 8890276..1683ed5 100644
--- a/drivers/ide/cmd640.c
+++ b/drivers/ide/cmd640.c
@@ -708,7 +708,7 @@
int second_port_cmd640 = 0, rc;
const char *bus_type, *port2;
u8 b, cfr;
- hw_regs_t hw[2], *hws[] = { NULL, NULL, NULL, NULL };
+ struct ide_hw hw[2], *hws[2];
if (cmd640_vlb && probe_for_cmd640_vlb()) {
bus_type = "VLB";
@@ -762,11 +762,9 @@
ide_std_init_ports(&hw[0], 0x1f0, 0x3f6);
hw[0].irq = 14;
- hw[0].chipset = ide_cmd640;
ide_std_init_ports(&hw[1], 0x170, 0x376);
hw[1].irq = 15;
- hw[1].chipset = ide_cmd640;
printk(KERN_INFO "cmd640: buggy cmd640%c interface on %s, config=0x%02x"
"\n", 'a' + cmd640_chip_version - 1, bus_type, cfr);
@@ -824,7 +822,8 @@
cmd640_dump_regs();
#endif
- return ide_host_add(&cmd640_port_info, hws, NULL);
+ return ide_host_add(&cmd640_port_info, hws, second_port_cmd640 ? 2 : 1,
+ NULL);
}
module_param_named(probe_vlb, cmd640_vlb, bool, 0);
diff --git a/drivers/ide/cs5520.c b/drivers/ide/cs5520.c
index 87987a7..bd066bb 100644
--- a/drivers/ide/cs5520.c
+++ b/drivers/ide/cs5520.c
@@ -110,7 +110,7 @@
static int __devinit cs5520_init_one(struct pci_dev *dev, const struct pci_device_id *id)
{
const struct ide_port_info *d = &cyrix_chipset;
- hw_regs_t hw[4], *hws[] = { NULL, NULL, NULL, NULL };
+ struct ide_hw hw[2], *hws[] = { NULL, NULL };
ide_setup_pci_noise(dev, d);
@@ -136,7 +136,7 @@
ide_pci_setup_ports(dev, d, &hw[0], &hws[0]);
hw[0].irq = 14;
- return ide_host_add(d, hws, NULL);
+ return ide_host_add(d, hws, 2, NULL);
}
static const struct pci_device_id cs5520_pci_tbl[] = {
diff --git a/drivers/ide/delkin_cb.c b/drivers/ide/delkin_cb.c
index f153b95..1e10eba 100644
--- a/drivers/ide/delkin_cb.c
+++ b/drivers/ide/delkin_cb.c
@@ -68,6 +68,7 @@
IDE_HFLAG_NO_DMA,
.irq_flags = IRQF_SHARED,
.init_chipset = delkin_cb_init_chipset,
+ .chipset = ide_pci,
};
static int __devinit
@@ -76,7 +77,7 @@
struct ide_host *host;
unsigned long base;
int rc;
- hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
rc = pci_enable_device(dev);
if (rc) {
@@ -97,9 +98,8 @@
ide_std_init_ports(&hw, base + 0x10, base + 0x1e);
hw.irq = dev->irq;
hw.dev = &dev->dev;
- hw.chipset = ide_pci; /* this enables IRQ sharing */
- rc = ide_host_add(&delkin_cb_port_info, hws, &host);
+ rc = ide_host_add(&delkin_cb_port_info, hws, 1, &host);
if (rc)
goto out_disable;
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
index 0e2df67..22fa273 100644
--- a/drivers/ide/falconide.c
+++ b/drivers/ide/falconide.c
@@ -111,9 +111,10 @@
.host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_SERIALIZE |
IDE_HFLAG_NO_DMA,
.irq_flags = IRQF_SHARED,
+ .chipset = ide_generic,
};
-static void __init falconide_setup_ports(hw_regs_t *hw)
+static void __init falconide_setup_ports(struct ide_hw *hw)
{
int i;
@@ -128,8 +129,6 @@
hw->irq = IRQ_MFP_IDE;
hw->ack_intr = NULL;
-
- hw->chipset = ide_generic;
}
/*
@@ -139,7 +138,7 @@
static int __init falconide_init(void)
{
struct ide_host *host;
- hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
int rc;
if (!MACH_IS_ATARI || !ATARIHW_PRESENT(IDE))
@@ -154,7 +153,7 @@
falconide_setup_ports(&hw);
- host = ide_host_alloc(&falconide_port_info, hws);
+ host = ide_host_alloc(&falconide_port_info, hws, 1);
if (host == NULL) {
rc = -ENOMEM;
goto err;
diff --git a/drivers/ide/gayle.c b/drivers/ide/gayle.c
index c711951..4451a6a 100644
--- a/drivers/ide/gayle.c
+++ b/drivers/ide/gayle.c
@@ -88,7 +88,7 @@
return 1;
}
-static void __init gayle_setup_ports(hw_regs_t *hw, unsigned long base,
+static void __init gayle_setup_ports(struct ide_hw *hw, unsigned long base,
unsigned long ctl, unsigned long irq_port,
ide_ack_intr_t *ack_intr)
{
@@ -106,14 +106,13 @@
hw->irq = IRQ_AMIGA_PORTS;
hw->ack_intr = ack_intr;
-
- hw->chipset = ide_generic;
}
static const struct ide_port_info gayle_port_info = {
.host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_SERIALIZE |
IDE_HFLAG_NO_DMA,
.irq_flags = IRQF_SHARED,
+ .chipset = ide_generic,
};
/*
@@ -126,7 +125,7 @@
unsigned long base, ctrlport, irqport;
ide_ack_intr_t *ack_intr;
int a4000, i, rc;
- hw_regs_t hw[GAYLE_NUM_HWIFS], *hws[] = { NULL, NULL, NULL, NULL };
+ struct ide_hw hw[GAYLE_NUM_HWIFS], *hws[GAYLE_NUM_HWIFS];
if (!MACH_IS_AMIGA)
return -ENODEV;
@@ -171,7 +170,7 @@
hws[i] = &hw[i];
}
- rc = ide_host_add(&gayle_port_info, hws, NULL);
+ rc = ide_host_add(&gayle_port_info, hws, i, NULL);
if (rc)
release_mem_region(res_start, res_n);
diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index 0feb66c..7ce68ef 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -138,14 +138,6 @@
#undef HPT_RESET_STATE_ENGINE
#undef HPT_DELAY_INTERRUPT
-static const char *quirk_drives[] = {
- "QUANTUM FIREBALLlct08 08",
- "QUANTUM FIREBALLP KA6.4",
- "QUANTUM FIREBALLP LM20.4",
- "QUANTUM FIREBALLP LM20.5",
- NULL
-};
-
static const char *bad_ata100_5[] = {
"IBM-DTLA-307075",
"IBM-DTLA-307060",
@@ -729,27 +721,13 @@
hpt3xx_set_mode(drive, XFER_PIO_0 + pio);
}
-static void hpt3xx_quirkproc(ide_drive_t *drive)
-{
- char *m = (char *)&drive->id[ATA_ID_PROD];
- const char **list = quirk_drives;
-
- while (*list)
- if (strstr(m, *list++)) {
- drive->quirk_list = 1;
- return;
- }
-
- drive->quirk_list = 0;
-}
-
static void hpt3xx_maskproc(ide_drive_t *drive, int mask)
{
ide_hwif_t *hwif = drive->hwif;
struct pci_dev *dev = to_pci_dev(hwif->dev);
struct hpt_info *info = hpt3xx_get_info(hwif->dev);
- if (drive->quirk_list == 0)
+ if ((drive->dev_flags & IDE_DFLAG_NIEN_QUIRK) == 0)
return;
if (info->chip_type >= HPT370) {
@@ -1404,7 +1382,6 @@
static const struct ide_port_ops hpt3xx_port_ops = {
.set_pio_mode = hpt3xx_set_pio_mode,
.set_dma_mode = hpt3xx_set_mode,
- .quirkproc = hpt3xx_quirkproc,
.maskproc = hpt3xx_maskproc,
.mdma_filter = hpt3xx_mdma_filter,
.udma_filter = hpt3xx_udma_filter,
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index 36da913..5af3d0f 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -65,8 +65,6 @@
};
struct icside_state {
- unsigned int channel;
- unsigned int enabled;
void __iomem *irq_port;
void __iomem *ioc_base;
unsigned int sel;
@@ -116,18 +114,11 @@
struct icside_state *state = ec->irq_data;
void __iomem *base = state->irq_port;
- state->enabled = 1;
+ writeb(0, base + ICS_ARCIN_V6_INTROFFSET_1);
+ readb(base + ICS_ARCIN_V6_INTROFFSET_2);
- switch (state->channel) {
- case 0:
- writeb(0, base + ICS_ARCIN_V6_INTROFFSET_1);
- readb(base + ICS_ARCIN_V6_INTROFFSET_2);
- break;
- case 1:
- writeb(0, base + ICS_ARCIN_V6_INTROFFSET_2);
- readb(base + ICS_ARCIN_V6_INTROFFSET_1);
- break;
- }
+ writeb(0, base + ICS_ARCIN_V6_INTROFFSET_2);
+ readb(base + ICS_ARCIN_V6_INTROFFSET_1);
}
/* Prototype: icside_irqdisable_arcin_v6 (struct expansion_card *ec, int irqnr)
@@ -137,8 +128,6 @@
{
struct icside_state *state = ec->irq_data;
- state->enabled = 0;
-
readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_1);
readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_2);
}
@@ -160,44 +149,6 @@
.irqpending = icside_irqpending_arcin_v6,
};
-/*
- * Handle routing of interrupts. This is called before
- * we write the command to the drive.
- */
-static void icside_maskproc(ide_drive_t *drive, int mask)
-{
- ide_hwif_t *hwif = drive->hwif;
- struct expansion_card *ec = ECARD_DEV(hwif->dev);
- struct icside_state *state = ecard_get_drvdata(ec);
- unsigned long flags;
-
- local_irq_save(flags);
-
- state->channel = hwif->channel;
-
- if (state->enabled && !mask) {
- switch (hwif->channel) {
- case 0:
- writeb(0, state->irq_port + ICS_ARCIN_V6_INTROFFSET_1);
- readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_2);
- break;
- case 1:
- writeb(0, state->irq_port + ICS_ARCIN_V6_INTROFFSET_2);
- readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_1);
- break;
- }
- } else {
- readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_2);
- readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_1);
- }
-
- local_irq_restore(flags);
-}
-
-static const struct ide_port_ops icside_v6_no_dma_port_ops = {
- .maskproc = icside_maskproc,
-};
-
#ifdef CONFIG_BLK_DEV_IDEDMA_ICS
/*
* SG-DMA support.
@@ -275,7 +226,6 @@
static const struct ide_port_ops icside_v6_port_ops = {
.set_dma_mode = icside_set_dma_mode,
- .maskproc = icside_maskproc,
};
static void icside_dma_host_set(ide_drive_t *drive, int on)
@@ -320,11 +270,6 @@
BUG_ON(dma_channel_active(ec->dma));
/*
- * Ensure that we have the right interrupt routed.
- */
- icside_maskproc(drive, 0);
-
- /*
* Route the DMA signals to the correct interface.
*/
writeb(state->sel | hwif->channel, state->ioc_base);
@@ -381,7 +326,7 @@
return -EOPNOTSUPP;
}
-static void icside_setup_ports(hw_regs_t *hw, void __iomem *base,
+static void icside_setup_ports(struct ide_hw *hw, void __iomem *base,
struct cardinfo *info, struct expansion_card *ec)
{
unsigned long port = (unsigned long)base + info->dataoffset;
@@ -398,11 +343,11 @@
hw->irq = ec->irq;
hw->dev = &ec->dev;
- hw->chipset = ide_acorn;
}
static const struct ide_port_info icside_v5_port_info = {
.host_flags = IDE_HFLAG_NO_DMA,
+ .chipset = ide_acorn,
};
static int __devinit
@@ -410,7 +355,7 @@
{
void __iomem *base;
struct ide_host *host;
- hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
int ret;
base = ecardm_iomap(ec, ECARD_RES_MEMC, 0, 0);
@@ -431,7 +376,7 @@
icside_setup_ports(&hw, base, &icside_cardinfo_v5, ec);
- host = ide_host_alloc(&icside_v5_port_info, hws);
+ host = ide_host_alloc(&icside_v5_port_info, hws, 1);
if (host == NULL)
return -ENODEV;
@@ -452,11 +397,11 @@
static const struct ide_port_info icside_v6_port_info __initdata = {
.init_dma = icside_dma_off_init,
- .port_ops = &icside_v6_no_dma_port_ops,
.dma_ops = &icside_v6_dma_ops,
.host_flags = IDE_HFLAG_SERIALIZE | IDE_HFLAG_MMIO,
.mwdma_mask = ATA_MWDMA2,
.swdma_mask = ATA_SWDMA2,
+ .chipset = ide_acorn,
};
static int __devinit
@@ -466,7 +411,7 @@
struct ide_host *host;
unsigned int sel = 0;
int ret;
- hw_regs_t hw[2], *hws[] = { &hw[0], &hw[1], NULL, NULL };
+ struct ide_hw hw[2], *hws[] = { &hw[0], &hw[1] };
struct ide_port_info d = icside_v6_port_info;
ioc_base = ecardm_iomap(ec, ECARD_RES_IOCFAST, 0, 0);
@@ -506,7 +451,7 @@
icside_setup_ports(&hw[0], easi_base, &icside_cardinfo_v6_1, ec);
icside_setup_ports(&hw[1], easi_base, &icside_cardinfo_v6_2, ec);
- host = ide_host_alloc(&d, hws);
+ host = ide_host_alloc(&d, hws, 2);
if (host == NULL)
return -ENODEV;
diff --git a/drivers/ide/ide-4drives.c b/drivers/ide/ide-4drives.c
index 78aca75..979d342 100644
--- a/drivers/ide/ide-4drives.c
+++ b/drivers/ide/ide-4drives.c
@@ -25,12 +25,13 @@
.port_ops = &ide_4drives_port_ops,
.host_flags = IDE_HFLAG_SERIALIZE | IDE_HFLAG_NO_DMA |
IDE_HFLAG_4DRIVES,
+ .chipset = ide_4drives,
};
static int __init ide_4drives_init(void)
{
unsigned long base = 0x1f0, ctl = 0x3f6;
- hw_regs_t hw, *hws[] = { &hw, &hw, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw, &hw };
if (probe_4drives == 0)
return -ENODEV;
@@ -52,9 +53,8 @@
ide_std_init_ports(&hw, base, ctl);
hw.irq = 14;
- hw.chipset = ide_4drives;
- return ide_host_add(&ide_4drives_port_info, hws, NULL);
+ return ide_host_add(&ide_4drives_port_info, hws, 2, NULL);
}
module_init(ide_4drives_init);
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 757e595..bbdd254 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -259,7 +259,7 @@
pc->req_xfer = blk_rq_bytes(sense_rq);
if (drive->media == ide_tape)
- set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
+ drive->atapi_flags |= IDE_AFLAG_IGNORE_DSC;
/*
* Push back the failed request and put request sense on top
diff --git a/drivers/ide/ide-cs.c b/drivers/ide/ide-cs.c
index 9e47f35..527908f 100644
--- a/drivers/ide/ide-cs.c
+++ b/drivers/ide/ide-cs.c
@@ -155,6 +155,7 @@
.port_ops = &idecs_port_ops,
.host_flags = IDE_HFLAG_NO_DMA,
.irq_flags = IRQF_SHARED,
+ .chipset = ide_pci,
};
static struct ide_host *idecs_register(unsigned long io, unsigned long ctl,
@@ -163,7 +164,7 @@
struct ide_host *host;
ide_hwif_t *hwif;
int i, rc;
- hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
if (!request_region(io, 8, DRV_NAME)) {
printk(KERN_ERR "%s: I/O resource 0x%lX-0x%lX not free.\n",
@@ -181,10 +182,9 @@
memset(&hw, 0, sizeof(hw));
ide_std_init_ports(&hw, io, ctl);
hw.irq = irq;
- hw.chipset = ide_pci;
hw.dev = &handle->dev;
- rc = ide_host_add(&idecs_port_info, hws, &host);
+ rc = ide_host_add(&idecs_port_info, hws, 1, &host);
if (rc)
goto out_release;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index c6f7fcf..6a1de21 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -302,14 +302,12 @@
{ NULL, NULL }
};
-static void idedisk_check_hpa(ide_drive_t *drive)
+static u64 ide_disk_hpa_get_native_capacity(ide_drive_t *drive, int lba48)
{
- unsigned long long capacity, set_max;
- int lba48 = ata_id_lba48_enabled(drive->id);
+ u64 capacity, set_max;
capacity = drive->capacity64;
-
- set_max = idedisk_read_native_max_address(drive, lba48);
+ set_max = idedisk_read_native_max_address(drive, lba48);
if (ide_in_drive_list(drive->id, hpa_list)) {
/*
@@ -320,9 +318,31 @@
set_max--;
}
+ return set_max;
+}
+
+static u64 ide_disk_hpa_set_capacity(ide_drive_t *drive, u64 set_max, int lba48)
+{
+ set_max = idedisk_set_max_address(drive, set_max, lba48);
+ if (set_max)
+ drive->capacity64 = set_max;
+
+ return set_max;
+}
+
+static void idedisk_check_hpa(ide_drive_t *drive)
+{
+ u64 capacity, set_max;
+ int lba48 = ata_id_lba48_enabled(drive->id);
+
+ capacity = drive->capacity64;
+ set_max = ide_disk_hpa_get_native_capacity(drive, lba48);
+
if (set_max <= capacity)
return;
+ drive->probed_capacity = set_max;
+
printk(KERN_INFO "%s: Host Protected Area detected.\n"
"\tcurrent capacity is %llu sectors (%llu MB)\n"
"\tnative capacity is %llu sectors (%llu MB)\n",
@@ -330,13 +350,13 @@
capacity, sectors_to_MB(capacity),
set_max, sectors_to_MB(set_max));
- set_max = idedisk_set_max_address(drive, set_max, lba48);
+ if ((drive->dev_flags & IDE_DFLAG_NOHPA) == 0)
+ return;
- if (set_max) {
- drive->capacity64 = set_max;
+ set_max = ide_disk_hpa_set_capacity(drive, set_max, lba48);
+ if (set_max)
printk(KERN_INFO "%s: Host Protected Area disabled.\n",
drive->name);
- }
}
static int ide_disk_get_capacity(ide_drive_t *drive)
@@ -358,6 +378,8 @@
drive->capacity64 = drive->cyl * drive->head * drive->sect;
}
+ drive->probed_capacity = drive->capacity64;
+
if (lba) {
drive->dev_flags |= IDE_DFLAG_LBA;
@@ -376,7 +398,7 @@
"%llu sectors (%llu MB)\n",
drive->name, (unsigned long long)drive->capacity64,
sectors_to_MB(drive->capacity64));
- drive->capacity64 = 1ULL << 28;
+ drive->probed_capacity = drive->capacity64 = 1ULL << 28;
}
if ((drive->hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) &&
@@ -392,6 +414,34 @@
return 0;
}
+static u64 ide_disk_set_capacity(ide_drive_t *drive, u64 capacity)
+{
+ u64 set = min(capacity, drive->probed_capacity);
+ u16 *id = drive->id;
+ int lba48 = ata_id_lba48_enabled(id);
+
+ if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 ||
+ ata_id_hpa_enabled(id) == 0)
+ goto out;
+
+ /*
+ * according to the spec the SET MAX ADDRESS command shall be
+ * immediately preceded by a READ NATIVE MAX ADDRESS command
+ */
+ capacity = ide_disk_hpa_get_native_capacity(drive, lba48);
+ if (capacity == 0)
+ goto out;
+
+ set = ide_disk_hpa_set_capacity(drive, set, lba48);
+ if (set) {
+ /* needed for ->resume to disable HPA */
+ drive->dev_flags |= IDE_DFLAG_NOHPA;
+ return set;
+ }
+out:
+ return drive->capacity64;
+}
+
static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
{
ide_drive_t *drive = q->queuedata;
@@ -428,14 +478,14 @@
if (arg < 0 || arg > (drive->id[ATA_ID_MAX_MULTSECT] & 0xff))
return -EINVAL;
- if (drive->special.b.set_multmode)
+ if (drive->special_flags & IDE_SFLAG_SET_MULTMODE)
return -EBUSY;
rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
drive->mult_req = arg;
- drive->special.b.set_multmode = 1;
+ drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
error = blk_execute_rq(drive->queue, NULL, rq, 0);
blk_put_request(rq);
@@ -740,6 +790,7 @@
const struct ide_disk_ops ide_ata_disk_ops = {
.check = ide_disk_check,
+ .set_capacity = ide_disk_set_capacity,
.get_capacity = ide_disk_get_capacity,
.setup = ide_disk_setup,
.flush = ide_disk_flush,
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 001f68f..219e6fb 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -347,7 +347,6 @@
return mode;
}
-EXPORT_SYMBOL_GPL(ide_find_dma_mode);
static int ide_tune_dma(ide_drive_t *drive)
{
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 5d5fb96..2b91419 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -52,7 +52,7 @@
}
if ((rq->errors & ERROR_RECAL) == ERROR_RECAL)
- drive->special.b.recalibrate = 1;
+ drive->special_flags |= IDE_SFLAG_RECALIBRATE;
++rq->errors;
@@ -268,9 +268,8 @@
{
int legacy = (drive->id[ATA_ID_CFS_ENABLE_2] & 0x0400) ? 0 : 1;
- drive->special.all = 0;
- drive->special.b.set_geometry = legacy;
- drive->special.b.recalibrate = legacy;
+ drive->special_flags =
+ legacy ? (IDE_SFLAG_SET_GEOMETRY | IDE_SFLAG_RECALIBRATE) : 0;
drive->mult_count = 0;
drive->dev_flags &= ~IDE_DFLAG_PARKED;
@@ -280,7 +279,7 @@
drive->mult_req = 0;
if (drive->mult_req != drive->mult_count)
- drive->special.b.set_multmode = 1;
+ drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
}
static void pre_reset(ide_drive_t *drive)
@@ -408,8 +407,9 @@
/* more than enough time */
udelay(10);
/* clear SRST, leave nIEN (unless device is on the quirk list) */
- tp_ops->write_devctl(hwif, (drive->quirk_list == 2 ? 0 : ATA_NIEN) |
- ATA_DEVCTL_OBS);
+ tp_ops->write_devctl(hwif,
+ ((drive->dev_flags & IDE_DFLAG_NIEN_QUIRK) ? 0 : ATA_NIEN) |
+ ATA_DEVCTL_OBS);
/* more than enough time */
udelay(10);
hwif->poll_timeout = jiffies + WAIT_WORSTCASE;
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index 4b6b71e..2141190 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -287,6 +287,19 @@
return ret;
}
+static unsigned long long ide_gd_set_capacity(struct gendisk *disk,
+ unsigned long long capacity)
+{
+ struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
+ ide_drive_t *drive = idkp->drive;
+ const struct ide_disk_ops *disk_ops = drive->disk_ops;
+
+ if (disk_ops->set_capacity)
+ return disk_ops->set_capacity(drive, capacity);
+
+ return drive->capacity64;
+}
+
static int ide_gd_revalidate_disk(struct gendisk *disk)
{
struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
@@ -315,6 +328,7 @@
.locked_ioctl = ide_gd_ioctl,
.getgeo = ide_gd_getgeo,
.media_changed = ide_gd_media_changed,
+ .set_capacity = ide_gd_set_capacity,
.revalidate_disk = ide_gd_revalidate_disk
};
diff --git a/drivers/ide/ide-generic.c b/drivers/ide/ide-generic.c
index 7812ca0..54d7c46 100644
--- a/drivers/ide/ide-generic.c
+++ b/drivers/ide/ide-generic.c
@@ -29,6 +29,7 @@
static const struct ide_port_info ide_generic_port_info = {
.host_flags = IDE_HFLAG_NO_DMA,
+ .chipset = ide_generic,
};
#ifdef CONFIG_ARM
@@ -85,7 +86,7 @@
static int __init ide_generic_init(void)
{
- hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
unsigned long io_addr;
int i, rc = 0, primary = 0, secondary = 0;
@@ -132,9 +133,7 @@
#else
hw.irq = legacy_irqs[i];
#endif
- hw.chipset = ide_generic;
-
- rc = ide_host_add(&ide_generic_port_info, hws, NULL);
+ rc = ide_host_add(&ide_generic_port_info, hws, 1, NULL);
if (rc) {
release_region(io_addr + 0x206, 1);
release_region(io_addr, 8);
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index c06ebdc..520f42c 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -64,26 +64,26 @@
#define H8300_IDE_GAP (2)
-static inline void hw_setup(hw_regs_t *hw)
+static inline void hw_setup(struct ide_hw *hw)
{
int i;
- memset(hw, 0, sizeof(hw_regs_t));
+ memset(hw, 0, sizeof(*hw));
for (i = 0; i <= 7; i++)
hw->io_ports_array[i] = CONFIG_H8300_IDE_BASE + H8300_IDE_GAP*i;
hw->io_ports.ctl_addr = CONFIG_H8300_IDE_ALT;
hw->irq = EXT_IRQ0 + CONFIG_H8300_IDE_IRQ;
- hw->chipset = ide_generic;
}
static const struct ide_port_info h8300_port_info = {
.tp_ops = &h8300_tp_ops,
.host_flags = IDE_HFLAG_NO_IO_32BIT | IDE_HFLAG_NO_DMA,
+ .chipset = ide_generic,
};
static int __init h8300_ide_init(void)
{
- hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
printk(KERN_INFO DRV_NAME ": H8/300 generic IDE interface\n");
@@ -96,7 +96,7 @@
hw_setup(&hw);
- return ide_host_add(&h8300_port_info, hws, NULL);
+ return ide_host_add(&h8300_port_info, hws, 1, NULL);
out_busy:
printk(KERN_ERR "ide-h8300: IDE I/F resource already used.\n");
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index bba4297..272cc38 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -184,29 +184,42 @@
tf->command = ATA_CMD_SET_MULTI;
}
-static ide_startstop_t ide_disk_special(ide_drive_t *drive)
+/**
+ * do_special - issue some special commands
+ * @drive: drive the command is for
+ *
+ * do_special() is used to issue ATA_CMD_INIT_DEV_PARAMS,
+ * ATA_CMD_RESTORE and ATA_CMD_SET_MULTI commands to a drive.
+ */
+
+static ide_startstop_t do_special(ide_drive_t *drive)
{
- special_t *s = &drive->special;
struct ide_cmd cmd;
+#ifdef DEBUG
+ printk(KERN_DEBUG "%s: %s: 0x%02x\n", drive->name, __func__,
+ drive->special_flags);
+#endif
+ if (drive->media != ide_disk) {
+ drive->special_flags = 0;
+ drive->mult_req = 0;
+ return ide_stopped;
+ }
+
memset(&cmd, 0, sizeof(cmd));
cmd.protocol = ATA_PROT_NODATA;
- if (s->b.set_geometry) {
- s->b.set_geometry = 0;
+ if (drive->special_flags & IDE_SFLAG_SET_GEOMETRY) {
+ drive->special_flags &= ~IDE_SFLAG_SET_GEOMETRY;
ide_tf_set_specify_cmd(drive, &cmd.tf);
- } else if (s->b.recalibrate) {
- s->b.recalibrate = 0;
+ } else if (drive->special_flags & IDE_SFLAG_RECALIBRATE) {
+ drive->special_flags &= ~IDE_SFLAG_RECALIBRATE;
ide_tf_set_restore_cmd(drive, &cmd.tf);
- } else if (s->b.set_multmode) {
- s->b.set_multmode = 0;
+ } else if (drive->special_flags & IDE_SFLAG_SET_MULTMODE) {
+ drive->special_flags &= ~IDE_SFLAG_SET_MULTMODE;
ide_tf_set_setmult_cmd(drive, &cmd.tf);
- } else if (s->all) {
- int special = s->all;
- s->all = 0;
- printk(KERN_ERR "%s: bad special flag: 0x%02x\n", drive->name, special);
- return ide_stopped;
- }
+ } else
+ BUG();
cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
cmd.valid.in.tf = IDE_VALID_IN_TF | IDE_VALID_DEVICE;
@@ -217,31 +230,6 @@
return ide_started;
}
-/**
- * do_special - issue some special commands
- * @drive: drive the command is for
- *
- * do_special() is used to issue ATA_CMD_INIT_DEV_PARAMS,
- * ATA_CMD_RESTORE and ATA_CMD_SET_MULTI commands to a drive.
- *
- * It used to do much more, but has been scaled back.
- */
-
-static ide_startstop_t do_special (ide_drive_t *drive)
-{
- special_t *s = &drive->special;
-
-#ifdef DEBUG
- printk("%s: do_special: 0x%02x\n", drive->name, s->all);
-#endif
- if (drive->media == ide_disk)
- return ide_disk_special(drive);
-
- s->all = 0;
- drive->mult_req = 0;
- return ide_stopped;
-}
-
void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
{
ide_hwif_t *hwif = drive->hwif;
@@ -351,7 +339,8 @@
printk(KERN_ERR "%s: drive not ready for command\n", drive->name);
return startstop;
}
- if (!drive->special.all) {
+
+ if (drive->special_flags == 0) {
struct ide_driver *drv;
/*
@@ -499,11 +488,15 @@
if ((hwif->host->host_flags & IDE_HFLAG_SERIALIZE) &&
hwif != prev_port) {
+ ide_drive_t *cur_dev =
+ prev_port ? prev_port->cur_dev : NULL;
+
/*
* set nIEN for previous port, drives in the
- * quirk_list may not like intr setups/cleanups
+ * quirk list may not like intr setups/cleanups
*/
- if (prev_port && prev_port->cur_dev->quirk_list == 0)
+ if (cur_dev &&
+ (cur_dev->dev_flags & IDE_DFLAG_NIEN_QUIRK) == 0)
prev_port->tp_ops->write_devctl(prev_port,
ATA_NIEN |
ATA_DEVCTL_OBS);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 06fe002..fa04715 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -282,6 +282,29 @@
return 0;
}
+static const char *nien_quirk_list[] = {
+ "QUANTUM FIREBALLlct08 08",
+ "QUANTUM FIREBALLP KA6.4",
+ "QUANTUM FIREBALLP KA9.1",
+ "QUANTUM FIREBALLP KX13.6",
+ "QUANTUM FIREBALLP KX20.5",
+ "QUANTUM FIREBALLP KX27.3",
+ "QUANTUM FIREBALLP LM20.4",
+ "QUANTUM FIREBALLP LM20.5",
+ NULL
+};
+
+void ide_check_nien_quirk_list(ide_drive_t *drive)
+{
+ const char **list, *m = (char *)&drive->id[ATA_ID_PROD];
+
+ for (list = nien_quirk_list; *list != NULL; list++)
+ if (strstr(m, *list) != NULL) {
+ drive->dev_flags |= IDE_DFLAG_NIEN_QUIRK;
+ return;
+ }
+}
+
int ide_driveid_update(ide_drive_t *drive)
{
u16 *id;
@@ -311,7 +334,6 @@
return 1;
out_err:
- SELECT_MASK(drive, 0);
if (rc == 2)
printk(KERN_ERR "%s: %s: bad status\n", drive->name, __func__);
kfree(id);
@@ -365,7 +387,7 @@
tp_ops->exec_command(hwif, ATA_CMD_SET_FEATURES);
- if (drive->quirk_list == 2)
+ if (drive->dev_flags & IDE_DFLAG_NIEN_QUIRK)
tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
error = __ide_wait_stat(drive, drive->ready_stat,
diff --git a/drivers/ide/ide-legacy.c b/drivers/ide/ide-legacy.c
index 8c5dcbf..b9654a7 100644
--- a/drivers/ide/ide-legacy.c
+++ b/drivers/ide/ide-legacy.c
@@ -1,7 +1,7 @@
#include <linux/kernel.h>
#include <linux/ide.h>
-static void ide_legacy_init_one(hw_regs_t **hws, hw_regs_t *hw,
+static void ide_legacy_init_one(struct ide_hw **hws, struct ide_hw *hw,
u8 port_no, const struct ide_port_info *d,
unsigned long config)
{
@@ -33,7 +33,6 @@
ide_std_init_ports(hw, base, ctl);
hw->irq = irq;
- hw->chipset = d->chipset;
hw->config = config;
hws[port_no] = hw;
@@ -41,7 +40,7 @@
int ide_legacy_device_add(const struct ide_port_info *d, unsigned long config)
{
- hw_regs_t hw[2], *hws[] = { NULL, NULL, NULL, NULL };
+ struct ide_hw hw[2], *hws[] = { NULL, NULL };
memset(&hw, 0, sizeof(hw));
@@ -53,6 +52,6 @@
(d->host_flags & IDE_HFLAG_SINGLE))
return -ENOENT;
- return ide_host_add(d, hws, NULL);
+ return ide_host_add(d, hws, 2, NULL);
}
EXPORT_SYMBOL_GPL(ide_legacy_device_add);
diff --git a/drivers/ide/ide-pnp.c b/drivers/ide/ide-pnp.c
index 6e80b77..017b1df 100644
--- a/drivers/ide/ide-pnp.c
+++ b/drivers/ide/ide-pnp.c
@@ -29,6 +29,7 @@
static const struct ide_port_info ide_pnp_port_info = {
.host_flags = IDE_HFLAG_NO_DMA,
+ .chipset = ide_generic,
};
static int idepnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id)
@@ -36,7 +37,7 @@
struct ide_host *host;
unsigned long base, ctl;
int rc;
- hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
printk(KERN_INFO DRV_NAME ": generic PnP IDE interface\n");
@@ -62,9 +63,8 @@
memset(&hw, 0, sizeof(hw));
ide_std_init_ports(&hw, base, ctl);
hw.irq = pnp_irq(dev, 0);
- hw.chipset = ide_generic;
- rc = ide_host_add(&ide_pnp_port_info, hws, &host);
+ rc = ide_host_add(&ide_pnp_port_info, hws, 1, &host);
if (rc)
goto out;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index c895ed5..f371b0d 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -97,7 +97,7 @@
drive->mult_req = id[ATA_ID_MULTSECT] & 0xff;
if (drive->mult_req)
- drive->special.b.set_multmode = 1;
+ drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
}
}
@@ -465,23 +465,8 @@
int rc;
u8 cmd;
- /*
- * In order to keep things simple we have an id
- * block for all drives at all times. If the device
- * is pre ATA or refuses ATA/ATAPI identify we
- * will add faked data to this.
- *
- * Also note that 0 everywhere means "can't do X"
- */
-
drive->dev_flags &= ~IDE_DFLAG_ID_READ;
- drive->id = kzalloc(SECTOR_SIZE, GFP_KERNEL);
- if (drive->id == NULL) {
- printk(KERN_ERR "ide: out of memory for id data.\n");
- return 0;
- }
-
m = (char *)&drive->id[ATA_ID_PROD];
strcpy(m, "UNKNOWN");
@@ -497,7 +482,7 @@
}
if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
- goto out_free;
+ return 0;
/* identification failed? */
if ((drive->dev_flags & IDE_DFLAG_ID_READ) == 0) {
@@ -521,7 +506,7 @@
}
if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
- goto out_free;
+ return 0;
/* The drive wasn't being helpful. Add generic info only */
if ((drive->dev_flags & IDE_DFLAG_ID_READ) == 0) {
@@ -535,9 +520,6 @@
}
return 1;
-out_free:
- kfree(drive->id);
- return 0;
}
static void hwif_release_dev(struct device *dev)
@@ -702,8 +684,14 @@
if (irqd)
disable_irq(hwif->irq);
- if (ide_port_wait_ready(hwif) == -EBUSY)
- printk(KERN_DEBUG "%s: Wait for ready failed before probe !\n", hwif->name);
+ rc = ide_port_wait_ready(hwif);
+ if (rc == -ENODEV) {
+ printk(KERN_INFO "%s: no devices on the port\n", hwif->name);
+ goto out;
+ } else if (rc == -EBUSY)
+ printk(KERN_ERR "%s: not ready before the probe\n", hwif->name);
+ else
+ rc = -ENODEV;
/*
* Second drive should only exist if first drive was found,
@@ -714,7 +702,7 @@
if (drive->dev_flags & IDE_DFLAG_PRESENT)
rc = 0;
}
-
+out:
/*
* Use cached IRQ number. It might be (and is...) changed by probe
* code above
@@ -732,6 +720,8 @@
int i;
ide_port_for_each_present_dev(i, drive, hwif) {
+ ide_check_nien_quirk_list(drive);
+
if (port_ops && port_ops->quirkproc)
port_ops->quirkproc(drive);
}
@@ -817,8 +807,6 @@
if (ide_init_queue(drive)) {
printk(KERN_ERR "ide: failed to init %s\n",
drive->name);
- kfree(drive->id);
- drive->id = NULL;
drive->dev_flags &= ~IDE_DFLAG_PRESENT;
continue;
}
@@ -947,9 +935,6 @@
blk_cleanup_queue(drive->queue);
drive->queue = NULL;
- kfree(drive->id);
- drive->id = NULL;
-
drive->dev_flags &= ~IDE_DFLAG_PRESENT;
complete(&drive->gendev_rel_comp);
@@ -1035,6 +1020,15 @@
if (port_ops && port_ops->init_dev)
port_ops->init_dev(drive);
}
+
+ ide_port_for_each_dev(i, drive, hwif) {
+ /*
+ * default to PIO Mode 0 before we figure out
+ * the most suited mode for the attached device
+ */
+ if (port_ops && port_ops->set_pio_mode)
+ port_ops->set_pio_mode(drive, 0);
+ }
}
static void ide_init_port(ide_hwif_t *hwif, unsigned int port,
@@ -1042,8 +1036,7 @@
{
hwif->channel = port;
- if (d->chipset)
- hwif->chipset = d->chipset;
+ hwif->chipset = d->chipset ? d->chipset : ide_pci;
if (d->init_iops)
d->init_iops(hwif);
@@ -1124,16 +1117,19 @@
ide_port_for_each_dev(i, drive, hwif) {
u8 j = (hwif->index * MAX_DRIVES) + i;
+ u16 *saved_id = drive->id;
memset(drive, 0, sizeof(*drive));
+ memset(saved_id, 0, SECTOR_SIZE);
+ drive->id = saved_id;
drive->media = ide_disk;
drive->select = (i << 4) | ATA_DEVICE_OBS;
drive->hwif = hwif;
drive->ready_stat = ATA_DRDY;
drive->bad_wstat = BAD_W_STAT;
- drive->special.b.recalibrate = 1;
- drive->special.b.set_geometry = 1;
+ drive->special_flags = IDE_SFLAG_RECALIBRATE |
+ IDE_SFLAG_SET_GEOMETRY;
drive->name[0] = 'h';
drive->name[1] = 'd';
drive->name[2] = 'a' + j;
@@ -1168,11 +1164,10 @@
ide_port_init_devices_data(hwif);
}
-static void ide_init_port_hw(ide_hwif_t *hwif, hw_regs_t *hw)
+static void ide_init_port_hw(ide_hwif_t *hwif, struct ide_hw *hw)
{
memcpy(&hwif->io_ports, &hw->io_ports, sizeof(hwif->io_ports));
hwif->irq = hw->irq;
- hwif->chipset = hw->chipset;
hwif->dev = hw->dev;
hwif->gendev.parent = hw->parent ? hw->parent : hw->dev;
hwif->ack_intr = hw->ack_intr;
@@ -1233,8 +1228,10 @@
ide_drive_t *drive;
int i;
- ide_port_for_each_dev(i, drive, hwif)
+ ide_port_for_each_dev(i, drive, hwif) {
+ kfree(drive->id);
kfree(drive);
+ }
}
static int ide_port_alloc_devices(ide_hwif_t *hwif, int node)
@@ -1248,6 +1245,18 @@
if (drive == NULL)
goto out_nomem;
+ /*
+ * In order to keep things simple we have an id
+ * block for all drives at all times. If the device
+ * is pre ATA or refuses ATA/ATAPI identify we
+ * will add faked data to this.
+ *
+ * Also note that 0 everywhere means "can't do X"
+ */
+ drive->id = kzalloc_node(SECTOR_SIZE, GFP_KERNEL, node);
+ if (drive->id == NULL)
+ goto out_nomem;
+
hwif->devices[i] = drive;
}
return 0;
@@ -1257,7 +1266,8 @@
return -ENOMEM;
}
-struct ide_host *ide_host_alloc(const struct ide_port_info *d, hw_regs_t **hws)
+struct ide_host *ide_host_alloc(const struct ide_port_info *d,
+ struct ide_hw **hws, unsigned int n_ports)
{
struct ide_host *host;
struct device *dev = hws[0] ? hws[0]->dev : NULL;
@@ -1268,7 +1278,7 @@
if (host == NULL)
return NULL;
- for (i = 0; i < MAX_HOST_PORTS; i++) {
+ for (i = 0; i < n_ports; i++) {
ide_hwif_t *hwif;
int idx;
@@ -1288,6 +1298,7 @@
if (idx < 0) {
printk(KERN_ERR "%s: no free slot for interface\n",
d ? d->name : "ide");
+ ide_port_free_devices(hwif);
kfree(hwif);
continue;
}
@@ -1344,7 +1355,7 @@
}
int ide_host_register(struct ide_host *host, const struct ide_port_info *d,
- hw_regs_t **hws)
+ struct ide_hw **hws)
{
ide_hwif_t *hwif, *mate = NULL;
int i, j = 0;
@@ -1438,13 +1449,13 @@
}
EXPORT_SYMBOL_GPL(ide_host_register);
-int ide_host_add(const struct ide_port_info *d, hw_regs_t **hws,
- struct ide_host **hostp)
+int ide_host_add(const struct ide_port_info *d, struct ide_hw **hws,
+ unsigned int n_ports, struct ide_host **hostp)
{
struct ide_host *host;
int rc;
- host = ide_host_alloc(d, hws);
+ host = ide_host_alloc(d, hws, n_ports);
if (host == NULL)
return -ENOMEM;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d9764f0..4b447a8 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -240,18 +240,27 @@
static void ide_tape_release(struct device *);
-static struct ide_tape_obj *ide_tape_get(struct gendisk *disk)
+static struct ide_tape_obj *idetape_devs[MAX_HWIFS * MAX_DRIVES];
+
+static struct ide_tape_obj *ide_tape_get(struct gendisk *disk, bool cdev,
+ unsigned int i)
{
struct ide_tape_obj *tape = NULL;
mutex_lock(&idetape_ref_mutex);
- tape = ide_drv_g(disk, ide_tape_obj);
+
+ if (cdev)
+ tape = idetape_devs[i];
+ else
+ tape = ide_drv_g(disk, ide_tape_obj);
+
if (tape) {
if (ide_device_get(tape->drive))
tape = NULL;
else
get_device(&tape->dev);
}
+
mutex_unlock(&idetape_ref_mutex);
return tape;
}
@@ -267,24 +276,6 @@
}
/*
- * The variables below are used for the character device interface. Additional
- * state variables are defined in our ide_drive_t structure.
- */
-static struct ide_tape_obj *idetape_devs[MAX_HWIFS * MAX_DRIVES];
-
-static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i)
-{
- struct ide_tape_obj *tape = NULL;
-
- mutex_lock(&idetape_ref_mutex);
- tape = idetape_devs[i];
- if (tape)
- get_device(&tape->dev);
- mutex_unlock(&idetape_ref_mutex);
- return tape;
-}
-
-/*
* called on each failed packet command retry to analyze the request sense. We
* currently do not utilize this information.
*/
@@ -397,7 +388,8 @@
if (readpos[0] & 0x4) {
printk(KERN_INFO "ide-tape: Block location is unknown"
"to the tape\n");
- clear_bit(IDE_AFLAG_ADDRESS_VALID, &drive->atapi_flags);
+ clear_bit(ilog2(IDE_AFLAG_ADDRESS_VALID),
+ &drive->atapi_flags);
uptodate = 0;
err = IDE_DRV_ERROR_GENERAL;
} else {
@@ -406,7 +398,8 @@
tape->partition = readpos[1];
tape->first_frame = be32_to_cpup((__be32 *)&readpos[4]);
- set_bit(IDE_AFLAG_ADDRESS_VALID, &drive->atapi_flags);
+ set_bit(ilog2(IDE_AFLAG_ADDRESS_VALID),
+ &drive->atapi_flags);
}
}
@@ -656,15 +649,15 @@
if ((drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) == 0 &&
(rq->cmd[13] & REQ_IDETAPE_PC2) == 0)
- set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
+ drive->atapi_flags |= IDE_AFLAG_IGNORE_DSC;
if (drive->dev_flags & IDE_DFLAG_POST_RESET) {
- set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
+ drive->atapi_flags |= IDE_AFLAG_IGNORE_DSC;
drive->dev_flags &= ~IDE_DFLAG_POST_RESET;
}
- if (!test_and_clear_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags) &&
- (stat & ATA_DSC) == 0) {
+ if (!(drive->atapi_flags & IDE_AFLAG_IGNORE_DSC) &&
+ !(stat & ATA_DSC)) {
if (postponed_rq == NULL) {
tape->dsc_polling_start = jiffies;
tape->dsc_poll_freq = tape->best_dsc_rw_freq;
@@ -684,7 +677,9 @@
tape->dsc_poll_freq = IDETAPE_DSC_MA_SLOW;
idetape_postpone_request(drive);
return ide_stopped;
- }
+ } else
+ drive->atapi_flags &= ~IDE_AFLAG_IGNORE_DSC;
+
if (rq->cmd[13] & REQ_IDETAPE_READ) {
pc = &tape->queued_pc;
ide_tape_create_rw_cmd(tape, pc, rq, READ_6);
@@ -744,7 +739,7 @@
int load_attempted = 0;
/* Wait for the tape to become ready */
- set_bit(IDE_AFLAG_MEDIUM_PRESENT, &drive->atapi_flags);
+ set_bit(ilog2(IDE_AFLAG_MEDIUM_PRESENT), &drive->atapi_flags);
timeout += jiffies;
while (time_before(jiffies, timeout)) {
if (ide_do_test_unit_ready(drive, disk) == 0)
@@ -820,7 +815,7 @@
if (tape->chrdev_dir != IDETAPE_DIR_READ)
return;
- clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags);
+ clear_bit(ilog2(IDE_AFLAG_FILEMARK), &drive->atapi_flags);
tape->valid = 0;
if (tape->buf != NULL) {
kfree(tape->buf);
@@ -1113,7 +1108,8 @@
if (tape->chrdev_dir == IDETAPE_DIR_READ) {
tape->valid = 0;
- if (test_and_clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
+ if (test_and_clear_bit(ilog2(IDE_AFLAG_FILEMARK),
+ &drive->atapi_flags))
++count;
ide_tape_discard_merge_buffer(drive, 0);
}
@@ -1168,7 +1164,7 @@
debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
if (tape->chrdev_dir != IDETAPE_DIR_READ) {
- if (test_bit(IDE_AFLAG_DETECT_BS, &drive->atapi_flags))
+ if (test_bit(ilog2(IDE_AFLAG_DETECT_BS), &drive->atapi_flags))
if (count > tape->blk_size &&
(count % tape->blk_size) == 0)
tape->user_bs_factor = count / tape->blk_size;
@@ -1184,7 +1180,8 @@
/* refill if staging buffer is empty */
if (!tape->valid) {
/* If we are at a filemark, nothing more to read */
- if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
+ if (test_bit(ilog2(IDE_AFLAG_FILEMARK),
+ &drive->atapi_flags))
break;
/* read */
if (idetape_queue_rw_tail(drive, REQ_IDETAPE_READ,
@@ -1202,7 +1199,7 @@
done += todo;
}
- if (!done && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
+ if (!done && test_bit(ilog2(IDE_AFLAG_FILEMARK), &drive->atapi_flags)) {
debug_log(DBG_SENSE, "%s: spacing over filemark\n", tape->name);
idetape_space_over_filemarks(drive, MTFSF, 1);
@@ -1336,7 +1333,8 @@
ide_tape_discard_merge_buffer(drive, 0);
retval = ide_do_start_stop(drive, disk, !IDETAPE_LU_LOAD_MASK);
if (!retval)
- clear_bit(IDE_AFLAG_MEDIUM_PRESENT, &drive->atapi_flags);
+ clear_bit(ilog2(IDE_AFLAG_MEDIUM_PRESENT),
+ &drive->atapi_flags);
return retval;
case MTNOP:
ide_tape_discard_merge_buffer(drive, 0);
@@ -1358,9 +1356,11 @@
mt_count % tape->blk_size)
return -EIO;
tape->user_bs_factor = mt_count / tape->blk_size;
- clear_bit(IDE_AFLAG_DETECT_BS, &drive->atapi_flags);
+ clear_bit(ilog2(IDE_AFLAG_DETECT_BS),
+ &drive->atapi_flags);
} else
- set_bit(IDE_AFLAG_DETECT_BS, &drive->atapi_flags);
+ set_bit(ilog2(IDE_AFLAG_DETECT_BS),
+ &drive->atapi_flags);
return 0;
case MTSEEK:
ide_tape_discard_merge_buffer(drive, 0);
@@ -1486,7 +1486,7 @@
return -ENXIO;
lock_kernel();
- tape = ide_tape_chrdev_get(i);
+ tape = ide_tape_get(NULL, true, i);
if (!tape) {
unlock_kernel();
return -ENXIO;
@@ -1505,20 +1505,20 @@
filp->private_data = tape;
- if (test_and_set_bit(IDE_AFLAG_BUSY, &drive->atapi_flags)) {
+ if (test_and_set_bit(ilog2(IDE_AFLAG_BUSY), &drive->atapi_flags)) {
retval = -EBUSY;
goto out_put_tape;
}
retval = idetape_wait_ready(drive, 60 * HZ);
if (retval) {
- clear_bit(IDE_AFLAG_BUSY, &drive->atapi_flags);
+ clear_bit(ilog2(IDE_AFLAG_BUSY), &drive->atapi_flags);
printk(KERN_ERR "ide-tape: %s: drive not ready\n", tape->name);
goto out_put_tape;
}
idetape_read_position(drive);
- if (!test_bit(IDE_AFLAG_ADDRESS_VALID, &drive->atapi_flags))
+ if (!test_bit(ilog2(IDE_AFLAG_ADDRESS_VALID), &drive->atapi_flags))
(void)idetape_rewind_tape(drive);
/* Read block size and write protect status from drive. */
@@ -1534,7 +1534,7 @@
if (tape->write_prot) {
if ((filp->f_flags & O_ACCMODE) == O_WRONLY ||
(filp->f_flags & O_ACCMODE) == O_RDWR) {
- clear_bit(IDE_AFLAG_BUSY, &drive->atapi_flags);
+ clear_bit(ilog2(IDE_AFLAG_BUSY), &drive->atapi_flags);
retval = -EROFS;
goto out_put_tape;
}
@@ -1591,15 +1591,17 @@
ide_tape_discard_merge_buffer(drive, 1);
}
- if (minor < 128 && test_bit(IDE_AFLAG_MEDIUM_PRESENT, &drive->atapi_flags))
+ if (minor < 128 && test_bit(ilog2(IDE_AFLAG_MEDIUM_PRESENT),
+ &drive->atapi_flags))
(void) idetape_rewind_tape(drive);
+
if (tape->chrdev_dir == IDETAPE_DIR_NONE) {
if (tape->door_locked == DOOR_LOCKED) {
if (!ide_set_media_lock(drive, tape->disk, 0))
tape->door_locked = DOOR_UNLOCKED;
}
}
- clear_bit(IDE_AFLAG_BUSY, &drive->atapi_flags);
+ clear_bit(ilog2(IDE_AFLAG_BUSY), &drive->atapi_flags);
ide_tape_put(tape);
unlock_kernel();
return 0;
@@ -1905,7 +1907,7 @@
static int idetape_open(struct block_device *bdev, fmode_t mode)
{
- struct ide_tape_obj *tape = ide_tape_get(bdev->bd_disk);
+ struct ide_tape_obj *tape = ide_tape_get(bdev->bd_disk, false, 0);
if (!tape)
return -ENXIO;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index a0c3e1b..75b85a8 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -98,7 +98,6 @@
if ((cmd->tf_flags & IDE_TFLAG_DMA_PIO_FALLBACK) == 0) {
ide_tf_dump(drive->name, cmd);
tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
- SELECT_MASK(drive, 0);
if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
u8 data[2] = { cmd->tf.data, cmd->hob.data };
@@ -166,7 +165,7 @@
if (!OK_STAT(stat, ATA_DRDY, BAD_STAT)) {
if (custom && tf->command == ATA_CMD_SET_MULTI) {
drive->mult_req = drive->mult_count = 0;
- drive->special.b.recalibrate = 1;
+ drive->special_flags |= IDE_SFLAG_RECALIBRATE;
(void)ide_dump_status(drive, __func__, stat);
return ide_stopped;
} else if (custom && tf->command == ATA_CMD_INIT_DEV_PARAMS) {
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 92c9b90..16d0569 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -211,6 +211,11 @@
module_param_call(noflush, ide_set_dev_param_mask, NULL, &ide_noflush, 0);
MODULE_PARM_DESC(noflush, "disable flush requests for a device");
+static unsigned int ide_nohpa;
+
+module_param_call(nohpa, ide_set_dev_param_mask, NULL, &ide_nohpa, 0);
+MODULE_PARM_DESC(nohpa, "disable Host Protected Area for a device");
+
static unsigned int ide_noprobe;
module_param_call(noprobe, ide_set_dev_param_mask, NULL, &ide_noprobe, 0);
@@ -281,6 +286,11 @@
drive->name);
drive->dev_flags |= IDE_DFLAG_NOFLUSH;
}
+ if (ide_nohpa & (1 << i)) {
+ printk(KERN_INFO "ide: disabling Host Protected Area for %s\n",
+ drive->name);
+ drive->dev_flags |= IDE_DFLAG_NOHPA;
+ }
if (ide_noprobe & (1 << i)) {
printk(KERN_INFO "ide: skipping probe for %s\n", drive->name);
drive->dev_flags |= IDE_DFLAG_NOPROBE;
diff --git a/drivers/ide/ide_platform.c b/drivers/ide/ide_platform.c
index 051b4ab..ee9b55e 100644
--- a/drivers/ide/ide_platform.c
+++ b/drivers/ide/ide_platform.c
@@ -21,7 +21,7 @@
#include <linux/platform_device.h>
#include <linux/io.h>
-static void __devinit plat_ide_setup_ports(hw_regs_t *hw,
+static void __devinit plat_ide_setup_ports(struct ide_hw *hw,
void __iomem *base,
void __iomem *ctrl,
struct pata_platform_info *pdata,
@@ -40,12 +40,11 @@
hw->io_ports.ctl_addr = (unsigned long)ctrl;
hw->irq = irq;
-
- hw->chipset = ide_generic;
}
static const struct ide_port_info platform_ide_port_info = {
.host_flags = IDE_HFLAG_NO_DMA,
+ .chipset = ide_generic,
};
static int __devinit plat_ide_probe(struct platform_device *pdev)
@@ -55,7 +54,7 @@
struct pata_platform_info *pdata;
struct ide_host *host;
int ret = 0, mmio = 0;
- hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
struct ide_port_info d = platform_ide_port_info;
pdata = pdev->dev.platform_data;
@@ -99,7 +98,7 @@
if (mmio)
d.host_flags |= IDE_HFLAG_MMIO;
- ret = ide_host_add(&d, hws, &host);
+ ret = ide_host_add(&d, hws, 1, &host);
if (ret)
goto out;
diff --git a/drivers/ide/macide.c b/drivers/ide/macide.c
index 4b1718e..1447c8c 100644
--- a/drivers/ide/macide.c
+++ b/drivers/ide/macide.c
@@ -62,7 +62,7 @@
return 0;
}
-static void __init macide_setup_ports(hw_regs_t *hw, unsigned long base,
+static void __init macide_setup_ports(struct ide_hw *hw, unsigned long base,
int irq, ide_ack_intr_t *ack_intr)
{
int i;
@@ -76,13 +76,12 @@
hw->irq = irq;
hw->ack_intr = ack_intr;
-
- hw->chipset = ide_generic;
}
static const struct ide_port_info macide_port_info = {
.host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
.irq_flags = IRQF_SHARED,
+ .chipset = ide_generic,
};
static const char *mac_ide_name[] =
@@ -97,7 +96,7 @@
ide_ack_intr_t *ack_intr;
unsigned long base;
int irq;
- hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
if (!MACH_IS_MAC)
return -ENODEV;
@@ -127,7 +126,7 @@
macide_setup_ports(&hw, base, irq, ack_intr);
- return ide_host_add(&macide_port_info, hws, NULL);
+ return ide_host_add(&macide_port_info, hws, 1, NULL);
}
module_init(macide_init);
diff --git a/drivers/ide/palm_bk3710.c b/drivers/ide/palm_bk3710.c
index 09d813d..3c1dc01 100644
--- a/drivers/ide/palm_bk3710.c
+++ b/drivers/ide/palm_bk3710.c
@@ -306,6 +306,7 @@
.host_flags = IDE_HFLAG_MMIO,
.pio_mask = ATA_PIO4,
.mwdma_mask = ATA_MWDMA2,
+ .chipset = ide_palm3710,
};
static int __init palm_bk3710_probe(struct platform_device *pdev)
@@ -315,7 +316,7 @@
void __iomem *base;
unsigned long rate, mem_size;
int i, rc;
- hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
clk = clk_get(&pdev->dev, "IDECLK");
if (IS_ERR(clk))
@@ -363,13 +364,12 @@
(base + IDE_PALM_ATA_PRI_CTL_OFFSET);
hw.irq = irq->start;
hw.dev = &pdev->dev;
- hw.chipset = ide_palm3710;
palm_bk3710_port_info.udma_mask = rate < 100000000 ? ATA_UDMA4 :
ATA_UDMA5;
/* Register the IDE interface with Linux */
- rc = ide_host_add(&palm_bk3710_port_info, hws, NULL);
+ rc = ide_host_add(&palm_bk3710_port_info, hws, 1, NULL);
if (rc)
goto out;
diff --git a/drivers/ide/pdc202xx_new.c b/drivers/ide/pdc202xx_new.c
index b68906c..65ba823 100644
--- a/drivers/ide/pdc202xx_new.c
+++ b/drivers/ide/pdc202xx_new.c
@@ -40,18 +40,6 @@
#define DBG(fmt, args...)
#endif
-static const char *pdc_quirk_drives[] = {
- "QUANTUM FIREBALLlct08 08",
- "QUANTUM FIREBALLP KA6.4",
- "QUANTUM FIREBALLP KA9.1",
- "QUANTUM FIREBALLP LM20.4",
- "QUANTUM FIREBALLP KX13.6",
- "QUANTUM FIREBALLP KX20.5",
- "QUANTUM FIREBALLP KX27.3",
- "QUANTUM FIREBALLP LM20.5",
- NULL
-};
-
static u8 max_dma_rate(struct pci_dev *pdev)
{
u8 mode;
@@ -200,19 +188,6 @@
return ATA_CBL_PATA80;
}
-static void pdcnew_quirkproc(ide_drive_t *drive)
-{
- const char **list, *m = (char *)&drive->id[ATA_ID_PROD];
-
- for (list = pdc_quirk_drives; *list != NULL; list++)
- if (strstr(m, *list) != NULL) {
- drive->quirk_list = 2;
- return;
- }
-
- drive->quirk_list = 0;
-}
-
static void pdcnew_reset(ide_drive_t *drive)
{
/*
@@ -473,7 +448,6 @@
static const struct ide_port_ops pdcnew_port_ops = {
.set_pio_mode = pdcnew_set_pio_mode,
.set_dma_mode = pdcnew_set_dma_mode,
- .quirkproc = pdcnew_quirkproc,
.resetproc = pdcnew_reset,
.cable_detect = pdcnew_cable_detect,
};
diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c
index e24ecc8..b6abf7e 100644
--- a/drivers/ide/pdc202xx_old.c
+++ b/drivers/ide/pdc202xx_old.c
@@ -23,18 +23,6 @@
#define PDC202XX_DEBUG_DRIVE_INFO 0
-static const char *pdc_quirk_drives[] = {
- "QUANTUM FIREBALLlct08 08",
- "QUANTUM FIREBALLP KA6.4",
- "QUANTUM FIREBALLP KA9.1",
- "QUANTUM FIREBALLP LM20.4",
- "QUANTUM FIREBALLP KX13.6",
- "QUANTUM FIREBALLP KX20.5",
- "QUANTUM FIREBALLP KX27.3",
- "QUANTUM FIREBALLP LM20.5",
- NULL
-};
-
static void pdc_old_disable_66MHz_clock(ide_hwif_t *);
static void pdc202xx_set_mode(ide_drive_t *drive, const u8 speed)
@@ -151,19 +139,6 @@
outb(clock & ~(hwif->channel ? 0x08 : 0x02), clock_reg);
}
-static void pdc202xx_quirkproc(ide_drive_t *drive)
-{
- const char **list, *m = (char *)&drive->id[ATA_ID_PROD];
-
- for (list = pdc_quirk_drives; *list != NULL; list++)
- if (strstr(m, *list) != NULL) {
- drive->quirk_list = 2;
- return;
- }
-
- drive->quirk_list = 0;
-}
-
static void pdc202xx_dma_start(ide_drive_t *drive)
{
if (drive->current_speed > XFER_UDMA_2)
@@ -203,52 +178,6 @@
return ide_dma_end(drive);
}
-static int pdc202xx_dma_test_irq(ide_drive_t *drive)
-{
- ide_hwif_t *hwif = drive->hwif;
- unsigned long high_16 = hwif->extra_base - 16;
- u8 dma_stat = inb(hwif->dma_base + ATA_DMA_STATUS);
- u8 sc1d = inb(high_16 + 0x001d);
-
- if (hwif->channel) {
- /* bit7: Error, bit6: Interrupting, bit5: FIFO Full, bit4: FIFO Empty */
- if ((sc1d & 0x50) == 0x50)
- goto somebody_else;
- else if ((sc1d & 0x40) == 0x40)
- return (dma_stat & 4) == 4;
- } else {
- /* bit3: Error, bit2: Interrupting, bit1: FIFO Full, bit0: FIFO Empty */
- if ((sc1d & 0x05) == 0x05)
- goto somebody_else;
- else if ((sc1d & 0x04) == 0x04)
- return (dma_stat & 4) == 4;
- }
-somebody_else:
- return (dma_stat & 4) == 4; /* return 1 if INTR asserted */
-}
-
-static void pdc202xx_reset(ide_drive_t *drive)
-{
- ide_hwif_t *hwif = drive->hwif;
- unsigned long high_16 = hwif->extra_base - 16;
- u8 udma_speed_flag = inb(high_16 | 0x001f);
-
- printk(KERN_WARNING "PDC202xx: software reset...\n");
-
- outb(udma_speed_flag | 0x10, high_16 | 0x001f);
- mdelay(100);
- outb(udma_speed_flag & ~0x10, high_16 | 0x001f);
- mdelay(2000); /* 2 seconds ?! */
-
- ide_set_max_pio(drive);
-}
-
-static void pdc202xx_dma_lost_irq(ide_drive_t *drive)
-{
- pdc202xx_reset(drive);
- ide_dma_lost_irq(drive);
-}
-
static int init_chipset_pdc202xx(struct pci_dev *dev)
{
unsigned long dmabase = pci_resource_start(dev, 4);
@@ -302,37 +231,22 @@
static const struct ide_port_ops pdc20246_port_ops = {
.set_pio_mode = pdc202xx_set_pio_mode,
.set_dma_mode = pdc202xx_set_mode,
- .quirkproc = pdc202xx_quirkproc,
};
static const struct ide_port_ops pdc2026x_port_ops = {
.set_pio_mode = pdc202xx_set_pio_mode,
.set_dma_mode = pdc202xx_set_mode,
- .quirkproc = pdc202xx_quirkproc,
- .resetproc = pdc202xx_reset,
.cable_detect = pdc2026x_cable_detect,
};
-static const struct ide_dma_ops pdc20246_dma_ops = {
- .dma_host_set = ide_dma_host_set,
- .dma_setup = ide_dma_setup,
- .dma_start = ide_dma_start,
- .dma_end = ide_dma_end,
- .dma_test_irq = pdc202xx_dma_test_irq,
- .dma_lost_irq = ide_dma_lost_irq,
- .dma_timer_expiry = ide_dma_sff_timer_expiry,
- .dma_sff_read_status = ide_dma_sff_read_status,
-};
-
static const struct ide_dma_ops pdc2026x_dma_ops = {
.dma_host_set = ide_dma_host_set,
.dma_setup = ide_dma_setup,
.dma_start = pdc202xx_dma_start,
.dma_end = pdc202xx_dma_end,
- .dma_test_irq = pdc202xx_dma_test_irq,
- .dma_lost_irq = pdc202xx_dma_lost_irq,
+ .dma_test_irq = ide_dma_test_irq,
+ .dma_lost_irq = ide_dma_lost_irq,
.dma_timer_expiry = ide_dma_sff_timer_expiry,
- .dma_clear = pdc202xx_reset,
.dma_sff_read_status = ide_dma_sff_read_status,
};
@@ -354,7 +268,7 @@
.name = DRV_NAME,
.init_chipset = init_chipset_pdc202xx,
.port_ops = &pdc20246_port_ops,
- .dma_ops = &pdc20246_dma_ops,
+ .dma_ops = &sff_dma_ops,
.host_flags = IDE_HFLAGS_PDC202XX,
.pio_mask = ATA_PIO4,
.mwdma_mask = ATA_MWDMA2,
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index f76e4e6..97642a7a 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1023,13 +1023,14 @@
* Setup, register & probe an IDE channel driven by this driver, this is
* called by one of the 2 probe functions (macio or PCI).
*/
-static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif, hw_regs_t *hw)
+static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif,
+ struct ide_hw *hw)
{
struct device_node *np = pmif->node;
const int *bidp;
struct ide_host *host;
ide_hwif_t *hwif;
- hw_regs_t *hws[] = { hw, NULL, NULL, NULL };
+ struct ide_hw *hws[] = { hw };
struct ide_port_info d = pmac_port_info;
int rc;
@@ -1077,7 +1078,7 @@
/* Make sure we have sane timings */
sanitize_timings(pmif);
- host = ide_host_alloc(&d, hws);
+ host = ide_host_alloc(&d, hws, 1);
if (host == NULL)
return -ENOMEM;
hwif = host->ports[0];
@@ -1124,7 +1125,7 @@
return 0;
}
-static void __devinit pmac_ide_init_ports(hw_regs_t *hw, unsigned long base)
+static void __devinit pmac_ide_init_ports(struct ide_hw *hw, unsigned long base)
{
int i;
@@ -1144,7 +1145,7 @@
unsigned long regbase;
pmac_ide_hwif_t *pmif;
int irq, rc;
- hw_regs_t hw;
+ struct ide_hw hw;
pmif = kzalloc(sizeof(*pmif), GFP_KERNEL);
if (pmif == NULL)
@@ -1268,7 +1269,7 @@
void __iomem *base;
unsigned long rbase, rlen;
int rc;
- hw_regs_t hw;
+ struct ide_hw hw;
np = pci_device_to_OF_node(pdev);
if (np == NULL) {
diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c
index c793466..ab49a97 100644
--- a/drivers/ide/q40ide.c
+++ b/drivers/ide/q40ide.c
@@ -51,11 +51,11 @@
/*
* Addresses are pretranslated for Q40 ISA access.
*/
-static void q40_ide_setup_ports(hw_regs_t *hw, unsigned long base,
+static void q40_ide_setup_ports(struct ide_hw *hw, unsigned long base,
ide_ack_intr_t *ack_intr,
int irq)
{
- memset(hw, 0, sizeof(hw_regs_t));
+ memset(hw, 0, sizeof(*hw));
/* BIG FAT WARNING:
assumption: only DATA port is ever used in 16 bit mode */
hw->io_ports.data_addr = Q40_ISA_IO_W(base);
@@ -70,8 +70,6 @@
hw->irq = irq;
hw->ack_intr = ack_intr;
-
- hw->chipset = ide_generic;
}
static void q40ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd,
@@ -119,6 +117,7 @@
.tp_ops = &q40ide_tp_ops,
.host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
.irq_flags = IRQF_SHARED,
+ .chipset = ide_generic,
};
/*
@@ -136,7 +135,7 @@
static int __init q40ide_init(void)
{
int i;
- hw_regs_t hw[Q40IDE_NUM_HWIFS], *hws[] = { NULL, NULL, NULL, NULL };
+ struct ide_hw hw[Q40IDE_NUM_HWIFS], *hws[] = { NULL, NULL };
if (!MACH_IS_Q40)
return -ENODEV;
@@ -163,7 +162,7 @@
hws[i] = &hw[i];
}
- return ide_host_add(&q40ide_port_info, hws, NULL);
+ return ide_host_add(&q40ide_port_info, hws, Q40IDE_NUM_HWIFS, NULL);
}
module_init(q40ide_init);
diff --git a/drivers/ide/rapide.c b/drivers/ide/rapide.c
index d5003ca..00f5424 100644
--- a/drivers/ide/rapide.c
+++ b/drivers/ide/rapide.c
@@ -13,9 +13,10 @@
static const struct ide_port_info rapide_port_info = {
.host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
+ .chipset = ide_generic,
};
-static void rapide_setup_ports(hw_regs_t *hw, void __iomem *base,
+static void rapide_setup_ports(struct ide_hw *hw, void __iomem *base,
void __iomem *ctrl, unsigned int sz, int irq)
{
unsigned long port = (unsigned long)base;
@@ -35,7 +36,7 @@
void __iomem *base;
struct ide_host *host;
int ret;
- hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
ret = ecard_request_resources(ec);
if (ret)
@@ -49,10 +50,9 @@
memset(&hw, 0, sizeof(hw));
rapide_setup_ports(&hw, base, base + 0x818, 1 << 6, ec->irq);
- hw.chipset = ide_generic;
hw.dev = &ec->dev;
- ret = ide_host_add(&rapide_port_info, hws, &host);
+ ret = ide_host_add(&rapide_port_info, hws, 1, &host);
if (ret)
goto release;
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 5be41f2..1104bb3 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -559,7 +559,7 @@
{
struct scc_ports *ports = pci_get_drvdata(dev);
struct ide_host *host;
- hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
int i, rc;
memset(&hw, 0, sizeof(hw));
@@ -567,9 +567,8 @@
hw.io_ports_array[i] = ports->dma + 0x20 + i * 4;
hw.irq = dev->irq;
hw.dev = &dev->dev;
- hw.chipset = ide_pci;
- rc = ide_host_add(d, hws, &host);
+ rc = ide_host_add(d, hws, 1, &host);
if (rc)
return rc;
@@ -823,6 +822,7 @@
.host_flags = IDE_HFLAG_SINGLE,
.irq_flags = IRQF_SHARED,
.pio_mask = ATA_PIO4,
+ .chipset = ide_pci,
};
/**
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index 7a3a12d..ab3db61 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -1,7 +1,7 @@
/*
* Copyright (C) 1998-2000 Andre Hedrick <andre@linux-ide.org>
* Copyright (C) 1995-1998 Mark Lord
- * Copyright (C) 2007 Bartlomiej Zolnierkiewicz
+ * Copyright (C) 2007-2009 Bartlomiej Zolnierkiewicz
*
* May be copied or modified under the terms of the GNU General Public License
*/
@@ -301,11 +301,11 @@
}
/**
- * ide_hw_configure - configure a hw_regs_t instance
+ * ide_hw_configure - configure a struct ide_hw instance
* @dev: PCI device holding interface
* @d: IDE port info
* @port: port number
- * @hw: hw_regs_t instance corresponding to this port
+ * @hw: struct ide_hw instance corresponding to this port
*
* Perform the initial set up for the hardware interface structure. This
* is done per interface port rather than per PCI device. There may be
@@ -315,7 +315,7 @@
*/
static int ide_hw_configure(struct pci_dev *dev, const struct ide_port_info *d,
- unsigned int port, hw_regs_t *hw)
+ unsigned int port, struct ide_hw *hw)
{
unsigned long ctl = 0, base = 0;
@@ -344,7 +344,6 @@
memset(hw, 0, sizeof(*hw));
hw->dev = &dev->dev;
- hw->chipset = d->chipset ? d->chipset : ide_pci;
ide_std_init_ports(hw, base, ctl | 2);
return 0;
@@ -446,8 +445,8 @@
* ide_pci_setup_ports - configure ports/devices on PCI IDE
* @dev: PCI device
* @d: IDE port info
- * @hw: hw_regs_t instances corresponding to this PCI IDE device
- * @hws: hw_regs_t pointers table to update
+ * @hw: struct ide_hw instances corresponding to this PCI IDE device
+ * @hws: struct ide_hw pointers table to update
*
* Scan the interfaces attached to this device and do any
* necessary per port setup. Attach the devices and ask the
@@ -459,7 +458,7 @@
*/
void ide_pci_setup_ports(struct pci_dev *dev, const struct ide_port_info *d,
- hw_regs_t *hw, hw_regs_t **hws)
+ struct ide_hw *hw, struct ide_hw **hws)
{
int channels = (d->host_flags & IDE_HFLAG_SINGLE) ? 1 : 2, port;
u8 tmp;
@@ -535,61 +534,15 @@
return ret;
}
-int ide_pci_init_one(struct pci_dev *dev, const struct ide_port_info *d,
- void *priv)
-{
- struct ide_host *host;
- hw_regs_t hw[4], *hws[] = { NULL, NULL, NULL, NULL };
- int ret;
-
- ret = ide_setup_pci_controller(dev, d, 1);
- if (ret < 0)
- goto out;
-
- ide_pci_setup_ports(dev, d, &hw[0], &hws[0]);
-
- host = ide_host_alloc(d, hws);
- if (host == NULL) {
- ret = -ENOMEM;
- goto out;
- }
-
- host->dev[0] = &dev->dev;
-
- host->host_priv = priv;
-
- host->irq_flags = IRQF_SHARED;
-
- pci_set_drvdata(dev, host);
-
- ret = do_ide_setup_pci_device(dev, d, 1);
- if (ret < 0)
- goto out;
-
- /* fixup IRQ */
- if (ide_pci_is_in_compatibility_mode(dev)) {
- hw[0].irq = pci_get_legacy_ide_irq(dev, 0);
- hw[1].irq = pci_get_legacy_ide_irq(dev, 1);
- } else
- hw[1].irq = hw[0].irq = ret;
-
- ret = ide_host_register(host, d, hws);
- if (ret)
- ide_host_free(host);
-out:
- return ret;
-}
-EXPORT_SYMBOL_GPL(ide_pci_init_one);
-
int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2,
const struct ide_port_info *d, void *priv)
{
struct pci_dev *pdev[] = { dev1, dev2 };
struct ide_host *host;
- int ret, i;
- hw_regs_t hw[4], *hws[] = { NULL, NULL, NULL, NULL };
+ int ret, i, n_ports = dev2 ? 4 : 2;
+ struct ide_hw hw[4], *hws[] = { NULL, NULL, NULL, NULL };
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < n_ports / 2; i++) {
ret = ide_setup_pci_controller(pdev[i], d, !i);
if (ret < 0)
goto out;
@@ -597,23 +550,24 @@
ide_pci_setup_ports(pdev[i], d, &hw[i*2], &hws[i*2]);
}
- host = ide_host_alloc(d, hws);
+ host = ide_host_alloc(d, hws, n_ports);
if (host == NULL) {
ret = -ENOMEM;
goto out;
}
host->dev[0] = &dev1->dev;
- host->dev[1] = &dev2->dev;
+ if (dev2)
+ host->dev[1] = &dev2->dev;
host->host_priv = priv;
-
host->irq_flags = IRQF_SHARED;
pci_set_drvdata(pdev[0], host);
- pci_set_drvdata(pdev[1], host);
+ if (dev2)
+ pci_set_drvdata(pdev[1], host);
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < n_ports / 2; i++) {
ret = do_ide_setup_pci_device(pdev[i], d, !i);
/*
@@ -639,6 +593,13 @@
}
EXPORT_SYMBOL_GPL(ide_pci_init_two);
+int ide_pci_init_one(struct pci_dev *dev, const struct ide_port_info *d,
+ void *priv)
+{
+ return ide_pci_init_two(dev, NULL, d, priv);
+}
+EXPORT_SYMBOL_GPL(ide_pci_init_one);
+
void ide_pci_remove(struct pci_dev *dev)
{
struct ide_host *host = pci_get_drvdata(dev);
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index e5d2a48..5f37f16 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -91,7 +91,7 @@
static void
-sgiioc4_init_hwif_ports(hw_regs_t * hw, unsigned long data_port,
+sgiioc4_init_hwif_ports(struct ide_hw *hw, unsigned long data_port,
unsigned long ctrl_port, unsigned long irq_port)
{
unsigned long reg = data_port;
@@ -546,7 +546,7 @@
unsigned long cmd_base, irqport;
unsigned long bar0, cmd_phys_base, ctl;
void __iomem *virt_base;
- hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
int rc;
/* Get the CmdBlk and CtrlBlk Base Registers */
@@ -575,13 +575,12 @@
memset(&hw, 0, sizeof(hw));
sgiioc4_init_hwif_ports(&hw, cmd_base, ctl, irqport);
hw.irq = dev->irq;
- hw.chipset = ide_pci;
hw.dev = &dev->dev;
/* Initializing chipset IRQ Registers */
writel(0x03, (void __iomem *)(irqport + IOC4_INTR_SET * 4));
- rc = ide_host_add(&sgiioc4_port_info, hws, NULL);
+ rc = ide_host_add(&sgiioc4_port_info, hws, 1, NULL);
if (!rc)
return 0;
diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c
index e4973cd..bd82d22 100644
--- a/drivers/ide/siimage.c
+++ b/drivers/ide/siimage.c
@@ -451,8 +451,8 @@
static void sil_sata_pre_reset(ide_drive_t *drive)
{
if (drive->media == ide_disk) {
- drive->special.b.set_geometry = 0;
- drive->special.b.recalibrate = 0;
+ drive->special_flags &=
+ ~(IDE_SFLAG_SET_GEOMETRY | IDE_SFLAG_RECALIBRATE);
}
}
diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c
index b0a4606..0924abf 100644
--- a/drivers/ide/sl82c105.c
+++ b/drivers/ide/sl82c105.c
@@ -10,7 +10,7 @@
* with the timing registers setup.
* -- Benjamin Herrenschmidt (01/11/03) benh@kernel.crashing.org
*
- * Copyright (C) 2006-2007 MontaVista Software, Inc. <source@mvista.com>
+ * Copyright (C) 2006-2007,2009 MontaVista Software, Inc. <source@mvista.com>
* Copyright (C) 2007 Bartlomiej Zolnierkiewicz
*/
@@ -146,14 +146,15 @@
u32 val, mask = hwif->channel ? CTRL_IDE_IRQB : CTRL_IDE_IRQA;
u8 dma_cmd;
- printk("sl82c105: lost IRQ, resetting host\n");
+ printk(KERN_WARNING "sl82c105: lost IRQ, resetting host\n");
/*
* Check the raw interrupt from the drive.
*/
pci_read_config_dword(dev, 0x40, &val);
if (val & mask)
- printk("sl82c105: drive was requesting IRQ, but host lost it\n");
+ printk(KERN_INFO "sl82c105: drive was requesting IRQ, "
+ "but host lost it\n");
/*
* Was DMA enabled? If so, disable it - we're resetting the
@@ -162,7 +163,7 @@
dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD);
if (dma_cmd & 1) {
outb(dma_cmd & ~1, hwif->dma_base + ATA_DMA_CMD);
- printk("sl82c105: DMA was enabled\n");
+ printk(KERN_INFO "sl82c105: DMA was enabled\n");
}
sl82c105_reset_host(dev);
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index e33d764..ea89fdd 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -130,8 +130,7 @@
static int __init tx4938ide_probe(struct platform_device *pdev)
{
- hw_regs_t hw;
- hw_regs_t *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
struct ide_host *host;
struct resource *res;
struct tx4938ide_platform_info *pdata = pdev->dev.platform_data;
@@ -183,7 +182,7 @@
tx4938ide_tune_ebusc(pdata->ebus_ch, pdata->gbus_clock, 0);
else
d.port_ops = NULL;
- ret = ide_host_add(&d, hws, &host);
+ ret = ide_host_add(&d, hws, 1, &host);
if (!ret)
platform_set_drvdata(pdev, host);
return ret;
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 5ca7622..64b58ec 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -537,8 +537,7 @@
static int __init tx4939ide_probe(struct platform_device *pdev)
{
- hw_regs_t hw;
- hw_regs_t *hws[] = { &hw, NULL, NULL, NULL };
+ struct ide_hw hw, *hws[] = { &hw };
struct ide_host *host;
struct resource *res;
int irq, ret;
@@ -581,7 +580,7 @@
hw.dev = &pdev->dev;
pr_info("TX4939 IDE interface (base %#lx, irq %d)\n", mapbase, irq);
- host = ide_host_alloc(&tx4939ide_port_info, hws);
+ host = ide_host_alloc(&tx4939ide_port_info, hws, 1);
if (!host)
return -ENOMEM;
/* use extra_base for base address of the all registers */
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index a3d3cba..0aaa059 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -1,6 +1,6 @@
config LGUEST
tristate "Linux hypervisor example code"
- depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX
+ depends on X86_32 && EXPERIMENTAL && EVENTFD
select HVC_DRIVER
---help---
This is a very simple module which allows you to run
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 4845fb3..a6974e9 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -95,7 +95,7 @@
* array of struct pages. It increments that pointer, but we don't
* care. */
pagep = switcher_page;
- err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep);
+ err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
if (err) {
printk("lguest: map_vm_area failed: %i\n", err);
goto free_vma;
@@ -188,6 +188,9 @@
{
/* We stop running once the Guest is dead. */
while (!cpu->lg->dead) {
+ unsigned int irq;
+ bool more;
+
/* First we run any hypercalls the Guest wants done. */
if (cpu->hcall)
do_hypercalls(cpu);
@@ -195,23 +198,23 @@
/* It's possible the Guest did a NOTIFY hypercall to the
* Launcher, in which case we return from the read() now. */
if (cpu->pending_notify) {
- if (put_user(cpu->pending_notify, user))
- return -EFAULT;
- return sizeof(cpu->pending_notify);
+ if (!send_notify_to_eventfd(cpu)) {
+ if (put_user(cpu->pending_notify, user))
+ return -EFAULT;
+ return sizeof(cpu->pending_notify);
+ }
}
/* Check for signals */
if (signal_pending(current))
return -ERESTARTSYS;
- /* If Waker set break_out, return to Launcher. */
- if (cpu->break_out)
- return -EAGAIN;
-
/* Check if there are any interrupts which can be delivered now:
* if so, this sets up the hander to be executed when we next
* run the Guest. */
- maybe_do_interrupt(cpu);
+ irq = interrupt_pending(cpu, &more);
+ if (irq < LGUEST_IRQS)
+ try_deliver_interrupt(cpu, irq, more);
/* All long-lived kernel loops need to check with this horrible
* thing called the freezer. If the Host is trying to suspend,
@@ -224,10 +227,15 @@
break;
/* If the Guest asked to be stopped, we sleep. The Guest's
- * clock timer or LHREQ_BREAK from the Waker will wake us. */
+ * clock timer will wake us. */
if (cpu->halted) {
set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ /* Just before we sleep, make sure no interrupt snuck in
+ * which we should be doing. */
+ if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
+ set_current_state(TASK_RUNNING);
+ else
+ schedule();
continue;
}
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 54d66f0..c29ffa1 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -37,6 +37,10 @@
/* This call does nothing, except by breaking out of the Guest
* it makes us process all the asynchronous hypercalls. */
break;
+ case LHCALL_SEND_INTERRUPTS:
+ /* This call does nothing too, but by breaking out of the Guest
+ * it makes us process any pending interrupts. */
+ break;
case LHCALL_LGUEST_INIT:
/* You can't get here unless you're already initialized. Don't
* do that. */
@@ -73,11 +77,21 @@
guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
break;
case LHCALL_SET_PTE:
+#ifdef CONFIG_X86_PAE
+ guest_set_pte(cpu, args->arg1, args->arg2,
+ __pte(args->arg3 | (u64)args->arg4 << 32));
+#else
guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
+#endif
break;
+ case LHCALL_SET_PGD:
+ guest_set_pgd(cpu->lg, args->arg1, args->arg2);
+ break;
+#ifdef CONFIG_X86_PAE
case LHCALL_SET_PMD:
guest_set_pmd(cpu->lg, args->arg1, args->arg2);
break;
+#endif
case LHCALL_SET_CLOCKEVENT:
guest_set_clockevent(cpu, args->arg1);
break;
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 6e99adb..0e9067b 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -128,30 +128,39 @@
/*H:205
* Virtual Interrupts.
*
- * maybe_do_interrupt() gets called before every entry to the Guest, to see if
- * we should divert the Guest to running an interrupt handler. */
-void maybe_do_interrupt(struct lg_cpu *cpu)
+ * interrupt_pending() returns the first pending interrupt which isn't blocked
+ * by the Guest. It is called before every entry to the Guest, and just before
+ * we go to sleep when the Guest has halted itself. */
+unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
{
unsigned int irq;
DECLARE_BITMAP(blk, LGUEST_IRQS);
- struct desc_struct *idt;
/* If the Guest hasn't even initialized yet, we can do nothing. */
if (!cpu->lg->lguest_data)
- return;
+ return LGUEST_IRQS;
/* Take our "irqs_pending" array and remove any interrupts the Guest
* wants blocked: the result ends up in "blk". */
if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
sizeof(blk)))
- return;
+ return LGUEST_IRQS;
bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
/* Find the first interrupt. */
irq = find_first_bit(blk, LGUEST_IRQS);
- /* None? Nothing to do */
- if (irq >= LGUEST_IRQS)
- return;
+ *more = find_next_bit(blk, LGUEST_IRQS, irq+1);
+
+ return irq;
+}
+
+/* This actually diverts the Guest to running an interrupt handler, once an
+ * interrupt has been identified by interrupt_pending(). */
+void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
+{
+ struct desc_struct *idt;
+
+ BUG_ON(irq >= LGUEST_IRQS);
/* They may be in the middle of an iret, where they asked us never to
* deliver interrupts. */
@@ -170,8 +179,12 @@
u32 irq_enabled;
if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
irq_enabled = 0;
- if (!irq_enabled)
+ if (!irq_enabled) {
+ /* Make sure they know an IRQ is pending. */
+ put_user(X86_EFLAGS_IF,
+ &cpu->lg->lguest_data->irq_pending);
return;
+ }
}
/* Look at the IDT entry the Guest gave us for this interrupt. The
@@ -194,6 +207,25 @@
* here is a compromise which means at least it gets updated every
* timer interrupt. */
write_timestamp(cpu);
+
+ /* If there are no other interrupts we want to deliver, clear
+ * the pending flag. */
+ if (!more)
+ put_user(0, &cpu->lg->lguest_data->irq_pending);
+}
+
+/* And this is the routine when we want to set an interrupt for the Guest. */
+void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
+{
+ /* Next time the Guest runs, the core code will see if it can deliver
+ * this interrupt. */
+ set_bit(irq, cpu->irqs_pending);
+
+ /* Make sure it sees it; it might be asleep (eg. halted), or
+ * running the Guest right now, in which case kick_process()
+ * will knock it out. */
+ if (!wake_up_process(cpu->tsk))
+ kick_process(cpu->tsk);
}
/*:*/
@@ -510,10 +542,7 @@
struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
/* Remember the first interrupt is the timer interrupt. */
- set_bit(0, cpu->irqs_pending);
- /* If the Guest is actually stopped, we need to wake it up. */
- if (cpu->halted)
- wake_up_process(cpu->tsk);
+ set_interrupt(cpu, 0);
return HRTIMER_NORESTART;
}
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index af92a17..d4e8979 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -49,7 +49,7 @@
u32 cr2;
int ts;
u32 esp1;
- u8 ss1;
+ u16 ss1;
/* Bitmap of what has changed: see CHANGED_* above. */
int changed;
@@ -71,9 +71,7 @@
/* Virtual clock device */
struct hrtimer hrt;
- /* Do we need to stop what we're doing and return to userspace? */
- int break_out;
- wait_queue_head_t break_wq;
+ /* Did the Guest tell us to halt? */
int halted;
/* Pending virtual interrupts */
@@ -82,6 +80,16 @@
struct lg_cpu_arch arch;
};
+struct lg_eventfd {
+ unsigned long addr;
+ struct file *event;
+};
+
+struct lg_eventfd_map {
+ unsigned int num;
+ struct lg_eventfd map[];
+};
+
/* The private info the thread maintains about the guest. */
struct lguest
{
@@ -102,6 +110,8 @@
unsigned int stack_pages;
u32 tsc_khz;
+ struct lg_eventfd_map *eventfds;
+
/* Dead? */
const char *dead;
};
@@ -137,9 +147,13 @@
* in the kernel. */
#define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK)
#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT)
+#define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK)
+#define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT)
/* interrupts_and_traps.c: */
-void maybe_do_interrupt(struct lg_cpu *cpu);
+unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
+void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
+void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
u32 low, u32 hi);
@@ -150,6 +164,7 @@
void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
const unsigned long *def);
void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
+bool send_notify_to_eventfd(struct lg_cpu *cpu);
void init_clockdev(struct lg_cpu *cpu);
bool check_syscall_vector(struct lguest *lg);
int init_interrupts(void);
@@ -168,7 +183,10 @@
int init_guest_pagetable(struct lguest *lg);
void free_guest_pagetable(struct lguest *lg);
void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
+void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
+#ifdef CONFIG_X86_PAE
void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
+#endif
void guest_pagetable_clear_all(struct lg_cpu *cpu);
void guest_pagetable_flush_user(struct lg_cpu *cpu);
void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index df44d96..e082cda 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -228,7 +228,8 @@
* function. */
static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
unsigned index,
- void (*callback)(struct virtqueue *vq))
+ void (*callback)(struct virtqueue *vq),
+ const char *name)
{
struct lguest_device *ldev = to_lgdev(vdev);
struct lguest_vq_info *lvq;
@@ -263,7 +264,7 @@
/* OK, tell virtio_ring.c to set up a virtqueue now we know its size
* and we've got a pointer to its pages. */
vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN,
- vdev, lvq->pages, lg_notify, callback);
+ vdev, lvq->pages, lg_notify, callback, name);
if (!vq) {
err = -ENOMEM;
goto unmap;
@@ -312,6 +313,38 @@
kfree(lvq);
}
+static void lg_del_vqs(struct virtio_device *vdev)
+{
+ struct virtqueue *vq, *n;
+
+ list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+ lg_del_vq(vq);
+}
+
+static int lg_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+ struct virtqueue *vqs[],
+ vq_callback_t *callbacks[],
+ const char *names[])
+{
+ struct lguest_device *ldev = to_lgdev(vdev);
+ int i;
+
+ /* We must have this many virtqueues. */
+ if (nvqs > ldev->desc->num_vq)
+ return -ENOENT;
+
+ for (i = 0; i < nvqs; ++i) {
+ vqs[i] = lg_find_vq(vdev, i, callbacks[i], names[i]);
+ if (IS_ERR(vqs[i]))
+ goto error;
+ }
+ return 0;
+
+error:
+ lg_del_vqs(vdev);
+ return PTR_ERR(vqs[i]);
+}
+
/* The ops structure which hooks everything together. */
static struct virtio_config_ops lguest_config_ops = {
.get_features = lg_get_features,
@@ -321,8 +354,8 @@
.get_status = lg_get_status,
.set_status = lg_set_status,
.reset = lg_reset,
- .find_vq = lg_find_vq,
- .del_vq = lg_del_vq,
+ .find_vqs = lg_find_vqs,
+ .del_vqs = lg_del_vqs,
};
/* The root device for the lguest virtio devices. This makes them appear as
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index b8ee103..32e2971 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -7,32 +7,83 @@
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include <linux/sched.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
#include "lg.h"
-/*L:055 When something happens, the Waker process needs a way to stop the
- * kernel running the Guest and return to the Launcher. So the Waker writes
- * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher
- * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
- * the Waker. */
-static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
+bool send_notify_to_eventfd(struct lg_cpu *cpu)
{
- unsigned long on;
+ unsigned int i;
+ struct lg_eventfd_map *map;
- /* Fetch whether they're turning break on or off. */
- if (get_user(on, input) != 0)
+ /* lg->eventfds is RCU-protected */
+ rcu_read_lock();
+ map = rcu_dereference(cpu->lg->eventfds);
+ for (i = 0; i < map->num; i++) {
+ if (map->map[i].addr == cpu->pending_notify) {
+ eventfd_signal(map->map[i].event, 1);
+ cpu->pending_notify = 0;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return cpu->pending_notify == 0;
+}
+
+static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
+{
+ struct lg_eventfd_map *new, *old = lg->eventfds;
+
+ if (!addr)
+ return -EINVAL;
+
+ /* Replace the old array with the new one, carefully: others can
+ * be accessing it at the same time */
+ new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
+ GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ /* First make identical copy. */
+ memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
+ new->num = old->num;
+
+ /* Now append new entry. */
+ new->map[new->num].addr = addr;
+ new->map[new->num].event = eventfd_fget(fd);
+ if (IS_ERR(new->map[new->num].event)) {
+ kfree(new);
+ return PTR_ERR(new->map[new->num].event);
+ }
+ new->num++;
+
+ /* Now put new one in place. */
+ rcu_assign_pointer(lg->eventfds, new);
+
+ /* We're not in a big hurry. Wait until noone's looking at old
+ * version, then delete it. */
+ synchronize_rcu();
+ kfree(old);
+
+ return 0;
+}
+
+static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
+{
+ unsigned long addr, fd;
+ int err;
+
+ if (get_user(addr, input) != 0)
+ return -EFAULT;
+ input++;
+ if (get_user(fd, input) != 0)
return -EFAULT;
- if (on) {
- cpu->break_out = 1;
- /* Pop it out of the Guest (may be running on different CPU) */
- wake_up_process(cpu->tsk);
- /* Wait for them to reset it */
- return wait_event_interruptible(cpu->break_wq, !cpu->break_out);
- } else {
- cpu->break_out = 0;
- wake_up(&cpu->break_wq);
- return 0;
- }
+ mutex_lock(&lguest_lock);
+ err = add_eventfd(lg, addr, fd);
+ mutex_unlock(&lguest_lock);
+
+ return 0;
}
/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
@@ -45,9 +96,8 @@
return -EFAULT;
if (irq >= LGUEST_IRQS)
return -EINVAL;
- /* Next time the Guest runs, the core code will see if it can deliver
- * this interrupt. */
- set_bit(irq, cpu->irqs_pending);
+
+ set_interrupt(cpu, irq);
return 0;
}
@@ -126,9 +176,6 @@
* address. */
lguest_arch_setup_regs(cpu, start_ip);
- /* Initialize the queue for the Waker to wait on */
- init_waitqueue_head(&cpu->break_wq);
-
/* We keep a pointer to the Launcher task (ie. current task) for when
* other Guests want to wake this one (eg. console input). */
cpu->tsk = current;
@@ -185,6 +232,13 @@
goto unlock;
}
+ lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
+ if (!lg->eventfds) {
+ err = -ENOMEM;
+ goto free_lg;
+ }
+ lg->eventfds->num = 0;
+
/* Populate the easy fields of our "struct lguest" */
lg->mem_base = (void __user *)args[0];
lg->pfn_limit = args[1];
@@ -192,7 +246,7 @@
/* This is the first cpu (cpu 0) and it will start booting at args[2] */
err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
if (err)
- goto release_guest;
+ goto free_eventfds;
/* Initialize the Guest's shadow page tables, using the toplevel
* address the Launcher gave us. This allocates memory, so can fail. */
@@ -211,7 +265,9 @@
free_regs:
/* FIXME: This should be in free_vcpu */
free_page(lg->cpus[0].regs_page);
-release_guest:
+free_eventfds:
+ kfree(lg->eventfds);
+free_lg:
kfree(lg);
unlock:
mutex_unlock(&lguest_lock);
@@ -252,11 +308,6 @@
/* Once the Guest is dead, you can only read() why it died. */
if (lg->dead)
return -ENOENT;
-
- /* If you're not the task which owns the Guest, all you can do
- * is break the Launcher out of running the Guest. */
- if (current != cpu->tsk && req != LHREQ_BREAK)
- return -EPERM;
}
switch (req) {
@@ -264,8 +315,8 @@
return initialize(file, input);
case LHREQ_IRQ:
return user_send_irq(cpu, input);
- case LHREQ_BREAK:
- return break_guest_out(cpu, input);
+ case LHREQ_EVENTFD:
+ return attach_eventfd(lg, input);
default:
return -EINVAL;
}
@@ -303,6 +354,12 @@
* the Launcher's memory management structure. */
mmput(lg->cpus[i].mm);
}
+
+ /* Release any eventfds they registered. */
+ for (i = 0; i < lg->eventfds->num; i++)
+ fput(lg->eventfds->map[i].event);
+ kfree(lg->eventfds);
+
/* If lg->dead doesn't contain an error code it will be NULL or a
* kmalloc()ed string, either of which is ok to hand to kfree(). */
if (!IS_ERR(lg->dead))
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index a059cf9..a6fe1ab 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -53,6 +53,17 @@
* page. */
#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
+/* For PAE we need the PMD index as well. We use the last 2MB, so we
+ * will need the last pmd entry of the last pmd page. */
+#ifdef CONFIG_X86_PAE
+#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
+#define RESERVE_MEM 2U
+#define CHECK_GPGD_MASK _PAGE_PRESENT
+#else
+#define RESERVE_MEM 4U
+#define CHECK_GPGD_MASK _PAGE_TABLE
+#endif
+
/* We actually need a separate PTE page for each CPU. Remember that after the
* Switcher code itself comes two pages for each CPU, and we don't want this
* CPU's guest to see the pages of any other CPU. */
@@ -73,24 +84,59 @@
{
unsigned int index = pgd_index(vaddr);
+#ifndef CONFIG_X86_PAE
/* We kill any Guest trying to touch the Switcher addresses. */
if (index >= SWITCHER_PGD_INDEX) {
kill_guest(cpu, "attempt to access switcher pages");
index = 0;
}
+#endif
/* Return a pointer index'th pgd entry for the i'th page table. */
return &cpu->lg->pgdirs[i].pgdir[index];
}
+#ifdef CONFIG_X86_PAE
+/* This routine then takes the PGD entry given above, which contains the
+ * address of the PMD page. It then returns a pointer to the PMD entry for the
+ * given address. */
+static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
+{
+ unsigned int index = pmd_index(vaddr);
+ pmd_t *page;
+
+ /* We kill any Guest trying to touch the Switcher addresses. */
+ if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
+ index >= SWITCHER_PMD_INDEX) {
+ kill_guest(cpu, "attempt to access switcher pages");
+ index = 0;
+ }
+
+ /* You should never call this if the PGD entry wasn't valid */
+ BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
+ page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
+
+ return &page[index];
+}
+#endif
+
/* This routine then takes the page directory entry returned above, which
* contains the address of the page table entry (PTE) page. It then returns a
* pointer to the PTE entry for the given address. */
-static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr)
+static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
{
+#ifdef CONFIG_X86_PAE
+ pmd_t *pmd = spmd_addr(cpu, spgd, vaddr);
+ pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT);
+
+ /* You should never call this if the PMD entry wasn't valid */
+ BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT));
+#else
pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
/* You should never call this if the PGD entry wasn't valid */
BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
- return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE];
+#endif
+
+ return &page[pte_index(vaddr)];
}
/* These two functions just like the above two, except they access the Guest
@@ -101,12 +147,32 @@
return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
}
-static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr)
+#ifdef CONFIG_X86_PAE
+static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
{
unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
- return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t);
+ return gpage + pmd_index(vaddr) * sizeof(pmd_t);
}
+
+static unsigned long gpte_addr(struct lg_cpu *cpu,
+ pmd_t gpmd, unsigned long vaddr)
+{
+ unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
+
+ BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT));
+ return gpage + pte_index(vaddr) * sizeof(pte_t);
+}
+#else
+static unsigned long gpte_addr(struct lg_cpu *cpu,
+ pgd_t gpgd, unsigned long vaddr)
+{
+ unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
+
+ BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
+ return gpage + pte_index(vaddr) * sizeof(pte_t);
+}
+#endif
/*:*/
/*M:014 get_pfn is slow: we could probably try to grab batches of pages here as
@@ -171,7 +237,7 @@
/* Remember that get_user_pages_fast() took a reference to the page, in
* get_pfn()? We have to put it back now. */
if (pte_flags(pte) & _PAGE_PRESENT)
- put_page(pfn_to_page(pte_pfn(pte)));
+ put_page(pte_page(pte));
}
/*:*/
@@ -184,11 +250,20 @@
static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
{
- if ((pgd_flags(gpgd) & ~_PAGE_TABLE) ||
+ if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
(pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
kill_guest(cpu, "bad page directory entry");
}
+#ifdef CONFIG_X86_PAE
+static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
+{
+ if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
+ (pmd_pfn(gpmd) >= cpu->lg->pfn_limit))
+ kill_guest(cpu, "bad page middle directory entry");
+}
+#endif
+
/*H:330
* (i) Looking up a page table entry when the Guest faults.
*
@@ -207,6 +282,11 @@
pte_t gpte;
pte_t *spte;
+#ifdef CONFIG_X86_PAE
+ pmd_t *spmd;
+ pmd_t gpmd;
+#endif
+
/* First step: get the top-level Guest page table entry. */
gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
/* Toplevel not present? We can't map it in. */
@@ -228,12 +308,45 @@
check_gpgd(cpu, gpgd);
/* And we copy the flags to the shadow PGD entry. The page
* number in the shadow PGD is the page we just allocated. */
- *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
+ set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
+ }
+
+#ifdef CONFIG_X86_PAE
+ gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+ /* middle level not present? We can't map it in. */
+ if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+ return false;
+
+ /* Now look at the matching shadow entry. */
+ spmd = spmd_addr(cpu, *spgd, vaddr);
+
+ if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
+ /* No shadow entry: allocate a new shadow PTE page. */
+ unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+
+ /* This is not really the Guest's fault, but killing it is
+ * simple for this corner case. */
+ if (!ptepage) {
+ kill_guest(cpu, "out of memory allocating pte page");
+ return false;
+ }
+
+ /* We check that the Guest pmd is OK. */
+ check_gpmd(cpu, gpmd);
+
+ /* And we copy the flags to the shadow PMD entry. The page
+ * number in the shadow PMD is the page we just allocated. */
+ native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
}
/* OK, now we look at the lower level in the Guest page table: keep its
* address, because we might update it later. */
- gpte_ptr = gpte_addr(gpgd, vaddr);
+ gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
+#else
+ /* OK, now we look at the lower level in the Guest page table: keep its
+ * address, because we might update it later. */
+ gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
+#endif
gpte = lgread(cpu, gpte_ptr, pte_t);
/* If this page isn't in the Guest page tables, we can't page it in. */
@@ -259,7 +372,7 @@
gpte = pte_mkdirty(gpte);
/* Get the pointer to the shadow PTE entry we're going to set. */
- spte = spte_addr(*spgd, vaddr);
+ spte = spte_addr(cpu, *spgd, vaddr);
/* If there was a valid shadow PTE entry here before, we release it.
* This can happen with a write to a previously read-only entry. */
release_pte(*spte);
@@ -273,7 +386,7 @@
* table entry, even if the Guest says it's writable. That way
* we will come back here when a write does actually occur, so
* we can update the Guest's _PAGE_DIRTY flag. */
- *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0);
+ native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
/* Finally, we write the Guest PTE entry back: we've set the
* _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
@@ -301,14 +414,23 @@
pgd_t *spgd;
unsigned long flags;
+#ifdef CONFIG_X86_PAE
+ pmd_t *spmd;
+#endif
/* Look at the current top level entry: is it present? */
spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
return false;
+#ifdef CONFIG_X86_PAE
+ spmd = spmd_addr(cpu, *spgd, vaddr);
+ if (!(pmd_flags(*spmd) & _PAGE_PRESENT))
+ return false;
+#endif
+
/* Check the flags on the pte entry itself: it must be present and
* writable. */
- flags = pte_flags(*(spte_addr(*spgd, vaddr)));
+ flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
}
@@ -322,8 +444,43 @@
kill_guest(cpu, "bad stack page %#lx", vaddr);
}
+#ifdef CONFIG_X86_PAE
+static void release_pmd(pmd_t *spmd)
+{
+ /* If the entry's not present, there's nothing to release. */
+ if (pmd_flags(*spmd) & _PAGE_PRESENT) {
+ unsigned int i;
+ pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT);
+ /* For each entry in the page, we might need to release it. */
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ release_pte(ptepage[i]);
+ /* Now we can free the page of PTEs */
+ free_page((long)ptepage);
+ /* And zero out the PMD entry so we never release it twice. */
+ native_set_pmd(spmd, __pmd(0));
+ }
+}
+
+static void release_pgd(pgd_t *spgd)
+{
+ /* If the entry's not present, there's nothing to release. */
+ if (pgd_flags(*spgd) & _PAGE_PRESENT) {
+ unsigned int i;
+ pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ release_pmd(&pmdpage[i]);
+
+ /* Now we can free the page of PMDs */
+ free_page((long)pmdpage);
+ /* And zero out the PGD entry so we never release it twice. */
+ set_pgd(spgd, __pgd(0));
+ }
+}
+
+#else /* !CONFIG_X86_PAE */
/*H:450 If we chase down the release_pgd() code, it looks like this: */
-static void release_pgd(struct lguest *lg, pgd_t *spgd)
+static void release_pgd(pgd_t *spgd)
{
/* If the entry's not present, there's nothing to release. */
if (pgd_flags(*spgd) & _PAGE_PRESENT) {
@@ -341,7 +498,7 @@
*spgd = __pgd(0);
}
}
-
+#endif
/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings()
* hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
* It simply releases every PTE page from 0 up to the Guest's kernel address. */
@@ -350,7 +507,7 @@
unsigned int i;
/* Release every pgd entry up to the kernel's address. */
for (i = 0; i < pgd_index(lg->kernel_address); i++)
- release_pgd(lg, lg->pgdirs[idx].pgdir + i);
+ release_pgd(lg->pgdirs[idx].pgdir + i);
}
/*H:440 (v) Flushing (throwing away) page tables,
@@ -369,7 +526,9 @@
{
pgd_t gpgd;
pte_t gpte;
-
+#ifdef CONFIG_X86_PAE
+ pmd_t gpmd;
+#endif
/* First step: get the top-level Guest page table entry. */
gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
/* Toplevel not present? We can't map it in. */
@@ -378,7 +537,14 @@
return -1UL;
}
- gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t);
+#ifdef CONFIG_X86_PAE
+ gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+ if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+ kill_guest(cpu, "Bad address %#lx", vaddr);
+ gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
+#else
+ gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
+#endif
if (!(pte_flags(gpte) & _PAGE_PRESENT))
kill_guest(cpu, "Bad address %#lx", vaddr);
@@ -405,6 +571,9 @@
int *blank_pgdir)
{
unsigned int next;
+#ifdef CONFIG_X86_PAE
+ pmd_t *pmd_table;
+#endif
/* We pick one entry at random to throw out. Choosing the Least
* Recently Used might be better, but this is easy. */
@@ -416,10 +585,27 @@
/* If the allocation fails, just keep using the one we have */
if (!cpu->lg->pgdirs[next].pgdir)
next = cpu->cpu_pgd;
- else
- /* This is a blank page, so there are no kernel
- * mappings: caller must map the stack! */
+ else {
+#ifdef CONFIG_X86_PAE
+ /* In PAE mode, allocate a pmd page and populate the
+ * last pgd entry. */
+ pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pmd_table) {
+ free_page((long)cpu->lg->pgdirs[next].pgdir);
+ set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
+ next = cpu->cpu_pgd;
+ } else {
+ set_pgd(cpu->lg->pgdirs[next].pgdir +
+ SWITCHER_PGD_INDEX,
+ __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+ /* This is a blank page, so there are no kernel
+ * mappings: caller must map the stack! */
+ *blank_pgdir = 1;
+ }
+#else
*blank_pgdir = 1;
+#endif
+ }
}
/* Record which Guest toplevel this shadows. */
cpu->lg->pgdirs[next].gpgdir = gpgdir;
@@ -431,7 +617,7 @@
/*H:430 (iv) Switching page tables
*
- * Now we've seen all the page table setting and manipulation, let's see what
+ * Now we've seen all the page table setting and manipulation, let's see
* what happens when the Guest changes page tables (ie. changes the top-level
* pgdir). This occurs on almost every context switch. */
void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
@@ -460,10 +646,25 @@
/* Every shadow pagetable this Guest has */
for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
- if (lg->pgdirs[i].pgdir)
+ if (lg->pgdirs[i].pgdir) {
+#ifdef CONFIG_X86_PAE
+ pgd_t *spgd;
+ pmd_t *pmdpage;
+ unsigned int k;
+
+ /* Get the last pmd page. */
+ spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
+ pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
+
+ /* And release the pmd entries of that pmd page,
+ * except for the switcher pmd. */
+ for (k = 0; k < SWITCHER_PMD_INDEX; k++)
+ release_pmd(&pmdpage[k]);
+#endif
/* Every PGD entry except the Switcher at the top */
for (j = 0; j < SWITCHER_PGD_INDEX; j++)
- release_pgd(lg, lg->pgdirs[i].pgdir + j);
+ release_pgd(lg->pgdirs[i].pgdir + j);
+ }
}
/* We also throw away everything when a Guest tells us it's changed a kernel
@@ -504,24 +705,37 @@
{
/* Look up the matching shadow page directory entry. */
pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
+#ifdef CONFIG_X86_PAE
+ pmd_t *spmd;
+#endif
/* If the top level isn't present, there's no entry to update. */
if (pgd_flags(*spgd) & _PAGE_PRESENT) {
- /* Otherwise, we start by releasing the existing entry. */
- pte_t *spte = spte_addr(*spgd, vaddr);
- release_pte(*spte);
+#ifdef CONFIG_X86_PAE
+ spmd = spmd_addr(cpu, *spgd, vaddr);
+ if (pmd_flags(*spmd) & _PAGE_PRESENT) {
+#endif
+ /* Otherwise, we start by releasing
+ * the existing entry. */
+ pte_t *spte = spte_addr(cpu, *spgd, vaddr);
+ release_pte(*spte);
- /* If they're setting this entry as dirty or accessed, we might
- * as well put that entry they've given us in now. This shaves
- * 10% off a copy-on-write micro-benchmark. */
- if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
- check_gpte(cpu, gpte);
- *spte = gpte_to_spte(cpu, gpte,
- pte_flags(gpte) & _PAGE_DIRTY);
- } else
- /* Otherwise kill it and we can demand_page() it in
- * later. */
- *spte = __pte(0);
+ /* If they're setting this entry as dirty or accessed,
+ * we might as well put that entry they've given us
+ * in now. This shaves 10% off a
+ * copy-on-write micro-benchmark. */
+ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
+ check_gpte(cpu, gpte);
+ native_set_pte(spte,
+ gpte_to_spte(cpu, gpte,
+ pte_flags(gpte) & _PAGE_DIRTY));
+ } else
+ /* Otherwise kill it and we can demand_page()
+ * it in later. */
+ native_set_pte(spte, __pte(0));
+#ifdef CONFIG_X86_PAE
+ }
+#endif
}
}
@@ -568,12 +782,10 @@
*
* So with that in mind here's our code to to update a (top-level) PGD entry:
*/
-void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
+void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
{
int pgdir;
- /* The kernel seems to try to initialize this early on: we ignore its
- * attempts to map over the Switcher. */
if (idx >= SWITCHER_PGD_INDEX)
return;
@@ -581,8 +793,14 @@
pgdir = find_pgdir(lg, gpgdir);
if (pgdir < ARRAY_SIZE(lg->pgdirs))
/* ... throw it away. */
- release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
+ release_pgd(lg->pgdirs[pgdir].pgdir + idx);
}
+#ifdef CONFIG_X86_PAE
+void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
+{
+ guest_pagetable_clear_all(&lg->cpus[0]);
+}
+#endif
/* Once we know how much memory we have we can construct simple identity
* (which set virtual == physical) and linear mappings
@@ -596,8 +814,16 @@
{
pgd_t __user *pgdir;
pte_t __user *linear;
- unsigned int mapped_pages, i, linear_pages, phys_linear;
unsigned long mem_base = (unsigned long)lg->mem_base;
+ unsigned int mapped_pages, i, linear_pages;
+#ifdef CONFIG_X86_PAE
+ pmd_t __user *pmds;
+ unsigned int j;
+ pgd_t pgd;
+ pmd_t pmd;
+#else
+ unsigned int phys_linear;
+#endif
/* We have mapped_pages frames to map, so we need
* linear_pages page tables to map them. */
@@ -610,6 +836,9 @@
/* Now we use the next linear_pages pages as pte pages */
linear = (void *)pgdir - linear_pages * PAGE_SIZE;
+#ifdef CONFIG_X86_PAE
+ pmds = (void *)linear - PAGE_SIZE;
+#endif
/* Linear mapping is easy: put every page's address into the
* mapping in order. */
for (i = 0; i < mapped_pages; i++) {
@@ -621,6 +850,22 @@
/* The top level points to the linear page table pages above.
* We setup the identity and linear mappings here. */
+#ifdef CONFIG_X86_PAE
+ for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
+ i += PTRS_PER_PTE, j++) {
+ native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i)
+ - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
+
+ if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0)
+ return -EFAULT;
+ }
+
+ set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT));
+ if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
+ return -EFAULT;
+ if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0)
+ return -EFAULT;
+#else
phys_linear = (unsigned long)linear - mem_base;
for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
pgd_t pgd;
@@ -633,6 +878,7 @@
&pgd, sizeof(pgd)))
return -EFAULT;
}
+#endif
/* We return the top level (guest-physical) address: remember where
* this is. */
@@ -648,7 +894,10 @@
u64 mem;
u32 initrd_size;
struct boot_params __user *boot = (struct boot_params *)lg->mem_base;
-
+#ifdef CONFIG_X86_PAE
+ pgd_t *pgd;
+ pmd_t *pmd_table;
+#endif
/* Get the Guest memory size and the ramdisk size from the boot header
* located at lg->mem_base (Guest address 0). */
if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
@@ -663,6 +912,15 @@
lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
if (!lg->pgdirs[0].pgdir)
return -ENOMEM;
+#ifdef CONFIG_X86_PAE
+ pgd = lg->pgdirs[0].pgdir;
+ pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
+ if (!pmd_table)
+ return -ENOMEM;
+
+ set_pgd(pgd + SWITCHER_PGD_INDEX,
+ __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+#endif
lg->cpus[0].cpu_pgd = 0;
return 0;
}
@@ -672,17 +930,24 @@
{
/* We get the kernel address: above this is all kernel memory. */
if (get_user(cpu->lg->kernel_address,
- &cpu->lg->lguest_data->kernel_address)
- /* We tell the Guest that it can't use the top 4MB of virtual
- * addresses used by the Switcher. */
- || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem)
- || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir))
+ &cpu->lg->lguest_data->kernel_address)
+ /* We tell the Guest that it can't use the top 2 or 4 MB
+ * of virtual addresses used by the Switcher. */
+ || put_user(RESERVE_MEM * 1024 * 1024,
+ &cpu->lg->lguest_data->reserve_mem)
+ || put_user(cpu->lg->pgdirs[0].gpgdir,
+ &cpu->lg->lguest_data->pgdir))
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
/* In flush_user_mappings() we loop from 0 to
* "pgd_index(lg->kernel_address)". This assumes it won't hit the
* Switcher mappings, so check that now. */
+#ifdef CONFIG_X86_PAE
+ if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
+ pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
+#else
if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
+#endif
kill_guest(cpu, "bad kernel address %#lx",
cpu->lg->kernel_address);
}
@@ -708,16 +973,30 @@
void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
{
pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
- pgd_t switcher_pgd;
pte_t regs_pte;
unsigned long pfn;
+#ifdef CONFIG_X86_PAE
+ pmd_t switcher_pmd;
+ pmd_t *pmd_table;
+
+ native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >>
+ PAGE_SHIFT, PAGE_KERNEL_EXEC));
+
+ pmd_table = __va(pgd_pfn(cpu->lg->
+ pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
+ << PAGE_SHIFT);
+ native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
+#else
+ pgd_t switcher_pgd;
+
/* Make the last PGD entry for this Guest point to the Switcher's PTE
* page for this CPU (with appropriate flags). */
- switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL);
+ switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
+#endif
/* We also change the Switcher PTE page. When we're running the Guest,
* we want the Guest's "regs" page to appear where the first Switcher
* page for this CPU is. This is an optimization: when the Switcher
@@ -726,8 +1005,9 @@
* page is already mapped there, we don't have to copy them out
* again. */
pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
- regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL));
- switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
+ native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL));
+ native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)],
+ regs_pte);
}
/*:*/
@@ -752,21 +1032,21 @@
/* The first entries are easy: they map the Switcher code. */
for (i = 0; i < pages; i++) {
- pte[i] = mk_pte(switcher_page[i],
- __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
+ native_set_pte(&pte[i], mk_pte(switcher_page[i],
+ __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
}
/* The only other thing we map is this CPU's pair of pages. */
i = pages + cpu*2;
/* First page (Guest registers) is writable from the Guest */
- pte[i] = pfn_pte(page_to_pfn(switcher_page[i]),
- __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW));
+ native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
+ __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
/* The second page contains the "struct lguest_ro_state", and is
* read-only. */
- pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]),
- __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
+ native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
+ __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
}
/* We've made it through the page table code. Perhaps our tired brains are
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index 7ede64f..482ed5a 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -150,7 +150,7 @@
{
/* We assume the Guest has the same number of GDT entries as the
* Host, otherwise we'd have to dynamically allocate the Guest GDT. */
- if (num > ARRAY_SIZE(cpu->arch.gdt))
+ if (num >= ARRAY_SIZE(cpu->arch.gdt))
kill_guest(cpu, "too many gdt entries %i", num);
/* Set it up, then fix it. */
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4d1d479..7fa620d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -845,6 +845,10 @@
int err;
struct net_device *dev;
struct virtnet_info *vi;
+ struct virtqueue *vqs[3];
+ vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL};
+ const char *names[] = { "input", "output", "control" };
+ int nvqs;
/* Allocate ourselves a network device with room for our info */
dev = alloc_etherdev(sizeof(struct virtnet_info));
@@ -905,25 +909,19 @@
if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
vi->mergeable_rx_bufs = true;
- /* We expect two virtqueues, receive then send. */
- vi->rvq = vdev->config->find_vq(vdev, 0, skb_recv_done);
- if (IS_ERR(vi->rvq)) {
- err = PTR_ERR(vi->rvq);
- goto free;
- }
+ /* We expect two virtqueues, receive then send,
+ * and optionally control. */
+ nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2;
- vi->svq = vdev->config->find_vq(vdev, 1, skb_xmit_done);
- if (IS_ERR(vi->svq)) {
- err = PTR_ERR(vi->svq);
- goto free_recv;
- }
+ err = vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names);
+ if (err)
+ goto free;
+
+ vi->rvq = vqs[0];
+ vi->svq = vqs[1];
if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
- vi->cvq = vdev->config->find_vq(vdev, 2, NULL);
- if (IS_ERR(vi->cvq)) {
- err = PTR_ERR(vi->svq);
- goto free_send;
- }
+ vi->cvq = vqs[2];
if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
dev->features |= NETIF_F_HW_VLAN_FILTER;
@@ -941,7 +939,7 @@
err = register_netdev(dev);
if (err) {
pr_debug("virtio_net: registering device failed\n");
- goto free_ctrl;
+ goto free_vqs;
}
/* Last of all, set up some receive buffers. */
@@ -962,13 +960,8 @@
unregister:
unregister_netdev(dev);
-free_ctrl:
- if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ))
- vdev->config->del_vq(vi->cvq);
-free_send:
- vdev->config->del_vq(vi->svq);
-free_recv:
- vdev->config->del_vq(vi->rvq);
+free_vqs:
+ vdev->config->del_vqs(vdev);
free:
free_netdev(dev);
return err;
@@ -994,12 +987,10 @@
BUG_ON(vi->num != 0);
- vdev->config->del_vq(vi->svq);
- vdev->config->del_vq(vi->rvq);
- if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ))
- vdev->config->del_vq(vi->cvq);
unregister_netdev(vi->dev);
+ vdev->config->del_vqs(vi->vdev);
+
while (vi->pages)
__free_pages(get_a_page(vi, GFP_KERNEL), 0);
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index cbc8566..e38e5d3 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -173,8 +173,9 @@
* this device and sets it up.
*/
static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
- unsigned index,
- void (*callback)(struct virtqueue *vq))
+ unsigned index,
+ void (*callback)(struct virtqueue *vq),
+ const char *name)
{
struct kvm_device *kdev = to_kvmdev(vdev);
struct kvm_vqconfig *config;
@@ -194,7 +195,7 @@
vq = vring_new_virtqueue(config->num, KVM_S390_VIRTIO_RING_ALIGN,
vdev, (void *) config->address,
- kvm_notify, callback);
+ kvm_notify, callback, name);
if (!vq) {
err = -ENOMEM;
goto unmap;
@@ -226,6 +227,38 @@
KVM_S390_VIRTIO_RING_ALIGN));
}
+static void kvm_del_vqs(struct virtio_device *vdev)
+{
+ struct virtqueue *vq, *n;
+
+ list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+ kvm_del_vq(vq);
+}
+
+static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+ struct virtqueue *vqs[],
+ vq_callback_t *callbacks[],
+ const char *names[])
+{
+ struct kvm_device *kdev = to_kvmdev(vdev);
+ int i;
+
+ /* We must have this many virtqueues. */
+ if (nvqs > kdev->desc->num_vq)
+ return -ENOENT;
+
+ for (i = 0; i < nvqs; ++i) {
+ vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i]);
+ if (IS_ERR(vqs[i]))
+ goto error;
+ }
+ return 0;
+
+error:
+ kvm_del_vqs(vdev);
+ return PTR_ERR(vqs[i]);
+}
+
/*
* The config ops structure as defined by virtio config
*/
@@ -237,8 +270,8 @@
.get_status = kvm_get_status,
.set_status = kvm_set_status,
.reset = kvm_reset,
- .find_vq = kvm_find_vq,
- .del_vq = kvm_del_vq,
+ .find_vqs = kvm_find_vqs,
+ .del_vqs = kvm_del_vqs,
};
/*
diff --git a/drivers/video/aty/aty128fb.c b/drivers/video/aty/aty128fb.c
index 35e8eb0..e4e4d43 100644
--- a/drivers/video/aty/aty128fb.c
+++ b/drivers/video/aty/aty128fb.c
@@ -354,7 +354,7 @@
static int default_lcd_on __devinitdata = 1;
#ifdef CONFIG_MTRR
-static int mtrr = 1;
+static bool mtrr = true;
#endif
#ifdef CONFIG_PMAC_BACKLIGHT
diff --git a/drivers/video/cyber2000fb.c b/drivers/video/cyber2000fb.c
index 83c5cef..da7c01b 100644
--- a/drivers/video/cyber2000fb.c
+++ b/drivers/video/cyber2000fb.c
@@ -1736,10 +1736,8 @@
#ifdef CONFIG_ARCH_SHARK
err = cyberpro_vl_probe();
- if (!err) {
+ if (!err)
ret = 0;
- __module_get(THIS_MODULE);
- }
#endif
#ifdef CONFIG_PCI
err = pci_register_driver(&cyberpro_driver);
@@ -1749,14 +1747,15 @@
return ret ? err : 0;
}
+module_init(cyber2000fb_init);
+#ifndef CONFIG_ARCH_SHARK
static void __exit cyberpro_exit(void)
{
pci_unregister_driver(&cyberpro_driver);
}
-
-module_init(cyber2000fb_init);
module_exit(cyberpro_exit);
+#endif
MODULE_AUTHOR("Russell King");
MODULE_DESCRIPTION("CyberPro 2000, 2010 and 5000 framebuffer driver");
diff --git a/drivers/video/uvesafb.c b/drivers/video/uvesafb.c
index 421770b..ca5b464 100644
--- a/drivers/video/uvesafb.c
+++ b/drivers/video/uvesafb.c
@@ -45,7 +45,7 @@
static int mtrr __devinitdata = 3; /* enable mtrr by default */
static int blank = 1; /* enable blanking by default */
static int ypan = 1; /* 0: scroll, 1: ypan, 2: ywrap */
-static int pmi_setpal __devinitdata = 1; /* use PMI for palette changes */
+static bool pmi_setpal __devinitdata = true; /* use PMI for palette changes */
static int nocrtc __devinitdata; /* ignore CRTC settings */
static int noedid __devinitdata; /* don't try DDC transfers */
static int vram_remap __devinitdata; /* set amt. of memory to be used */
@@ -2002,11 +2002,7 @@
module_exit(uvesafb_exit);
-static int param_get_scroll(char *buffer, struct kernel_param *kp)
-{
- return 0;
-}
-
+#define param_get_scroll NULL
static int param_set_scroll(const char *val, struct kernel_param *kp)
{
ypan = 0;
@@ -2017,6 +2013,8 @@
ypan = 1;
else if (!strcmp(val, "ywrap"))
ypan = 2;
+ else
+ return -EINVAL;
return 0;
}
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 018c070..3a43ebf 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -31,21 +31,37 @@
return sprintf(buf, "virtio:d%08Xv%08X\n",
dev->id.device, dev->id.vendor);
}
+static ssize_t features_show(struct device *_d,
+ struct device_attribute *attr, char *buf)
+{
+ struct virtio_device *dev = container_of(_d, struct virtio_device, dev);
+ unsigned int i;
+ ssize_t len = 0;
+
+ /* We actually represent this as a bitstring, as it could be
+ * arbitrary length in future. */
+ for (i = 0; i < ARRAY_SIZE(dev->features)*BITS_PER_LONG; i++)
+ len += sprintf(buf+len, "%c",
+ test_bit(i, dev->features) ? '1' : '0');
+ len += sprintf(buf+len, "\n");
+ return len;
+}
static struct device_attribute virtio_dev_attrs[] = {
__ATTR_RO(device),
__ATTR_RO(vendor),
__ATTR_RO(status),
__ATTR_RO(modalias),
+ __ATTR_RO(features),
__ATTR_NULL
};
static inline int virtio_id_match(const struct virtio_device *dev,
const struct virtio_device_id *id)
{
- if (id->device != dev->id.device)
+ if (id->device != dev->id.device && id->device != VIRTIO_DEV_ANY_ID)
return 0;
- return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor != dev->id.vendor;
+ return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor == dev->id.vendor;
}
/* This looks through all the IDs a driver claims to support. If any of them
@@ -118,13 +134,14 @@
if (device_features & (1 << i))
set_bit(i, dev->features);
+ dev->config->finalize_features(dev);
+
err = drv->probe(dev);
if (err)
add_status(dev, VIRTIO_CONFIG_S_FAILED);
- else {
- dev->config->finalize_features(dev);
+ else
add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
- }
+
return err;
}
@@ -185,6 +202,8 @@
/* Acknowledge that we've seen the device. */
add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+ INIT_LIST_HEAD(&dev->vqs);
+
/* device_register() causes the bus infrastructure to look for a
* matching driver. */
err = device_register(&dev->dev);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 9c76a06..26b2782 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -204,6 +204,9 @@
static int virtballoon_probe(struct virtio_device *vdev)
{
struct virtio_balloon *vb;
+ struct virtqueue *vqs[2];
+ vq_callback_t *callbacks[] = { balloon_ack, balloon_ack };
+ const char *names[] = { "inflate", "deflate" };
int err;
vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL);
@@ -218,22 +221,17 @@
vb->vdev = vdev;
/* We expect two virtqueues. */
- vb->inflate_vq = vdev->config->find_vq(vdev, 0, balloon_ack);
- if (IS_ERR(vb->inflate_vq)) {
- err = PTR_ERR(vb->inflate_vq);
+ err = vdev->config->find_vqs(vdev, 2, vqs, callbacks, names);
+ if (err)
goto out_free_vb;
- }
- vb->deflate_vq = vdev->config->find_vq(vdev, 1, balloon_ack);
- if (IS_ERR(vb->deflate_vq)) {
- err = PTR_ERR(vb->deflate_vq);
- goto out_del_inflate_vq;
- }
+ vb->inflate_vq = vqs[0];
+ vb->deflate_vq = vqs[1];
vb->thread = kthread_run(balloon, vb, "vballoon");
if (IS_ERR(vb->thread)) {
err = PTR_ERR(vb->thread);
- goto out_del_deflate_vq;
+ goto out_del_vqs;
}
vb->tell_host_first
@@ -241,10 +239,8 @@
return 0;
-out_del_deflate_vq:
- vdev->config->del_vq(vb->deflate_vq);
-out_del_inflate_vq:
- vdev->config->del_vq(vb->inflate_vq);
+out_del_vqs:
+ vdev->config->del_vqs(vdev);
out_free_vb:
kfree(vb);
out:
@@ -264,8 +260,7 @@
/* Now we reset the device so we can clean up the queues. */
vdev->config->reset(vdev);
- vdev->config->del_vq(vb->deflate_vq);
- vdev->config->del_vq(vb->inflate_vq);
+ vdev->config->del_vqs(vdev);
kfree(vb);
}
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 330aacb..193c8f0 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -42,6 +42,26 @@
/* a list of queues so we can dispatch IRQs */
spinlock_t lock;
struct list_head virtqueues;
+
+ /* MSI-X support */
+ int msix_enabled;
+ int intx_enabled;
+ struct msix_entry *msix_entries;
+ /* Name strings for interrupts. This size should be enough,
+ * and I'm too lazy to allocate each name separately. */
+ char (*msix_names)[256];
+ /* Number of available vectors */
+ unsigned msix_vectors;
+ /* Vectors allocated */
+ unsigned msix_used_vectors;
+};
+
+/* Constants for MSI-X */
+/* Use first vector for configuration changes, second and the rest for
+ * virtqueues Thus, we need at least 2 vectors for MSI. */
+enum {
+ VP_MSIX_CONFIG_VECTOR = 0,
+ VP_MSIX_VQ_VECTOR = 1,
};
struct virtio_pci_vq_info
@@ -60,6 +80,9 @@
/* the list node for the virtqueues list */
struct list_head node;
+
+ /* MSI-X vector (or none) */
+ unsigned vector;
};
/* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */
@@ -109,7 +132,8 @@
void *buf, unsigned len)
{
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
- void __iomem *ioaddr = vp_dev->ioaddr + VIRTIO_PCI_CONFIG + offset;
+ void __iomem *ioaddr = vp_dev->ioaddr +
+ VIRTIO_PCI_CONFIG(vp_dev) + offset;
u8 *ptr = buf;
int i;
@@ -123,7 +147,8 @@
const void *buf, unsigned len)
{
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
- void __iomem *ioaddr = vp_dev->ioaddr + VIRTIO_PCI_CONFIG + offset;
+ void __iomem *ioaddr = vp_dev->ioaddr +
+ VIRTIO_PCI_CONFIG(vp_dev) + offset;
const u8 *ptr = buf;
int i;
@@ -164,37 +189,26 @@
iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
}
-/* A small wrapper to also acknowledge the interrupt when it's handled.
- * I really need an EIO hook for the vring so I can ack the interrupt once we
- * know that we'll be handling the IRQ but before we invoke the callback since
- * the callback may notify the host which results in the host attempting to
- * raise an interrupt that we would then mask once we acknowledged the
- * interrupt. */
-static irqreturn_t vp_interrupt(int irq, void *opaque)
+/* Handle a configuration change: Tell driver if it wants to know. */
+static irqreturn_t vp_config_changed(int irq, void *opaque)
+{
+ struct virtio_pci_device *vp_dev = opaque;
+ struct virtio_driver *drv;
+ drv = container_of(vp_dev->vdev.dev.driver,
+ struct virtio_driver, driver);
+
+ if (drv && drv->config_changed)
+ drv->config_changed(&vp_dev->vdev);
+ return IRQ_HANDLED;
+}
+
+/* Notify all virtqueues on an interrupt. */
+static irqreturn_t vp_vring_interrupt(int irq, void *opaque)
{
struct virtio_pci_device *vp_dev = opaque;
struct virtio_pci_vq_info *info;
irqreturn_t ret = IRQ_NONE;
unsigned long flags;
- u8 isr;
-
- /* reading the ISR has the effect of also clearing it so it's very
- * important to save off the value. */
- isr = ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
-
- /* It's definitely not us if the ISR was not high */
- if (!isr)
- return IRQ_NONE;
-
- /* Configuration change? Tell driver if it wants to know. */
- if (isr & VIRTIO_PCI_ISR_CONFIG) {
- struct virtio_driver *drv;
- drv = container_of(vp_dev->vdev.dev.driver,
- struct virtio_driver, driver);
-
- if (drv && drv->config_changed)
- drv->config_changed(&vp_dev->vdev);
- }
spin_lock_irqsave(&vp_dev->lock, flags);
list_for_each_entry(info, &vp_dev->virtqueues, node) {
@@ -206,15 +220,157 @@
return ret;
}
-/* the config->find_vq() implementation */
+/* A small wrapper to also acknowledge the interrupt when it's handled.
+ * I really need an EIO hook for the vring so I can ack the interrupt once we
+ * know that we'll be handling the IRQ but before we invoke the callback since
+ * the callback may notify the host which results in the host attempting to
+ * raise an interrupt that we would then mask once we acknowledged the
+ * interrupt. */
+static irqreturn_t vp_interrupt(int irq, void *opaque)
+{
+ struct virtio_pci_device *vp_dev = opaque;
+ u8 isr;
+
+ /* reading the ISR has the effect of also clearing it so it's very
+ * important to save off the value. */
+ isr = ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
+
+ /* It's definitely not us if the ISR was not high */
+ if (!isr)
+ return IRQ_NONE;
+
+ /* Configuration change? Tell driver if it wants to know. */
+ if (isr & VIRTIO_PCI_ISR_CONFIG)
+ vp_config_changed(irq, opaque);
+
+ return vp_vring_interrupt(irq, opaque);
+}
+
+static void vp_free_vectors(struct virtio_device *vdev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ int i;
+
+ if (vp_dev->intx_enabled) {
+ free_irq(vp_dev->pci_dev->irq, vp_dev);
+ vp_dev->intx_enabled = 0;
+ }
+
+ for (i = 0; i < vp_dev->msix_used_vectors; ++i)
+ free_irq(vp_dev->msix_entries[i].vector, vp_dev);
+ vp_dev->msix_used_vectors = 0;
+
+ if (vp_dev->msix_enabled) {
+ /* Disable the vector used for configuration */
+ iowrite16(VIRTIO_MSI_NO_VECTOR,
+ vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+ /* Flush the write out to device */
+ ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+
+ vp_dev->msix_enabled = 0;
+ pci_disable_msix(vp_dev->pci_dev);
+ }
+}
+
+static int vp_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
+ int *options, int noptions)
+{
+ int i;
+ for (i = 0; i < noptions; ++i)
+ if (!pci_enable_msix(dev, entries, options[i]))
+ return options[i];
+ return -EBUSY;
+}
+
+static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ const char *name = dev_name(&vp_dev->vdev.dev);
+ unsigned i, v;
+ int err = -ENOMEM;
+ /* We want at most one vector per queue and one for config changes.
+ * Fallback to separate vectors for config and a shared for queues.
+ * Finally fall back to regular interrupts. */
+ int options[] = { max_vqs + 1, 2 };
+ int nvectors = max(options[0], options[1]);
+
+ vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries,
+ GFP_KERNEL);
+ if (!vp_dev->msix_entries)
+ goto error_entries;
+ vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names,
+ GFP_KERNEL);
+ if (!vp_dev->msix_names)
+ goto error_names;
+
+ for (i = 0; i < nvectors; ++i)
+ vp_dev->msix_entries[i].entry = i;
+
+ err = vp_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries,
+ options, ARRAY_SIZE(options));
+ if (err < 0) {
+ /* Can't allocate enough MSI-X vectors, use regular interrupt */
+ vp_dev->msix_vectors = 0;
+ err = request_irq(vp_dev->pci_dev->irq, vp_interrupt,
+ IRQF_SHARED, name, vp_dev);
+ if (err)
+ goto error_irq;
+ vp_dev->intx_enabled = 1;
+ } else {
+ vp_dev->msix_vectors = err;
+ vp_dev->msix_enabled = 1;
+
+ /* Set the vector used for configuration */
+ v = vp_dev->msix_used_vectors;
+ snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
+ "%s-config", name);
+ err = request_irq(vp_dev->msix_entries[v].vector,
+ vp_config_changed, 0, vp_dev->msix_names[v],
+ vp_dev);
+ if (err)
+ goto error_irq;
+ ++vp_dev->msix_used_vectors;
+
+ iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+ /* Verify we had enough resources to assign the vector */
+ v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+ if (v == VIRTIO_MSI_NO_VECTOR) {
+ err = -EBUSY;
+ goto error_irq;
+ }
+ }
+
+ if (vp_dev->msix_vectors && vp_dev->msix_vectors != max_vqs + 1) {
+ /* Shared vector for all VQs */
+ v = vp_dev->msix_used_vectors;
+ snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
+ "%s-virtqueues", name);
+ err = request_irq(vp_dev->msix_entries[v].vector,
+ vp_vring_interrupt, 0, vp_dev->msix_names[v],
+ vp_dev);
+ if (err)
+ goto error_irq;
+ ++vp_dev->msix_used_vectors;
+ }
+ return 0;
+error_irq:
+ vp_free_vectors(vdev);
+ kfree(vp_dev->msix_names);
+error_names:
+ kfree(vp_dev->msix_entries);
+error_entries:
+ return err;
+}
+
static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
- void (*callback)(struct virtqueue *vq))
+ void (*callback)(struct virtqueue *vq),
+ const char *name)
{
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
struct virtio_pci_vq_info *info;
struct virtqueue *vq;
unsigned long flags, size;
- u16 num;
+ u16 num, vector;
int err;
/* Select the queue we're interested in */
@@ -233,6 +389,7 @@
info->queue_index = index;
info->num = num;
+ info->vector = VIRTIO_MSI_NO_VECTOR;
size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
@@ -247,7 +404,7 @@
/* create the vring */
vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN,
- vdev, info->queue, vp_notify, callback);
+ vdev, info->queue, vp_notify, callback, name);
if (!vq) {
err = -ENOMEM;
goto out_activate_queue;
@@ -256,12 +413,43 @@
vq->priv = info;
info->vq = vq;
+ /* allocate per-vq vector if available and necessary */
+ if (callback && vp_dev->msix_used_vectors < vp_dev->msix_vectors) {
+ vector = vp_dev->msix_used_vectors;
+ snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names,
+ "%s-%s", dev_name(&vp_dev->vdev.dev), name);
+ err = request_irq(vp_dev->msix_entries[vector].vector,
+ vring_interrupt, 0,
+ vp_dev->msix_names[vector], vq);
+ if (err)
+ goto out_request_irq;
+ info->vector = vector;
+ ++vp_dev->msix_used_vectors;
+ } else
+ vector = VP_MSIX_VQ_VECTOR;
+
+ if (callback && vp_dev->msix_enabled) {
+ iowrite16(vector, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+ vector = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+ if (vector == VIRTIO_MSI_NO_VECTOR) {
+ err = -EBUSY;
+ goto out_assign;
+ }
+ }
+
spin_lock_irqsave(&vp_dev->lock, flags);
list_add(&info->node, &vp_dev->virtqueues);
spin_unlock_irqrestore(&vp_dev->lock, flags);
return vq;
+out_assign:
+ if (info->vector != VIRTIO_MSI_NO_VECTOR) {
+ free_irq(vp_dev->msix_entries[info->vector].vector, vq);
+ --vp_dev->msix_used_vectors;
+ }
+out_request_irq:
+ vring_del_virtqueue(vq);
out_activate_queue:
iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
free_pages_exact(info->queue, size);
@@ -270,21 +458,27 @@
return ERR_PTR(err);
}
-/* the config->del_vq() implementation */
static void vp_del_vq(struct virtqueue *vq)
{
struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
struct virtio_pci_vq_info *info = vq->priv;
- unsigned long flags, size;
+ unsigned long size;
- spin_lock_irqsave(&vp_dev->lock, flags);
- list_del(&info->node);
- spin_unlock_irqrestore(&vp_dev->lock, flags);
+ iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+
+ if (info->vector != VIRTIO_MSI_NO_VECTOR)
+ free_irq(vp_dev->msix_entries[info->vector].vector, vq);
+
+ if (vp_dev->msix_enabled) {
+ iowrite16(VIRTIO_MSI_NO_VECTOR,
+ vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+ /* Flush the write out to device */
+ ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
+ }
vring_del_virtqueue(vq);
/* Select and deactivate the queue */
- iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN));
@@ -292,14 +486,57 @@
kfree(info);
}
+/* the config->del_vqs() implementation */
+static void vp_del_vqs(struct virtio_device *vdev)
+{
+ struct virtqueue *vq, *n;
+
+ list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+ vp_del_vq(vq);
+
+ vp_free_vectors(vdev);
+}
+
+/* the config->find_vqs() implementation */
+static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+ struct virtqueue *vqs[],
+ vq_callback_t *callbacks[],
+ const char *names[])
+{
+ int vectors = 0;
+ int i, err;
+
+ /* How many vectors would we like? */
+ for (i = 0; i < nvqs; ++i)
+ if (callbacks[i])
+ ++vectors;
+
+ err = vp_request_vectors(vdev, vectors);
+ if (err)
+ goto error_request;
+
+ for (i = 0; i < nvqs; ++i) {
+ vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i]);
+ if (IS_ERR(vqs[i]))
+ goto error_find;
+ }
+ return 0;
+
+error_find:
+ vp_del_vqs(vdev);
+
+error_request:
+ return PTR_ERR(vqs[i]);
+}
+
static struct virtio_config_ops virtio_pci_config_ops = {
.get = vp_get,
.set = vp_set,
.get_status = vp_get_status,
.set_status = vp_set_status,
.reset = vp_reset,
- .find_vq = vp_find_vq,
- .del_vq = vp_del_vq,
+ .find_vqs = vp_find_vqs,
+ .del_vqs = vp_del_vqs,
.get_features = vp_get_features,
.finalize_features = vp_finalize_features,
};
@@ -310,7 +547,7 @@
struct virtio_pci_device *vp_dev = to_vp_device(dev);
struct pci_dev *pci_dev = vp_dev->pci_dev;
- free_irq(pci_dev->irq, vp_dev);
+ vp_del_vqs(dev);
pci_set_drvdata(pci_dev, NULL);
pci_iounmap(pci_dev, vp_dev->ioaddr);
pci_release_regions(pci_dev);
@@ -369,21 +606,13 @@
vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor;
vp_dev->vdev.id.device = pci_dev->subsystem_device;
- /* register a handler for the queue with the PCI device's interrupt */
- err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED,
- dev_name(&vp_dev->vdev.dev), vp_dev);
- if (err)
- goto out_set_drvdata;
-
/* finally register the virtio device */
err = register_virtio_device(&vp_dev->vdev);
if (err)
- goto out_req_irq;
+ goto out_set_drvdata;
return 0;
-out_req_irq:
- free_irq(pci_dev->irq, vp_dev);
out_set_drvdata:
pci_set_drvdata(pci_dev, NULL);
pci_iounmap(pci_dev, vp_dev->ioaddr);
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 5c52369..a882f26 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -23,21 +23,30 @@
#ifdef DEBUG
/* For development, we want to crash whenever the ring is screwed. */
-#define BAD_RING(_vq, fmt...) \
- do { dev_err(&(_vq)->vq.vdev->dev, fmt); BUG(); } while(0)
+#define BAD_RING(_vq, fmt, args...) \
+ do { \
+ dev_err(&(_vq)->vq.vdev->dev, \
+ "%s:"fmt, (_vq)->vq.name, ##args); \
+ BUG(); \
+ } while (0)
/* Caller is supposed to guarantee no reentry. */
#define START_USE(_vq) \
do { \
if ((_vq)->in_use) \
- panic("in_use = %i\n", (_vq)->in_use); \
+ panic("%s:in_use = %i\n", \
+ (_vq)->vq.name, (_vq)->in_use); \
(_vq)->in_use = __LINE__; \
mb(); \
- } while(0)
+ } while (0)
#define END_USE(_vq) \
do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; mb(); } while(0)
#else
-#define BAD_RING(_vq, fmt...) \
- do { dev_err(&_vq->vq.vdev->dev, fmt); (_vq)->broken = true; } while(0)
+#define BAD_RING(_vq, fmt, args...) \
+ do { \
+ dev_err(&_vq->vq.vdev->dev, \
+ "%s:"fmt, (_vq)->vq.name, ##args); \
+ (_vq)->broken = true; \
+ } while (0)
#define START_USE(vq)
#define END_USE(vq)
#endif
@@ -52,6 +61,9 @@
/* Other side has made a mess, don't try any more. */
bool broken;
+ /* Host supports indirect buffers */
+ bool indirect;
+
/* Number of free buffers */
unsigned int num_free;
/* Head of free buffer list. */
@@ -76,6 +88,55 @@
#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
+/* Set up an indirect table of descriptors and add it to the queue. */
+static int vring_add_indirect(struct vring_virtqueue *vq,
+ struct scatterlist sg[],
+ unsigned int out,
+ unsigned int in)
+{
+ struct vring_desc *desc;
+ unsigned head;
+ int i;
+
+ desc = kmalloc((out + in) * sizeof(struct vring_desc), GFP_ATOMIC);
+ if (!desc)
+ return vq->vring.num;
+
+ /* Transfer entries from the sg list into the indirect page */
+ for (i = 0; i < out; i++) {
+ desc[i].flags = VRING_DESC_F_NEXT;
+ desc[i].addr = sg_phys(sg);
+ desc[i].len = sg->length;
+ desc[i].next = i+1;
+ sg++;
+ }
+ for (; i < (out + in); i++) {
+ desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+ desc[i].addr = sg_phys(sg);
+ desc[i].len = sg->length;
+ desc[i].next = i+1;
+ sg++;
+ }
+
+ /* Last one doesn't continue. */
+ desc[i-1].flags &= ~VRING_DESC_F_NEXT;
+ desc[i-1].next = 0;
+
+ /* We're about to use a buffer */
+ vq->num_free--;
+
+ /* Use a single buffer which doesn't continue */
+ head = vq->free_head;
+ vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT;
+ vq->vring.desc[head].addr = virt_to_phys(desc);
+ vq->vring.desc[head].len = i * sizeof(struct vring_desc);
+
+ /* Update free pointer */
+ vq->free_head = vq->vring.desc[head].next;
+
+ return head;
+}
+
static int vring_add_buf(struct virtqueue *_vq,
struct scatterlist sg[],
unsigned int out,
@@ -85,12 +146,21 @@
struct vring_virtqueue *vq = to_vvq(_vq);
unsigned int i, avail, head, uninitialized_var(prev);
+ START_USE(vq);
+
BUG_ON(data == NULL);
+
+ /* If the host supports indirect descriptor tables, and we have multiple
+ * buffers, then go indirect. FIXME: tune this threshold */
+ if (vq->indirect && (out + in) > 1 && vq->num_free) {
+ head = vring_add_indirect(vq, sg, out, in);
+ if (head != vq->vring.num)
+ goto add_head;
+ }
+
BUG_ON(out + in > vq->vring.num);
BUG_ON(out + in == 0);
- START_USE(vq);
-
if (vq->num_free < out + in) {
pr_debug("Can't add buf len %i - avail = %i\n",
out + in, vq->num_free);
@@ -127,6 +197,7 @@
/* Update free pointer */
vq->free_head = i;
+add_head:
/* Set token. */
vq->data[head] = data;
@@ -170,6 +241,11 @@
/* Put back on free list: find end */
i = head;
+
+ /* Free the indirect table */
+ if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT)
+ kfree(phys_to_virt(vq->vring.desc[i].addr));
+
while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
i = vq->vring.desc[i].next;
vq->num_free++;
@@ -284,7 +360,8 @@
struct virtio_device *vdev,
void *pages,
void (*notify)(struct virtqueue *),
- void (*callback)(struct virtqueue *))
+ void (*callback)(struct virtqueue *),
+ const char *name)
{
struct vring_virtqueue *vq;
unsigned int i;
@@ -303,14 +380,18 @@
vq->vq.callback = callback;
vq->vq.vdev = vdev;
vq->vq.vq_ops = &vring_vq_ops;
+ vq->vq.name = name;
vq->notify = notify;
vq->broken = false;
vq->last_used_idx = 0;
vq->num_added = 0;
+ list_add_tail(&vq->vq.list, &vdev->vqs);
#ifdef DEBUG
vq->in_use = false;
#endif
+ vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
+
/* No callback? Tell other side not to bother us. */
if (!callback)
vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
@@ -327,6 +408,7 @@
void vring_del_virtqueue(struct virtqueue *vq)
{
+ list_del(&vq->list);
kfree(to_vvq(vq));
}
EXPORT_SYMBOL_GPL(vring_del_virtqueue);
@@ -338,6 +420,8 @@
for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
switch (i) {
+ case VIRTIO_RING_F_INDIRECT_DESC:
+ break;
default:
/* We don't understand this bit. */
clear_bit(i, vdev->features);
diff --git a/fs/Kconfig b/fs/Kconfig
index 9f7270f..525da2e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -62,6 +62,16 @@
source "fs/autofs4/Kconfig"
source "fs/fuse/Kconfig"
+config CUSE
+ tristate "Character device in Userpace support"
+ depends on FUSE_FS
+ help
+ This FUSE extension allows character devices to be
+ implemented in userspace.
+
+ If you want to develop or use userspace character device
+ based on CUSE, answer Y or M.
+
config GENERIC_ACL
bool
select FS_POSIX_ACL
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 2a701d5..3f0e197 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -16,6 +16,7 @@
#include <linux/anon_inodes.h>
#include <linux/eventfd.h>
#include <linux/syscalls.h>
+#include <linux/module.h>
struct eventfd_ctx {
wait_queue_head_t wqh;
@@ -56,6 +57,7 @@
return n;
}
+EXPORT_SYMBOL_GPL(eventfd_signal);
static int eventfd_release(struct inode *inode, struct file *file)
{
@@ -197,6 +199,7 @@
return file;
}
+EXPORT_SYMBOL_GPL(eventfd_fget);
SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
{
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 7243706..e95eeb4 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -3,5 +3,6 @@
#
obj-$(CONFIG_FUSE_FS) += fuse.o
+obj-$(CONFIG_CUSE) += cuse.o
fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
new file mode 100644
index 0000000..de792dc
--- /dev/null
+++ b/fs/fuse/cuse.c
@@ -0,0 +1,610 @@
+/*
+ * CUSE: Character device in Userspace
+ *
+ * Copyright (C) 2008-2009 SUSE Linux Products GmbH
+ * Copyright (C) 2008-2009 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * CUSE enables character devices to be implemented from userland much
+ * like FUSE allows filesystems. On initialization /dev/cuse is
+ * created. By opening the file and replying to the CUSE_INIT request
+ * userland CUSE server can create a character device. After that the
+ * operation is very similar to FUSE.
+ *
+ * A CUSE instance involves the following objects.
+ *
+ * cuse_conn : contains fuse_conn and serves as bonding structure
+ * channel : file handle connected to the userland CUSE server
+ * cdev : the implemented character device
+ * dev : generic device for cdev
+ *
+ * Note that 'channel' is what 'dev' is in FUSE. As CUSE deals with
+ * devices, it's called 'channel' to reduce confusion.
+ *
+ * channel determines when the character device dies. When channel is
+ * closed, everything begins to destruct. The cuse_conn is taken off
+ * the lookup table preventing further access from cdev, cdev and
+ * generic device are removed and the base reference of cuse_conn is
+ * put.
+ *
+ * On each open, the matching cuse_conn is looked up and if found an
+ * additional reference is taken which is released when the file is
+ * closed.
+ */
+
+#include <linux/fuse.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/magic.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+
+#include "fuse_i.h"
+
+#define CUSE_CONNTBL_LEN 64
+
+struct cuse_conn {
+ struct list_head list; /* linked on cuse_conntbl */
+ struct fuse_conn fc; /* fuse connection */
+ struct cdev *cdev; /* associated character device */
+ struct device *dev; /* device representing @cdev */
+
+ /* init parameters, set once during initialization */
+ bool unrestricted_ioctl;
+};
+
+static DEFINE_SPINLOCK(cuse_lock); /* protects cuse_conntbl */
+static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
+static struct class *cuse_class;
+
+static struct cuse_conn *fc_to_cc(struct fuse_conn *fc)
+{
+ return container_of(fc, struct cuse_conn, fc);
+}
+
+static struct list_head *cuse_conntbl_head(dev_t devt)
+{
+ return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN];
+}
+
+
+/**************************************************************************
+ * CUSE frontend operations
+ *
+ * These are file operations for the character device.
+ *
+ * On open, CUSE opens a file from the FUSE mnt and stores it to
+ * private_data of the open file. All other ops call FUSE ops on the
+ * FUSE file.
+ */
+
+static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ loff_t pos = 0;
+
+ return fuse_direct_io(file, buf, count, &pos, 0);
+}
+
+static ssize_t cuse_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ loff_t pos = 0;
+ /*
+ * No locking or generic_write_checks(), the server is
+ * responsible for locking and sanity checks.
+ */
+ return fuse_direct_io(file, buf, count, &pos, 1);
+}
+
+static int cuse_open(struct inode *inode, struct file *file)
+{
+ dev_t devt = inode->i_cdev->dev;
+ struct cuse_conn *cc = NULL, *pos;
+ int rc;
+
+ /* look up and get the connection */
+ spin_lock(&cuse_lock);
+ list_for_each_entry(pos, cuse_conntbl_head(devt), list)
+ if (pos->dev->devt == devt) {
+ fuse_conn_get(&pos->fc);
+ cc = pos;
+ break;
+ }
+ spin_unlock(&cuse_lock);
+
+ /* dead? */
+ if (!cc)
+ return -ENODEV;
+
+ /*
+ * Generic permission check is already done against the chrdev
+ * file, proceed to open.
+ */
+ rc = fuse_do_open(&cc->fc, 0, file, 0);
+ if (rc)
+ fuse_conn_put(&cc->fc);
+ return rc;
+}
+
+static int cuse_release(struct inode *inode, struct file *file)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fc;
+
+ fuse_sync_release(ff, file->f_flags);
+ fuse_conn_put(fc);
+
+ return 0;
+}
+
+static long cuse_file_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_file *ff = file->private_data;
+ struct cuse_conn *cc = fc_to_cc(ff->fc);
+ unsigned int flags = 0;
+
+ if (cc->unrestricted_ioctl)
+ flags |= FUSE_IOCTL_UNRESTRICTED;
+
+ return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_file *ff = file->private_data;
+ struct cuse_conn *cc = fc_to_cc(ff->fc);
+ unsigned int flags = FUSE_IOCTL_COMPAT;
+
+ if (cc->unrestricted_ioctl)
+ flags |= FUSE_IOCTL_UNRESTRICTED;
+
+ return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static const struct file_operations cuse_frontend_fops = {
+ .owner = THIS_MODULE,
+ .read = cuse_read,
+ .write = cuse_write,
+ .open = cuse_open,
+ .release = cuse_release,
+ .unlocked_ioctl = cuse_file_ioctl,
+ .compat_ioctl = cuse_file_compat_ioctl,
+ .poll = fuse_file_poll,
+};
+
+
+/**************************************************************************
+ * CUSE channel initialization and destruction
+ */
+
+struct cuse_devinfo {
+ const char *name;
+};
+
+/**
+ * cuse_parse_one - parse one key=value pair
+ * @pp: i/o parameter for the current position
+ * @end: points to one past the end of the packed string
+ * @keyp: out parameter for key
+ * @valp: out parameter for value
+ *
+ * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends
+ * at @end - 1. This function parses one pair and set *@keyp to the
+ * start of the key and *@valp to the start of the value. Note that
+ * the original string is modified such that the key string is
+ * terminated with '\0'. *@pp is updated to point to the next string.
+ *
+ * RETURNS:
+ * 1 on successful parse, 0 on EOF, -errno on failure.
+ */
+static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
+{
+ char *p = *pp;
+ char *key, *val;
+
+ while (p < end && *p == '\0')
+ p++;
+ if (p == end)
+ return 0;
+
+ if (end[-1] != '\0') {
+ printk(KERN_ERR "CUSE: info not properly terminated\n");
+ return -EINVAL;
+ }
+
+ key = val = p;
+ p += strlen(p);
+
+ if (valp) {
+ strsep(&val, "=");
+ if (!val)
+ val = key + strlen(key);
+ key = strstrip(key);
+ val = strstrip(val);
+ } else
+ key = strstrip(key);
+
+ if (!strlen(key)) {
+ printk(KERN_ERR "CUSE: zero length info key specified\n");
+ return -EINVAL;
+ }
+
+ *pp = p;
+ *keyp = key;
+ if (valp)
+ *valp = val;
+
+ return 1;
+}
+
+/**
+ * cuse_parse_dev_info - parse device info
+ * @p: device info string
+ * @len: length of device info string
+ * @devinfo: out parameter for parsed device info
+ *
+ * Parse @p to extract device info and store it into @devinfo. String
+ * pointed to by @p is modified by parsing and @devinfo points into
+ * them, so @p shouldn't be freed while @devinfo is in use.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
+{
+ char *end = p + len;
+ char *key, *val;
+ int rc;
+
+ while (true) {
+ rc = cuse_parse_one(&p, end, &key, &val);
+ if (rc < 0)
+ return rc;
+ if (!rc)
+ break;
+ if (strcmp(key, "DEVNAME") == 0)
+ devinfo->name = val;
+ else
+ printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n",
+ key);
+ }
+
+ if (!devinfo->name || !strlen(devinfo->name)) {
+ printk(KERN_ERR "CUSE: DEVNAME unspecified\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void cuse_gendev_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+/**
+ * cuse_process_init_reply - finish initializing CUSE channel
+ *
+ * This function creates the character device and sets up all the
+ * required data structures for it. Please read the comment at the
+ * top of this file for high level overview.
+ */
+static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+ struct cuse_conn *cc = fc_to_cc(fc);
+ struct cuse_init_out *arg = &req->misc.cuse_init_out;
+ struct page *page = req->pages[0];
+ struct cuse_devinfo devinfo = { };
+ struct device *dev;
+ struct cdev *cdev;
+ dev_t devt;
+ int rc;
+
+ if (req->out.h.error ||
+ arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
+ goto err;
+ }
+
+ fc->minor = arg->minor;
+ fc->max_read = max_t(unsigned, arg->max_read, 4096);
+ fc->max_write = max_t(unsigned, arg->max_write, 4096);
+
+ /* parse init reply */
+ cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL;
+
+ rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size,
+ &devinfo);
+ if (rc)
+ goto err;
+
+ /* determine and reserve devt */
+ devt = MKDEV(arg->dev_major, arg->dev_minor);
+ if (!MAJOR(devt))
+ rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name);
+ else
+ rc = register_chrdev_region(devt, 1, devinfo.name);
+ if (rc) {
+ printk(KERN_ERR "CUSE: failed to register chrdev region\n");
+ goto err;
+ }
+
+ /* devt determined, create device */
+ rc = -ENOMEM;
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ goto err_region;
+
+ device_initialize(dev);
+ dev_set_uevent_suppress(dev, 1);
+ dev->class = cuse_class;
+ dev->devt = devt;
+ dev->release = cuse_gendev_release;
+ dev_set_drvdata(dev, cc);
+ dev_set_name(dev, "%s", devinfo.name);
+
+ rc = device_add(dev);
+ if (rc)
+ goto err_device;
+
+ /* register cdev */
+ rc = -ENOMEM;
+ cdev = cdev_alloc();
+ if (!cdev)
+ goto err_device;
+
+ cdev->owner = THIS_MODULE;
+ cdev->ops = &cuse_frontend_fops;
+
+ rc = cdev_add(cdev, devt, 1);
+ if (rc)
+ goto err_cdev;
+
+ cc->dev = dev;
+ cc->cdev = cdev;
+
+ /* make the device available */
+ spin_lock(&cuse_lock);
+ list_add(&cc->list, cuse_conntbl_head(devt));
+ spin_unlock(&cuse_lock);
+
+ /* announce device availability */
+ dev_set_uevent_suppress(dev, 0);
+ kobject_uevent(&dev->kobj, KOBJ_ADD);
+out:
+ __free_page(page);
+ return;
+
+err_cdev:
+ cdev_del(cdev);
+err_device:
+ put_device(dev);
+err_region:
+ unregister_chrdev_region(devt, 1);
+err:
+ fc->conn_error = 1;
+ goto out;
+}
+
+static int cuse_send_init(struct cuse_conn *cc)
+{
+ int rc;
+ struct fuse_req *req;
+ struct page *page;
+ struct fuse_conn *fc = &cc->fc;
+ struct cuse_init_in *arg;
+
+ BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
+
+ req = fuse_get_req(fc);
+ if (IS_ERR(req)) {
+ rc = PTR_ERR(req);
+ goto err;
+ }
+
+ rc = -ENOMEM;
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto err_put_req;
+
+ arg = &req->misc.cuse_init_in;
+ arg->major = FUSE_KERNEL_VERSION;
+ arg->minor = FUSE_KERNEL_MINOR_VERSION;
+ arg->flags |= CUSE_UNRESTRICTED_IOCTL;
+ req->in.h.opcode = CUSE_INIT;
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(struct cuse_init_in);
+ req->in.args[0].value = arg;
+ req->out.numargs = 2;
+ req->out.args[0].size = sizeof(struct cuse_init_out);
+ req->out.args[0].value = &req->misc.cuse_init_out;
+ req->out.args[1].size = CUSE_INIT_INFO_MAX;
+ req->out.argvar = 1;
+ req->out.argpages = 1;
+ req->pages[0] = page;
+ req->num_pages = 1;
+ req->end = cuse_process_init_reply;
+ fuse_request_send_background(fc, req);
+
+ return 0;
+
+err_put_req:
+ fuse_put_request(fc, req);
+err:
+ return rc;
+}
+
+static void cuse_fc_release(struct fuse_conn *fc)
+{
+ struct cuse_conn *cc = fc_to_cc(fc);
+ kfree(cc);
+}
+
+/**
+ * cuse_channel_open - open method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being opened
+ *
+ * Userland CUSE server can create a CUSE device by opening /dev/cuse
+ * and replying to the initilaization request kernel sends. This
+ * function is responsible for handling CUSE device initialization.
+ * Because the fd opened by this function is used during
+ * initialization, this function only creates cuse_conn and sends
+ * init. The rest is delegated to a kthread.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_open(struct inode *inode, struct file *file)
+{
+ struct cuse_conn *cc;
+ int rc;
+
+ /* set up cuse_conn */
+ cc = kzalloc(sizeof(*cc), GFP_KERNEL);
+ if (!cc)
+ return -ENOMEM;
+
+ fuse_conn_init(&cc->fc);
+
+ INIT_LIST_HEAD(&cc->list);
+ cc->fc.release = cuse_fc_release;
+
+ cc->fc.connected = 1;
+ cc->fc.blocked = 0;
+ rc = cuse_send_init(cc);
+ if (rc) {
+ fuse_conn_put(&cc->fc);
+ return rc;
+ }
+ file->private_data = &cc->fc; /* channel owns base reference to cc */
+
+ return 0;
+}
+
+/**
+ * cuse_channel_release - release method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being closed
+ *
+ * Disconnect the channel, deregister CUSE device and initiate
+ * destruction by putting the default reference.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_release(struct inode *inode, struct file *file)
+{
+ struct cuse_conn *cc = fc_to_cc(file->private_data);
+ int rc;
+
+ /* remove from the conntbl, no more access from this point on */
+ spin_lock(&cuse_lock);
+ list_del_init(&cc->list);
+ spin_unlock(&cuse_lock);
+
+ /* remove device */
+ if (cc->dev)
+ device_unregister(cc->dev);
+ if (cc->cdev) {
+ unregister_chrdev_region(cc->cdev->dev, 1);
+ cdev_del(cc->cdev);
+ }
+
+ /* kill connection and shutdown channel */
+ fuse_conn_kill(&cc->fc);
+ rc = fuse_dev_release(inode, file); /* puts the base reference */
+
+ return rc;
+}
+
+static struct file_operations cuse_channel_fops; /* initialized during init */
+
+
+/**************************************************************************
+ * Misc stuff and module initializatiion
+ *
+ * CUSE exports the same set of attributes to sysfs as fusectl.
+ */
+
+static ssize_t cuse_class_waiting_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cuse_conn *cc = dev_get_drvdata(dev);
+
+ return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
+}
+
+static ssize_t cuse_class_abort_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cuse_conn *cc = dev_get_drvdata(dev);
+
+ fuse_abort_conn(&cc->fc);
+ return count;
+}
+
+static struct device_attribute cuse_class_dev_attrs[] = {
+ __ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL),
+ __ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store),
+ { }
+};
+
+static struct miscdevice cuse_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "cuse",
+ .fops = &cuse_channel_fops,
+};
+
+static int __init cuse_init(void)
+{
+ int i, rc;
+
+ /* init conntbl */
+ for (i = 0; i < CUSE_CONNTBL_LEN; i++)
+ INIT_LIST_HEAD(&cuse_conntbl[i]);
+
+ /* inherit and extend fuse_dev_operations */
+ cuse_channel_fops = fuse_dev_operations;
+ cuse_channel_fops.owner = THIS_MODULE;
+ cuse_channel_fops.open = cuse_channel_open;
+ cuse_channel_fops.release = cuse_channel_release;
+
+ cuse_class = class_create(THIS_MODULE, "cuse");
+ if (IS_ERR(cuse_class))
+ return PTR_ERR(cuse_class);
+
+ cuse_class->dev_attrs = cuse_class_dev_attrs;
+
+ rc = misc_register(&cuse_miscdev);
+ if (rc) {
+ class_destroy(cuse_class);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void __exit cuse_exit(void)
+{
+ misc_deregister(&cuse_miscdev);
+ class_destroy(cuse_class);
+}
+
+module_init(cuse_init);
+module_exit(cuse_exit);
+
+MODULE_AUTHOR("Tejun Heo <tj@kernel.org>");
+MODULE_DESCRIPTION("Character device in Userspace");
+MODULE_LICENSE("GPL");
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ba76b68..8fed2ed 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -46,6 +46,7 @@
fuse_request_init(req);
return req;
}
+EXPORT_SYMBOL_GPL(fuse_request_alloc);
struct fuse_req *fuse_request_alloc_nofs(void)
{
@@ -124,6 +125,7 @@
atomic_dec(&fc->num_waiting);
return ERR_PTR(err);
}
+EXPORT_SYMBOL_GPL(fuse_get_req);
/*
* Return request in fuse_file->reserved_req. However that may
@@ -208,6 +210,7 @@
fuse_request_free(req);
}
}
+EXPORT_SYMBOL_GPL(fuse_put_request);
static unsigned len_args(unsigned numargs, struct fuse_arg *args)
{
@@ -282,7 +285,7 @@
wake_up_all(&fc->blocked_waitq);
}
if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
- fc->connected) {
+ fc->connected && fc->bdi_initialized) {
clear_bdi_congested(&fc->bdi, READ);
clear_bdi_congested(&fc->bdi, WRITE);
}
@@ -400,6 +403,7 @@
}
spin_unlock(&fc->lock);
}
+EXPORT_SYMBOL_GPL(fuse_request_send);
static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
struct fuse_req *req)
@@ -408,7 +412,8 @@
fc->num_background++;
if (fc->num_background == FUSE_MAX_BACKGROUND)
fc->blocked = 1;
- if (fc->num_background == FUSE_CONGESTION_THRESHOLD) {
+ if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+ fc->bdi_initialized) {
set_bdi_congested(&fc->bdi, READ);
set_bdi_congested(&fc->bdi, WRITE);
}
@@ -439,6 +444,7 @@
req->isreply = 1;
fuse_request_send_nowait(fc, req);
}
+EXPORT_SYMBOL_GPL(fuse_request_send_background);
/*
* Called under fc->lock
@@ -1105,8 +1111,9 @@
}
spin_unlock(&fc->lock);
}
+EXPORT_SYMBOL_GPL(fuse_abort_conn);
-static int fuse_dev_release(struct inode *inode, struct file *file)
+int fuse_dev_release(struct inode *inode, struct file *file)
{
struct fuse_conn *fc = fuse_get_conn(file);
if (fc) {
@@ -1120,6 +1127,7 @@
return 0;
}
+EXPORT_SYMBOL_GPL(fuse_dev_release);
static int fuse_dev_fasync(int fd, struct file *file, int on)
{
@@ -1142,6 +1150,7 @@
.release = fuse_dev_release,
.fasync = fuse_dev_fasync,
};
+EXPORT_SYMBOL_GPL(fuse_dev_operations);
static struct miscdevice fuse_miscdevice = {
.minor = FUSE_MINOR,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8b8eebc..b3089a0 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -362,19 +362,6 @@
}
/*
- * Synchronous release for the case when something goes wrong in CREATE_OPEN
- */
-static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
- u64 nodeid, int flags)
-{
- fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
- ff->reserved_req->force = 1;
- fuse_request_send(fc, ff->reserved_req);
- fuse_put_request(fc, ff->reserved_req);
- kfree(ff);
-}
-
-/*
* Atomic create+open operation
*
* If the filesystem doesn't support this, then fall back to separate
@@ -445,12 +432,14 @@
goto out_free_ff;
fuse_put_request(fc, req);
+ ff->fh = outopen.fh;
+ ff->nodeid = outentry.nodeid;
+ ff->open_flags = outopen.open_flags;
inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
&outentry.attr, entry_attr_timeout(&outentry), 0);
if (!inode) {
flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
- ff->fh = outopen.fh;
- fuse_sync_release(fc, ff, outentry.nodeid, flags);
+ fuse_sync_release(ff, flags);
fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
return -ENOMEM;
}
@@ -460,11 +449,11 @@
fuse_invalidate_attr(dir);
file = lookup_instantiate_filp(nd, entry, generic_file_open);
if (IS_ERR(file)) {
- ff->fh = outopen.fh;
- fuse_sync_release(fc, ff, outentry.nodeid, flags);
+ fuse_sync_release(ff, flags);
return PTR_ERR(file);
}
- fuse_finish_open(inode, file, ff, &outopen);
+ file->private_data = fuse_file_get(ff);
+ fuse_finish_open(inode, file);
return 0;
out_free_ff:
@@ -1035,7 +1024,7 @@
req->out.argpages = 1;
req->num_pages = 1;
req->pages[0] = page;
- fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
+ fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR);
fuse_request_send(fc, req);
nbytes = req->out.args[0].size;
err = req->out.h.error;
@@ -1101,12 +1090,14 @@
static int fuse_dir_open(struct inode *inode, struct file *file)
{
- return fuse_open_common(inode, file, 1);
+ return fuse_open_common(inode, file, true);
}
static int fuse_dir_release(struct inode *inode, struct file *file)
{
- return fuse_release_common(inode, file, 1);
+ fuse_release_common(file, FUSE_RELEASEDIR);
+
+ return 0;
}
static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 06f30e9..fce6ce6 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -12,13 +12,13 @@
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/module.h>
static const struct file_operations fuse_direct_io_file_operations;
-static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
- struct fuse_open_out *outargp)
+static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+ int opcode, struct fuse_open_out *outargp)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_open_in inarg;
struct fuse_req *req;
int err;
@@ -31,8 +31,8 @@
inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
if (!fc->atomic_o_trunc)
inarg.flags &= ~O_TRUNC;
- req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
- req->in.h.nodeid = get_node_id(inode);
+ req->in.h.opcode = opcode;
+ req->in.h.nodeid = nodeid;
req->in.numargs = 1;
req->in.args[0].size = sizeof(inarg);
req->in.args[0].value = &inarg;
@@ -49,22 +49,27 @@
struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
{
struct fuse_file *ff;
+
ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
- if (ff) {
- ff->reserved_req = fuse_request_alloc();
- if (!ff->reserved_req) {
- kfree(ff);
- return NULL;
- } else {
- INIT_LIST_HEAD(&ff->write_entry);
- atomic_set(&ff->count, 0);
- spin_lock(&fc->lock);
- ff->kh = ++fc->khctr;
- spin_unlock(&fc->lock);
- }
- RB_CLEAR_NODE(&ff->polled_node);
- init_waitqueue_head(&ff->poll_wait);
+ if (unlikely(!ff))
+ return NULL;
+
+ ff->fc = fc;
+ ff->reserved_req = fuse_request_alloc();
+ if (unlikely(!ff->reserved_req)) {
+ kfree(ff);
+ return NULL;
}
+
+ INIT_LIST_HEAD(&ff->write_entry);
+ atomic_set(&ff->count, 0);
+ RB_CLEAR_NODE(&ff->polled_node);
+ init_waitqueue_head(&ff->poll_wait);
+
+ spin_lock(&fc->lock);
+ ff->kh = ++fc->khctr;
+ spin_unlock(&fc->lock);
+
return ff;
}
@@ -74,7 +79,7 @@
kfree(ff);
}
-static struct fuse_file *fuse_file_get(struct fuse_file *ff)
+struct fuse_file *fuse_file_get(struct fuse_file *ff)
{
atomic_inc(&ff->count);
return ff;
@@ -82,41 +87,66 @@
static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
{
- dput(req->misc.release.dentry);
- mntput(req->misc.release.vfsmount);
+ path_put(&req->misc.release.path);
}
static void fuse_file_put(struct fuse_file *ff)
{
if (atomic_dec_and_test(&ff->count)) {
struct fuse_req *req = ff->reserved_req;
- struct inode *inode = req->misc.release.dentry->d_inode;
- struct fuse_conn *fc = get_fuse_conn(inode);
+
req->end = fuse_release_end;
- fuse_request_send_background(fc, req);
+ fuse_request_send_background(ff->fc, req);
kfree(ff);
}
}
-void fuse_finish_open(struct inode *inode, struct file *file,
- struct fuse_file *ff, struct fuse_open_out *outarg)
+int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+ bool isdir)
{
- if (outarg->open_flags & FOPEN_DIRECT_IO)
- file->f_op = &fuse_direct_io_file_operations;
- if (!(outarg->open_flags & FOPEN_KEEP_CACHE))
- invalidate_inode_pages2(inode->i_mapping);
- if (outarg->open_flags & FOPEN_NONSEEKABLE)
- nonseekable_open(inode, file);
- ff->fh = outarg->fh;
- file->private_data = fuse_file_get(ff);
-}
-
-int fuse_open_common(struct inode *inode, struct file *file, int isdir)
-{
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_open_out outarg;
struct fuse_file *ff;
int err;
+ int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
+
+ ff = fuse_file_alloc(fc);
+ if (!ff)
+ return -ENOMEM;
+
+ err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+ if (err) {
+ fuse_file_free(ff);
+ return err;
+ }
+
+ if (isdir)
+ outarg.open_flags &= ~FOPEN_DIRECT_IO;
+
+ ff->fh = outarg.fh;
+ ff->nodeid = nodeid;
+ ff->open_flags = outarg.open_flags;
+ file->private_data = fuse_file_get(ff);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_do_open);
+
+void fuse_finish_open(struct inode *inode, struct file *file)
+{
+ struct fuse_file *ff = file->private_data;
+
+ if (ff->open_flags & FOPEN_DIRECT_IO)
+ file->f_op = &fuse_direct_io_file_operations;
+ if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ invalidate_inode_pages2(inode->i_mapping);
+ if (ff->open_flags & FOPEN_NONSEEKABLE)
+ nonseekable_open(inode, file);
+}
+
+int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ int err;
/* VFS checks this, but only _after_ ->open() */
if (file->f_flags & O_DIRECT)
@@ -126,79 +156,86 @@
if (err)
return err;
- ff = fuse_file_alloc(fc);
- if (!ff)
- return -ENOMEM;
-
- err = fuse_send_open(inode, file, isdir, &outarg);
+ err = fuse_do_open(fc, get_node_id(inode), file, isdir);
if (err)
- fuse_file_free(ff);
- else {
- if (isdir)
- outarg.open_flags &= ~FOPEN_DIRECT_IO;
- fuse_finish_open(inode, file, ff, &outarg);
- }
+ return err;
- return err;
+ fuse_finish_open(inode, file);
+
+ return 0;
}
-void fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, int opcode)
+static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
{
+ struct fuse_conn *fc = ff->fc;
struct fuse_req *req = ff->reserved_req;
struct fuse_release_in *inarg = &req->misc.release.in;
+ spin_lock(&fc->lock);
+ list_del(&ff->write_entry);
+ if (!RB_EMPTY_NODE(&ff->polled_node))
+ rb_erase(&ff->polled_node, &fc->polled_files);
+ spin_unlock(&fc->lock);
+
+ wake_up_interruptible_sync(&ff->poll_wait);
+
inarg->fh = ff->fh;
inarg->flags = flags;
req->in.h.opcode = opcode;
- req->in.h.nodeid = nodeid;
+ req->in.h.nodeid = ff->nodeid;
req->in.numargs = 1;
req->in.args[0].size = sizeof(struct fuse_release_in);
req->in.args[0].value = inarg;
}
-int fuse_release_common(struct inode *inode, struct file *file, int isdir)
+void fuse_release_common(struct file *file, int opcode)
{
- struct fuse_file *ff = file->private_data;
- if (ff) {
- struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req = ff->reserved_req;
+ struct fuse_file *ff;
+ struct fuse_req *req;
- fuse_release_fill(ff, get_node_id(inode), file->f_flags,
- isdir ? FUSE_RELEASEDIR : FUSE_RELEASE);
+ ff = file->private_data;
+ if (unlikely(!ff))
+ return;
- /* Hold vfsmount and dentry until release is finished */
- req->misc.release.vfsmount = mntget(file->f_path.mnt);
- req->misc.release.dentry = dget(file->f_path.dentry);
+ req = ff->reserved_req;
+ fuse_prepare_release(ff, file->f_flags, opcode);
- spin_lock(&fc->lock);
- list_del(&ff->write_entry);
- if (!RB_EMPTY_NODE(&ff->polled_node))
- rb_erase(&ff->polled_node, &fc->polled_files);
- spin_unlock(&fc->lock);
+ /* Hold vfsmount and dentry until release is finished */
+ path_get(&file->f_path);
+ req->misc.release.path = file->f_path;
- wake_up_interruptible_sync(&ff->poll_wait);
- /*
- * Normally this will send the RELEASE request,
- * however if some asynchronous READ or WRITE requests
- * are outstanding, the sending will be delayed
- */
- fuse_file_put(ff);
- }
-
- /* Return value is ignored by VFS */
- return 0;
+ /*
+ * Normally this will send the RELEASE request, however if
+ * some asynchronous READ or WRITE requests are outstanding,
+ * the sending will be delayed.
+ */
+ fuse_file_put(ff);
}
static int fuse_open(struct inode *inode, struct file *file)
{
- return fuse_open_common(inode, file, 0);
+ return fuse_open_common(inode, file, false);
}
static int fuse_release(struct inode *inode, struct file *file)
{
- return fuse_release_common(inode, file, 0);
+ fuse_release_common(file, FUSE_RELEASE);
+
+ /* return value is ignored by VFS */
+ return 0;
}
+void fuse_sync_release(struct fuse_file *ff, int flags)
+{
+ WARN_ON(atomic_read(&ff->count) > 1);
+ fuse_prepare_release(ff, flags, FUSE_RELEASE);
+ ff->reserved_req->force = 1;
+ fuse_request_send(ff->fc, ff->reserved_req);
+ fuse_put_request(ff->fc, ff->reserved_req);
+ kfree(ff);
+}
+EXPORT_SYMBOL_GPL(fuse_sync_release);
+
/*
* Scramble the ID space with XTEA, so that the value of the files_struct
* pointer is not exposed to userspace.
@@ -371,8 +408,8 @@
return fuse_fsync_common(file, de, datasync, 0);
}
-void fuse_read_fill(struct fuse_req *req, struct file *file,
- struct inode *inode, loff_t pos, size_t count, int opcode)
+void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
+ size_t count, int opcode)
{
struct fuse_read_in *inarg = &req->misc.read.in;
struct fuse_file *ff = file->private_data;
@@ -382,7 +419,7 @@
inarg->size = count;
inarg->flags = file->f_flags;
req->in.h.opcode = opcode;
- req->in.h.nodeid = get_node_id(inode);
+ req->in.h.nodeid = ff->nodeid;
req->in.numargs = 1;
req->in.args[0].size = sizeof(struct fuse_read_in);
req->in.args[0].value = inarg;
@@ -392,12 +429,12 @@
}
static size_t fuse_send_read(struct fuse_req *req, struct file *file,
- struct inode *inode, loff_t pos, size_t count,
- fl_owner_t owner)
+ loff_t pos, size_t count, fl_owner_t owner)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fc;
- fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
+ fuse_read_fill(req, file, pos, count, FUSE_READ);
if (owner != NULL) {
struct fuse_read_in *inarg = &req->misc.read.in;
@@ -455,7 +492,7 @@
req->out.argpages = 1;
req->num_pages = 1;
req->pages[0] = page;
- num_read = fuse_send_read(req, file, inode, pos, count, NULL);
+ num_read = fuse_send_read(req, file, pos, count, NULL);
err = req->out.h.error;
fuse_put_request(fc, req);
@@ -504,19 +541,18 @@
fuse_file_put(req->ff);
}
-static void fuse_send_readpages(struct fuse_req *req, struct file *file,
- struct inode *inode)
+static void fuse_send_readpages(struct fuse_req *req, struct file *file)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fc;
loff_t pos = page_offset(req->pages[0]);
size_t count = req->num_pages << PAGE_CACHE_SHIFT;
req->out.argpages = 1;
req->out.page_zeroing = 1;
- fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
+ fuse_read_fill(req, file, pos, count, FUSE_READ);
req->misc.read.attr_ver = fuse_get_attr_version(fc);
if (fc->async_read) {
- struct fuse_file *ff = file->private_data;
req->ff = fuse_file_get(ff);
req->end = fuse_readpages_end;
fuse_request_send_background(fc, req);
@@ -546,7 +582,7 @@
(req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
(req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
req->pages[req->num_pages - 1]->index + 1 != page->index)) {
- fuse_send_readpages(req, data->file, inode);
+ fuse_send_readpages(req, data->file);
data->req = req = fuse_get_req(fc);
if (IS_ERR(req)) {
unlock_page(page);
@@ -580,7 +616,7 @@
err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
if (!err) {
if (data.req->num_pages)
- fuse_send_readpages(data.req, file, inode);
+ fuse_send_readpages(data.req, file);
else
fuse_put_request(fc, data.req);
}
@@ -607,24 +643,19 @@
return generic_file_aio_read(iocb, iov, nr_segs, pos);
}
-static void fuse_write_fill(struct fuse_req *req, struct file *file,
- struct fuse_file *ff, struct inode *inode,
- loff_t pos, size_t count, int writepage)
+static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
+ loff_t pos, size_t count)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_write_in *inarg = &req->misc.write.in;
struct fuse_write_out *outarg = &req->misc.write.out;
- memset(inarg, 0, sizeof(struct fuse_write_in));
inarg->fh = ff->fh;
inarg->offset = pos;
inarg->size = count;
- inarg->write_flags = writepage ? FUSE_WRITE_CACHE : 0;
- inarg->flags = file ? file->f_flags : 0;
req->in.h.opcode = FUSE_WRITE;
- req->in.h.nodeid = get_node_id(inode);
+ req->in.h.nodeid = ff->nodeid;
req->in.numargs = 2;
- if (fc->minor < 9)
+ if (ff->fc->minor < 9)
req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
else
req->in.args[0].size = sizeof(struct fuse_write_in);
@@ -636,13 +667,15 @@
}
static size_t fuse_send_write(struct fuse_req *req, struct file *file,
- struct inode *inode, loff_t pos, size_t count,
- fl_owner_t owner)
+ loff_t pos, size_t count, fl_owner_t owner)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
- fuse_write_fill(req, file, file->private_data, inode, pos, count, 0);
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fc;
+ struct fuse_write_in *inarg = &req->misc.write.in;
+
+ fuse_write_fill(req, ff, pos, count);
+ inarg->flags = file->f_flags;
if (owner != NULL) {
- struct fuse_write_in *inarg = &req->misc.write.in;
inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
inarg->lock_owner = fuse_lock_owner_id(fc, owner);
}
@@ -700,7 +733,7 @@
req->num_pages = 1;
req->pages[0] = page;
req->page_offset = offset;
- nres = fuse_send_write(req, file, inode, pos, count, NULL);
+ nres = fuse_send_write(req, file, pos, count, NULL);
err = req->out.h.error;
fuse_put_request(fc, req);
if (!err && !nres)
@@ -741,7 +774,7 @@
for (i = 0; i < req->num_pages; i++)
fuse_wait_on_page_writeback(inode, req->pages[i]->index);
- res = fuse_send_write(req, file, inode, pos, count, NULL);
+ res = fuse_send_write(req, file, pos, count, NULL);
offset = req->page_offset;
count = res;
@@ -979,25 +1012,23 @@
return 0;
}
-static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos, int write)
+ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos, int write)
{
- struct inode *inode = file->f_path.dentry->d_inode;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fc;
size_t nmax = write ? fc->max_write : fc->max_read;
loff_t pos = *ppos;
ssize_t res = 0;
struct fuse_req *req;
- if (is_bad_inode(inode))
- return -EIO;
-
req = fuse_get_req(fc);
if (IS_ERR(req))
return PTR_ERR(req);
while (count) {
size_t nres;
+ fl_owner_t owner = current->files;
size_t nbytes = min(count, nmax);
int err = fuse_get_user_pages(req, buf, &nbytes, write);
if (err) {
@@ -1006,11 +1037,10 @@
}
if (write)
- nres = fuse_send_write(req, file, inode, pos, nbytes,
- current->files);
+ nres = fuse_send_write(req, file, pos, nbytes, owner);
else
- nres = fuse_send_read(req, file, inode, pos, nbytes,
- current->files);
+ nres = fuse_send_read(req, file, pos, nbytes, owner);
+
fuse_release_user_pages(req, !write);
if (req->out.h.error) {
if (!res)
@@ -1034,20 +1064,27 @@
}
}
fuse_put_request(fc, req);
- if (res > 0) {
- if (write)
- fuse_write_update_size(inode, pos);
+ if (res > 0)
*ppos = pos;
- }
- fuse_invalidate_attr(inode);
return res;
}
+EXPORT_SYMBOL_GPL(fuse_direct_io);
static ssize_t fuse_direct_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- return fuse_direct_io(file, buf, count, ppos, 0);
+ ssize_t res;
+ struct inode *inode = file->f_path.dentry->d_inode;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ res = fuse_direct_io(file, buf, count, ppos, 0);
+
+ fuse_invalidate_attr(inode);
+
+ return res;
}
static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
@@ -1055,12 +1092,22 @@
{
struct inode *inode = file->f_path.dentry->d_inode;
ssize_t res;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
/* Don't allow parallel writes to the same file */
mutex_lock(&inode->i_mutex);
res = generic_write_checks(file, ppos, &count, 0);
- if (!res)
+ if (!res) {
res = fuse_direct_io(file, buf, count, ppos, 1);
+ if (res > 0)
+ fuse_write_update_size(inode, *ppos);
+ }
mutex_unlock(&inode->i_mutex);
+
+ fuse_invalidate_attr(inode);
+
return res;
}
@@ -1177,9 +1224,10 @@
req->ff = fuse_file_get(ff);
spin_unlock(&fc->lock);
- fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1);
+ fuse_write_fill(req, ff, page_offset(page), 0);
copy_highpage(tmp_page, page);
+ req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
req->in.argpages = 1;
req->num_pages = 1;
req->pages[0] = tmp_page;
@@ -1603,12 +1651,11 @@
* limits ioctl data transfers to well-formed ioctls and is the forced
* behavior for all FUSE servers.
*/
-static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg, unsigned int flags)
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+ unsigned int flags)
{
- struct inode *inode = file->f_dentry->d_inode;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_conn *fc = ff->fc;
struct fuse_ioctl_in inarg = {
.fh = ff->fh,
.cmd = cmd,
@@ -1627,13 +1674,6 @@
/* assume all the iovs returned by client always fits in a page */
BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
- if (!fuse_allow_task(fc, current))
- return -EACCES;
-
- err = -EIO;
- if (is_bad_inode(inode))
- goto out;
-
err = -ENOMEM;
pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
iov_page = alloc_page(GFP_KERNEL);
@@ -1694,7 +1734,7 @@
/* okay, let's send it to the client */
req->in.h.opcode = FUSE_IOCTL;
- req->in.h.nodeid = get_node_id(inode);
+ req->in.h.nodeid = ff->nodeid;
req->in.numargs = 1;
req->in.args[0].size = sizeof(inarg);
req->in.args[0].value = &inarg;
@@ -1777,17 +1817,33 @@
return err ? err : outarg.result;
}
+EXPORT_SYMBOL_GPL(fuse_do_ioctl);
+
+static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
+ unsigned long arg, unsigned int flags)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fuse_allow_task(fc, current))
+ return -EACCES;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ return fuse_do_ioctl(file, cmd, arg, flags);
+}
static long fuse_file_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
- return fuse_file_do_ioctl(file, cmd, arg, 0);
+ return fuse_file_ioctl_common(file, cmd, arg, 0);
}
static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
- return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
+ return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
}
/*
@@ -1841,11 +1897,10 @@
spin_unlock(&fc->lock);
}
-static unsigned fuse_file_poll(struct file *file, poll_table *wait)
+unsigned fuse_file_poll(struct file *file, poll_table *wait)
{
- struct inode *inode = file->f_dentry->d_inode;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_conn *fc = ff->fc;
struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
struct fuse_poll_out outarg;
struct fuse_req *req;
@@ -1870,7 +1925,7 @@
return PTR_ERR(req);
req->in.h.opcode = FUSE_POLL;
- req->in.h.nodeid = get_node_id(inode);
+ req->in.h.nodeid = ff->nodeid;
req->in.numargs = 1;
req->in.args[0].size = sizeof(inarg);
req->in.args[0].value = &inarg;
@@ -1889,6 +1944,7 @@
}
return POLLERR;
}
+EXPORT_SYMBOL_GPL(fuse_file_poll);
/*
* This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6fc5aed..aaf2f9f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -97,8 +97,13 @@
struct list_head writepages;
};
+struct fuse_conn;
+
/** FUSE specific file data */
struct fuse_file {
+ /** Fuse connection for this file */
+ struct fuse_conn *fc;
+
/** Request reserved for flush and release */
struct fuse_req *reserved_req;
@@ -108,9 +113,15 @@
/** File handle used by userspace */
u64 fh;
+ /** Node id of this file */
+ u64 nodeid;
+
/** Refcount */
atomic_t count;
+ /** FOPEN_* flags returned by open */
+ u32 open_flags;
+
/** Entry on inode's write_files list */
struct list_head write_entry;
@@ -185,8 +196,6 @@
FUSE_REQ_FINISHED
};
-struct fuse_conn;
-
/**
* A request to the client
*/
@@ -248,11 +257,12 @@
struct fuse_forget_in forget_in;
struct {
struct fuse_release_in in;
- struct vfsmount *vfsmount;
- struct dentry *dentry;
+ struct path path;
} release;
struct fuse_init_in init_in;
struct fuse_init_out init_out;
+ struct cuse_init_in cuse_init_in;
+ struct cuse_init_out cuse_init_out;
struct {
struct fuse_read_in in;
u64 attr_ver;
@@ -386,6 +396,9 @@
/** Filesystem supports NFS exporting. Only set in INIT */
unsigned export_support:1;
+ /** Set if bdi is valid */
+ unsigned bdi_initialized:1;
+
/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
@@ -515,25 +528,24 @@
* Initialize READ or READDIR request
*/
void fuse_read_fill(struct fuse_req *req, struct file *file,
- struct inode *inode, loff_t pos, size_t count, int opcode);
+ loff_t pos, size_t count, int opcode);
/**
* Send OPEN or OPENDIR request
*/
-int fuse_open_common(struct inode *inode, struct file *file, int isdir);
+int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
+struct fuse_file *fuse_file_get(struct fuse_file *ff);
void fuse_file_free(struct fuse_file *ff);
-void fuse_finish_open(struct inode *inode, struct file *file,
- struct fuse_file *ff, struct fuse_open_out *outarg);
+void fuse_finish_open(struct inode *inode, struct file *file);
-/** Fill in ff->reserved_req with a RELEASE request */
-void fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, int opcode);
+void fuse_sync_release(struct fuse_file *ff, int flags);
/**
* Send RELEASE or RELEASEDIR request
*/
-int fuse_release_common(struct inode *inode, struct file *file, int isdir);
+void fuse_release_common(struct file *file, int opcode);
/**
* Send FSYNC or FSYNCDIR request
@@ -652,10 +664,12 @@
*/
struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
+void fuse_conn_kill(struct fuse_conn *fc);
+
/**
* Initialize fuse_conn
*/
-int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb);
+void fuse_conn_init(struct fuse_conn *fc);
/**
* Release reference to fuse_conn
@@ -694,4 +708,13 @@
u64 fuse_get_attr_version(struct fuse_conn *fc);
+int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+ bool isdir);
+ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos, int write);
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+ unsigned int flags);
+unsigned fuse_file_poll(struct file *file, poll_table *wait);
+int fuse_dev_release(struct inode *inode, struct file *file);
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 91f7c85f..f0df55a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -277,11 +277,14 @@
}
}
-static void fuse_put_super(struct super_block *sb)
+static void fuse_bdi_destroy(struct fuse_conn *fc)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ if (fc->bdi_initialized)
+ bdi_destroy(&fc->bdi);
+}
- fuse_send_destroy(fc);
+void fuse_conn_kill(struct fuse_conn *fc)
+{
spin_lock(&fc->lock);
fc->connected = 0;
fc->blocked = 0;
@@ -295,7 +298,16 @@
list_del(&fc->entry);
fuse_ctl_remove_conn(fc);
mutex_unlock(&fuse_mutex);
- bdi_destroy(&fc->bdi);
+ fuse_bdi_destroy(fc);
+}
+EXPORT_SYMBOL_GPL(fuse_conn_kill);
+
+static void fuse_put_super(struct super_block *sb)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ fuse_send_destroy(fc);
+ fuse_conn_kill(fc);
fuse_conn_put(fc);
}
@@ -466,10 +478,8 @@
return 0;
}
-int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
+void fuse_conn_init(struct fuse_conn *fc)
{
- int err;
-
memset(fc, 0, sizeof(*fc));
spin_lock_init(&fc->lock);
mutex_init(&fc->inst_mutex);
@@ -484,49 +494,12 @@
INIT_LIST_HEAD(&fc->bg_queue);
INIT_LIST_HEAD(&fc->entry);
atomic_set(&fc->num_waiting, 0);
- fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
- fc->bdi.unplug_io_fn = default_unplug_io_fn;
- /* fuse does it's own writeback accounting */
- fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
fc->khctr = 0;
fc->polled_files = RB_ROOT;
- fc->dev = sb->s_dev;
- err = bdi_init(&fc->bdi);
- if (err)
- goto error_mutex_destroy;
- if (sb->s_bdev) {
- err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
- MAJOR(fc->dev), MINOR(fc->dev));
- } else {
- err = bdi_register_dev(&fc->bdi, fc->dev);
- }
- if (err)
- goto error_bdi_destroy;
- /*
- * For a single fuse filesystem use max 1% of dirty +
- * writeback threshold.
- *
- * This gives about 1M of write buffer for memory maps on a
- * machine with 1G and 10% dirty_ratio, which should be more
- * than enough.
- *
- * Privileged users can raise it by writing to
- *
- * /sys/class/bdi/<bdi>/max_ratio
- */
- bdi_set_max_ratio(&fc->bdi, 1);
fc->reqctr = 0;
fc->blocked = 1;
fc->attr_version = 1;
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
-
- return 0;
-
- error_bdi_destroy:
- bdi_destroy(&fc->bdi);
- error_mutex_destroy:
- mutex_destroy(&fc->inst_mutex);
- return err;
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
@@ -539,12 +512,14 @@
fc->release(fc);
}
}
+EXPORT_SYMBOL_GPL(fuse_conn_put);
struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
{
atomic_inc(&fc->count);
return fc;
}
+EXPORT_SYMBOL_GPL(fuse_conn_get);
static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
{
@@ -797,6 +772,48 @@
kfree(fc);
}
+static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
+{
+ int err;
+
+ fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+ fc->bdi.unplug_io_fn = default_unplug_io_fn;
+ /* fuse does it's own writeback accounting */
+ fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
+
+ err = bdi_init(&fc->bdi);
+ if (err)
+ return err;
+
+ fc->bdi_initialized = 1;
+
+ if (sb->s_bdev) {
+ err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+ MAJOR(fc->dev), MINOR(fc->dev));
+ } else {
+ err = bdi_register_dev(&fc->bdi, fc->dev);
+ }
+
+ if (err)
+ return err;
+
+ /*
+ * For a single fuse filesystem use max 1% of dirty +
+ * writeback threshold.
+ *
+ * This gives about 1M of write buffer for memory maps on a
+ * machine with 1G and 10% dirty_ratio, which should be more
+ * than enough.
+ *
+ * Privileged users can raise it by writing to
+ *
+ * /sys/class/bdi/<bdi>/max_ratio
+ */
+ bdi_set_max_ratio(&fc->bdi, 1);
+
+ return 0;
+}
+
static int fuse_fill_super(struct super_block *sb, void *data, int silent)
{
struct fuse_conn *fc;
@@ -843,11 +860,12 @@
if (!fc)
goto err_fput;
- err = fuse_conn_init(fc, sb);
- if (err) {
- kfree(fc);
- goto err_fput;
- }
+ fuse_conn_init(fc);
+
+ fc->dev = sb->s_dev;
+ err = fuse_bdi_init(fc, sb);
+ if (err)
+ goto err_put_conn;
fc->release = fuse_free_conn;
fc->flags = d.flags;
@@ -911,7 +929,7 @@
err_put_root:
dput(root_dentry);
err_put_conn:
- bdi_destroy(&fc->bdi);
+ fuse_bdi_destroy(fc);
fuse_conn_put(fc);
err_fput:
fput(file);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0af3608..1a9c787 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -556,27 +556,49 @@
/* add partitions */
for (p = 1; p < state->limit; p++) {
- sector_t size = state->parts[p].size;
- sector_t from = state->parts[p].from;
+ sector_t size, from;
+try_scan:
+ size = state->parts[p].size;
if (!size)
continue;
+
+ from = state->parts[p].from;
if (from >= get_capacity(disk)) {
printk(KERN_WARNING
"%s: p%d ignored, start %llu is behind the end of the disk\n",
disk->disk_name, p, (unsigned long long) from);
continue;
}
+
if (from + size > get_capacity(disk)) {
- /*
- * we can not ignore partitions of broken tables
- * created by for example camera firmware, but we
- * limit them to the end of the disk to avoid
- * creating invalid block devices
- */
+ struct block_device_operations *bdops = disk->fops;
+ unsigned long long capacity;
+
printk(KERN_WARNING
- "%s: p%d size %llu limited to end of disk\n",
+ "%s: p%d size %llu exceeds device capacity, ",
disk->disk_name, p, (unsigned long long) size);
- size = get_capacity(disk) - from;
+
+ if (bdops->set_capacity &&
+ (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
+ printk(KERN_CONT "enabling native capacity\n");
+ capacity = bdops->set_capacity(disk, ~0ULL);
+ disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+ if (capacity > get_capacity(disk)) {
+ set_capacity(disk, capacity);
+ check_disk_size_change(disk, bdev);
+ bdev->bd_invalidated = 0;
+ }
+ goto try_scan;
+ } else {
+ /*
+ * we can not ignore partitions of broken tables
+ * created by for example camera firmware, but
+ * we limit them to the end of the disk to avoid
+ * creating invalid block devices
+ */
+ printk(KERN_CONT "limited to end of disk\n");
+ size = get_capacity(disk) - from;
+ }
}
part = add_partition(disk, p, from, size,
state->parts[p].flags);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ebdfde8..0b1a6ca 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1226,6 +1226,8 @@
int (*direct_access) (struct block_device *, sector_t,
void **, unsigned long *);
int (*media_changed) (struct gendisk *);
+ unsigned long long (*set_capacity) (struct gendisk *,
+ unsigned long long);
int (*revalidate_disk) (struct gendisk *);
int (*getgeo)(struct block_device *, struct hd_geometry *);
struct module *owner;
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 37bcb50..04fb513 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -261,6 +261,11 @@
# define __section(S) __attribute__ ((__section__(#S)))
#endif
+/* Are two types/vars the same type (ignoring qualifiers)? */
+#ifndef __same_type
+# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
+#endif
+
/*
* Prevent the compiler from merging or refetching accesses. The compiler
* is also forbidden from reordering successive instances of ACCESS_ONCE(),
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 162e5de..d41ed59 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -121,6 +121,13 @@
#define FUSE_BIG_WRITES (1 << 5)
/**
+ * CUSE INIT request/reply flags
+ *
+ * CUSE_UNRESTRICTED_IOCTL: use unrestricted ioctl
+ */
+#define CUSE_UNRESTRICTED_IOCTL (1 << 0)
+
+/**
* Release flags
*/
#define FUSE_RELEASE_FLUSH (1 << 0)
@@ -210,6 +217,9 @@
FUSE_DESTROY = 38,
FUSE_IOCTL = 39,
FUSE_POLL = 40,
+
+ /* CUSE specific operations */
+ CUSE_INIT = 4096,
};
enum fuse_notify_code {
@@ -401,6 +411,27 @@
__u32 max_write;
};
+#define CUSE_INIT_INFO_MAX 4096
+
+struct cuse_init_in {
+ __u32 major;
+ __u32 minor;
+ __u32 unused;
+ __u32 flags;
+};
+
+struct cuse_init_out {
+ __u32 major;
+ __u32 minor;
+ __u32 unused;
+ __u32 flags;
+ __u32 max_read;
+ __u32 max_write;
+ __u32 dev_major; /* chardev major */
+ __u32 dev_minor; /* chardev minor */
+ __u32 spare[10];
+};
+
struct fuse_interrupt_in {
__u64 unique;
};
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 149fda2..7cbd38d 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -114,6 +114,7 @@
#define GENHD_FL_UP 16
#define GENHD_FL_SUPPRESS_PARTITION_INFO 32
#define GENHD_FL_EXT_DEVT 64 /* allow extended devt */
+#define GENHD_FL_NATIVE_CAPACITY 128
#define BLK_SCSI_MAX_CMDS (256)
#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 867cb68..a6c6a2f 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -178,7 +178,7 @@
/*
* Structure to hold all information about the location of this port
*/
-typedef struct hw_regs_s {
+struct ide_hw {
union {
struct ide_io_ports io_ports;
unsigned long io_ports_array[IDE_NR_PORTS];
@@ -186,12 +186,11 @@
int irq; /* our irq number */
ide_ack_intr_t *ack_intr; /* acknowledge interrupt */
- hwif_chipset_t chipset;
struct device *dev, *parent;
unsigned long config;
-} hw_regs_t;
+};
-static inline void ide_std_init_ports(hw_regs_t *hw,
+static inline void ide_std_init_ports(struct ide_hw *hw,
unsigned long io_addr,
unsigned long ctl_addr)
{
@@ -218,21 +217,12 @@
/*
* Special Driver Flags
- *
- * set_geometry : respecify drive geometry
- * recalibrate : seek to cyl 0
- * set_multmode : set multmode count
- * reserved : unused
*/
-typedef union {
- unsigned all : 8;
- struct {
- unsigned set_geometry : 1;
- unsigned recalibrate : 1;
- unsigned set_multmode : 1;
- unsigned reserved : 5;
- } b;
-} special_t;
+enum {
+ IDE_SFLAG_SET_GEOMETRY = (1 << 0),
+ IDE_SFLAG_RECALIBRATE = (1 << 1),
+ IDE_SFLAG_SET_MULTMODE = (1 << 2),
+};
/*
* Status returned from various ide_ functions
@@ -391,6 +381,7 @@
struct ide_disk_ops {
int (*check)(struct ide_drive_s *, const char *);
int (*get_capacity)(struct ide_drive_s *);
+ u64 (*set_capacity)(struct ide_drive_s *, u64);
void (*setup)(struct ide_drive_s *);
void (*flush)(struct ide_drive_s *);
int (*init_media)(struct ide_drive_s *, struct gendisk *);
@@ -468,6 +459,8 @@
IDE_DFLAG_NICE1 = (1 << 5),
/* device is physically present */
IDE_DFLAG_PRESENT = (1 << 6),
+ /* disable Host Protected Area */
+ IDE_DFLAG_NOHPA = (1 << 7),
/* id read from device (synthetic if not set) */
IDE_DFLAG_ID_READ = (1 << 8),
IDE_DFLAG_NOPROBE = (1 << 9),
@@ -506,6 +499,7 @@
/* write protect */
IDE_DFLAG_WP = (1 << 29),
IDE_DFLAG_FORMAT_IN_PROGRESS = (1 << 30),
+ IDE_DFLAG_NIEN_QUIRK = (1 << 31),
};
struct ide_drive_s {
@@ -530,14 +524,13 @@
unsigned long sleep; /* sleep until this time */
unsigned long timeout; /* max time to wait for irq */
- special_t special; /* special action flags */
+ u8 special_flags; /* special action flags */
u8 select; /* basic drive/head select reg value */
u8 retry_pio; /* retrying dma capable host in pio */
u8 waiting_for_dma; /* dma currently in progress */
u8 dma; /* atapi dma flag */
- u8 quirk_list; /* considered quirky, set for a specific host */
u8 init_speed; /* transfer rate set at boot */
u8 current_speed; /* current transfer rate set */
u8 desired_speed; /* desired transfer rate set */
@@ -562,8 +555,7 @@
unsigned int drive_data; /* used by set_pio_mode/dev_select() */
unsigned int failures; /* current failure count */
unsigned int max_failures; /* maximum allowed failure count */
- u64 probed_capacity;/* initial reported media capacity (ide-cd only currently) */
-
+ u64 probed_capacity;/* initial/native media capacity */
u64 capacity64; /* total number of sectors */
int lun; /* logical unit */
@@ -1222,7 +1214,7 @@
}
void ide_pci_setup_ports(struct pci_dev *, const struct ide_port_info *,
- hw_regs_t *, hw_regs_t **);
+ struct ide_hw *, struct ide_hw **);
void ide_setup_pci_noise(struct pci_dev *, const struct ide_port_info *);
#ifdef CONFIG_BLK_DEV_IDEDMA_PCI
@@ -1461,16 +1453,18 @@
void ide_register_region(struct gendisk *);
void ide_unregister_region(struct gendisk *);
+void ide_check_nien_quirk_list(ide_drive_t *);
void ide_undecoded_slave(ide_drive_t *);
void ide_port_apply_params(ide_hwif_t *);
int ide_sysfs_register_port(ide_hwif_t *);
-struct ide_host *ide_host_alloc(const struct ide_port_info *, hw_regs_t **);
+struct ide_host *ide_host_alloc(const struct ide_port_info *, struct ide_hw **,
+ unsigned int);
void ide_host_free(struct ide_host *);
int ide_host_register(struct ide_host *, const struct ide_port_info *,
- hw_regs_t **);
-int ide_host_add(const struct ide_port_info *, hw_regs_t **,
+ struct ide_hw **);
+int ide_host_add(const struct ide_port_info *, struct ide_hw **, unsigned int,
struct ide_host **);
void ide_host_remove(struct ide_host *);
int ide_legacy_device_add(const struct ide_port_info *, unsigned long);
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index 175e63f..7bc1440 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -30,6 +30,10 @@
/* Wallclock time set by the Host. */
struct timespec time;
+ /* Interrupt pending set by the Host. The Guest should do a hypercall
+ * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */
+ int irq_pending;
+
/* Async hypercall ring. Instead of directly making hypercalls, we can
* place them in here for processing the next time the Host wants.
* This batching can be quite efficient. */
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index a53407a..bfefbdf 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -57,7 +57,8 @@
LHREQ_INITIALIZE, /* + base, pfnlimit, start */
LHREQ_GETDMA, /* No longer used */
LHREQ_IRQ, /* + irq */
- LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
+ LHREQ_BREAK, /* No longer used */
+ LHREQ_EVENTFD, /* + address, fd. */
};
/* The alignment to use between consumer and producer parts of vring.
diff --git a/include/linux/module.h b/include/linux/module.h
index a8f2c0a..a7bc6e7 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -77,6 +77,7 @@
void sort_extable(struct exception_table_entry *start,
struct exception_table_entry *finish);
void sort_main_extable(void);
+void trim_init_extable(struct module *m);
#ifdef MODULE
#define MODULE_GENERIC_TABLE(gtype,name) \
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index a4f0b93..6547c3c 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -36,9 +36,14 @@
/* Returns length written or -errno. Buffer is 4k (ie. be short!) */
typedef int (*param_get_fn)(char *buffer, struct kernel_param *kp);
+/* Flag bits for kernel_param.flags */
+#define KPARAM_KMALLOCED 1
+#define KPARAM_ISBOOL 2
+
struct kernel_param {
const char *name;
- unsigned int perm;
+ u16 perm;
+ u16 flags;
param_set_fn set;
param_get_fn get;
union {
@@ -79,7 +84,7 @@
parameters. perm sets the visibility in sysfs: 000 means it's
not there, read bits mean it's readable, write bits mean it's
writable. */
-#define __module_param_call(prefix, name, set, get, arg, perm) \
+#define __module_param_call(prefix, name, set, get, arg, isbool, perm) \
/* Default value instead of permissions? */ \
static int __param_perm_check_##name __attribute__((unused)) = \
BUILD_BUG_ON_ZERO((perm) < 0 || (perm) > 0777 || ((perm) & 2)) \
@@ -88,10 +93,13 @@
static struct kernel_param __moduleparam_const __param_##name \
__used \
__attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \
- = { __param_str_##name, perm, set, get, { arg } }
+ = { __param_str_##name, perm, isbool ? KPARAM_ISBOOL : 0, \
+ set, get, { arg } }
#define module_param_call(name, set, get, arg, perm) \
- __module_param_call(MODULE_PARAM_PREFIX, name, set, get, arg, perm)
+ __module_param_call(MODULE_PARAM_PREFIX, \
+ name, set, get, arg, \
+ __same_type(*(arg), bool), perm)
/* Helper functions: type is byte, short, ushort, int, uint, long,
ulong, charp, bool or invbool, or XXX if you define param_get_XXX,
@@ -120,15 +128,16 @@
#define core_param(name, var, type, perm) \
param_check_##type(name, &(var)); \
__module_param_call("", name, param_set_##type, param_get_##type, \
- &var, perm)
+ &var, __same_type(var, bool), perm)
#endif /* !MODULE */
/* Actually copy string: maxlen param is usually sizeof(string). */
#define module_param_string(name, string, len, perm) \
static const struct kparam_string __param_string_##name \
= { len, string }; \
- module_param_call(name, param_set_copystring, param_get_string, \
- .str = &__param_string_##name, perm); \
+ __module_param_call(MODULE_PARAM_PREFIX, name, \
+ param_set_copystring, param_get_string, \
+ .str = &__param_string_##name, 0, perm); \
__MODULE_PARM_TYPE(name, "string")
/* Called on module insert or kernel boot */
@@ -186,21 +195,30 @@
extern int param_get_charp(char *buffer, struct kernel_param *kp);
#define param_check_charp(name, p) __param_check(name, p, char *)
+/* For historical reasons "bool" parameters can be (unsigned) "int". */
extern int param_set_bool(const char *val, struct kernel_param *kp);
extern int param_get_bool(char *buffer, struct kernel_param *kp);
-#define param_check_bool(name, p) __param_check(name, p, int)
+#define param_check_bool(name, p) \
+ static inline void __check_##name(void) \
+ { \
+ BUILD_BUG_ON(!__same_type(*(p), bool) && \
+ !__same_type(*(p), unsigned int) && \
+ !__same_type(*(p), int)); \
+ }
extern int param_set_invbool(const char *val, struct kernel_param *kp);
extern int param_get_invbool(char *buffer, struct kernel_param *kp);
-#define param_check_invbool(name, p) __param_check(name, p, int)
+#define param_check_invbool(name, p) __param_check(name, p, bool)
/* Comma-separated array: *nump is set to number they actually specified. */
#define module_param_array_named(name, array, type, nump, perm) \
static const struct kparam_array __param_arr_##name \
= { ARRAY_SIZE(array), nump, param_set_##type, param_get_##type,\
sizeof(array[0]), array }; \
- module_param_call(name, param_array_set, param_array_get, \
- .arr = &__param_arr_##name, perm); \
+ __module_param_call(MODULE_PARAM_PREFIX, name, \
+ param_array_set, param_array_get, \
+ .arr = &__param_arr_##name, \
+ __same_type(array[0], bool), perm); \
__MODULE_PARM_TYPE(name, "array of " #type)
#define module_param_array(name, type, nump, perm) \
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 06005fa..4fca4f5 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -10,14 +10,17 @@
/**
* virtqueue - a queue to register buffers for sending or receiving.
+ * @list: the chain of virtqueues for this device
* @callback: the function to call when buffers are consumed (can be NULL).
+ * @name: the name of this virtqueue (mainly for debugging)
* @vdev: the virtio device this queue was created for.
* @vq_ops: the operations for this virtqueue (see below).
* @priv: a pointer for the virtqueue implementation to use.
*/
-struct virtqueue
-{
+struct virtqueue {
+ struct list_head list;
void (*callback)(struct virtqueue *vq);
+ const char *name;
struct virtio_device *vdev;
struct virtqueue_ops *vq_ops;
void *priv;
@@ -76,15 +79,16 @@
* @dev: underlying device.
* @id: the device type identification (used to match it with a driver).
* @config: the configuration ops for this device.
+ * @vqs: the list of virtqueues for this device.
* @features: the features supported by both driver and device.
* @priv: private pointer for the driver's use.
*/
-struct virtio_device
-{
+struct virtio_device {
int index;
struct device dev;
struct virtio_device_id id;
struct virtio_config_ops *config;
+ struct list_head vqs;
/* Note that this is a Linux set_bit-style bitmap. */
unsigned long features[1];
void *priv;
@@ -99,8 +103,7 @@
* @id_table: the ids serviced by this driver.
* @feature_table: an array of feature numbers supported by this device.
* @feature_table_size: number of entries in the feature table array.
- * @probe: the function to call when a device is found. Returns a token for
- * remove, or PTR_ERR().
+ * @probe: the function to call when a device is found. Returns 0 or -errno.
* @remove: the function when a device is removed.
* @config_changed: optional function to call when the device configuration
* changes; may be called in interrupt context.
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index bf8ec28..99f5145 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -29,6 +29,7 @@
#define VIRTIO_F_NOTIFY_ON_EMPTY 24
#ifdef __KERNEL__
+#include <linux/err.h>
#include <linux/virtio.h>
/**
@@ -49,15 +50,26 @@
* @set_status: write the status byte
* vdev: the virtio_device
* status: the new status byte
+ * @request_vqs: request the specified number of virtqueues
+ * vdev: the virtio_device
+ * max_vqs: the max number of virtqueues we want
+ * If supplied, must call before any virtqueues are instantiated.
+ * To modify the max number of virtqueues after request_vqs has been
+ * called, call free_vqs and then request_vqs with a new value.
+ * @free_vqs: cleanup resources allocated by request_vqs
+ * vdev: the virtio_device
+ * If supplied, must call after all virtqueues have been deleted.
* @reset: reset the device
* vdev: the virtio device
* After this, status and feature negotiation must be done again
- * @find_vq: find a virtqueue and instantiate it.
+ * @find_vqs: find virtqueues and instantiate them.
* vdev: the virtio_device
- * index: the 0-based virtqueue number in case there's more than one.
- * callback: the virqtueue callback
- * Returns the new virtqueue or ERR_PTR() (eg. -ENOENT).
- * @del_vq: free a virtqueue found by find_vq().
+ * nvqs: the number of virtqueues to find
+ * vqs: on success, includes new virtqueues
+ * callbacks: array of callbacks, for each virtqueue
+ * names: array of virtqueue names (mainly for debugging)
+ * Returns 0 on success or error status
+ * @del_vqs: free virtqueues found by find_vqs().
* @get_features: get the array of feature bits for this device.
* vdev: the virtio_device
* Returns the first 32 feature bits (all we currently need).
@@ -66,6 +78,7 @@
* This gives the final feature bits for the device: it can change
* the dev->feature bits if it wants.
*/
+typedef void vq_callback_t(struct virtqueue *);
struct virtio_config_ops
{
void (*get)(struct virtio_device *vdev, unsigned offset,
@@ -75,10 +88,11 @@
u8 (*get_status)(struct virtio_device *vdev);
void (*set_status)(struct virtio_device *vdev, u8 status);
void (*reset)(struct virtio_device *vdev);
- struct virtqueue *(*find_vq)(struct virtio_device *vdev,
- unsigned index,
- void (*callback)(struct virtqueue *));
- void (*del_vq)(struct virtqueue *vq);
+ int (*find_vqs)(struct virtio_device *, unsigned nvqs,
+ struct virtqueue *vqs[],
+ vq_callback_t *callbacks[],
+ const char *names[]);
+ void (*del_vqs)(struct virtio_device *);
u32 (*get_features)(struct virtio_device *vdev);
void (*finalize_features)(struct virtio_device *vdev);
};
@@ -99,7 +113,9 @@
if (__builtin_constant_p(fbit))
BUILD_BUG_ON(fbit >= 32);
- virtio_check_driver_offered_feature(vdev, fbit);
+ if (fbit < VIRTIO_TRANSPORT_F_START)
+ virtio_check_driver_offered_feature(vdev, fbit);
+
return test_bit(fbit, vdev->features);
}
@@ -126,5 +142,18 @@
vdev->config->get(vdev, offset, buf, len);
return 0;
}
+
+static inline
+struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
+ vq_callback_t *c, const char *n)
+{
+ vq_callback_t *callbacks[] = { c };
+ const char *names[] = { n };
+ struct virtqueue *vq;
+ int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names);
+ if (err < 0)
+ return ERR_PTR(err);
+ return vq;
+}
#endif /* __KERNEL__ */
#endif /* _LINUX_VIRTIO_CONFIG_H */
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
index cd0fd5d..9a3d7c4 100644
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -47,9 +47,17 @@
/* The bit of the ISR which indicates a device configuration change. */
#define VIRTIO_PCI_ISR_CONFIG 0x2
+/* MSI-X registers: only enabled if MSI-X is enabled. */
+/* A 16-bit vector for configuration changes. */
+#define VIRTIO_MSI_CONFIG_VECTOR 20
+/* A 16-bit vector for selected queue notifications. */
+#define VIRTIO_MSI_QUEUE_VECTOR 22
+/* Vector value used to disable MSI for queue */
+#define VIRTIO_MSI_NO_VECTOR 0xffff
+
/* The remaining space is defined by each driver as the per-driver
* configuration space */
-#define VIRTIO_PCI_CONFIG 20
+#define VIRTIO_PCI_CONFIG(dev) ((dev)->msix_enabled ? 24 : 20)
/* Virtio ABI version, this must match exactly */
#define VIRTIO_PCI_ABI_VERSION 0
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index 71e0372..693e0ec 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -14,6 +14,8 @@
#define VRING_DESC_F_NEXT 1
/* This marks a buffer as write-only (otherwise read-only). */
#define VRING_DESC_F_WRITE 2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT 4
/* The Host uses this in used->flags to advise the Guest: don't kick me when
* you add a buffer. It's unreliable, so it's simply an optimization. Guest
@@ -24,6 +26,9 @@
* optimization. */
#define VRING_AVAIL_F_NO_INTERRUPT 1
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC 28
+
/* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
struct vring_desc
{
@@ -119,7 +124,8 @@
struct virtio_device *vdev,
void *pages,
void (*notify)(struct virtqueue *vq),
- void (*callback)(struct virtqueue *vq));
+ void (*callback)(struct virtqueue *vq),
+ const char *name);
void vring_del_virtqueue(struct virtqueue *vq);
/* Filter out transport-specific feature bits. */
void vring_transport_features(struct virtio_device *vdev);
diff --git a/kernel/module.c b/kernel/module.c
index 35f7de0..e4ab36c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2455,6 +2455,7 @@
mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
+ trim_init_extable(mod);
module_free(mod, mod->module_init);
mod->module_init = NULL;
mod->init_size = 0;
diff --git a/kernel/params.c b/kernel/params.c
index de273ec..7f6912c 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,9 +24,6 @@
#include <linux/err.h>
#include <linux/slab.h>
-/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
-#define KPARAM_KMALLOCED 0x80000000
-
#if 0
#define DEBUGP printk
#else
@@ -220,13 +217,13 @@
return -ENOSPC;
}
- if (kp->perm & KPARAM_KMALLOCED)
+ if (kp->flags & KPARAM_KMALLOCED)
kfree(*(char **)kp->arg);
/* This is a hack. We can't need to strdup in early boot, and we
* don't need to; this mangled commandline is preserved. */
if (slab_is_available()) {
- kp->perm |= KPARAM_KMALLOCED;
+ kp->flags |= KPARAM_KMALLOCED;
*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
if (!kp->arg)
return -ENOMEM;
@@ -241,44 +238,63 @@
return sprintf(buffer, "%s", *((char **)kp->arg));
}
+/* Actually could be a bool or an int, for historical reasons. */
int param_set_bool(const char *val, struct kernel_param *kp)
{
+ bool v;
+
/* No equals means "set"... */
if (!val) val = "1";
/* One of =[yYnN01] */
switch (val[0]) {
case 'y': case 'Y': case '1':
- *(int *)kp->arg = 1;
- return 0;
+ v = true;
+ break;
case 'n': case 'N': case '0':
- *(int *)kp->arg = 0;
- return 0;
+ v = false;
+ break;
+ default:
+ return -EINVAL;
}
- return -EINVAL;
+
+ if (kp->flags & KPARAM_ISBOOL)
+ *(bool *)kp->arg = v;
+ else
+ *(int *)kp->arg = v;
+ return 0;
}
int param_get_bool(char *buffer, struct kernel_param *kp)
{
+ bool val;
+ if (kp->flags & KPARAM_ISBOOL)
+ val = *(bool *)kp->arg;
+ else
+ val = *(int *)kp->arg;
+
/* Y and N chosen as being relatively non-coder friendly */
- return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N');
+ return sprintf(buffer, "%c", val ? 'Y' : 'N');
}
+/* This one must be bool. */
int param_set_invbool(const char *val, struct kernel_param *kp)
{
- int boolval, ret;
+ int ret;
+ bool boolval;
struct kernel_param dummy;
dummy.arg = &boolval;
+ dummy.flags = KPARAM_ISBOOL;
ret = param_set_bool(val, &dummy);
if (ret == 0)
- *(int *)kp->arg = !boolval;
+ *(bool *)kp->arg = !boolval;
return ret;
}
int param_get_invbool(char *buffer, struct kernel_param *kp)
{
- return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y');
+ return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
}
/* We break the rule and mangle the string. */
@@ -591,7 +607,7 @@
unsigned int i;
for (i = 0; i < num; i++)
- if (params[i].perm & KPARAM_KMALLOCED)
+ if (params[i].flags & KPARAM_KMALLOCED)
kfree(*(char **)params[i].arg);
}
diff --git a/kernel/sched.c b/kernel/sched.c
index f04aa96..8ec9d13 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2192,6 +2192,7 @@
smp_send_reschedule(cpu);
preempt_enable();
}
+EXPORT_SYMBOL_GPL(kick_process);
/*
* Return a low guess at the load of a migration-source cpu weighted
diff --git a/lib/extable.c b/lib/extable.c
index 179c087..4cac81e 100644
--- a/lib/extable.c
+++ b/lib/extable.c
@@ -39,7 +39,26 @@
sort(start, finish - start, sizeof(struct exception_table_entry),
cmp_ex, NULL);
}
-#endif
+
+#ifdef CONFIG_MODULES
+/*
+ * If the exception table is sorted, any referring to the module init
+ * will be at the beginning or the end.
+ */
+void trim_init_extable(struct module *m)
+{
+ /*trim the beginning*/
+ while (m->num_exentries && within_module_init(m->extable[0].insn, m)) {
+ m->extable++;
+ m->num_exentries--;
+ }
+ /*trim the end*/
+ while (m->num_exentries &&
+ within_module_init(m->extable[m->num_exentries-1].insn, m))
+ m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
+#endif /* !ARCH_HAS_SORT_EXTABLE */
#ifndef ARCH_HAS_SEARCH_EXTABLE
/*
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index bb8579a..a49484e 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -246,7 +246,7 @@
chan->vdev = vdev;
/* We expect one virtqueue, for requests. */
- chan->vq = vdev->config->find_vq(vdev, 0, req_done);
+ chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
if (IS_ERR(chan->vq)) {
err = PTR_ERR(chan->vq);
goto out_free_vq;
@@ -261,7 +261,7 @@
return 0;
out_free_vq:
- vdev->config->del_vq(chan->vq);
+ vdev->config->del_vqs(vdev);
fail:
mutex_lock(&virtio_9p_lock);
chan_index--;
@@ -332,7 +332,7 @@
BUG_ON(chan->inuse);
if (chan->initialized) {
- vdev->config->del_vq(chan->vq);
+ vdev->config->del_vqs(vdev);
chan->initialized = false;
}
}
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index a334428..40e0045 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -641,7 +641,7 @@
id->vendor = TO_NATIVE(id->vendor);
strcpy(alias, "virtio:");
- ADD(alias, "d", 1, id->device);
+ ADD(alias, "d", id->device != VIRTIO_DEV_ANY_ID, id->device);
ADD(alias, "v", id->vendor != VIRTIO_DEV_ANY_ID, id->vendor);
add_wildcard(alias);