UHCI: Eliminate asynchronous skeleton Queue Headers

This patch (as856) attempts to improve the performance of uhci-hcd by
removing the asynchronous skeleton Queue Headers.  They don't contain
any useful information but the controller has to read through them at
least once every millisecond, incurring a non-zero DMA overhead.

Now all the asynchronous queues are combined, along with the period-1
interrupt queue, into a single list with a single skeleton QH.  The
start of the low-speed control, full-speed control, and bulk sublists
is determined by linear search.  Since there should rarely be more
than a couple of QHs in the list, the searches should incur a much
smaller total load than keeping the skeleton QHs.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c
index 1f0833a..44da433 100644
--- a/drivers/usb/host/uhci-hcd.c
+++ b/drivers/usb/host/uhci-hcd.c
@@ -13,7 +13,7 @@
  * (C) Copyright 2000 Yggdrasil Computing, Inc. (port of new PCI interface
  *               support from usb-ohci.c by Adam Richter, adam@yggdrasil.com).
  * (C) Copyright 1999 Gregory P. Smith (from usb-ohci.c)
- * (C) Copyright 2004-2006 Alan Stern, stern@rowland.harvard.edu
+ * (C) Copyright 2004-2007 Alan Stern, stern@rowland.harvard.edu
  *
  * Intel documents this fairly well, and as far as I know there
  * are no royalties or anything like that, but even so there are
@@ -107,10 +107,10 @@
 	 * interrupt QHs, which will help spread out bandwidth utilization.
 	 *
 	 * ffs (Find First bit Set) does exactly what we need:
-	 * 1,3,5,...  => ffs = 0 => use skel_int2_qh = skelqh[8],
-	 * 2,6,10,... => ffs = 1 => use skel_int4_qh = skelqh[7], etc.
+	 * 1,3,5,...  => ffs = 0 => use period-2 QH = skelqh[8],
+	 * 2,6,10,... => ffs = 1 => use period-4 QH = skelqh[7], etc.
 	 * ffs >= 7 => not on any high-period queue, so use
-	 *	skel_int1_qh = skelqh[9].
+	 *	period-1 QH = skelqh[9].
 	 * Add in UHCI_NUMFRAMES to insure at least one bit is set.
 	 */
 	skelnum = 8 - (int) __ffs(frame | UHCI_NUMFRAMES);
@@ -540,16 +540,18 @@
  *
  * The hardware doesn't really know any difference
  * in the queues, but the order does matter for the
- * protocols higher up. The order is:
+ * protocols higher up.  The order in which the queues
+ * are encountered by the hardware is:
  *
- *  - any isochronous events handled before any
+ *  - All isochronous events are handled before any
  *    of the queues. We don't do that here, because
  *    we'll create the actual TD entries on demand.
- *  - The first queue is the interrupt queue.
- *  - The second queue is the control queue, split into low- and full-speed
- *  - The third queue is bulk queue.
- *  - The fourth queue is the bandwidth reclamation queue, which loops back
- *    to the full-speed control queue.
+ *  - The first queue is the high-period interrupt queue.
+ *  - The second queue is the period-1 interrupt and async
+ *    (low-speed control, full-speed control, then bulk) queue.
+ *  - The third queue is the terminating bandwidth reclamation queue,
+ *    which contains no members, loops back to itself, and is present
+ *    only when FSBR is on and there are no full-speed control or bulk QHs.
  */
 static int uhci_start(struct usb_hcd *hcd)
 {
@@ -626,30 +628,18 @@
 	}
 
 	/*
-	 * 8 Interrupt queues; link all higher int queues to int1,
-	 * then link int1 to control and control to bulk
+	 * 8 Interrupt queues; link all higher int queues to int1 = async
 	 */
-	uhci->skel_int128_qh->link =
-			uhci->skel_int64_qh->link =
-			uhci->skel_int32_qh->link =
-			uhci->skel_int16_qh->link =
-			uhci->skel_int8_qh->link =
-			uhci->skel_int4_qh->link =
-			uhci->skel_int2_qh->link = LINK_TO_QH(
-				uhci->skel_int1_qh);
-
-	uhci->skel_int1_qh->link = LINK_TO_QH(uhci->skel_ls_control_qh);
-	uhci->skel_ls_control_qh->link = LINK_TO_QH(uhci->skel_fs_control_qh);
-	uhci->skel_fs_control_qh->link = LINK_TO_QH(uhci->skel_bulk_qh);
-	uhci->skel_bulk_qh->link = LINK_TO_QH(uhci->skel_term_qh);
+	for (i = SKEL_ISO + 1; i < SKEL_ASYNC; ++i)
+		uhci->skelqh[i]->link = LINK_TO_QH(uhci->skel_async_qh);
+	uhci->skel_async_qh->link = uhci->skel_term_qh->link = UHCI_PTR_TERM;
 
 	/* This dummy TD is to work around a bug in Intel PIIX controllers */
 	uhci_fill_td(uhci->term_td, 0, uhci_explen(0) |
-		(0x7f << TD_TOKEN_DEVADDR_SHIFT) | USB_PID_IN, 0);
-	uhci->term_td->link = LINK_TO_TD(uhci->term_td);
-
-	uhci->skel_term_qh->link = UHCI_PTR_TERM;
-	uhci->skel_term_qh->element = LINK_TO_TD(uhci->term_td);
+			(0x7f << TD_TOKEN_DEVADDR_SHIFT) | USB_PID_IN, 0);
+	uhci->term_td->link = UHCI_PTR_TERM;
+	uhci->skel_async_qh->element = uhci->skel_term_qh->element =
+			LINK_TO_TD(uhci->term_td);
 
 	/*
 	 * Fill the frame list: make all entries point to the proper