USB: EHCI: split ehci_qh into hw and sw parts

The ehci_qh structure merged hw and sw together which is not good:
1. More and more items are being added into ehci_qh, the ehci_qh software
   part are unnecessary to be allocated in DMA qh_pool.
2. If HCD has local SRAM, the sw part will consume it too, and it won't
   bring any benefit.
3. For non-cache-coherence system, the entire ehci_qh is uncachable, actually
   we only need the hw part to be uncacheable. Spliting them will let the sw
   part to be cacheable.

Signed-off-by: Alek Du <alek.du@intel.com>
Cc: David Brownell <dbrownell@users.sourceforge.net>
CC: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

diff --git a/drivers/usb/host/ehci-q.c b/drivers/usb/host/ehci-q.c
index 7673554..377ed53 100644
--- a/drivers/usb/host/ehci-q.c
+++ b/drivers/usb/host/ehci-q.c
@@ -87,31 +87,33 @@
 static inline void
 qh_update (struct ehci_hcd *ehci, struct ehci_qh *qh, struct ehci_qtd *qtd)
 {
+	struct ehci_qh_hw *hw = qh->hw;
+
 	/* writes to an active overlay are unsafe */
 	BUG_ON(qh->qh_state != QH_STATE_IDLE);
 
-	qh->hw_qtd_next = QTD_NEXT(ehci, qtd->qtd_dma);
-	qh->hw_alt_next = EHCI_LIST_END(ehci);
+	hw->hw_qtd_next = QTD_NEXT(ehci, qtd->qtd_dma);
+	hw->hw_alt_next = EHCI_LIST_END(ehci);
 
 	/* Except for control endpoints, we make hardware maintain data
 	 * toggle (like OHCI) ... here (re)initialize the toggle in the QH,
 	 * and set the pseudo-toggle in udev. Only usb_clear_halt() will
 	 * ever clear it.
 	 */
-	if (!(qh->hw_info1 & cpu_to_hc32(ehci, 1 << 14))) {
+	if (!(hw->hw_info1 & cpu_to_hc32(ehci, 1 << 14))) {
 		unsigned	is_out, epnum;
 
 		is_out = !(qtd->hw_token & cpu_to_hc32(ehci, 1 << 8));
-		epnum = (hc32_to_cpup(ehci, &qh->hw_info1) >> 8) & 0x0f;
+		epnum = (hc32_to_cpup(ehci, &hw->hw_info1) >> 8) & 0x0f;
 		if (unlikely (!usb_gettoggle (qh->dev, epnum, is_out))) {
-			qh->hw_token &= ~cpu_to_hc32(ehci, QTD_TOGGLE);
+			hw->hw_token &= ~cpu_to_hc32(ehci, QTD_TOGGLE);
 			usb_settoggle (qh->dev, epnum, is_out, 1);
 		}
 	}
 
 	/* HC must see latest qtd and qh data before we clear ACTIVE+HALT */
 	wmb ();
-	qh->hw_token &= cpu_to_hc32(ehci, QTD_TOGGLE | QTD_STS_PING);
+	hw->hw_token &= cpu_to_hc32(ehci, QTD_TOGGLE | QTD_STS_PING);
 }
 
 /* if it weren't for a common silicon quirk (writing the dummy into the qh
@@ -129,7 +131,7 @@
 		qtd = list_entry (qh->qtd_list.next,
 				struct ehci_qtd, qtd_list);
 		/* first qtd may already be partially processed */
-		if (cpu_to_hc32(ehci, qtd->qtd_dma) == qh->hw_current)
+		if (cpu_to_hc32(ehci, qtd->qtd_dma) == qh->hw->hw_current)
 			qtd = NULL;
 	}
 
@@ -260,7 +262,7 @@
 		struct ehci_qh	*qh = (struct ehci_qh *) urb->hcpriv;
 
 		/* S-mask in a QH means it's an interrupt urb */
-		if ((qh->hw_info2 & cpu_to_hc32(ehci, QH_SMASK)) != 0) {
+		if ((qh->hw->hw_info2 & cpu_to_hc32(ehci, QH_SMASK)) != 0) {
 
 			/* ... update hc-wide periodic stats (for usbfs) */
 			ehci_to_hcd(ehci)->self.bandwidth_int_reqs--;
@@ -315,6 +317,7 @@
 	unsigned		count = 0;
 	u8			state;
 	__le32			halt = HALT_BIT(ehci);
+	struct ehci_qh_hw	*hw = qh->hw;
 
 	if (unlikely (list_empty (&qh->qtd_list)))
 		return count;
@@ -392,7 +395,8 @@
 					qtd->hw_token = cpu_to_hc32(ehci,
 							token);
 					wmb();
-					qh->hw_token = cpu_to_hc32(ehci, token);
+					hw->hw_token = cpu_to_hc32(ehci,
+							token);
 					goto retry_xacterr;
 				}
 				stopped = 1;
@@ -435,8 +439,8 @@
 			/* qh unlinked; token in overlay may be most current */
 			if (state == QH_STATE_IDLE
 					&& cpu_to_hc32(ehci, qtd->qtd_dma)
-						== qh->hw_current) {
-				token = hc32_to_cpu(ehci, qh->hw_token);
+						== hw->hw_current) {
+				token = hc32_to_cpu(ehci, hw->hw_token);
 
 				/* An unlink may leave an incomplete
 				 * async transaction in the TT buffer.
@@ -449,9 +453,9 @@
 			 * patch the qh later and so that completions can't
 			 * activate it while we "know" it's stopped.
 			 */
-			if ((halt & qh->hw_token) == 0) {
+			if ((halt & hw->hw_token) == 0) {
 halt:
-				qh->hw_token |= halt;
+				hw->hw_token |= halt;
 				wmb ();
 			}
 		}
@@ -510,7 +514,7 @@
 	 * it after fault cleanup, or recovering from silicon wrongly
 	 * overlaying the dummy qtd (which reduces DMA chatter).
 	 */
-	if (stopped != 0 || qh->hw_qtd_next == EHCI_LIST_END(ehci)) {
+	if (stopped != 0 || hw->hw_qtd_next == EHCI_LIST_END(ehci)) {
 		switch (state) {
 		case QH_STATE_IDLE:
 			qh_refresh(ehci, qh);
@@ -528,7 +532,7 @@
 			 * except maybe high bandwidth ...
 			 */
 			if ((cpu_to_hc32(ehci, QH_SMASK)
-					& qh->hw_info2) != 0) {
+					& hw->hw_info2) != 0) {
 				intr_deschedule (ehci, qh);
 				(void) qh_schedule (ehci, qh);
 			} else
@@ -649,7 +653,7 @@
 		 * (this will usually be overridden later.)
 		 */
 		if (is_input)
-			qtd->hw_alt_next = ehci->async->hw_alt_next;
+			qtd->hw_alt_next = ehci->async->hw->hw_alt_next;
 
 		/* qh makes control packets use qtd toggle; maybe switch it */
 		if ((maxpacket & (this_qtd_len + (maxpacket - 1))) == 0)
@@ -744,6 +748,7 @@
 	int			is_input, type;
 	int			maxp = 0;
 	struct usb_tt		*tt = urb->dev->tt;
+	struct ehci_qh_hw	*hw;
 
 	if (!qh)
 		return qh;
@@ -890,8 +895,9 @@
 
 	/* init as live, toggle clear, advance to dummy */
 	qh->qh_state = QH_STATE_IDLE;
-	qh->hw_info1 = cpu_to_hc32(ehci, info1);
-	qh->hw_info2 = cpu_to_hc32(ehci, info2);
+	hw = qh->hw;
+	hw->hw_info1 = cpu_to_hc32(ehci, info1);
+	hw->hw_info2 = cpu_to_hc32(ehci, info2);
 	usb_settoggle (urb->dev, usb_pipeendpoint (urb->pipe), !is_input, 1);
 	qh_refresh (ehci, qh);
 	return qh;
@@ -933,11 +939,11 @@
 
 	/* splice right after start */
 	qh->qh_next = head->qh_next;
-	qh->hw_next = head->hw_next;
+	qh->hw->hw_next = head->hw->hw_next;
 	wmb ();
 
 	head->qh_next.qh = qh;
-	head->hw_next = dma;
+	head->hw->hw_next = dma;
 
 	qh_get(qh);
 	qh->xacterrs = 0;
@@ -984,7 +990,7 @@
 
                         /* usb_reset_device() briefly reverts to address 0 */
                         if (usb_pipedevice (urb->pipe) == 0)
-                                qh->hw_info1 &= ~qh_addr_mask;
+				qh->hw->hw_info1 &= ~qh_addr_mask;
 		}
 
 		/* just one way to queue requests: swap with the dummy qtd.
@@ -1169,7 +1175,7 @@
 	while (prev->qh_next.qh != qh)
 		prev = prev->qh_next.qh;
 
-	prev->hw_next = qh->hw_next;
+	prev->hw->hw_next = qh->hw->hw_next;
 	prev->qh_next = qh->qh_next;
 	wmb ();