USB: xhci: Add memory allocation for USB3 bulk streams.

Add support for allocating streams for USB 3.0 bulk endpoints.  See
Documentation/usb/bulk-streams.txt for more information about how and why
you would use streams.

When an endpoint has streams enabled, instead of having one ring where all
transfers are enqueued to the hardware, it has several rings.  The ring
dequeue pointer in the endpoint context is changed to point to a "Stream
Context Array".  This is basically an array of pointers to transfer rings,
one for each stream ID that the driver wants to use.

The Stream Context Array size must be a power of two, and host controllers
can place a limit on the size of the array (4 to 2^16 entries).  These
two facts make calculating the size of the Stream Context Array and the
number of entries actually used by the driver a bit tricky.

Besides the Stream Context Array and rings for all the stream IDs, we need
one more data structure.  The xHCI hardware will not tell us which stream
ID a transfer event was for, but it will give us the slot ID, endpoint
index, and physical address for the TRB that caused the event.  For every
endpoint on a device, add a radix tree to map physical TRB addresses to
virtual segments within a stream ring.

Keep track of whether an endpoint is transitioning to using streams, and
don't enqueue any URBs while that's taking place.  Refuse to transition an
endpoint to streams if there are already URBs enqueued for that endpoint.

We need to make sure that freeing streams does not fail, since a driver's
disconnect() function may attempt to do this, and it cannot fail.
Pre-allocate the command structure used to issue the Configure Endpoint
command, and reserve space on the command ring for each stream endpoint.
This may be a bit overkill, but it is permissible for the driver to
allocate all streams in one call and free them in multiple calls.  (It is
not advised, however, since it is a waste of resources and time.)

Even with the memory and ring room pre-allocated, freeing streams can
still fail because the xHC rejects the configure endpoint command.  It is
valid (by the xHCI 0.96 spec) to return a "Bandwidth Error" or a "Resource
Error" for a configure endpoint command.  We should never see a Bandwidth
Error, since bulk endpoints do not effect the reserved bandwidth.  The
host controller can still return a Resource Error, but it's improbable
since the xHC would be going from a more resource-intensive configuration
(streams) to a less resource-intensive configuration (no streams).

If the xHC returns a Resource Error, the endpoint will be stuck with
streams and will be unusable for drivers.  It's an unavoidable consequence
of broken host controller hardware.

Includes bug fixes from the original patch, contributed by
John Youn <John.Youn@synopsys.com> and Andy Green <AGreen@PLXTech.com>

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
index d64f572..d299ffa 100644
--- a/drivers/usb/host/xhci-mem.c
+++ b/drivers/usb/host/xhci-mem.c
@@ -304,6 +304,350 @@
 		(ctx->bytes + (ep_index * CTX_SIZE(xhci->hcc_params)));
 }
 
+
+/***************** Streams structures manipulation *************************/
+
+void xhci_free_stream_ctx(struct xhci_hcd *xhci,
+		unsigned int num_stream_ctxs,
+		struct xhci_stream_ctx *stream_ctx, dma_addr_t dma)
+{
+	struct pci_dev *pdev = to_pci_dev(xhci_to_hcd(xhci)->self.controller);
+
+	if (num_stream_ctxs > MEDIUM_STREAM_ARRAY_SIZE)
+		pci_free_consistent(pdev,
+				sizeof(struct xhci_stream_ctx)*num_stream_ctxs,
+				stream_ctx, dma);
+	else if (num_stream_ctxs <= SMALL_STREAM_ARRAY_SIZE)
+		return dma_pool_free(xhci->small_streams_pool,
+				stream_ctx, dma);
+	else
+		return dma_pool_free(xhci->medium_streams_pool,
+				stream_ctx, dma);
+}
+
+/*
+ * The stream context array for each endpoint with bulk streams enabled can
+ * vary in size, based on:
+ *  - how many streams the endpoint supports,
+ *  - the maximum primary stream array size the host controller supports,
+ *  - and how many streams the device driver asks for.
+ *
+ * The stream context array must be a power of 2, and can be as small as
+ * 64 bytes or as large as 1MB.
+ */
+struct xhci_stream_ctx *xhci_alloc_stream_ctx(struct xhci_hcd *xhci,
+		unsigned int num_stream_ctxs, dma_addr_t *dma,
+		gfp_t mem_flags)
+{
+	struct pci_dev *pdev = to_pci_dev(xhci_to_hcd(xhci)->self.controller);
+
+	if (num_stream_ctxs > MEDIUM_STREAM_ARRAY_SIZE)
+		return pci_alloc_consistent(pdev,
+				sizeof(struct xhci_stream_ctx)*num_stream_ctxs,
+				dma);
+	else if (num_stream_ctxs <= SMALL_STREAM_ARRAY_SIZE)
+		return dma_pool_alloc(xhci->small_streams_pool,
+				mem_flags, dma);
+	else
+		return dma_pool_alloc(xhci->medium_streams_pool,
+				mem_flags, dma);
+}
+
+#ifdef CONFIG_USB_XHCI_HCD_DEBUGGING
+struct xhci_ring *dma_to_stream_ring(
+		struct xhci_stream_info *stream_info,
+		u64 address)
+{
+	return radix_tree_lookup(&stream_info->trb_address_map,
+			address >> SEGMENT_SHIFT);
+}
+#endif	/* CONFIG_USB_XHCI_HCD_DEBUGGING */
+
+#ifdef CONFIG_USB_XHCI_HCD_DEBUGGING
+static int xhci_test_radix_tree(struct xhci_hcd *xhci,
+		unsigned int num_streams,
+		struct xhci_stream_info *stream_info)
+{
+	u32 cur_stream;
+	struct xhci_ring *cur_ring;
+	u64 addr;
+
+	for (cur_stream = 1; cur_stream < num_streams; cur_stream++) {
+		struct xhci_ring *mapped_ring;
+		int trb_size = sizeof(union xhci_trb);
+
+		cur_ring = stream_info->stream_rings[cur_stream];
+		for (addr = cur_ring->first_seg->dma;
+				addr < cur_ring->first_seg->dma + SEGMENT_SIZE;
+				addr += trb_size) {
+			mapped_ring = dma_to_stream_ring(stream_info, addr);
+			if (cur_ring != mapped_ring) {
+				xhci_warn(xhci, "WARN: DMA address 0x%08llx "
+						"didn't map to stream ID %u; "
+						"mapped to ring %p\n",
+						(unsigned long long) addr,
+						cur_stream,
+						mapped_ring);
+				return -EINVAL;
+			}
+		}
+		/* One TRB after the end of the ring segment shouldn't return a
+		 * pointer to the current ring (although it may be a part of a
+		 * different ring).
+		 */
+		mapped_ring = dma_to_stream_ring(stream_info, addr);
+		if (mapped_ring != cur_ring) {
+			/* One TRB before should also fail */
+			addr = cur_ring->first_seg->dma - trb_size;
+			mapped_ring = dma_to_stream_ring(stream_info, addr);
+		}
+		if (mapped_ring == cur_ring) {
+			xhci_warn(xhci, "WARN: Bad DMA address 0x%08llx "
+					"mapped to valid stream ID %u; "
+					"mapped ring = %p\n",
+					(unsigned long long) addr,
+					cur_stream,
+					mapped_ring);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+#endif	/* CONFIG_USB_XHCI_HCD_DEBUGGING */
+
+/*
+ * Change an endpoint's internal structure so it supports stream IDs.  The
+ * number of requested streams includes stream 0, which cannot be used by device
+ * drivers.
+ *
+ * The number of stream contexts in the stream context array may be bigger than
+ * the number of streams the driver wants to use.  This is because the number of
+ * stream context array entries must be a power of two.
+ *
+ * We need a radix tree for mapping physical addresses of TRBs to which stream
+ * ID they belong to.  We need to do this because the host controller won't tell
+ * us which stream ring the TRB came from.  We could store the stream ID in an
+ * event data TRB, but that doesn't help us for the cancellation case, since the
+ * endpoint may stop before it reaches that event data TRB.
+ *
+ * The radix tree maps the upper portion of the TRB DMA address to a ring
+ * segment that has the same upper portion of DMA addresses.  For example, say I
+ * have segments of size 1KB, that are always 64-byte aligned.  A segment may
+ * start at 0x10c91000 and end at 0x10c913f0.  If I use the upper 10 bits, the
+ * key to the stream ID is 0x43244.  I can use the DMA address of the TRB to
+ * pass the radix tree a key to get the right stream ID:
+ *
+ * 	0x10c90fff >> 10 = 0x43243
+ * 	0x10c912c0 >> 10 = 0x43244
+ * 	0x10c91400 >> 10 = 0x43245
+ *
+ * Obviously, only those TRBs with DMA addresses that are within the segment
+ * will make the radix tree return the stream ID for that ring.
+ *
+ * Caveats for the radix tree:
+ *
+ * The radix tree uses an unsigned long as a key pair.  On 32-bit systems, an
+ * unsigned long will be 32-bits; on a 64-bit system an unsigned long will be
+ * 64-bits.  Since we only request 32-bit DMA addresses, we can use that as the
+ * key on 32-bit or 64-bit systems (it would also be fine if we asked for 64-bit
+ * PCI DMA addresses on a 64-bit system).  There might be a problem on 32-bit
+ * extended systems (where the DMA address can be bigger than 32-bits),
+ * if we allow the PCI dma mask to be bigger than 32-bits.  So don't do that.
+ */
+struct xhci_stream_info *xhci_alloc_stream_info(struct xhci_hcd *xhci,
+		unsigned int num_stream_ctxs,
+		unsigned int num_streams, gfp_t mem_flags)
+{
+	struct xhci_stream_info *stream_info;
+	u32 cur_stream;
+	struct xhci_ring *cur_ring;
+	unsigned long key;
+	u64 addr;
+	int ret;
+
+	xhci_dbg(xhci, "Allocating %u streams and %u "
+			"stream context array entries.\n",
+			num_streams, num_stream_ctxs);
+	if (xhci->cmd_ring_reserved_trbs == MAX_RSVD_CMD_TRBS) {
+		xhci_dbg(xhci, "Command ring has no reserved TRBs available\n");
+		return NULL;
+	}
+	xhci->cmd_ring_reserved_trbs++;
+
+	stream_info = kzalloc(sizeof(struct xhci_stream_info), mem_flags);
+	if (!stream_info)
+		goto cleanup_trbs;
+
+	stream_info->num_streams = num_streams;
+	stream_info->num_stream_ctxs = num_stream_ctxs;
+
+	/* Initialize the array of virtual pointers to stream rings. */
+	stream_info->stream_rings = kzalloc(
+			sizeof(struct xhci_ring *)*num_streams,
+			mem_flags);
+	if (!stream_info->stream_rings)
+		goto cleanup_info;
+
+	/* Initialize the array of DMA addresses for stream rings for the HW. */
+	stream_info->stream_ctx_array = xhci_alloc_stream_ctx(xhci,
+			num_stream_ctxs, &stream_info->ctx_array_dma,
+			mem_flags);
+	if (!stream_info->stream_ctx_array)
+		goto cleanup_ctx;
+	memset(stream_info->stream_ctx_array, 0,
+			sizeof(struct xhci_stream_ctx)*num_stream_ctxs);
+
+	/* Allocate everything needed to free the stream rings later */
+	stream_info->free_streams_command =
+		xhci_alloc_command(xhci, true, true, mem_flags);
+	if (!stream_info->free_streams_command)
+		goto cleanup_ctx;
+
+	INIT_RADIX_TREE(&stream_info->trb_address_map, GFP_ATOMIC);
+
+	/* Allocate rings for all the streams that the driver will use,
+	 * and add their segment DMA addresses to the radix tree.
+	 * Stream 0 is reserved.
+	 */
+	for (cur_stream = 1; cur_stream < num_streams; cur_stream++) {
+		stream_info->stream_rings[cur_stream] =
+			xhci_ring_alloc(xhci, 1, true, mem_flags);
+		cur_ring = stream_info->stream_rings[cur_stream];
+		if (!cur_ring)
+			goto cleanup_rings;
+		/* Set deq ptr, cycle bit, and stream context type */
+		addr = cur_ring->first_seg->dma |
+			SCT_FOR_CTX(SCT_PRI_TR) |
+			cur_ring->cycle_state;
+		stream_info->stream_ctx_array[cur_stream].stream_ring = addr;
+		xhci_dbg(xhci, "Setting stream %d ring ptr to 0x%08llx\n",
+				cur_stream, (unsigned long long) addr);
+
+		key = (unsigned long)
+			(cur_ring->first_seg->dma >> SEGMENT_SHIFT);
+		ret = radix_tree_insert(&stream_info->trb_address_map,
+				key, cur_ring);
+		if (ret) {
+			xhci_ring_free(xhci, cur_ring);
+			stream_info->stream_rings[cur_stream] = NULL;
+			goto cleanup_rings;
+		}
+	}
+	/* Leave the other unused stream ring pointers in the stream context
+	 * array initialized to zero.  This will cause the xHC to give us an
+	 * error if the device asks for a stream ID we don't have setup (if it
+	 * was any other way, the host controller would assume the ring is
+	 * "empty" and wait forever for data to be queued to that stream ID).
+	 */
+#if XHCI_DEBUG
+	/* Do a little test on the radix tree to make sure it returns the
+	 * correct values.
+	 */
+	if (xhci_test_radix_tree(xhci, num_streams, stream_info))
+		goto cleanup_rings;
+#endif
+
+	return stream_info;
+
+cleanup_rings:
+	for (cur_stream = 1; cur_stream < num_streams; cur_stream++) {
+		cur_ring = stream_info->stream_rings[cur_stream];
+		if (cur_ring) {
+			addr = cur_ring->first_seg->dma;
+			radix_tree_delete(&stream_info->trb_address_map,
+					addr >> SEGMENT_SHIFT);
+			xhci_ring_free(xhci, cur_ring);
+			stream_info->stream_rings[cur_stream] = NULL;
+		}
+	}
+	xhci_free_command(xhci, stream_info->free_streams_command);
+cleanup_ctx:
+	kfree(stream_info->stream_rings);
+cleanup_info:
+	kfree(stream_info);
+cleanup_trbs:
+	xhci->cmd_ring_reserved_trbs--;
+	return NULL;
+}
+/*
+ * Sets the MaxPStreams field and the Linear Stream Array field.
+ * Sets the dequeue pointer to the stream context array.
+ */
+void xhci_setup_streams_ep_input_ctx(struct xhci_hcd *xhci,
+		struct xhci_ep_ctx *ep_ctx,
+		struct xhci_stream_info *stream_info)
+{
+	u32 max_primary_streams;
+	/* MaxPStreams is the number of stream context array entries, not the
+	 * number we're actually using.  Must be in 2^(MaxPstreams + 1) format.
+	 * fls(0) = 0, fls(0x1) = 1, fls(0x10) = 2, fls(0x100) = 3, etc.
+	 */
+	max_primary_streams = fls(stream_info->num_stream_ctxs) - 2;
+	xhci_dbg(xhci, "Setting number of stream ctx array entries to %u\n",
+			1 << (max_primary_streams + 1));
+	ep_ctx->ep_info &= ~EP_MAXPSTREAMS_MASK;
+	ep_ctx->ep_info |= EP_MAXPSTREAMS(max_primary_streams);
+	ep_ctx->ep_info |= EP_HAS_LSA;
+	ep_ctx->deq  = stream_info->ctx_array_dma;
+}
+
+/*
+ * Sets the MaxPStreams field and the Linear Stream Array field to 0.
+ * Reinstalls the "normal" endpoint ring (at its previous dequeue mark,
+ * not at the beginning of the ring).
+ */
+void xhci_setup_no_streams_ep_input_ctx(struct xhci_hcd *xhci,
+		struct xhci_ep_ctx *ep_ctx,
+		struct xhci_virt_ep *ep)
+{
+	dma_addr_t addr;
+	ep_ctx->ep_info &= ~EP_MAXPSTREAMS_MASK;
+	ep_ctx->ep_info &= ~EP_HAS_LSA;
+	addr = xhci_trb_virt_to_dma(ep->ring->deq_seg, ep->ring->dequeue);
+	ep_ctx->deq  = addr | ep->ring->cycle_state;
+}
+
+/* Frees all stream contexts associated with the endpoint,
+ *
+ * Caller should fix the endpoint context streams fields.
+ */
+void xhci_free_stream_info(struct xhci_hcd *xhci,
+		struct xhci_stream_info *stream_info)
+{
+	int cur_stream;
+	struct xhci_ring *cur_ring;
+	dma_addr_t addr;
+
+	if (!stream_info)
+		return;
+
+	for (cur_stream = 1; cur_stream < stream_info->num_streams;
+			cur_stream++) {
+		cur_ring = stream_info->stream_rings[cur_stream];
+		if (cur_ring) {
+			addr = cur_ring->first_seg->dma;
+			radix_tree_delete(&stream_info->trb_address_map,
+					addr >> SEGMENT_SHIFT);
+			xhci_ring_free(xhci, cur_ring);
+			stream_info->stream_rings[cur_stream] = NULL;
+		}
+	}
+	xhci_free_command(xhci, stream_info->free_streams_command);
+	xhci->cmd_ring_reserved_trbs--;
+	if (stream_info->stream_ctx_array)
+		xhci_free_stream_ctx(xhci,
+				stream_info->num_stream_ctxs,
+				stream_info->stream_ctx_array,
+				stream_info->ctx_array_dma);
+
+	if (stream_info)
+		kfree(stream_info->stream_rings);
+	kfree(stream_info);
+}
+
+
+/***************** Device context manipulation *************************/
+
 static void xhci_init_endpoint_timer(struct xhci_hcd *xhci,
 		struct xhci_virt_ep *ep)
 {
@@ -328,9 +672,13 @@
 	if (!dev)
 		return;
 
-	for (i = 0; i < 31; ++i)
+	for (i = 0; i < 31; ++i) {
 		if (dev->eps[i].ring)
 			xhci_ring_free(xhci, dev->eps[i].ring);
+		if (dev->eps[i].stream_info)
+			xhci_free_stream_info(xhci,
+					dev->eps[i].stream_info);
+	}
 
 	if (dev->ring_cache) {
 		for (i = 0; i < dev->num_rings_cached; i++)
@@ -655,6 +1003,9 @@
 	return max_packet * (max_burst + 1);
 }
 
+/* Set up an endpoint with one ring segment.  Do not allocate stream rings.
+ * Drivers will have to call usb_alloc_streams() to do that.
+ */
 int xhci_endpoint_init(struct xhci_hcd *xhci,
 		struct xhci_virt_device *virt_dev,
 		struct usb_device *udev,
@@ -1003,6 +1354,16 @@
 	xhci->device_pool = NULL;
 	xhci_dbg(xhci, "Freed device context pool\n");
 
+	if (xhci->small_streams_pool)
+		dma_pool_destroy(xhci->small_streams_pool);
+	xhci->small_streams_pool = NULL;
+	xhci_dbg(xhci, "Freed small stream array pool\n");
+
+	if (xhci->medium_streams_pool)
+		dma_pool_destroy(xhci->medium_streams_pool);
+	xhci->medium_streams_pool = NULL;
+	xhci_dbg(xhci, "Freed medium stream array pool\n");
+
 	xhci_write_64(xhci, 0, &xhci->op_regs->dcbaa_ptr);
 	if (xhci->dcbaa)
 		pci_free_consistent(pdev, sizeof(*xhci->dcbaa),
@@ -1239,6 +1600,22 @@
 	if (!xhci->segment_pool || !xhci->device_pool)
 		goto fail;
 
+	/* Linear stream context arrays don't have any boundary restrictions,
+	 * and only need to be 16-byte aligned.
+	 */
+	xhci->small_streams_pool =
+		dma_pool_create("xHCI 256 byte stream ctx arrays",
+			dev, SMALL_STREAM_ARRAY_SIZE, 16, 0);
+	xhci->medium_streams_pool =
+		dma_pool_create("xHCI 1KB stream ctx arrays",
+			dev, MEDIUM_STREAM_ARRAY_SIZE, 16, 0);
+	/* Any stream context array bigger than MEDIUM_STREAM_ARRAY_SIZE
+	 * will be allocated with pci_alloc_consistent()
+	 */
+
+	if (!xhci->small_streams_pool || !xhci->medium_streams_pool)
+		goto fail;
+
 	/* Set up the command ring to have one segments for now. */
 	xhci->cmd_ring = xhci_ring_alloc(xhci, 1, true, flags);
 	if (!xhci->cmd_ring)
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index c1359ed..a14f657 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -323,6 +323,10 @@
 	ep_state = ep->ep_state;
 	/* Don't ring the doorbell for this endpoint if there are pending
 	 * cancellations because the we don't want to interrupt processing.
+	 * We don't want to restart any stream rings if there's a set dequeue
+	 * pointer command pending because the device can choose to start any
+	 * stream once the endpoint is on the HW schedule.
+	 * FIXME - check all the stream rings for pending cancellations.
 	 */
 	if (!(ep_state & EP_HALT_PENDING) && !(ep_state & SET_DEQ_PENDING)
 			&& !(ep_state & EP_HALTED)) {
@@ -916,8 +920,9 @@
 		 * Configure endpoint commands can come from the USB core
 		 * configuration or alt setting changes, or because the HW
 		 * needed an extra configure endpoint command after a reset
-		 * endpoint command.  In the latter case, the xHCI driver is
-		 * not waiting on the configure endpoint command.
+		 * endpoint command or streams were being configured.
+		 * If the command was for a halted endpoint, the xHCI driver
+		 * is not waiting on the configure endpoint command.
 		 */
 		ctrl_ctx = xhci_get_input_control_ctx(xhci,
 				virt_dev->in_ctx);
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 077dfcd..2e370fe 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/log2.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/slab.h>
@@ -726,8 +727,21 @@
 		spin_lock_irqsave(&xhci->lock, flags);
 		if (xhci->xhc_state & XHCI_STATE_DYING)
 			goto dying;
-		ret = xhci_queue_bulk_tx(xhci, GFP_ATOMIC, urb,
-				slot_id, ep_index);
+		if (xhci->devs[slot_id]->eps[ep_index].ep_state &
+				EP_GETTING_STREAMS) {
+			xhci_warn(xhci, "WARN: Can't enqueue URB while bulk ep "
+					"is transitioning to using streams.\n");
+			ret = -EINVAL;
+		} else if (xhci->devs[slot_id]->eps[ep_index].ep_state &
+				EP_GETTING_NO_STREAMS) {
+			xhci_warn(xhci, "WARN: Can't enqueue URB while bulk ep "
+					"is transitioning to "
+					"not having streams.\n");
+			ret = -EINVAL;
+		} else {
+			ret = xhci_queue_bulk_tx(xhci, GFP_ATOMIC, urb,
+					slot_id, ep_index);
+		}
 		spin_unlock_irqrestore(&xhci->lock, flags);
 	} else if (usb_endpoint_xfer_int(&urb->ep->desc)) {
 		spin_lock_irqsave(&xhci->lock, flags);
@@ -1446,6 +1460,387 @@
 		xhci_warn(xhci, "FIXME allocate a new ring segment\n");
 }
 
+static int xhci_check_streams_endpoint(struct xhci_hcd *xhci,
+		struct usb_device *udev, struct usb_host_endpoint *ep,
+		unsigned int slot_id)
+{
+	int ret;
+	unsigned int ep_index;
+	unsigned int ep_state;
+
+	if (!ep)
+		return -EINVAL;
+	ret = xhci_check_args(xhci_to_hcd(xhci), udev, ep, 1, __func__);
+	if (ret <= 0)
+		return -EINVAL;
+	if (!ep->ss_ep_comp) {
+		xhci_warn(xhci, "WARN: No SuperSpeed Endpoint Companion"
+				" descriptor for ep 0x%x\n",
+				ep->desc.bEndpointAddress);
+		return -EINVAL;
+	}
+	if (ep->ss_ep_comp->desc.bmAttributes == 0) {
+		xhci_warn(xhci, "WARN: SuperSpeed Endpoint Companion"
+				" descriptor for ep 0x%x does not support streams\n",
+				ep->desc.bEndpointAddress);
+		return -EINVAL;
+	}
+
+	ep_index = xhci_get_endpoint_index(&ep->desc);
+	ep_state = xhci->devs[slot_id]->eps[ep_index].ep_state;
+	if (ep_state & EP_HAS_STREAMS ||
+			ep_state & EP_GETTING_STREAMS) {
+		xhci_warn(xhci, "WARN: SuperSpeed bulk endpoint 0x%x "
+				"already has streams set up.\n",
+				ep->desc.bEndpointAddress);
+		xhci_warn(xhci, "Send email to xHCI maintainer and ask for "
+				"dynamic stream context array reallocation.\n");
+		return -EINVAL;
+	}
+	if (!list_empty(&xhci->devs[slot_id]->eps[ep_index].ring->td_list)) {
+		xhci_warn(xhci, "Cannot setup streams for SuperSpeed bulk "
+				"endpoint 0x%x; URBs are pending.\n",
+				ep->desc.bEndpointAddress);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void xhci_calculate_streams_entries(struct xhci_hcd *xhci,
+		unsigned int *num_streams, unsigned int *num_stream_ctxs)
+{
+	unsigned int max_streams;
+
+	/* The stream context array size must be a power of two */
+	*num_stream_ctxs = roundup_pow_of_two(*num_streams);
+	/*
+	 * Find out how many primary stream array entries the host controller
+	 * supports.  Later we may use secondary stream arrays (similar to 2nd
+	 * level page entries), but that's an optional feature for xHCI host
+	 * controllers. xHCs must support at least 4 stream IDs.
+	 */
+	max_streams = HCC_MAX_PSA(xhci->hcc_params);
+	if (*num_stream_ctxs > max_streams) {
+		xhci_dbg(xhci, "xHCI HW only supports %u stream ctx entries.\n",
+				max_streams);
+		*num_stream_ctxs = max_streams;
+		*num_streams = max_streams;
+	}
+}
+
+/* Returns an error code if one of the endpoint already has streams.
+ * This does not change any data structures, it only checks and gathers
+ * information.
+ */
+static int xhci_calculate_streams_and_bitmask(struct xhci_hcd *xhci,
+		struct usb_device *udev,
+		struct usb_host_endpoint **eps, unsigned int num_eps,
+		unsigned int *num_streams, u32 *changed_ep_bitmask)
+{
+	struct usb_host_ss_ep_comp *ss_ep_comp;
+	unsigned int max_streams;
+	unsigned int endpoint_flag;
+	int i;
+	int ret;
+
+	for (i = 0; i < num_eps; i++) {
+		ret = xhci_check_streams_endpoint(xhci, udev,
+				eps[i], udev->slot_id);
+		if (ret < 0)
+			return ret;
+
+		ss_ep_comp = eps[i]->ss_ep_comp;
+		max_streams = USB_SS_MAX_STREAMS(ss_ep_comp->desc.bmAttributes);
+		if (max_streams < (*num_streams - 1)) {
+			xhci_dbg(xhci, "Ep 0x%x only supports %u stream IDs.\n",
+					eps[i]->desc.bEndpointAddress,
+					max_streams);
+			*num_streams = max_streams+1;
+		}
+
+		endpoint_flag = xhci_get_endpoint_flag(&eps[i]->desc);
+		if (*changed_ep_bitmask & endpoint_flag)
+			return -EINVAL;
+		*changed_ep_bitmask |= endpoint_flag;
+	}
+	return 0;
+}
+
+static u32 xhci_calculate_no_streams_bitmask(struct xhci_hcd *xhci,
+		struct usb_device *udev,
+		struct usb_host_endpoint **eps, unsigned int num_eps)
+{
+	u32 changed_ep_bitmask = 0;
+	unsigned int slot_id;
+	unsigned int ep_index;
+	unsigned int ep_state;
+	int i;
+
+	slot_id = udev->slot_id;
+	if (!xhci->devs[slot_id])
+		return 0;
+
+	for (i = 0; i < num_eps; i++) {
+		ep_index = xhci_get_endpoint_index(&eps[i]->desc);
+		ep_state = xhci->devs[slot_id]->eps[ep_index].ep_state;
+		/* Are streams already being freed for the endpoint? */
+		if (ep_state & EP_GETTING_NO_STREAMS) {
+			xhci_warn(xhci, "WARN Can't disable streams for "
+					"endpoint 0x%x\n, "
+					"streams are being disabled already.",
+					eps[i]->desc.bEndpointAddress);
+			return 0;
+		}
+		/* Are there actually any streams to free? */
+		if (!(ep_state & EP_HAS_STREAMS) &&
+				!(ep_state & EP_GETTING_STREAMS)) {
+			xhci_warn(xhci, "WARN Can't disable streams for "
+					"endpoint 0x%x\n, "
+					"streams are already disabled!",
+					eps[i]->desc.bEndpointAddress);
+			xhci_warn(xhci, "WARN xhci_free_streams() called "
+					"with non-streams endpoint\n");
+			return 0;
+		}
+		changed_ep_bitmask |= xhci_get_endpoint_flag(&eps[i]->desc);
+	}
+	return changed_ep_bitmask;
+}
+
+/*
+ * The USB device drivers use this function (though the HCD interface in USB
+ * core) to prepare a set of bulk endpoints to use streams.  Streams are used to
+ * coordinate mass storage command queueing across multiple endpoints (basically
+ * a stream ID == a task ID).
+ *
+ * Setting up streams involves allocating the same size stream context array
+ * for each endpoint and issuing a configure endpoint command for all endpoints.
+ *
+ * Don't allow the call to succeed if one endpoint only supports one stream
+ * (which means it doesn't support streams at all).
+ *
+ * Drivers may get less stream IDs than they asked for, if the host controller
+ * hardware or endpoints claim they can't support the number of requested
+ * stream IDs.
+ */
+int xhci_alloc_streams(struct usb_hcd *hcd, struct usb_device *udev,
+		struct usb_host_endpoint **eps, unsigned int num_eps,
+		unsigned int num_streams, gfp_t mem_flags)
+{
+	int i, ret;
+	struct xhci_hcd *xhci;
+	struct xhci_virt_device *vdev;
+	struct xhci_command *config_cmd;
+	unsigned int ep_index;
+	unsigned int num_stream_ctxs;
+	unsigned long flags;
+	u32 changed_ep_bitmask = 0;
+
+	if (!eps)
+		return -EINVAL;
+
+	/* Add one to the number of streams requested to account for
+	 * stream 0 that is reserved for xHCI usage.
+	 */
+	num_streams += 1;
+	xhci = hcd_to_xhci(hcd);
+	xhci_dbg(xhci, "Driver wants %u stream IDs (including stream 0).\n",
+			num_streams);
+
+	config_cmd = xhci_alloc_command(xhci, true, true, mem_flags);
+	if (!config_cmd) {
+		xhci_dbg(xhci, "Could not allocate xHCI command structure.\n");
+		return -ENOMEM;
+	}
+
+	/* Check to make sure all endpoints are not already configured for
+	 * streams.  While we're at it, find the maximum number of streams that
+	 * all the endpoints will support and check for duplicate endpoints.
+	 */
+	spin_lock_irqsave(&xhci->lock, flags);
+	ret = xhci_calculate_streams_and_bitmask(xhci, udev, eps,
+			num_eps, &num_streams, &changed_ep_bitmask);
+	if (ret < 0) {
+		xhci_free_command(xhci, config_cmd);
+		spin_unlock_irqrestore(&xhci->lock, flags);
+		return ret;
+	}
+	if (num_streams <= 1) {
+		xhci_warn(xhci, "WARN: endpoints can't handle "
+				"more than one stream.\n");
+		xhci_free_command(xhci, config_cmd);
+		spin_unlock_irqrestore(&xhci->lock, flags);
+		return -EINVAL;
+	}
+	vdev = xhci->devs[udev->slot_id];
+	/* Mark each endpoint as being in transistion, so
+	 * xhci_urb_enqueue() will reject all URBs.
+	 */
+	for (i = 0; i < num_eps; i++) {
+		ep_index = xhci_get_endpoint_index(&eps[i]->desc);
+		vdev->eps[ep_index].ep_state |= EP_GETTING_STREAMS;
+	}
+	spin_unlock_irqrestore(&xhci->lock, flags);
+
+	/* Setup internal data structures and allocate HW data structures for
+	 * streams (but don't install the HW structures in the input context
+	 * until we're sure all memory allocation succeeded).
+	 */
+	xhci_calculate_streams_entries(xhci, &num_streams, &num_stream_ctxs);
+	xhci_dbg(xhci, "Need %u stream ctx entries for %u stream IDs.\n",
+			num_stream_ctxs, num_streams);
+
+	for (i = 0; i < num_eps; i++) {
+		ep_index = xhci_get_endpoint_index(&eps[i]->desc);
+		vdev->eps[ep_index].stream_info = xhci_alloc_stream_info(xhci,
+				num_stream_ctxs,
+				num_streams, mem_flags);
+		if (!vdev->eps[ep_index].stream_info)
+			goto cleanup;
+		/* Set maxPstreams in endpoint context and update deq ptr to
+		 * point to stream context array. FIXME
+		 */
+	}
+
+	/* Set up the input context for a configure endpoint command. */
+	for (i = 0; i < num_eps; i++) {
+		struct xhci_ep_ctx *ep_ctx;
+
+		ep_index = xhci_get_endpoint_index(&eps[i]->desc);
+		ep_ctx = xhci_get_ep_ctx(xhci, config_cmd->in_ctx, ep_index);
+
+		xhci_endpoint_copy(xhci, config_cmd->in_ctx,
+				vdev->out_ctx, ep_index);
+		xhci_setup_streams_ep_input_ctx(xhci, ep_ctx,
+				vdev->eps[ep_index].stream_info);
+	}
+	/* Tell the HW to drop its old copy of the endpoint context info
+	 * and add the updated copy from the input context.
+	 */
+	xhci_setup_input_ctx_for_config_ep(xhci, config_cmd->in_ctx,
+			vdev->out_ctx, changed_ep_bitmask, changed_ep_bitmask);
+
+	/* Issue and wait for the configure endpoint command */
+	ret = xhci_configure_endpoint(xhci, udev, config_cmd,
+			false, false);
+
+	/* xHC rejected the configure endpoint command for some reason, so we
+	 * leave the old ring intact and free our internal streams data
+	 * structure.
+	 */
+	if (ret < 0)
+		goto cleanup;
+
+	spin_lock_irqsave(&xhci->lock, flags);
+	for (i = 0; i < num_eps; i++) {
+		ep_index = xhci_get_endpoint_index(&eps[i]->desc);
+		vdev->eps[ep_index].ep_state &= ~EP_GETTING_STREAMS;
+		xhci_dbg(xhci, "Slot %u ep ctx %u now has streams.\n",
+			 udev->slot_id, ep_index);
+		vdev->eps[ep_index].ep_state |= EP_HAS_STREAMS;
+	}
+	xhci_free_command(xhci, config_cmd);
+	spin_unlock_irqrestore(&xhci->lock, flags);
+
+	/* Subtract 1 for stream 0, which drivers can't use */
+	return num_streams - 1;
+
+cleanup:
+	/* If it didn't work, free the streams! */
+	for (i = 0; i < num_eps; i++) {
+		ep_index = xhci_get_endpoint_index(&eps[i]->desc);
+		xhci_free_stream_info(xhci, vdev->eps[ep_index].stream_info);
+		/* FIXME Unset maxPstreams in endpoint context and
+		 * update deq ptr to point to normal string ring.
+		 */
+		vdev->eps[ep_index].ep_state &= ~EP_GETTING_STREAMS;
+		vdev->eps[ep_index].ep_state &= ~EP_HAS_STREAMS;
+		xhci_endpoint_zero(xhci, vdev, eps[i]);
+	}
+	xhci_free_command(xhci, config_cmd);
+	return -ENOMEM;
+}
+
+/* Transition the endpoint from using streams to being a "normal" endpoint
+ * without streams.
+ *
+ * Modify the endpoint context state, submit a configure endpoint command,
+ * and free all endpoint rings for streams if that completes successfully.
+ */
+int xhci_free_streams(struct usb_hcd *hcd, struct usb_device *udev,
+		struct usb_host_endpoint **eps, unsigned int num_eps,
+		gfp_t mem_flags)
+{
+	int i, ret;
+	struct xhci_hcd *xhci;
+	struct xhci_virt_device *vdev;
+	struct xhci_command *command;
+	unsigned int ep_index;
+	unsigned long flags;
+	u32 changed_ep_bitmask;
+
+	xhci = hcd_to_xhci(hcd);
+	vdev = xhci->devs[udev->slot_id];
+
+	/* Set up a configure endpoint command to remove the streams rings */
+	spin_lock_irqsave(&xhci->lock, flags);
+	changed_ep_bitmask = xhci_calculate_no_streams_bitmask(xhci,
+			udev, eps, num_eps);
+	if (changed_ep_bitmask == 0) {
+		spin_unlock_irqrestore(&xhci->lock, flags);
+		return -EINVAL;
+	}
+
+	/* Use the xhci_command structure from the first endpoint.  We may have
+	 * allocated too many, but the driver may call xhci_free_streams() for
+	 * each endpoint it grouped into one call to xhci_alloc_streams().
+	 */
+	ep_index = xhci_get_endpoint_index(&eps[0]->desc);
+	command = vdev->eps[ep_index].stream_info->free_streams_command;
+	for (i = 0; i < num_eps; i++) {
+		struct xhci_ep_ctx *ep_ctx;
+
+		ep_index = xhci_get_endpoint_index(&eps[i]->desc);
+		ep_ctx = xhci_get_ep_ctx(xhci, command->in_ctx, ep_index);
+		xhci->devs[udev->slot_id]->eps[ep_index].ep_state |=
+			EP_GETTING_NO_STREAMS;
+
+		xhci_endpoint_copy(xhci, command->in_ctx,
+				vdev->out_ctx, ep_index);
+		xhci_setup_no_streams_ep_input_ctx(xhci, ep_ctx,
+				&vdev->eps[ep_index]);
+	}
+	xhci_setup_input_ctx_for_config_ep(xhci, command->in_ctx,
+			vdev->out_ctx, changed_ep_bitmask, changed_ep_bitmask);
+	spin_unlock_irqrestore(&xhci->lock, flags);
+
+	/* Issue and wait for the configure endpoint command,
+	 * which must succeed.
+	 */
+	ret = xhci_configure_endpoint(xhci, udev, command,
+			false, true);
+
+	/* xHC rejected the configure endpoint command for some reason, so we
+	 * leave the streams rings intact.
+	 */
+	if (ret < 0)
+		return ret;
+
+	spin_lock_irqsave(&xhci->lock, flags);
+	for (i = 0; i < num_eps; i++) {
+		ep_index = xhci_get_endpoint_index(&eps[i]->desc);
+		xhci_free_stream_info(xhci, vdev->eps[ep_index].stream_info);
+		/* FIXME Unset maxPstreams in endpoint context and
+		 * update deq ptr to point to normal string ring.
+		 */
+		vdev->eps[ep_index].ep_state &= ~EP_GETTING_NO_STREAMS;
+		vdev->eps[ep_index].ep_state &= ~EP_HAS_STREAMS;
+	}
+	spin_unlock_irqrestore(&xhci->lock, flags);
+
+	return 0;
+}
+
 /*
  * This submits a Reset Device Command, which will set the device state to 0,
  * set the device address to 0, and disable all the endpoints except the default
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index a7c4e11..7a9447c 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -117,7 +117,7 @@
 /* true: no secondary Stream ID Support */
 #define HCC_NSS(p)		((p) & (1 << 7))
 /* Max size for Primary Stream Arrays - 2^(n+1), where n is bits 12:15 */
-#define HCC_MAX_PSA		(1 << ((((p) >> 12) & 0xf) + 1))
+#define HCC_MAX_PSA(p)		(1 << ((((p) >> 12) & 0xf) + 1))
 /* Extended Capabilities pointer from PCI base - section 5.3.6 */
 #define HCC_EXT_CAPS(p)		XHCI_HCC_EXT_CAPS(p)
 
@@ -585,6 +585,10 @@
 /* Interval - period between requests to an endpoint - 125u increments. */
 #define EP_INTERVAL(p)		((p & 0xff) << 16)
 #define EP_INTERVAL_TO_UFRAMES(p)		(1 << (((p) >> 16) & 0xff))
+#define EP_MAXPSTREAMS_MASK	(0x1f << 10)
+#define EP_MAXPSTREAMS(p)	(((p) << 10) & EP_MAXPSTREAMS_MASK)
+/* Endpoint is set up with a Linear Stream Array (vs. Secondary Stream Array) */
+#define	EP_HAS_LSA		(1 << 15)
 
 /* ep_info2 bitmasks */
 /*
@@ -648,8 +652,50 @@
 /* add context bitmasks */
 #define	ADD_EP(x)	(0x1 << x)
 
+struct xhci_stream_ctx {
+	/* 64-bit stream ring address, cycle state, and stream type */
+	u64	stream_ring;
+	/* offset 0x14 - 0x1f reserved for HC internal use */
+	u32	reserved[2];
+};
+
+/* Stream Context Types (section 6.4.1) - bits 3:1 of stream ctx deq ptr */
+#define	SCT_FOR_CTX(p)		(((p) << 1) & 0x7)
+/* Secondary stream array type, dequeue pointer is to a transfer ring */
+#define	SCT_SEC_TR		0
+/* Primary stream array type, dequeue pointer is to a transfer ring */
+#define	SCT_PRI_TR		1
+/* Dequeue pointer is for a secondary stream array (SSA) with 8 entries */
+#define SCT_SSA_8		2
+#define SCT_SSA_16		3
+#define SCT_SSA_32		4
+#define SCT_SSA_64		5
+#define SCT_SSA_128		6
+#define SCT_SSA_256		7
+
+/* Assume no secondary streams for now */
+struct xhci_stream_info {
+	struct xhci_ring		**stream_rings;
+	/* Number of streams, including stream 0 (which drivers can't use) */
+	unsigned int			num_streams;
+	/* The stream context array may be bigger than
+	 * the number of streams the driver asked for
+	 */
+	struct xhci_stream_ctx		*stream_ctx_array;
+	unsigned int			num_stream_ctxs;
+	dma_addr_t			ctx_array_dma;
+	/* For mapping physical TRB addresses to segments in stream rings */
+	struct radix_tree_root		trb_address_map;
+	struct xhci_command		*free_streams_command;
+};
+
+#define	SMALL_STREAM_ARRAY_SIZE		256
+#define	MEDIUM_STREAM_ARRAY_SIZE	1024
+
 struct xhci_virt_ep {
 	struct xhci_ring		*ring;
+	/* Related to endpoints that are configured to use stream IDs only */
+	struct xhci_stream_info		*stream_info;
 	/* Temporary storage in case the configure endpoint command fails and we
 	 * have to restore the device state to the previous state
 	 */
@@ -658,6 +704,11 @@
 #define SET_DEQ_PENDING		(1 << 0)
 #define EP_HALTED		(1 << 1)	/* For stall handling */
 #define EP_HALT_PENDING		(1 << 2)	/* For URB cancellation */
+/* Transitioning the endpoint to using streams, don't enqueue URBs */
+#define EP_GETTING_STREAMS	(1 << 3)
+#define EP_HAS_STREAMS		(1 << 4)
+/* Transitioning the endpoint to not using streams, don't enqueue URBs */
+#define EP_GETTING_NO_STREAMS	(1 << 5)
 	/* ----  Related to URB cancellation ---- */
 	struct list_head	cancelled_td_list;
 	/* The TRB that was last reported in a stopped endpoint ring */
@@ -710,14 +761,6 @@
  */
 
 
-struct xhci_stream_ctx {
-	/* 64-bit stream ring address, cycle state, and stream type */
-	u64	stream_ring;
-	/* offset 0x14 - 0x1f reserved for HC internal use */
-	u32	reserved[2];
-};
-
-
 struct xhci_transfer_event {
 	/* 64-bit buffer address, or immediate data */
 	u64	buffer;
@@ -952,6 +995,10 @@
 /* Allow two commands + a link TRB, along with any reserved command TRBs */
 #define MAX_RSVD_CMD_TRBS	(TRBS_PER_SEGMENT - 3)
 #define SEGMENT_SIZE		(TRBS_PER_SEGMENT*16)
+/* SEGMENT_SHIFT should be log2(SEGMENT_SIZE).
+ * Change this if you change TRBS_PER_SEGMENT!
+ */
+#define SEGMENT_SHIFT		10
 /* TRB buffer pointers can't cross 64KB boundaries */
 #define TRB_MAX_BUFF_SHIFT		16
 #define TRB_MAX_BUFF_SIZE	(1 << TRB_MAX_BUFF_SHIFT)
@@ -1088,6 +1135,8 @@
 	/* DMA pools */
 	struct dma_pool	*device_pool;
 	struct dma_pool	*segment_pool;
+	struct dma_pool	*small_streams_pool;
+	struct dma_pool	*medium_streams_pool;
 
 #ifdef CONFIG_USB_XHCI_HCD_DEBUGGING
 	/* Poll the rings - for debugging */
@@ -1242,6 +1291,17 @@
 void xhci_free_or_cache_endpoint_ring(struct xhci_hcd *xhci,
 		struct xhci_virt_device *virt_dev,
 		unsigned int ep_index);
+struct xhci_stream_info *xhci_alloc_stream_info(struct xhci_hcd *xhci,
+		unsigned int num_stream_ctxs,
+		unsigned int num_streams, gfp_t flags);
+void xhci_free_stream_info(struct xhci_hcd *xhci,
+		struct xhci_stream_info *stream_info);
+void xhci_setup_streams_ep_input_ctx(struct xhci_hcd *xhci,
+		struct xhci_ep_ctx *ep_ctx,
+		struct xhci_stream_info *stream_info);
+void xhci_setup_no_streams_ep_input_ctx(struct xhci_hcd *xhci,
+		struct xhci_ep_ctx *ep_ctx,
+		struct xhci_virt_ep *ep);
 struct xhci_command *xhci_alloc_command(struct xhci_hcd *xhci,
 		bool allocate_in_ctx, bool allocate_completion,
 		gfp_t mem_flags);
@@ -1266,6 +1326,12 @@
 irqreturn_t xhci_irq(struct usb_hcd *hcd);
 int xhci_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev);
 void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev);
+int xhci_alloc_streams(struct usb_hcd *hcd, struct usb_device *udev,
+		struct usb_host_endpoint **eps, unsigned int num_eps,
+		unsigned int num_streams, gfp_t mem_flags);
+int xhci_free_streams(struct usb_hcd *hcd, struct usb_device *udev,
+		struct usb_host_endpoint **eps, unsigned int num_eps,
+		gfp_t mem_flags);
 int xhci_address_device(struct usb_hcd *hcd, struct usb_device *udev);
 int xhci_update_hub_device(struct usb_hcd *hcd, struct usb_device *hdev,
 			struct usb_tt *tt, gfp_t mem_flags);