cciss: add performant mode support for Stars/Sirius

Add a mode of controller operation called Performant Mode.  Even though
cciss has been deprecated in favor of hpsa there are new controllers due
out next year that HP must support in older vendor distros.  Vendors
require all fixes/features be upstream.  These new controllers support
only 16 commands in simple mode but support up to 1024 in performant mode.
This requires us to add this support at this late date.

The performant mode transport minimizes host PCI accesses by performinf
many completions per read.  PCI writes are posted so the host can write
then immediately get off the bus not waiting for the writwe to complete to
the target.  In the context of performant mode the host read out to a
controller pulls all posted writes into host memory ensuring the reply
queue is coherent.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Cc: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index cd830cb..08a2e61 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -206,6 +206,11 @@
 static void cciss_free_gendisk(ctlr_info_t *h, int drv_index);
 static void cciss_free_drive_info(ctlr_info_t *h, int drv_index);
 
+/* performant mode helper functions */
+static void  calc_bucket_map(int *bucket, int num_buckets, int nsgs,
+				int *bucket_map);
+static void cciss_put_controller_into_performant_mode(ctlr_info_t *h);
+
 #ifdef CONFIG_PROC_FS
 static void cciss_procinit(int i);
 #else
@@ -231,6 +236,16 @@
 	.revalidate_disk = cciss_revalidate,
 };
 
+/* set_performant_mode: Modify the tag for cciss performant
+ * set bit 0 for pull model, bits 3-1 for block fetch
+ * register number
+ */
+static void set_performant_mode(ctlr_info_t *h, CommandList_struct *c)
+{
+	if (likely(h->transMethod == CFGTBL_Trans_Performant))
+		c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1);
+}
+
 /*
  * Enqueuing and dequeuing functions for cmdlists.
  */
@@ -261,6 +276,7 @@
 	CommandList_struct *c)
 {
 	unsigned long flags;
+	set_performant_mode(h, c);
 	spin_lock_irqsave(&h->lock, flags);
 	addQ(&h->reqQ, c);
 	h->Qdepth++;
@@ -350,6 +366,28 @@
 
 #ifdef CONFIG_PROC_FS
 
+static inline u32 next_command(ctlr_info_t *h)
+{
+	u32 a;
+
+	if (unlikely(h->transMethod != CFGTBL_Trans_Performant))
+		return h->access.command_completed(h);
+
+	if ((*(h->reply_pool_head) & 1) == (h->reply_pool_wraparound)) {
+		a = *(h->reply_pool_head); /* Next cmd in ring buffer */
+		(h->reply_pool_head)++;
+		h->commands_outstanding--;
+	} else {
+		a = FIFO_EMPTY;
+	}
+	/* Check for wraparound */
+	if (h->reply_pool_head == (h->reply_pool + h->max_commands)) {
+		h->reply_pool_head = h->reply_pool;
+		h->reply_pool_wraparound ^= 1;
+	}
+	return a;
+}
+
 /*
  * Report information about this controller.
  */
@@ -377,7 +415,7 @@
 		h->product_name,
 		(unsigned long)h->board_id,
 		h->firm_ver[0], h->firm_ver[1], h->firm_ver[2],
-		h->firm_ver[3], (unsigned int)h->intr[SIMPLE_MODE_INT],
+		h->firm_ver[3], (unsigned int)h->intr[PERF_MODE_INT],
 		h->num_luns,
 		h->Qdepth, h->commands_outstanding,
 		h->maxQsinceinit, h->max_outstanding, h->maxSG);
@@ -3126,13 +3164,13 @@
 
 static inline u32 cciss_tag_contains_index(u32 tag)
 {
-#define DIRECT_LOOKUP_BIT 0x04
+#define DIRECT_LOOKUP_BIT 0x10
 	return tag & DIRECT_LOOKUP_BIT;
 }
 
 static inline u32 cciss_tag_to_index(u32 tag)
 {
-#define DIRECT_LOOKUP_SHIFT 3
+#define DIRECT_LOOKUP_SHIFT 5
 	return tag >> DIRECT_LOOKUP_SHIFT;
 }
 
@@ -3262,9 +3300,12 @@
 			blk_rq_sectors(creq), seg, chained);
 #endif				/* CCISS_DEBUG */
 
-	c->Header.SGList = c->Header.SGTotal = seg + chained;
-	if (seg > h->max_cmd_sgentries)
+	c->Header.SGTotal = seg + chained;
+	if (seg <= h->max_cmd_sgentries)
+		c->Header.SGList = c->Header.SGTotal;
+	else
 		c->Header.SGList = h->max_cmd_sgentries;
+	set_performant_mode(h, c);
 
 	if (likely(blk_fs_request(creq))) {
 		if(h->cciss_read == CCISS_READ_10) {
@@ -3370,10 +3411,10 @@
 
 	tag_index = cciss_tag_to_index(raw_tag);
 	if (bad_tag(h, tag_index, raw_tag))
-		return get_next_completion(h);
+		return next_command(h);
 	c = h->cmd_pool + tag_index;
 	finish_cmd(h, c, raw_tag);
-	return get_next_completion(h);
+	return next_command(h);
 }
 
 /* process completion of a non-indexed command */
@@ -3390,11 +3431,11 @@
 		tag_masked = cciss_tag_discard_error_bits(tag);
 		if (busaddr_masked == tag_masked) {
 			finish_cmd(h, c, raw_tag);
-			return get_next_completion(h);
+			return next_command(h);
 		}
 	}
 	bad_tag(h, h->nr_cmds + 1, raw_tag);
-	return get_next_completion(h);
+	return next_command(h);
 }
 
 static irqreturn_t do_cciss_intx(int irq, void *dev_id)
@@ -3700,6 +3741,155 @@
 	return -1;
 }
 
+/* Fill in bucket_map[], given nsgs (the max number of
+ * scatter gather elements supported) and bucket[],
+ * which is an array of 8 integers.  The bucket[] array
+ * contains 8 different DMA transfer sizes (in 16
+ * byte increments) which the controller uses to fetch
+ * commands.  This function fills in bucket_map[], which
+ * maps a given number of scatter gather elements to one of
+ * the 8 DMA transfer sizes.  The point of it is to allow the
+ * controller to only do as much DMA as needed to fetch the
+ * command, with the DMA transfer size encoded in the lower
+ * bits of the command address.
+ */
+static void  calc_bucket_map(int bucket[], int num_buckets,
+	int nsgs, int *bucket_map)
+{
+	int i, j, b, size;
+
+	/* even a command with 0 SGs requires 4 blocks */
+#define MINIMUM_TRANSFER_BLOCKS 4
+#define NUM_BUCKETS 8
+	/* Note, bucket_map must have nsgs+1 entries. */
+	for (i = 0; i <= nsgs; i++) {
+		/* Compute size of a command with i SG entries */
+		size = i + MINIMUM_TRANSFER_BLOCKS;
+		b = num_buckets; /* Assume the biggest bucket */
+		/* Find the bucket that is just big enough */
+		for (j = 0; j < 8; j++) {
+			if (bucket[j] >= size) {
+				b = j;
+				break;
+			}
+		}
+		/* for a command with i SG entries, use bucket b. */
+		bucket_map[i] = b;
+	}
+}
+
+static void
+cciss_put_controller_into_performant_mode(ctlr_info_t *h)
+{
+	int l = 0;
+	__u32 trans_support;
+	__u32 trans_offset;
+			/*
+			 *  5 = 1 s/g entry or 4k
+			 *  6 = 2 s/g entry or 8k
+			 *  8 = 4 s/g entry or 16k
+			 * 10 = 6 s/g entry or 24k
+			 */
+	int bft[8] = { 5, 6, 8, 10, 12, 20, 28, MAXSGENTRIES + 4};
+	unsigned long register_value;
+
+	BUILD_BUG_ON(28 > MAXSGENTRIES + 4);
+
+	/* Attempt to put controller into performant mode if supported */
+	/* Does board support performant mode? */
+	trans_support = readl(&(h->cfgtable->TransportSupport));
+	if (!(trans_support & PERFORMANT_MODE))
+		return;
+
+	printk(KERN_WARNING "cciss%d: Placing controller into "
+				"performant mode\n", h->ctlr);
+	/* Performant mode demands commands on a 32 byte boundary
+	 * pci_alloc_consistent aligns on page boundarys already.
+	 * Just need to check if divisible by 32
+	 */
+	if ((sizeof(CommandList_struct) % 32) != 0) {
+		printk(KERN_WARNING "%s %d %s\n",
+			"cciss info: command size[",
+			(int)sizeof(CommandList_struct),
+			"] not divisible by 32, no performant mode..\n");
+		return;
+	}
+
+	/* Performant mode ring buffer and supporting data structures */
+	h->reply_pool = (__u64 *)pci_alloc_consistent(
+		h->pdev, h->max_commands * sizeof(__u64),
+		&(h->reply_pool_dhandle));
+
+	/* Need a block fetch table for performant mode */
+	h->blockFetchTable = kmalloc(((h->maxsgentries+1) *
+		sizeof(__u32)), GFP_KERNEL);
+
+	if ((h->reply_pool == NULL) || (h->blockFetchTable == NULL))
+		goto clean_up;
+
+	h->reply_pool_wraparound = 1; /* spec: init to 1 */
+
+	/* Controller spec: zero out this buffer. */
+	memset(h->reply_pool, 0, h->max_commands * sizeof(__u64));
+	h->reply_pool_head = h->reply_pool;
+
+	trans_offset = readl(&(h->cfgtable->TransMethodOffset));
+	calc_bucket_map(bft, ARRAY_SIZE(bft), h->maxsgentries,
+				h->blockFetchTable);
+	writel(bft[0], &h->transtable->BlockFetch0);
+	writel(bft[1], &h->transtable->BlockFetch1);
+	writel(bft[2], &h->transtable->BlockFetch2);
+	writel(bft[3], &h->transtable->BlockFetch3);
+	writel(bft[4], &h->transtable->BlockFetch4);
+	writel(bft[5], &h->transtable->BlockFetch5);
+	writel(bft[6], &h->transtable->BlockFetch6);
+	writel(bft[7], &h->transtable->BlockFetch7);
+
+	/* size of controller ring buffer */
+	writel(h->max_commands, &h->transtable->RepQSize);
+	writel(1, &h->transtable->RepQCount);
+	writel(0, &h->transtable->RepQCtrAddrLow32);
+	writel(0, &h->transtable->RepQCtrAddrHigh32);
+	writel(h->reply_pool_dhandle, &h->transtable->RepQAddr0Low32);
+	writel(0, &h->transtable->RepQAddr0High32);
+	writel(CFGTBL_Trans_Performant,
+			&(h->cfgtable->HostWrite.TransportRequest));
+
+	h->transMethod = CFGTBL_Trans_Performant;
+	writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+	/* under certain very rare conditions, this can take awhile.
+	 * (e.g.: hot replace a failed 144GB drive in a RAID 5 set right
+	 * as we enter this code.) */
+	for (l = 0; l < MAX_CONFIG_WAIT; l++) {
+		register_value = readl(h->vaddr + SA5_DOORBELL);
+		if (!(register_value & CFGTBL_ChangeReq))
+			break;
+		/* delay and try again */
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(10);
+	}
+	register_value = readl(&(h->cfgtable->TransportActive));
+	if (!(register_value & CFGTBL_Trans_Performant)) {
+		printk(KERN_WARNING "cciss: unable to get board into"
+					" performant mode\n");
+		return;
+	}
+
+	/* Change the access methods to the performant access methods */
+	h->access = SA5_performant_access;
+
+	return;
+clean_up:
+	kfree(h->blockFetchTable);
+	if (h->reply_pool)
+		pci_free_consistent(h->pdev,
+				h->max_commands * sizeof(__u64),
+				h->reply_pool,
+				h->reply_pool_dhandle);
+	return;
+
+} /* cciss_put_controller_into_performant_mode */
+
 /* If MSI/MSI-X is supported by the kernel we will try to enable it on
  * controllers that are capable. If not, we use IO-APIC mode.
  */
@@ -3749,7 +3939,7 @@
 default_int_mode:
 #endif				/* CONFIG_PCI_MSI */
 	/* if we get here we're going to use the default interrupt mode */
-	c->intr[SIMPLE_MODE_INT] = pdev->irq;
+	c->intr[PERF_MODE_INT] = pdev->irq;
 	return;
 }
 
@@ -3761,6 +3951,7 @@
 	__u32 cfg_base_addr;
 	__u64 cfg_base_addr_index;
 	int i, prod_index, err;
+	__u32 trans_offset;
 
 	subsystem_vendor_id = pdev->subsystem_vendor;
 	subsystem_device_id = pdev->subsystem_device;
@@ -3874,11 +4065,16 @@
 	c->cfgtable = remap_pci_mem(pci_resource_start(pdev,
 						       cfg_base_addr_index) +
 				    cfg_offset, sizeof(CfgTable_struct));
+	/* Find performant mode table. */
+	trans_offset = readl(&(c->cfgtable->TransMethodOffset));
+	c->transtable = remap_pci_mem(pci_resource_start(pdev,
+		cfg_base_addr_index) + cfg_offset+trans_offset,
+		sizeof(*c->transtable));
 	c->board_id = board_id;
 
-#ifdef CCISS_DEBUG
-	print_cfg_table(c->cfgtable);
-#endif				/* CCISS_DEBUG */
+	#ifdef CCISS_DEBUG
+		print_cfg_table(c->cfgtable);
+	#endif				/* CCISS_DEBUG */
 
 	/* Some controllers support Zero Memory Raid (ZMR).
 	 * When configured in ZMR mode the number of supported
@@ -3888,7 +4084,7 @@
 	 * are supported on the controller then subtract 4 to
 	 * leave a little room for ioctl calls.
 	 */
-	c->max_commands = readl(&(c->cfgtable->CmdsOutMax));
+	c->max_commands = readl(&(c->cfgtable->MaxPerformantModeCommands));
 	c->maxsgentries = readl(&(c->cfgtable->MaxSGElements));
 
 	/*
@@ -3933,7 +4129,7 @@
 	 * kernels revealed a bug in the refetch if dom0 resides on a P600.
 	 */
 	if(board_id == 0x3225103C) {
-		__u32 dma_prefetch;
+			__u32 dma_prefetch;
 		__u32 dma_refetch;
 		dma_prefetch = readl(c->vaddr + I2O_DMA1_CFG);
 		dma_prefetch |= 0x8000;
@@ -3944,38 +4140,8 @@
 	}
 
 #ifdef CCISS_DEBUG
-	printk("Trying to put board into Simple mode\n");
+	printk(KERN_WARNING "Trying to put board into Performant mode\n");
 #endif				/* CCISS_DEBUG */
-	c->max_commands = readl(&(c->cfgtable->CmdsOutMax));
-	/* Update the field, and then ring the doorbell */
-	writel(CFGTBL_Trans_Simple, &(c->cfgtable->HostWrite.TransportRequest));
-	writel(CFGTBL_ChangeReq, c->vaddr + SA5_DOORBELL);
-
-	/* under certain very rare conditions, this can take awhile.
-	 * (e.g.: hot replace a failed 144GB drive in a RAID 5 set right
-	 * as we enter this code.) */
-	for (i = 0; i < MAX_CONFIG_WAIT; i++) {
-		if (!(readl(c->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
-			break;
-		/* delay and try again */
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(msecs_to_jiffies(1));
-	}
-
-#ifdef CCISS_DEBUG
-	printk(KERN_DEBUG "I counter got to %d %x\n", i,
-	       readl(c->vaddr + SA5_DOORBELL));
-#endif				/* CCISS_DEBUG */
-#ifdef CCISS_DEBUG
-	print_cfg_table(c->cfgtable);
-#endif				/* CCISS_DEBUG */
-
-	if (!(readl(&(c->cfgtable->TransportActive)) & CFGTBL_Trans_Simple)) {
-		printk(KERN_WARNING "cciss: unable to get board into"
-		       " simple mode\n");
-		err = -ENODEV;
-		goto err_out_free_res;
-	}
 	return 0;
 
 err_out_free_res:
@@ -3984,6 +4150,7 @@
 	 * Smart Array controllers that pci_enable_device does not undo
 	 */
 	pci_release_regions(pdev);
+	cciss_put_controller_into_performant_mode(c);
 	return err;
 }
 
@@ -4260,7 +4427,6 @@
 	i = alloc_cciss_hba();
 	if (i < 0)
 		return -1;
-
 	hba[i]->busy_initializing = 1;
 	INIT_HLIST_HEAD(&hba[i]->cmpQ);
 	INIT_HLIST_HEAD(&hba[i]->reqQ);
@@ -4327,7 +4493,7 @@
 
 	printk(KERN_INFO "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n",
 	       hba[i]->devname, pdev->device, pci_name(pdev),
-	       hba[i]->intr[SIMPLE_MODE_INT], dac ? "" : " not");
+	       hba[i]->intr[PERF_MODE_INT], dac ? "" : " not");
 
 	hba[i]->cmd_pool_bits =
 	    kmalloc(DIV_ROUND_UP(hba[i]->nr_cmds, BITS_PER_LONG)
@@ -4433,7 +4599,7 @@
 				    hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
 				    hba[i]->errinfo_pool,
 				    hba[i]->errinfo_pool_dhandle);
-	free_irq(hba[i]->intr[SIMPLE_MODE_INT], hba[i]);
+	free_irq(hba[i]->intr[PERF_MODE_INT], hba[i]);
 clean2:
 	unregister_blkdev(hba[i]->major, hba[i]->devname);
 clean1:
@@ -4475,7 +4641,7 @@
 		printk(KERN_WARNING "cciss%d: Error flushing cache\n",
 			h->ctlr);
 	h->access.set_intr_mask(h, CCISS_INTR_OFF);
-	free_irq(h->intr[2], h);
+	free_irq(h->intr[PERF_MODE_INT], h);
 }
 
 static void __devexit cciss_remove_one(struct pci_dev *pdev)
@@ -4575,7 +4741,6 @@
 	 * array of them, the size must be a multiple of 8 bytes.
 	 */
 	BUILD_BUG_ON(sizeof(CommandList_struct) % COMMANDLIST_ALIGNMENT);
-
 	printk(KERN_INFO DRIVER_NAME "\n");
 
 	err = bus_register(&cciss_bus_type);