qdio: support asynchronous delivery of storage blocks

This patch introduces support for asynchronous delivery of storage blocks for
Hipersockets. Upper layers may exploit this functionality to reuse SBALs for
which the delivery status is still pending.

Signed-off-by: Einar Lueck <elelueck@de.ibm.com>
Signed-off-by: Jan Glauber <jang@linux.vnet.ibm.com>
Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h
index 15c9762..3881e94 100644
--- a/arch/s390/include/asm/qdio.h
+++ b/arch/s390/include/asm/qdio.h
@@ -123,6 +123,40 @@
 };
 
 /**
+ * struct qaob - queue asynchronous operation block
+ * @res0: reserved parameters
+ * @res1: reserved parameter
+ * @res2: reserved parameter
+ * @res3: reserved parameter
+ * @aorc: asynchronous operation return code
+ * @flags: internal flags
+ * @cbtbs: control block type
+ * @sb_count: number of storage blocks
+ * @sba: storage block element addresses
+ * @dcount: size of storage block elements
+ * @user0: user defineable value
+ * @res4: reserved paramater
+ * @user1: user defineable value
+ * @user2: user defineable value
+ */
+struct qaob {
+	u64 res0[6];
+	u8 res1;
+	u8 res2;
+	u8 res3;
+	u8 aorc;
+	u8 flags;
+	u16 cbtbs;
+	u8 sb_count;
+	u64 sba[QDIO_MAX_ELEMENTS_PER_BUFFER];
+	u16 dcount[QDIO_MAX_ELEMENTS_PER_BUFFER];
+	u64 user0;
+	u64 res4[2];
+	u64 user1;
+	u64 user2;
+} __attribute__ ((packed, aligned(256)));
+
+/**
  * struct slib - storage list information block (SLIB)
  * @nsliba: next SLIB address (if any)
  * @sla: SL address
@@ -225,6 +259,31 @@
 #define CHSC_AC2_DATA_DIV_AVAILABLE	0x0010
 #define CHSC_AC2_DATA_DIV_ENABLED	0x0002
 
+/**
+ * struct qdio_outbuf_state - SBAL related asynchronous operation information
+ *   (for communication with upper layer programs)
+ *   (only required for use with completion queues)
+ * @flags: flags indicating state of buffer
+ * @aob: pointer to QAOB used for the particular SBAL
+ * @user: pointer to upper layer program's state information related to SBAL
+ *        (stored in user1 data of QAOB)
+ */
+struct qdio_outbuf_state {
+	u8 flags;
+	struct qaob *aob;
+	void *user;
+};
+
+#define QDIO_OUTBUF_STATE_FLAG_NONE	0x00
+#define QDIO_OUTBUF_STATE_FLAG_PENDING	0x01
+
+#define CHSC_AC1_INITIATE_INPUTQ	0x80
+
+#define CHSC_AC2_DATA_DIV_AVAILABLE	0x0010
+#define CHSC_AC2_DATA_DIV_ENABLED	0x0002
+
+#define CHSC_AC3_FORMAT2_CQ_AVAILABLE	0x8000
+
 struct qdio_ssqd_desc {
 	u8 flags;
 	u8:8;
@@ -243,8 +302,7 @@
 	u64 sch_token;
 	u8 mro;
 	u8 mri;
-	u8:8;
-	u8 sbalic;
+	u16 qdioac3;
 	u16:16;
 	u8:8;
 	u8 mmwc;
@@ -280,9 +338,11 @@
  * @no_output_qs: number of output queues
  * @input_handler: handler to be called for input queues
  * @output_handler: handler to be called for output queues
+ * @queue_start_poll: polling handlers (one per input queue or NULL)
  * @int_parm: interruption parameter
  * @input_sbal_addr_array:  address of no_input_qs * 128 pointers
  * @output_sbal_addr_array: address of no_output_qs * 128 pointers
+ * @output_sbal_state_array: no_output_qs * 128 state info (for CQ or NULL)
  */
 struct qdio_initialize {
 	struct ccw_device *cdev;
@@ -297,11 +357,12 @@
 	unsigned int no_output_qs;
 	qdio_handler_t *input_handler;
 	qdio_handler_t *output_handler;
-	void (*queue_start_poll) (struct ccw_device *, int, unsigned long);
+	void (**queue_start_poll) (struct ccw_device *, int, unsigned long);
 	int scan_threshold;
 	unsigned long int_parm;
 	void **input_sbal_addr_array;
 	void **output_sbal_addr_array;
+	struct qdio_outbuf_state *output_sbal_state_array;
 };
 
 #define QDIO_STATE_INACTIVE		0x00000002 /* after qdio_cleanup */
@@ -316,6 +377,7 @@
 extern int qdio_allocate(struct qdio_initialize *);
 extern int qdio_establish(struct qdio_initialize *);
 extern int qdio_activate(struct ccw_device *);
+extern void qdio_release_aob(struct qaob *);
 extern int do_QDIO(struct ccw_device *, unsigned int, int, unsigned int,
 		   unsigned int);
 extern int qdio_start_irq(struct ccw_device *, int);
diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h
index e5c9664..2b21f65 100644
--- a/drivers/s390/cio/qdio.h
+++ b/drivers/s390/cio/qdio.h
@@ -44,6 +44,7 @@
 #define SLSB_STATE_NOT_INIT	0x0
 #define SLSB_STATE_EMPTY	0x1
 #define SLSB_STATE_PRIMED	0x2
+#define SLSB_STATE_PENDING	0x3
 #define SLSB_STATE_HALTED	0xe
 #define SLSB_STATE_ERROR	0xf
 #define SLSB_TYPE_INPUT		0x0
@@ -67,6 +68,8 @@
 	(SLSB_OWNER_PROG | SLSB_TYPE_OUTPUT | SLSB_STATE_NOT_INIT) /* 0xa0 */
 #define SLSB_P_OUTPUT_EMPTY	\
 	(SLSB_OWNER_PROG | SLSB_TYPE_OUTPUT | SLSB_STATE_EMPTY)	   /* 0xa1 */
+#define SLSB_P_OUTPUT_PENDING \
+	(SLSB_OWNER_PROG | SLSB_TYPE_OUTPUT | SLSB_STATE_PENDING)  /* 0xa3 */
 #define SLSB_CU_OUTPUT_PRIMED	\
 	(SLSB_OWNER_CU | SLSB_TYPE_OUTPUT | SLSB_STATE_PRIMED)	   /* 0x62 */
 #define SLSB_P_OUTPUT_HALTED	\
@@ -97,6 +100,7 @@
 #define QDIO_SIGA_WRITE		0x00
 #define QDIO_SIGA_READ		0x01
 #define QDIO_SIGA_SYNC		0x02
+#define QDIO_SIGA_WRITEQ	0x04
 #define QDIO_SIGA_QEBSM_FLAG	0x80
 
 #ifdef CONFIG_64BIT
@@ -253,6 +257,12 @@
 struct qdio_output_q {
 	/* PCIs are enabled for the queue */
 	int pci_out_enabled;
+	/* cq: use asynchronous output buffers */
+	int use_cq;
+	/* cq: aobs used for particual SBAL */
+	struct qaob **aobs;
+	/* cq: sbal state related to asynchronous operation */
+	struct qdio_outbuf_state *sbal_state;
 	/* timer to check for more outbound work */
 	struct timer_list timer;
 	/* used SBALs before tasklet schedule */
@@ -432,9 +442,20 @@
 
 extern struct indicator_t *q_indicators;
 
-static inline int shared_ind(u32 *dsci)
+static inline int has_multiple_inq_on_dsci(struct qdio_irq *irq)
 {
-	return dsci == &q_indicators[TIQDIO_SHARED_IND].ind;
+	return irq->nr_input_qs > 1;
+}
+
+static inline int references_shared_dsci(struct qdio_irq *irq)
+{
+	return irq->dsci == &q_indicators[TIQDIO_SHARED_IND].ind;
+}
+
+static inline int shared_ind(struct qdio_q *q)
+{
+	struct qdio_irq *i = q->irq_ptr;
+	return references_shared_dsci(i) || has_multiple_inq_on_dsci(i);
 }
 
 /* prototypes for thin interrupt */
@@ -449,6 +470,7 @@
 int tiqdio_register_thinints(void);
 void tiqdio_unregister_thinints(void);
 
+
 /* prototypes for setup */
 void qdio_inbound_processing(unsigned long data);
 void qdio_outbound_processing(unsigned long data);
@@ -469,6 +491,9 @@
 void qdio_setup_destroy_sysfs(struct ccw_device *cdev);
 int qdio_setup_init(void);
 void qdio_setup_exit(void);
+int qdio_enable_async_operation(struct qdio_output_q *q);
+void qdio_disable_async_operation(struct qdio_output_q *q);
+struct qaob *qdio_allocate_aob(void);
 
 int debug_get_buf_state(struct qdio_q *q, unsigned int bufnr,
 			unsigned char *state);
diff --git a/drivers/s390/cio/qdio_debug.c b/drivers/s390/cio/qdio_debug.c
index 0e615cb..aaf7f93 100644
--- a/drivers/s390/cio/qdio_debug.c
+++ b/drivers/s390/cio/qdio_debug.c
@@ -76,6 +76,9 @@
 		case SLSB_P_OUTPUT_NOT_INIT:
 			seq_printf(m, "N");
 			break;
+		case SLSB_P_OUTPUT_PENDING:
+			seq_printf(m, "P");
+			break;
 		case SLSB_P_INPUT_PRIMED:
 		case SLSB_CU_OUTPUT_PRIMED:
 			seq_printf(m, "+");
diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c
index 288c914..a7153f2 100644
--- a/drivers/s390/cio/qdio_main.c
+++ b/drivers/s390/cio/qdio_main.c
@@ -14,6 +14,7 @@
 #include <linux/timer.h>
 #include <linux/delay.h>
 #include <linux/gfp.h>
+#include <linux/io.h>
 #include <linux/kernel_stat.h>
 #include <linux/atomic.h>
 #include <asm/debug.h>
@@ -77,11 +78,13 @@
  * Note: For IQDC unicast queues only the highest priority queue is processed.
  */
 static inline int do_siga_output(unsigned long schid, unsigned long mask,
-				 unsigned int *bb, unsigned int fc)
+				 unsigned int *bb, unsigned int fc,
+				 unsigned long aob)
 {
 	register unsigned long __fc asm("0") = fc;
 	register unsigned long __schid asm("1") = schid;
 	register unsigned long __mask asm("2") = mask;
+	register unsigned long __aob asm("3") = aob;
 	int cc = QDIO_ERROR_SIGA_ACCESS_EXCEPTION;
 
 	asm volatile(
@@ -90,7 +93,8 @@
 		"	srl	%0,28\n"
 		"1:\n"
 		EX_TABLE(0b, 1b)
-		: "+d" (cc), "+d" (__fc), "+d" (__schid), "+d" (__mask)
+		: "+d" (cc), "+d" (__fc), "+d" (__schid), "+d" (__mask),
+		  "+d" (__aob)
 		: : "cc", "memory");
 	*bb = ((unsigned int) __fc) >> 31;
 	return cc;
@@ -212,7 +216,7 @@
 /* returns number of examined buffers and their common state in *state */
 static inline int get_buf_states(struct qdio_q *q, unsigned int bufnr,
 				 unsigned char *state, unsigned int count,
-				 int auto_ack)
+				 int auto_ack, int merge_pending)
 {
 	unsigned char __state = 0;
 	int i;
@@ -224,9 +228,14 @@
 		return qdio_do_eqbs(q, state, bufnr, count, auto_ack);
 
 	for (i = 0; i < count; i++) {
-		if (!__state)
+		if (!__state) {
 			__state = q->slsb.val[bufnr];
-		else if (q->slsb.val[bufnr] != __state)
+			if (merge_pending && __state == SLSB_P_OUTPUT_PENDING)
+				__state = SLSB_P_OUTPUT_EMPTY;
+		} else if (merge_pending) {
+			if ((q->slsb.val[bufnr] & __state) != __state)
+				break;
+		} else if (q->slsb.val[bufnr] != __state)
 			break;
 		bufnr = next_buf(bufnr);
 	}
@@ -237,7 +246,7 @@
 static inline int get_buf_state(struct qdio_q *q, unsigned int bufnr,
 				unsigned char *state, int auto_ack)
 {
-	return get_buf_states(q, bufnr, state, 1, auto_ack);
+	return get_buf_states(q, bufnr, state, 1, auto_ack, 0);
 }
 
 /* wrap-around safe setting of slsb states, returns number of changed buffers */
@@ -308,19 +317,28 @@
 		return qdio_siga_sync(q, q->mask, 0);
 }
 
-static int qdio_siga_output(struct qdio_q *q, unsigned int *busy_bit)
+static int qdio_siga_output(struct qdio_q *q, unsigned int *busy_bit,
+	unsigned long aob)
 {
 	unsigned long schid = *((u32 *) &q->irq_ptr->schid);
 	unsigned int fc = QDIO_SIGA_WRITE;
 	u64 start_time = 0;
 	int retries = 0, cc;
+	unsigned long laob = 0;
+
+	if (q->u.out.use_cq && aob != 0) {
+		fc = QDIO_SIGA_WRITEQ;
+		laob = aob;
+	}
 
 	if (is_qebsm(q)) {
 		schid = q->irq_ptr->sch_token;
 		fc |= QDIO_SIGA_QEBSM_FLAG;
 	}
 again:
-	cc = do_siga_output(schid, q->mask, busy_bit, fc);
+	WARN_ON_ONCE((aob && queue_type(q) != QDIO_IQDIO_QFMT) ||
+		(aob && fc != QDIO_SIGA_WRITEQ));
+	cc = do_siga_output(schid, q->mask, busy_bit, fc, laob);
 
 	/* hipersocket busy condition */
 	if (unlikely(*busy_bit)) {
@@ -379,7 +397,7 @@
 {
 	if (need_siga_sync(q))
 		qdio_siga_sync_q(q);
-	return get_buf_states(q, bufnr, state, 1, 0);
+	return get_buf_states(q, bufnr, state, 1, 0, 0);
 }
 
 static inline void qdio_stop_polling(struct qdio_q *q)
@@ -507,7 +525,7 @@
 	 * No siga sync here, as a PCI or we after a thin interrupt
 	 * already sync'ed the queues.
 	 */
-	count = get_buf_states(q, q->first_to_check, &state, count, 1);
+	count = get_buf_states(q, q->first_to_check, &state, count, 1, 0);
 	if (!count)
 		goto out;
 
@@ -590,6 +608,107 @@
 		return 0;
 }
 
+static inline int contains_aobs(struct qdio_q *q)
+{
+	return !q->is_input_q && q->u.out.use_cq;
+}
+
+static inline void qdio_trace_aob(struct qdio_irq *irq, struct qdio_q *q,
+				int i, struct qaob *aob)
+{
+	int tmp;
+
+	DBF_DEV_EVENT(DBF_INFO, irq, "AOB%d:%lx", i,
+			(unsigned long) virt_to_phys(aob));
+	DBF_DEV_EVENT(DBF_INFO, irq, "RES00:%lx",
+			(unsigned long) aob->res0[0]);
+	DBF_DEV_EVENT(DBF_INFO, irq, "RES01:%lx",
+			(unsigned long) aob->res0[1]);
+	DBF_DEV_EVENT(DBF_INFO, irq, "RES02:%lx",
+			(unsigned long) aob->res0[2]);
+	DBF_DEV_EVENT(DBF_INFO, irq, "RES03:%lx",
+			(unsigned long) aob->res0[3]);
+	DBF_DEV_EVENT(DBF_INFO, irq, "RES04:%lx",
+			(unsigned long) aob->res0[4]);
+	DBF_DEV_EVENT(DBF_INFO, irq, "RES05:%lx",
+			(unsigned long) aob->res0[5]);
+	DBF_DEV_EVENT(DBF_INFO, irq, "RES1:%x", aob->res1);
+	DBF_DEV_EVENT(DBF_INFO, irq, "RES2:%x", aob->res2);
+	DBF_DEV_EVENT(DBF_INFO, irq, "RES3:%x", aob->res3);
+	DBF_DEV_EVENT(DBF_INFO, irq, "AORC:%u", aob->aorc);
+	DBF_DEV_EVENT(DBF_INFO, irq, "FLAGS:%u", aob->flags);
+	DBF_DEV_EVENT(DBF_INFO, irq, "CBTBS:%u", aob->cbtbs);
+	DBF_DEV_EVENT(DBF_INFO, irq, "SBC:%u", aob->sb_count);
+	for (tmp = 0; tmp < QDIO_MAX_ELEMENTS_PER_BUFFER; ++tmp) {
+		DBF_DEV_EVENT(DBF_INFO, irq, "SBA%d:%lx", tmp,
+				(unsigned long) aob->sba[tmp]);
+		DBF_DEV_EVENT(DBF_INFO, irq, "rSBA%d:%lx", tmp,
+				(unsigned long) q->sbal[i]->element[tmp].addr);
+		DBF_DEV_EVENT(DBF_INFO, irq, "DC%d:%u", tmp, aob->dcount[tmp]);
+		DBF_DEV_EVENT(DBF_INFO, irq, "rDC%d:%u", tmp,
+				q->sbal[i]->element[tmp].length);
+	}
+	DBF_DEV_EVENT(DBF_INFO, irq, "USER0:%lx", (unsigned long) aob->user0);
+	for (tmp = 0; tmp < 2; ++tmp) {
+		DBF_DEV_EVENT(DBF_INFO, irq, "RES4%d:%lx", tmp,
+			(unsigned long) aob->res4[tmp]);
+	}
+	DBF_DEV_EVENT(DBF_INFO, irq, "USER1:%lx", (unsigned long) aob->user1);
+	DBF_DEV_EVENT(DBF_INFO, irq, "USER2:%lx", (unsigned long) aob->user2);
+}
+
+static inline void qdio_handle_aobs(struct qdio_q *q, int start, int count)
+{
+	unsigned char state = 0;
+	int j, b = start;
+
+	if (!contains_aobs(q))
+		return;
+
+	for (j = 0; j < count; ++j) {
+		get_buf_state(q, b, &state, 0);
+		if (state == SLSB_P_OUTPUT_PENDING) {
+			struct qaob *aob = q->u.out.aobs[b];
+			if (aob == NULL)
+				continue;
+
+			BUG_ON(q->u.out.sbal_state == NULL);
+			q->u.out.sbal_state[b].flags |=
+				QDIO_OUTBUF_STATE_FLAG_PENDING;
+			q->u.out.aobs[b] = NULL;
+		} else if (state == SLSB_P_OUTPUT_EMPTY) {
+			BUG_ON(q->u.out.sbal_state == NULL);
+			q->u.out.sbal_state[b].aob = NULL;
+		}
+		b = next_buf(b);
+	}
+}
+
+static inline unsigned long qdio_aob_for_buffer(struct qdio_output_q *q,
+					int bufnr)
+{
+	unsigned long phys_aob = 0;
+
+	if (!q->use_cq)
+		goto out;
+
+	if (!q->aobs[bufnr]) {
+		struct qaob *aob = qdio_allocate_aob();
+		q->aobs[bufnr] = aob;
+	}
+	if (q->aobs[bufnr]) {
+		BUG_ON(q->sbal_state == NULL);
+		q->sbal_state[bufnr].flags = QDIO_OUTBUF_STATE_FLAG_NONE;
+		q->sbal_state[bufnr].aob = q->aobs[bufnr];
+		q->aobs[bufnr]->user1 = (u64) q->sbal_state[bufnr].user;
+		phys_aob = virt_to_phys(q->aobs[bufnr]);
+		BUG_ON(phys_aob & 0xFF);
+	}
+
+out:
+	return phys_aob;
+}
+
 static void qdio_kick_handler(struct qdio_q *q)
 {
 	int start = q->first_to_kick;
@@ -610,6 +729,8 @@
 			      start, count);
 	}
 
+	qdio_handle_aobs(q, start, count);
+
 	q->handler(q->irq_ptr->cdev, q->qdio_error, q->nr, start, count,
 		   q->irq_ptr->int_parm);
 
@@ -672,23 +793,26 @@
 	 */
 	count = min(atomic_read(&q->nr_buf_used), QDIO_MAX_BUFFERS_MASK);
 	stop = add_buf(q->first_to_check, count);
-
 	if (q->first_to_check == stop)
-		return q->first_to_check;
+		goto out;
 
-	count = get_buf_states(q, q->first_to_check, &state, count, 0);
+	count = get_buf_states(q, q->first_to_check, &state, count, 0, 1);
 	if (!count)
-		return q->first_to_check;
+		goto out;
 
 	switch (state) {
+	case SLSB_P_OUTPUT_PENDING:
+		BUG();
 	case SLSB_P_OUTPUT_EMPTY:
 		/* the adapter got it */
-		DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "out empty:%1d %02x", q->nr, count);
+		DBF_DEV_EVENT(DBF_INFO, q->irq_ptr,
+			"out empty:%1d %02x", q->nr, count);
 
 		atomic_sub(count, &q->nr_buf_used);
 		q->first_to_check = add_buf(q->first_to_check, count);
 		if (q->irq_ptr->perf_stat_enabled)
 			account_sbals(q, count);
+
 		break;
 	case SLSB_P_OUTPUT_ERROR:
 		process_buffer_error(q, count);
@@ -701,7 +825,8 @@
 		/* the adapter has not fetched the output yet */
 		if (q->irq_ptr->perf_stat_enabled)
 			q->q_stats.nr_sbal_nop++;
-		DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "out primed:%1d", q->nr);
+		DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "out primed:%1d",
+			      q->nr);
 		break;
 	case SLSB_P_OUTPUT_NOT_INIT:
 	case SLSB_P_OUTPUT_HALTED:
@@ -709,6 +834,8 @@
 	default:
 		BUG();
 	}
+
+out:
 	return q->first_to_check;
 }
 
@@ -732,7 +859,7 @@
 		return 0;
 }
 
-static int qdio_kick_outbound_q(struct qdio_q *q)
+static int qdio_kick_outbound_q(struct qdio_q *q, unsigned long aob)
 {
 	int retries = 0, cc;
 	unsigned int busy_bit;
@@ -744,7 +871,7 @@
 retry:
 	qperf_inc(q, siga_write);
 
-	cc = qdio_siga_output(q, &busy_bit);
+	cc = qdio_siga_output(q, &busy_bit, aob);
 	switch (cc) {
 	case 0:
 		break;
@@ -921,8 +1048,9 @@
 			}
 			q->u.in.queue_start_poll(q->irq_ptr->cdev, q->nr,
 						 q->irq_ptr->int_parm);
-		} else
+		} else {
 			tasklet_schedule(&q->tasklet);
+		}
 	}
 
 	if (!pci_out_supported(q))
@@ -1236,6 +1364,26 @@
 }
 EXPORT_SYMBOL_GPL(qdio_allocate);
 
+static void qdio_detect_hsicq(struct qdio_irq *irq_ptr)
+{
+	struct qdio_q *q = irq_ptr->input_qs[0];
+	int i, use_cq = 0;
+
+	if (irq_ptr->nr_input_qs > 1 && queue_type(q) == QDIO_IQDIO_QFMT)
+		use_cq = 1;
+
+	for_each_output_queue(irq_ptr, q, i) {
+		if (use_cq) {
+			if (qdio_enable_async_operation(&q->u.out) < 0) {
+				use_cq = 0;
+				continue;
+			}
+		} else
+			qdio_disable_async_operation(&q->u.out);
+	}
+	DBF_EVENT("use_cq:%d", use_cq);
+}
+
 /**
  * qdio_establish - establish queues on a qdio subchannel
  * @init_data: initialization data
@@ -1301,6 +1449,8 @@
 	qdio_setup_ssqd_info(irq_ptr);
 	DBF_EVENT("qib ac:%4x", irq_ptr->qib.ac);
 
+	qdio_detect_hsicq(irq_ptr);
+
 	/* qebsm is now setup if available, initialize buffer states */
 	qdio_init_buf_states(irq_ptr);
 
@@ -1480,17 +1630,21 @@
 		q->u.out.pci_out_enabled = 0;
 
 	if (queue_type(q) == QDIO_IQDIO_QFMT) {
-		/* One SIGA-W per buffer required for unicast HiperSockets. */
+		unsigned long phys_aob = 0;
+
+		/* One SIGA-W per buffer required for unicast HSI */
 		WARN_ON_ONCE(count > 1 && !multicast_outbound(q));
 
-		rc = qdio_kick_outbound_q(q);
+		phys_aob = qdio_aob_for_buffer(&q->u.out, bufnr);
+
+		rc = qdio_kick_outbound_q(q, phys_aob);
 	} else if (need_siga_sync(q)) {
 		rc = qdio_siga_sync_q(q);
 	} else {
 		/* try to fast requeue buffers */
 		get_buf_state(q, prev_buf(bufnr), &state, 0);
 		if (state != SLSB_CU_OUTPUT_PRIMED)
-			rc = qdio_kick_outbound_q(q);
+			rc = qdio_kick_outbound_q(q, 0);
 		else
 			qperf_inc(q, fast_requeue);
 	}
@@ -1518,6 +1672,7 @@
 {
 	struct qdio_irq *irq_ptr;
 
+
 	if (bufnr >= QDIO_MAX_BUFFERS_PER_Q || count > QDIO_MAX_BUFFERS_PER_Q)
 		return -EINVAL;
 
@@ -1562,7 +1717,7 @@
 
 	WARN_ON(queue_irqs_enabled(q));
 
-	if (!shared_ind(q->irq_ptr->dsci))
+	if (!shared_ind(q))
 		xchg(q->irq_ptr->dsci, 0);
 
 	qdio_stop_polling(q);
@@ -1572,7 +1727,7 @@
 	 * We need to check again to not lose initiative after
 	 * resetting the ACK state.
 	 */
-	if (!shared_ind(q->irq_ptr->dsci) && *q->irq_ptr->dsci)
+	if (!shared_ind(q) && *q->irq_ptr->dsci)
 		goto rescan;
 	if (!qdio_inbound_q_done(q))
 		goto rescan;
diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c
index 89107d0..dd8bd67 100644
--- a/drivers/s390/cio/qdio_setup.c
+++ b/drivers/s390/cio/qdio_setup.c
@@ -19,6 +19,22 @@
 #include "qdio_debug.h"
 
 static struct kmem_cache *qdio_q_cache;
+static struct kmem_cache *qdio_aob_cache;
+
+struct qaob *qdio_allocate_aob()
+{
+	struct qaob *aob;
+
+	aob = kmem_cache_zalloc(qdio_aob_cache, GFP_ATOMIC);
+	return aob;
+}
+EXPORT_SYMBOL_GPL(qdio_allocate_aob);
+
+void qdio_release_aob(struct qaob *aob)
+{
+	kmem_cache_free(qdio_aob_cache, aob);
+}
+EXPORT_SYMBOL_GPL(qdio_release_aob);
 
 /*
  * qebsm is only available under 64bit but the adapter sets the feature
@@ -154,29 +170,36 @@
 	struct qdio_q *q;
 	void **input_sbal_array = qdio_init->input_sbal_addr_array;
 	void **output_sbal_array = qdio_init->output_sbal_addr_array;
+	struct qdio_outbuf_state *output_sbal_state_array =
+				  qdio_init->output_sbal_state_array;
 	int i;
 
 	for_each_input_queue(irq_ptr, q, i) {
-		DBF_EVENT("in-q:%1d", i);
+		DBF_EVENT("inq:%1d", i);
 		setup_queues_misc(q, irq_ptr, qdio_init->input_handler, i);
 
 		q->is_input_q = 1;
-		q->u.in.queue_start_poll = qdio_init->queue_start_poll;
+		q->u.in.queue_start_poll = qdio_init->queue_start_poll[i];
+
 		setup_storage_lists(q, irq_ptr, input_sbal_array, i);
 		input_sbal_array += QDIO_MAX_BUFFERS_PER_Q;
 
-		if (is_thinint_irq(irq_ptr))
+		if (is_thinint_irq(irq_ptr)) {
 			tasklet_init(&q->tasklet, tiqdio_inbound_processing,
 				     (unsigned long) q);
-		else
+		} else {
 			tasklet_init(&q->tasklet, qdio_inbound_processing,
 				     (unsigned long) q);
+		}
 	}
 
 	for_each_output_queue(irq_ptr, q, i) {
 		DBF_EVENT("outq:%1d", i);
 		setup_queues_misc(q, irq_ptr, qdio_init->output_handler, i);
 
+		q->u.out.sbal_state = output_sbal_state_array;
+		output_sbal_state_array += QDIO_MAX_BUFFERS_PER_Q;
+
 		q->is_input_q = 0;
 		q->u.out.scan_threshold = qdio_init->scan_threshold;
 		setup_storage_lists(q, irq_ptr, output_sbal_array, i);
@@ -311,6 +334,19 @@
 	for (i = 0; i < QDIO_MAX_QUEUES_PER_IRQ; i++) {
 		q = irq_ptr->output_qs[i];
 		if (q) {
+			if (q->u.out.use_cq) {
+				int n;
+
+				for (n = 0; n < QDIO_MAX_BUFFERS_PER_Q; ++n) {
+					struct qaob *aob = q->u.out.aobs[n];
+					if (aob) {
+						qdio_release_aob(aob);
+						q->u.out.aobs[n] = NULL;
+					}
+				}
+
+				qdio_disable_async_operation(&q->u.out);
+			}
 			free_page((unsigned long) q->slib);
 			kmem_cache_free(qdio_q_cache, q);
 		}
@@ -465,23 +501,60 @@
 	printk(KERN_INFO "%s", s);
 }
 
+int qdio_enable_async_operation(struct qdio_output_q *outq)
+{
+	outq->aobs = kzalloc(sizeof(struct qaob *) * QDIO_MAX_BUFFERS_PER_Q,
+			     GFP_ATOMIC);
+	if (!outq->aobs) {
+		outq->use_cq = 0;
+		return -ENOMEM;
+	}
+	outq->use_cq = 1;
+	return 0;
+}
+
+void qdio_disable_async_operation(struct qdio_output_q *q)
+{
+	kfree(q->aobs);
+	q->aobs = NULL;
+	q->use_cq = 0;
+}
+
 int __init qdio_setup_init(void)
 {
+	int rc;
+
 	qdio_q_cache = kmem_cache_create("qdio_q", sizeof(struct qdio_q),
 					 256, 0, NULL);
 	if (!qdio_q_cache)
 		return -ENOMEM;
 
+	qdio_aob_cache = kmem_cache_create("qdio_aob",
+					sizeof(struct qaob),
+					sizeof(struct qaob),
+					0,
+					NULL);
+	if (!qdio_aob_cache) {
+		rc = -ENOMEM;
+		goto free_qdio_q_cache;
+	}
+
 	/* Check for OSA/FCP thin interrupts (bit 67). */
 	DBF_EVENT("thinint:%1d",
 		  (css_general_characteristics.aif_osa) ? 1 : 0);
 
 	/* Check for QEBSM support in general (bit 58). */
 	DBF_EVENT("cssQEBSM:%1d", (qebsm_possible()) ? 1 : 0);
-	return 0;
+	rc = 0;
+out:
+	return rc;
+free_qdio_q_cache:
+	kmem_cache_destroy(qdio_q_cache);
+	goto out;
 }
 
 void qdio_setup_exit(void)
 {
+	kmem_cache_destroy(qdio_aob_cache);
 	kmem_cache_destroy(qdio_q_cache);
 }
diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c
index 2a1d4df..a3e3949 100644
--- a/drivers/s390/cio/qdio_thinint.c
+++ b/drivers/s390/cio/qdio_thinint.c
@@ -67,12 +67,9 @@
 
 void tiqdio_add_input_queues(struct qdio_irq *irq_ptr)
 {
-	struct qdio_q *q;
-	int i;
-
 	mutex_lock(&tiq_list_lock);
-	for_each_input_queue(irq_ptr, q, i)
-		list_add_rcu(&q->entry, &tiq_list);
+	BUG_ON(irq_ptr->nr_input_qs < 1);
+	list_add_rcu(&irq_ptr->input_qs[0]->entry, &tiq_list);
 	mutex_unlock(&tiq_list_lock);
 	xchg(irq_ptr->dsci, 1 << 7);
 }
@@ -80,19 +77,17 @@
 void tiqdio_remove_input_queues(struct qdio_irq *irq_ptr)
 {
 	struct qdio_q *q;
-	int i;
 
-	for (i = 0; i < irq_ptr->nr_input_qs; i++) {
-		q = irq_ptr->input_qs[i];
-		/* if establish triggered an error */
-		if (!q || !q->entry.prev || !q->entry.next)
-			continue;
+	BUG_ON(irq_ptr->nr_input_qs < 1);
+	q = irq_ptr->input_qs[0];
+	/* if establish triggered an error */
+	if (!q || !q->entry.prev || !q->entry.next)
+		return;
 
-		mutex_lock(&tiq_list_lock);
-		list_del_rcu(&q->entry);
-		mutex_unlock(&tiq_list_lock);
-		synchronize_rcu();
-	}
+	mutex_lock(&tiq_list_lock);
+	list_del_rcu(&q->entry);
+	mutex_unlock(&tiq_list_lock);
+	synchronize_rcu();
 }
 
 static inline u32 clear_shared_ind(void)
@@ -102,6 +97,40 @@
 	return xchg(&q_indicators[TIQDIO_SHARED_IND].ind, 0);
 }
 
+static inline void tiqdio_call_inq_handlers(struct qdio_irq *irq)
+{
+	struct qdio_q *q;
+	int i;
+
+	for_each_input_queue(irq, q, i) {
+		if (!references_shared_dsci(irq) &&
+		    has_multiple_inq_on_dsci(irq))
+			xchg(q->irq_ptr->dsci, 0);
+
+		if (q->u.in.queue_start_poll) {
+			/* skip if polling is enabled or already in work */
+			if (test_and_set_bit(QDIO_QUEUE_IRQS_DISABLED,
+					     &q->u.in.queue_irq_state)) {
+				qperf_inc(q, int_discarded);
+				continue;
+			}
+
+			/* avoid dsci clear here, done after processing */
+			q->u.in.queue_start_poll(q->irq_ptr->cdev, q->nr,
+						 q->irq_ptr->int_parm);
+		} else {
+			if (!shared_ind(q))
+				xchg(q->irq_ptr->dsci, 0);
+
+			/*
+			 * Call inbound processing but not directly
+			 * since that could starve other thinint queues.
+			 */
+			tasklet_schedule(&q->tasklet);
+		}
+	}
+}
+
 /**
  * tiqdio_thinint_handler - thin interrupt handler for qdio
  * @alsi: pointer to adapter local summary indicator
@@ -120,35 +149,18 @@
 
 	/* check for work on all inbound thinint queues */
 	list_for_each_entry_rcu(q, &tiq_list, entry) {
+		struct qdio_irq *irq;
 
 		/* only process queues from changed sets */
-		if (unlikely(shared_ind(q->irq_ptr->dsci))) {
+		irq = q->irq_ptr;
+		if (unlikely(references_shared_dsci(irq))) {
 			if (!si_used)
 				continue;
-		} else if (!*q->irq_ptr->dsci)
+		} else if (!*irq->dsci)
 			continue;
 
-		if (q->u.in.queue_start_poll) {
-			/* skip if polling is enabled or already in work */
-			if (test_and_set_bit(QDIO_QUEUE_IRQS_DISABLED,
-					     &q->u.in.queue_irq_state)) {
-				qperf_inc(q, int_discarded);
-				continue;
-			}
+		tiqdio_call_inq_handlers(irq);
 
-			/* avoid dsci clear here, done after processing */
-			q->u.in.queue_start_poll(q->irq_ptr->cdev, q->nr,
-						 q->irq_ptr->int_parm);
-		} else {
-			/* only clear it if the indicator is non-shared */
-			if (!shared_ind(q->irq_ptr->dsci))
-				xchg(q->irq_ptr->dsci, 0);
-			/*
-			 * Call inbound processing but not directly
-			 * since that could starve other thinint queues.
-			 */
-			tasklet_schedule(&q->tasklet);
-		}
 		qperf_inc(q, adapter_int);
 	}
 	rcu_read_unlock();
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 2b0fb05..8d804be 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -3939,6 +3939,7 @@
 	struct qdio_initialize init_data;
 	char *qib_param_field;
 	struct qdio_buffer **in_sbal_ptrs;
+	void (**queue_start_poll) (struct ccw_device *, int, unsigned long);
 	struct qdio_buffer **out_sbal_ptrs;
 	int i, j, k;
 	int rc = 0;
@@ -3947,8 +3948,10 @@
 
 	qib_param_field = kzalloc(QDIO_MAX_BUFFERS_PER_Q * sizeof(char),
 			      GFP_KERNEL);
-	if (!qib_param_field)
-		return -ENOMEM;
+	if (!qib_param_field) {
+		rc =  -ENOMEM;
+		goto out_free_nothing;
+	}
 
 	qeth_create_qib_param_field(card, qib_param_field);
 	qeth_create_qib_param_field_blkt(card, qib_param_field);
@@ -3956,20 +3959,26 @@
 	in_sbal_ptrs = kmalloc(QDIO_MAX_BUFFERS_PER_Q * sizeof(void *),
 			       GFP_KERNEL);
 	if (!in_sbal_ptrs) {
-		kfree(qib_param_field);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto out_free_qib_param;
 	}
 	for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; ++i)
 		in_sbal_ptrs[i] = (struct qdio_buffer *)
 			virt_to_phys(card->qdio.in_q->bufs[i].buffer);
 
+	queue_start_poll = kmalloc(sizeof(void *) * 1, GFP_KERNEL);
+	if (!queue_start_poll) {
+		rc = -ENOMEM;
+		goto out_free_in_sbals;
+	}
+	queue_start_poll[0] = card->discipline.start_poll;
+
 	out_sbal_ptrs =
 		kmalloc(card->qdio.no_out_queues * QDIO_MAX_BUFFERS_PER_Q *
 			sizeof(void *), GFP_KERNEL);
 	if (!out_sbal_ptrs) {
-		kfree(in_sbal_ptrs);
-		kfree(qib_param_field);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto out_free_queue_start_poll;
 	}
 	for (i = 0, k = 0; i < card->qdio.no_out_queues; ++i)
 		for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; ++j, ++k) {
@@ -3986,7 +3995,7 @@
 	init_data.no_output_qs           = card->qdio.no_out_queues;
 	init_data.input_handler          = card->discipline.input_handler;
 	init_data.output_handler         = card->discipline.output_handler;
-	init_data.queue_start_poll	 = card->discipline.start_poll;
+	init_data.queue_start_poll       = queue_start_poll;
 	init_data.int_parm               = (unsigned long) card;
 	init_data.input_sbal_addr_array  = (void **) in_sbal_ptrs;
 	init_data.output_sbal_addr_array = (void **) out_sbal_ptrs;
@@ -4008,8 +4017,13 @@
 	}
 out:
 	kfree(out_sbal_ptrs);
+out_free_queue_start_poll:
+	kfree(queue_start_poll);
+out_free_in_sbals:
 	kfree(in_sbal_ptrs);
+out_free_qib_param:
 	kfree(qib_param_field);
+out_free_nothing:
 	return rc;
 }