crypto: marvell - Add load balancing between engines

This commits adds support for fine grained load balancing on
multi-engine IPs. The engine is pre-selected based on its current load
and on the weight of the crypto request that is about to be processed.
The global crypto queue is also moved to each engine. These changes are
required to allow chaining crypto requests at the DMA level. By using
a crypto queue per engine, we make sure that we keep the state of the
tdma chain synchronized with the crypto queue. We also reduce contention
on 'cesa_dev->lock' and improve parallelism.

Signed-off-by: Romain Perier <romain.perier@free-electrons.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/drivers/crypto/marvell/cesa.h b/drivers/crypto/marvell/cesa.h
index d749335..8d600682 100644
--- a/drivers/crypto/marvell/cesa.h
+++ b/drivers/crypto/marvell/cesa.h
@@ -400,7 +400,6 @@
  * @regs:	device registers
  * @sram_size:	usable SRAM size
  * @lock:	device lock
- * @queue:	crypto request queue
  * @engines:	array of engines
  * @dma:	dma pools
  *
@@ -412,7 +411,6 @@
 	struct device *dev;
 	unsigned int sram_size;
 	spinlock_t lock;
-	struct crypto_queue queue;
 	struct mv_cesa_engine *engines;
 	struct mv_cesa_dev_dma *dma;
 };
@@ -431,6 +429,8 @@
  * @int_mask:		interrupt mask cache
  * @pool:		memory pool pointing to the memory region reserved in
  *			SRAM
+ * @queue:		fifo of the pending crypto requests
+ * @load:		engine load counter, useful for load balancing
  *
  * Structure storing CESA engine information.
  */
@@ -446,11 +446,12 @@
 	size_t max_req_len;
 	u32 int_mask;
 	struct gen_pool *pool;
+	struct crypto_queue queue;
+	atomic_t load;
 };
 
 /**
  * struct mv_cesa_req_ops - CESA request operations
- * @prepare:	prepare a request to be executed on the specified engine
  * @process:	process a request chunk result (should return 0 if the
  *		operation, -EINPROGRESS if it needs more steps or an error
  *		code)
@@ -460,8 +461,6 @@
  * 		needed.
  */
 struct mv_cesa_req_ops {
-	void (*prepare)(struct crypto_async_request *req,
-			struct mv_cesa_engine *engine);
 	int (*process)(struct crypto_async_request *req, u32 status);
 	void (*step)(struct crypto_async_request *req);
 	void (*cleanup)(struct crypto_async_request *req);
@@ -690,6 +689,26 @@
 int mv_cesa_queue_req(struct crypto_async_request *req,
 		      struct mv_cesa_req *creq);
 
+static inline struct mv_cesa_engine *mv_cesa_select_engine(int weight)
+{
+	int i;
+	u32 min_load = U32_MAX;
+	struct mv_cesa_engine *selected = NULL;
+
+	for (i = 0; i < cesa_dev->caps->nengines; i++) {
+		struct mv_cesa_engine *engine = cesa_dev->engines + i;
+		u32 load = atomic_read(&engine->load);
+		if (load < min_load) {
+			min_load = load;
+			selected = engine;
+		}
+	}
+
+	atomic_add(weight, &selected->load);
+
+	return selected;
+}
+
 /*
  * Helper function that indicates whether a crypto request needs to be
  * cleaned up or not after being enqueued using mv_cesa_queue_req().