video: add driver for PXA3xx 2D graphics accelerator

This adds a driver for the the 2D graphics accelerator found on PXA3xx
processors. Only resource mapping, interrupt handling and a simple ioctl
handler is done by the kernel part, the rest of the logic is implemented
in DirectFB userspace.

Graphic applications greatly benefit for line drawing, blend, and
rectangle and triangle filling operations.

Benchmarks done on a PXA303 using the df_dok benchmarking tool follow,
where the value in square brackets show the CPU usage during that test.

Without accelerator (benchmarking 256x252 on 480x262 RGB16 (16bit)):

  Anti-aliased Text                              3.016 secs (   65.649 KChars/sec) [ 99.6%]
  Fill Rectangle                                 3.021 secs (  175.107 MPixel/sec) [ 98.0%]
  Fill Rectangle (blend)                         3.582 secs (    3.602 MPixel/sec) [ 99.7%]
  Fill Rectangles [10]                           3.177 secs (  182.753 MPixel/sec) [ 98.1%]
  Fill Rectangles [10] (blend)                  18.020 secs (    3.580 MPixel/sec) [ 98.7%]
  Fill Spans                                     3.019 secs (  145.306 MPixel/sec) [ 98.0%]
  Fill Spans (blend)                             3.616 secs (    3.568 MPixel/sec) [ 99.4%]
  Blit                                           3.074 secs (   39.874 MPixel/sec) [ 98.0%]
  Blit 180                                       3.020 secs (   32.042 MPixel/sec) [ 98.0%]
  Blit with format conversion                    3.005 secs (   19.321 MPixel/sec) [ 99.6%]
  Blit from 32bit (blend)                        4.792 secs (    2.692 MPixel/sec) [ 98.7%]

With accelerator:

  Anti-aliased Text                              3.056 secs (*  36.518 KChars/sec) [ 21.3%]
  Fill Rectangle                                 3.015 secs (* 115.543 MPixel/sec) [  8.9%]
  Fill Rectangle (blend)                         3.180 secs (*  20.286 MPixel/sec) [  1.8%]
  Fill Rectangles [10]                           3.251 secs (* 119.062 MPixel/sec) [  1.2%]
  Fill Rectangles [10] (blend)                   6.293 secs (*  20.502 MPixel/sec) [  0.3%]
  Fill Spans                                     3.051 secs (*  97.264 MPixel/sec) [ 35.7%]
  Fill Spans (blend)                             3.377 secs (*  15.282 MPixel/sec) [ 17.8%]
  Blit                                           3.046 secs (*  27.533 MPixel/sec) [  2.6%]
  Blit 180                                       3.098 secs (*  27.070 MPixel/sec) [  2.2%]
  Blit with format conversion                    3.131 secs (*  39.148 MPixel/sec) [  2.8%]
  Blit from 32bit (blend)                        3.346 secs (*  11.568 MPixel/sec) [  0.8%]

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Tested-by: Sven Neumann <s.neumann@raumfeld.com>
Cc: Eric Miao <eric.y.miao@gmail.com>
Cc: Denis Oliver Kropp <dok@directfb.org>
Cc: Sven Neumann <s.neumann@raumfeld.com>
Cc: Haojian Zhuang <haojian.zhuang@gmail.com>
Signed-off-by: Eric Miao <eric.y.miao@gmail.com>
diff --git a/drivers/video/pxa3xx-gcu.h b/drivers/video/pxa3xx-gcu.h
new file mode 100644
index 0000000..0428ed0
--- /dev/null
+++ b/drivers/video/pxa3xx-gcu.h
@@ -0,0 +1,38 @@
+#ifndef __PXA3XX_GCU_H__
+#define __PXA3XX_GCU_H__
+
+#include <linux/types.h>
+
+/* Number of 32bit words in display list (ring buffer). */
+#define PXA3XX_GCU_BUFFER_WORDS  ((256 * 1024 - 256) / 4)
+
+/* To be increased when breaking the ABI */
+#define PXA3XX_GCU_SHARED_MAGIC  0x30000001
+
+#define PXA3XX_GCU_BATCH_WORDS   8192
+
+struct pxa3xx_gcu_shared {
+	u32            buffer[PXA3XX_GCU_BUFFER_WORDS];
+
+	bool           hw_running;
+
+	unsigned long  buffer_phys;
+
+	unsigned int   num_words;
+	unsigned int   num_writes;
+	unsigned int   num_done;
+	unsigned int   num_interrupts;
+	unsigned int   num_wait_idle;
+	unsigned int   num_wait_free;
+	unsigned int   num_idle;
+
+	u32            magic;
+};
+
+/* Initialization and synchronization.
+ * Hardware is started upon write(). */
+#define PXA3XX_GCU_IOCTL_RESET		_IO('G', 0)
+#define PXA3XX_GCU_IOCTL_WAIT_IDLE	_IO('G', 2)
+
+#endif /* __PXA3XX_GCU_H__ */
+