fbdev: Support for byte-reversed framebuffer formats

Allow generic frame-buffer code to correctly write texts and blit images for
1, 2 and 4 bit per pixel frame-buffer organizations when pixels in bytes are
organized to in opposite order than bytes in long type.

Overhead should be reasonable.  If option is not selected, than compiler
should eliminate completely all overhead.

The feature is disabled at compile time if CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is
not set.

[adaplas]
Convert helper functions to macros if feature is not enabled.

Signed-off-by: Pavel Pisa <pisa@cmp.felk.cvut.cz>
Signed-off-by: Antonino Daplas <adaplas@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index f1cc899..5c1ef09 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -103,6 +103,15 @@
 	  blitting. This is used by drivers that don't provide their own
 	  (accelerated) version.
 
+config FB_CFB_REV_PIXELS_IN_BYTE
+	bool
+	depends on FB
+	default n
+	---help---
+	  Allow generic frame-buffer functions to work on displays with 1, 2
+	  and 4 bits per pixel depths which has opposite order of pixels in
+	  byte order to bytes in long order.
+
 config FB_SYS_FILLRECT
 	tristate
 	depends on FB
diff --git a/drivers/video/cfbcopyarea.c b/drivers/video/cfbcopyarea.c
index 032210f..1c67885 100644
--- a/drivers/video/cfbcopyarea.c
+++ b/drivers/video/cfbcopyarea.c
@@ -45,14 +45,14 @@
 
 static void
 bitcpy(unsigned long __iomem *dst, int dst_idx, const unsigned long __iomem *src,
-	int src_idx, int bits, unsigned n)
+	int src_idx, int bits, unsigned n, u32 bswapmask)
 {
 	unsigned long first, last;
 	int const shift = dst_idx-src_idx;
 	int left, right;
 
-	first = FB_SHIFT_HIGH(~0UL, dst_idx);
-	last = ~(FB_SHIFT_HIGH(~0UL, (dst_idx+n) % bits));
+	first = fb_shifted_pixels_mask_long(dst_idx, bswapmask);
+	last = ~fb_shifted_pixels_mask_long((dst_idx+n) % bits, bswapmask);
 
 	if (!shift) {
 		// Same alignment for source and dest
@@ -185,7 +185,7 @@
 
 static void
 bitcpy_rev(unsigned long __iomem *dst, int dst_idx, const unsigned long __iomem *src,
-		int src_idx, int bits, unsigned n)
+		int src_idx, int bits, unsigned n, u32 bswapmask)
 {
 	unsigned long first, last;
 	int shift;
@@ -203,8 +203,8 @@
 
 	shift = dst_idx-src_idx;
 
-	first = FB_SHIFT_LOW(~0UL, bits - 1 - dst_idx);
-	last = ~(FB_SHIFT_LOW(~0UL, bits - 1 - ((dst_idx-n) % bits)));
+	first = fb_shifted_pixels_mask_long(bits - 1 - dst_idx, bswapmask);
+	last = ~fb_shifted_pixels_mask_long(bits - 1 - ((dst_idx-n) % bits), bswapmask);
 
 	if (!shift) {
 		// Same alignment for source and dest
@@ -336,6 +336,7 @@
 	unsigned long __iomem *dst = NULL, *src = NULL;
 	int bits = BITS_PER_LONG, bytes = bits >> 3;
 	int dst_idx = 0, src_idx = 0, rev_copy = 0;
+	u32 bswapmask = fb_compute_bswapmask(p);
 
 	if (p->state != FBINFO_STATE_RUNNING)
 		return;
@@ -368,7 +369,7 @@
 			src += src_idx >> (ffs(bits) - 1);
 			src_idx &= (bytes - 1);
 			bitcpy_rev(dst, dst_idx, src, src_idx, bits,
-				width*p->var.bits_per_pixel);
+				width*p->var.bits_per_pixel, bswapmask);
 		}
 	} else {
 		while (height--) {
@@ -377,7 +378,7 @@
 			src += src_idx >> (ffs(bits) - 1);
 			src_idx &= (bytes - 1);
 			bitcpy(dst, dst_idx, src, src_idx, bits,
-				width*p->var.bits_per_pixel);
+				width*p->var.bits_per_pixel, bswapmask);
 			dst_idx += bits_per_line;
 			src_idx += bits_per_line;
 		}
diff --git a/drivers/video/cfbfillrect.c b/drivers/video/cfbfillrect.c
index 71623b4..23d70a1 100644
--- a/drivers/video/cfbfillrect.c
+++ b/drivers/video/cfbfillrect.c
@@ -36,15 +36,16 @@
      */
 
 static void
-bitfill_aligned(unsigned long __iomem *dst, int dst_idx, unsigned long pat, unsigned n, int bits)
+bitfill_aligned(unsigned long __iomem *dst, int dst_idx, unsigned long pat,
+		unsigned n, int bits, u32 bswapmask)
 {
 	unsigned long first, last;
 
 	if (!n)
 		return;
 
-	first = FB_SHIFT_HIGH(~0UL, dst_idx);
-	last = ~(FB_SHIFT_HIGH(~0UL, (dst_idx+n) % bits));
+	first = fb_shifted_pixels_mask_long(dst_idx, bswapmask);
+	last = ~fb_shifted_pixels_mask_long((dst_idx+n) % bits, bswapmask);
 
 	if (dst_idx+n <= bits) {
 		// Single word
@@ -146,7 +147,8 @@
      *  Aligned pattern invert using 32/64-bit memory accesses
      */
 static void
-bitfill_aligned_rev(unsigned long __iomem *dst, int dst_idx, unsigned long pat, unsigned n, int bits)
+bitfill_aligned_rev(unsigned long __iomem *dst, int dst_idx, unsigned long pat,
+		unsigned n, int bits, u32 bswapmask)
 {
 	unsigned long val = pat, dat;
 	unsigned long first, last;
@@ -154,8 +156,8 @@
 	if (!n)
 		return;
 
-	first = FB_SHIFT_HIGH(~0UL, dst_idx);
-	last = ~(FB_SHIFT_HIGH(~0UL, (dst_idx+n) % bits));
+	first = fb_shifted_pixels_mask_long(dst_idx, bswapmask);
+	last = ~fb_shifted_pixels_mask_long((dst_idx+n) % bits, bswapmask);
 
 	if (dst_idx+n <= bits) {
 		// Single word
@@ -303,8 +305,10 @@
 	if (p->fbops->fb_sync)
 		p->fbops->fb_sync(p);
 	if (!left) {
+		u32 bswapmask = fb_compute_bswapmask(p);
 		void (*fill_op32)(unsigned long __iomem *dst, int dst_idx,
-		                  unsigned long pat, unsigned n, int bits) = NULL;
+		                  unsigned long pat, unsigned n, int bits,
+				  u32 bswapmask) = NULL;
 
 		switch (rect->rop) {
 		case ROP_XOR:
@@ -321,7 +325,7 @@
 		while (height--) {
 			dst += dst_idx >> (ffs(bits) - 1);
 			dst_idx &= (bits - 1);
-			fill_op32(dst, dst_idx, pat, width*bpp, bits);
+			fill_op32(dst, dst_idx, pat, width*bpp, bits, bswapmask);
 			dst_idx += p->fix.line_length*8;
 		}
 	} else {
diff --git a/drivers/video/cfbimgblt.c b/drivers/video/cfbimgblt.c
index 2610044..f598907 100644
--- a/drivers/video/cfbimgblt.c
+++ b/drivers/video/cfbimgblt.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <linux/fb.h>
 #include <asm/types.h>
+#include "fb_draw.h"
 
 #define DEBUG
 
@@ -87,6 +88,7 @@
 	u32 null_bits = 32 - bpp;
 	u32 *palette = (u32 *) p->pseudo_palette;
 	const u8 *src = image->data;
+	u32 bswapmask = fb_compute_bswapmask(p);
 
 	dst2 = (u32 __iomem *) dst1;
 	for (i = image->height; i--; ) {
@@ -96,7 +98,7 @@
 		val = 0;
 		
 		if (start_index) {
-			u32 start_mask = ~(FB_SHIFT_HIGH(~(u32)0, start_index));
+			u32 start_mask = ~fb_shifted_pixels_mask_u32(start_index, bswapmask);
 			val = FB_READL(dst) & start_mask;
 			shift = start_index;
 		}
@@ -107,7 +109,7 @@
 			else
 				color = *src;
 			color <<= FB_LEFT_POS(bpp);
-			val |= FB_SHIFT_HIGH(color, shift);
+			val |= FB_SHIFT_HIGH(color, shift ^ bswapmask);
 			if (shift >= null_bits) {
 				FB_WRITEL(val, dst++);
 	
@@ -119,7 +121,7 @@
 			src++;
 		}
 		if (shift) {
-			u32 end_mask = FB_SHIFT_HIGH(~(u32)0, shift);
+			u32 end_mask = fb_shifted_pixels_mask_u32(shift, bswapmask);
 
 			FB_WRITEL((FB_READL(dst) & end_mask) | val, dst);
 		}
@@ -147,7 +149,8 @@
 	u32 spitch = (image->width+7)/8;
 	const u8 *src = image->data, *s;
 	u32 i, j, l;
-	
+	u32 bswapmask = fb_compute_bswapmask(p);
+
 	dst2 = (u32 __iomem *) dst1;
 	fgcolor <<= FB_LEFT_POS(bpp);
 	bgcolor <<= FB_LEFT_POS(bpp);
@@ -161,7 +164,7 @@
 
 		/* write leading bits */
 		if (start_index) {
-			u32 start_mask = ~(FB_SHIFT_HIGH(~(u32)0,start_index));
+			u32 start_mask = ~fb_shifted_pixels_mask_u32(start_index, bswapmask);
 			val = FB_READL(dst) & start_mask;
 			shift = start_index;
 		}
@@ -169,7 +172,7 @@
 		while (j--) {
 			l--;
 			color = (*s & (1 << l)) ? fgcolor : bgcolor;
-			val |= FB_SHIFT_HIGH(color, shift);
+			val |= FB_SHIFT_HIGH(color, shift ^ bswapmask);
 			
 			/* Did the bitshift spill bits to the next long? */
 			if (shift >= null_bits) {
@@ -184,7 +187,7 @@
 
 		/* write trailing bits */
  		if (shift) {
-			u32 end_mask = FB_SHIFT_HIGH(~(u32)0, shift);
+			u32 end_mask = fb_shifted_pixels_mask_u32(shift, bswapmask);
 
 			FB_WRITEL((FB_READL(dst) & end_mask) | val, dst);
 		}
diff --git a/drivers/video/fb_draw.h b/drivers/video/fb_draw.h
index c5c4520..816843f 100644
--- a/drivers/video/fb_draw.h
+++ b/drivers/video/fb_draw.h
@@ -2,6 +2,7 @@
 #define _FB_DRAW_H
 
 #include <asm/types.h>
+#include <linux/fb.h>
 
     /*
      *  Compose two values, using a bitmask as decision value
@@ -69,4 +70,71 @@
     }
 }
 #endif
+
+#ifdef CONFIG_FB_CFB_REV_PIXELS_IN_BYTE
+
+static inline u32 fb_shifted_pixels_mask_u32(u32 index, u32 bswapmask)
+{
+	u32 mask;
+
+	if (!bswapmask) {
+		mask = FB_SHIFT_HIGH(~(u32)0, index);
+	} else {
+		mask = 0xff << FB_LEFT_POS(8);
+		mask = FB_SHIFT_LOW(mask, index & (bswapmask)) & mask;
+		mask = FB_SHIFT_HIGH(mask, index & ~(bswapmask));
+#if defined(__i386__) || defined(__x86_64__)
+		/* Shift argument is limited to 0 - 31 on x86 based CPU's */
+		if(index + bswapmask < 32)
+#endif
+			mask |= FB_SHIFT_HIGH(~(u32)0,
+					(index + bswapmask) & ~(bswapmask));
+	}
+	return mask;
+}
+
+static inline unsigned long fb_shifted_pixels_mask_long(u32 index, u32 bswapmask)
+{
+	unsigned long mask;
+
+	if (!bswapmask) {
+		mask = FB_SHIFT_HIGH(~0UL, index);
+	} else {
+		mask = 0xff << FB_LEFT_POS(8);
+		mask = FB_SHIFT_LOW(mask, index & (bswapmask)) & mask;
+		mask = FB_SHIFT_HIGH(mask, index & ~(bswapmask));
+#if defined(__i386__) || defined(__x86_64__)
+		/* Shift argument is limited to 0 - 31 on x86 based CPU's */
+		if(index + bswapmask < BITS_PER_LONG)
+#endif
+			mask |= FB_SHIFT_HIGH(~0UL,
+					(index + bswapmask) & ~(bswapmask));
+	}
+	return mask;
+}
+
+
+static inline u32 fb_compute_bswapmask(struct fb_info *info)
+{
+	u32 bswapmask = 0;
+	unsigned bpp = info->var.bits_per_pixel;
+
+	if ((bpp < 8) && (info->var.nonstd & FB_NONSTD_REV_PIX_IN_B)) {
+		/*
+		 * Reversed order of pixel layout in bytes
+		 * works only for 1, 2 and 4 bpp
+		 */
+		bswapmask = 7 - bpp + 1;
+	}
+	return bswapmask;
+}
+
+#else /* CONFIG_FB_CFB_REV_PIXELS_IN_BYTE */
+
+#define fb_shifted_pixels_mask_u32(i, b) FB_SHIFT_HIGH(~(u32)0, (i))
+#define fb_shifted_pixels_mask_long(i, b) FB_SHIFT_HIGH(~0UL, (i))
+#define fb_compute_bswapmask(...) 0
+
+#endif  /* CONFIG_FB_CFB_REV_PIXELS_IN_BYTE */
+
 #endif /* FB_DRAW_H */