microblaze: Optimize CACHE_LOOP_LIMITS and CACHE_RANGE_LOOP macros

1. Remove CACHE_ALL_LOOP2 macro because it is identical to CACHE_ALL_LOOP
2. Change BUG_ON to WARN_ON
3. Remove end aligned from CACHE_LOOP_LIMITS.
C implementation do not need aligned end address and ASM code do aligned
in their macros
4. ASM optimized  CACHE_RANGE_LOOP_1/2 macros needs to get aligned end address.
Because end address is compound from start + size, end address is the first address
which is exclude.

Here is the corresponding code which describe it.
+       int align = ~(line_length - 1);
+       end = ((end & align) == end) ? end - line_length : end & align;

a) end is aligned:
it is necessary to subtruct line length because we don't want to work with
next cacheline
b) end address is not aligned:
Just align it to be ready for ASM code.

Signed-off-by: Michal Simek <monstr@monstr.eu>
diff --git a/arch/microblaze/kernel/cpu/cache.c b/arch/microblaze/kernel/cpu/cache.c
index 19dae71..21c3a92 100644
--- a/arch/microblaze/kernel/cpu/cache.c
+++ b/arch/microblaze/kernel/cpu/cache.c
@@ -96,13 +96,16 @@
 }
 
 
-/* Helper macro for computing the limits of cache range loops */
+/* Helper macro for computing the limits of cache range loops
+ *
+ * End address can be unaligned which is OK for C implementation.
+ * ASM implementation align it in ASM macros
+ */
 #define CACHE_LOOP_LIMITS(start, end, cache_line_length, cache_size)	\
 do {									\
 	int align = ~(cache_line_length - 1);				\
 	end = min(start + cache_size, end);				\
 	start &= align;							\
-	end = ((end & align) + cache_line_length);			\
 } while (0);
 
 /*
@@ -111,9 +114,9 @@
  */
 #define CACHE_ALL_LOOP(cache_size, line_length, op)			\
 do {									\
-	unsigned int len = cache_size;					\
+	unsigned int len = cache_size - line_length;			\
 	int step = -line_length;					\
-	BUG_ON(step >= 0);						\
+	WARN_ON(step >= 0);						\
 									\
 	__asm__ __volatile__ (" 1:      " #op " %0, r0;			\
 					bgtid   %0, 1b;			\
@@ -122,26 +125,21 @@
 					: "memory");			\
 } while (0);
 
-
-#define CACHE_ALL_LOOP2(cache_size, line_length, op)			\
-do {									\
-	unsigned int len = cache_size;					\
-	int step = -line_length;					\
-	BUG_ON(step >= 0);						\
-									\
-	__asm__ __volatile__ (" 1:	" #op "	r0, %0;			\
-					bgtid	%0, 1b;			\
-					addk	%0, %0, %1;		\
-					" : : "r" (len), "r" (step)	\
-					: "memory");			\
-} while (0);
-
-/* for wdc.flush/clear */
+/* Used for wdc.flush/clear which can use rB for offset which is not possible
+ * to use for simple wdc or wic.
+ *
+ * start address is cache aligned
+ * end address is not aligned, if end is aligned then I have to substract
+ * cacheline length because I can't flush/invalidate the next cacheline.
+ * If is not, I align it because I will flush/invalidate whole line.
+ */
 #define CACHE_RANGE_LOOP_2(start, end, line_length, op)			\
 do {									\
 	int step = -line_length;					\
+	int align = ~(line_length - 1);					\
+	end = ((end & align) == end) ? end - line_length : end & align;	\
 	int count = end - start;					\
-	BUG_ON(count <= 0);						\
+	WARN_ON(count < 0);						\
 									\
 	__asm__ __volatile__ (" 1:	" #op "	%0, %1;			\
 					bgtid	%1, 1b;			\
@@ -154,7 +152,9 @@
 #define CACHE_RANGE_LOOP_1(start, end, line_length, op)			\
 do {									\
 	int volatile temp;						\
-	BUG_ON(end - start <= 0);					\
+	int align = ~(line_length - 1);					\
+	end = ((end & align) == end) ? end - line_length : end & align;	\
+	WARN_ON(end - start < 0);					\
 									\
 	__asm__ __volatile__ (" 1:	" #op "	%1, r0;			\
 					cmpu	%0, %1, %2;		\
@@ -360,8 +360,12 @@
 #endif
 }
 
-/* FIXME this is weird - should be only wdc but not work
- * MS: I am getting bus errors and other weird things */
+/* FIXME It is blindly invalidation as is expected
+ * but can't be called on noMMU in microblaze_cache_init below
+ *
+ * MS: noMMU kernel won't boot if simple wdc is used
+ * The reason should be that there are discared data which kernel needs
+ */
 static void __invalidate_dcache_all_wb(void)
 {
 #ifndef ASM_LOOP
@@ -369,12 +373,12 @@
 #endif
 	pr_debug("%s\n", __func__);
 #ifdef ASM_LOOP
-	CACHE_ALL_LOOP2(cpuinfo.dcache_size, cpuinfo.dcache_line_length,
-					wdc.clear)
+	CACHE_ALL_LOOP(cpuinfo.dcache_size, cpuinfo.dcache_line_length,
+					wdc)
 #else
 	for (i = 0; i < cpuinfo.dcache_size;
 		 i += cpuinfo.dcache_line_length)
-			__asm__ __volatile__ ("wdc.clear	%0, r0;" \
+			__asm__ __volatile__ ("wdc	%0, r0;" \
 					: : "r" (i));
 #endif
 }
@@ -650,7 +654,11 @@
 			}
 		}
 	}
-	invalidate_dcache();
+/* FIXME Invalidation is done in U-BOOT
+ * WT cache: Data is already written to main memory
+ * WB cache: Discard data on noMMU which caused that kernel doesn't boot
+ */
+	/* invalidate_dcache(); */
 	enable_dcache();
 
 	invalidate_icache();