Score: Implement the function csum_ipv6_magic

Signed-off-by: Lennox Wu <lennox.wu@gmail.com>
diff --git a/arch/score/include/asm/checksum.h b/arch/score/include/asm/checksum.h
index f909ac3..961bd64 100644
--- a/arch/score/include/asm/checksum.h
+++ b/arch/score/include/asm/checksum.h
@@ -184,48 +184,57 @@
 				__wsum sum)
 {
 	__asm__ __volatile__(
-		".set\tnoreorder\t\t\t# csum_ipv6_magic\n\t"
-		".set\tnoat\n\t"
-		"addu\t%0, %5\t\t\t# proto (long in network byte order)\n\t"
-		"sltu\t$1, %0, %5\n\t"
-		"addu\t%0, $1\n\t"
-		"addu\t%0, %6\t\t\t# csum\n\t"
-		"sltu\t$1, %0, %6\n\t"
-		"lw\t%1, 0(%2)\t\t\t# four words source address\n\t"
-		"addu\t%0, $1\n\t"
-		"addu\t%0, %1\n\t"
-		"sltu\t$1, %0, %1\n\t"
-		"lw\t%1, 4(%2)\n\t"
-		"addu\t%0, $1\n\t"
-		"addu\t%0, %1\n\t"
-		"sltu\t$1, %0, %1\n\t"
-		"lw\t%1, 8(%2)\n\t"
-		"addu\t%0, $1\n\t"
-		"addu\t%0, %1\n\t"
-		"sltu\t$1, %0, %1\n\t"
-		"lw\t%1, 12(%2)\n\t"
-		"addu\t%0, $1\n\t"
-		"addu\t%0, %1\n\t"
-		"sltu\t$1, %0, %1\n\t"
-		"lw\t%1, 0(%3)\n\t"
-		"addu\t%0, $1\n\t"
-		"addu\t%0, %1\n\t"
-		"sltu\t$1, %0, %1\n\t"
-		"lw\t%1, 4(%3)\n\t"
-		"addu\t%0, $1\n\t"
-		"addu\t%0, %1\n\t"
-		"sltu\t$1, %0, %1\n\t"
-		"lw\t%1, 8(%3)\n\t"
-		"addu\t%0, $1\n\t"
-		"addu\t%0, %1\n\t"
-		"sltu\t$1, %0, %1\n\t"
-		"lw\t%1, 12(%3)\n\t"
-		"addu\t%0, $1\n\t"
-		"addu\t%0, %1\n\t"
-		"sltu\t$1, %0, %1\n\t"
-		"addu\t%0, $1\t\t\t# Add final carry\n\t"
-		".set\tnoat\n\t"
-		".set\tnoreorder"
+		".set\tvolatile\t\t\t# csum_ipv6_magic\n\t"
+		"add\t%0, %0, %5\t\t\t# proto (long in network byte order)\n\t"
+		"cmp.c\t%5, %0\n\t"
+		"bleu 1f\n\t"
+		"addi\t%0, 0x1\n\t"
+		"1:add\t%0, %0, %6\t\t\t# csum\n\t"
+		"cmp.c\t%6, %0\n\t"
+		"lw\t%1, [%2, 0]\t\t\t# four words source address\n\t"
+		"bleu 1f\n\t"
+		"addi\t%0, 0x1\n\t"
+		"1:add\t%0, %0, %1\n\t"
+		"cmp.c\t%1, %0\n\t"
+		"1:lw\t%1, [%2, 4]\n\t"
+		"bleu 1f\n\t"
+		"addi\t%0, 0x1\n\t"
+		"1:add\t%0, %0, %1\n\t"
+		"cmp.c\t%1, %0\n\t"
+		"lw\t%1, [%2,8]\n\t"
+		"bleu 1f\n\t"
+		"addi\t%0, 0x1\n\t"
+		"1:add\t%0, %0, %1\n\t"
+		"cmp.c\t%1, %0\n\t"
+		"lw\t%1, [%2, 12]\n\t"
+		"bleu 1f\n\t"
+		"addi\t%0, 0x1\n\t"
+		"1:add\t%0, %0,%1\n\t"
+		"cmp.c\t%1, %0\n\t"
+		"lw\t%1, [%3, 0]\n\t"
+		"bleu 1f\n\t"
+		"addi\t%0, 0x1\n\t"
+		"1:add\t%0, %0, %1\n\t"
+		"cmp.c\t%1, %0\n\t"
+		"lw\t%1, [%3, 4]\n\t"
+		"bleu 1f\n\t"
+		"addi\t%0, 0x1\n\t"
+		"1:add\t%0, %0, %1\n\t"
+		"cmp.c\t%1, %0\n\t"
+		"lw\t%1, [%3, 8]\n\t"
+		"bleu 1f\n\t"
+		"addi\t%0, 0x1\n\t"
+		"1:add\t%0, %0, %1\n\t"
+		"cmp.c\t%1, %0\n\t"
+		"lw\t%1, [%3, 12]\n\t"
+		"bleu 1f\n\t"
+		"addi\t%0, 0x1\n\t"
+		"1:add\t%0, %0, %1\n\t"
+		"cmp.c\t%1, %0\n\t"
+		"bleu 1f\n\t"
+		"addi\t%0, 0x1\n\t"
+		"1:\n\t"
+		".set\toptimize"
 		: "=r" (sum), "=r" (proto)
 		: "r" (saddr), "r" (daddr),
 		  "0" (htonl(len)), "1" (htonl(proto)), "r" (sum));