Introduce little endian RISC support

Little endian RISC support was either not or incorrectly implemented in AES.
This was avoided by always depending on the CICS implementations for LE.
These additions and fixes add the LE RISC support in AES and allow configure
to set the conditions for it.
diff --git a/configure b/configure
index 40396d9..49a05d0 100755
--- a/configure
+++ b/configure
@@ -4079,15 +4079,8 @@
 $as_echo "#define HAVE_X86 1" >>confdefs.h
 ;;
 	* )
-	# CPU_RISC is only supported for big endian machines.
-	if test "$ac_cv_c_bigendian" = "yes"; then
 
 $as_echo "#define CPU_RISC 1" >>confdefs.h
-
-	else
-	   $as_echo "#define CPU_CISC 1" >>confdefs.h
-
-	fi
 	;;
 esac
 
diff --git a/crypto/cipher/aes.c b/crypto/cipher/aes.c
index a279682..e91e525 100644
--- a/crypto/cipher/aes.c
+++ b/crypto/cipher/aes.c
@@ -1528,6 +1528,7 @@
 #ifdef CPU_RISC
     uint32_t tmp;
 
+#ifdef WORDS_BIGENDIAN
     tmp = expanded_key->round[i].v32[0];
     expanded_key->round[i].v32[0] = 
       U0[T4[(tmp >> 24)       ] & 0xff] ^ 
@@ -1555,6 +1556,36 @@
       U1[T4[(tmp >> 16) & 0xff] & 0xff] ^ 
       U2[T4[(tmp >> 8)  & 0xff] & 0xff] ^ 
       U3[T4[(tmp)       & 0xff] & 0xff];
+#else
+    tmp = expanded_key->round[i].v32[0];
+    expanded_key->round[i].v32[0] = 
+      U3[T4[(tmp >> 24)       ] & 0xff] ^ 
+      U2[T4[(tmp >> 16) & 0xff] & 0xff] ^ 
+      U1[T4[(tmp >> 8)  & 0xff] & 0xff] ^ 
+      U0[T4[(tmp)       & 0xff] & 0xff];
+
+    tmp = expanded_key->round[i].v32[1];
+    expanded_key->round[i].v32[1] = 
+      U3[T4[(tmp >> 24)       ] & 0xff] ^ 
+      U2[T4[(tmp >> 16) & 0xff] & 0xff] ^ 
+      U1[T4[(tmp >> 8)  & 0xff] & 0xff] ^ 
+      U0[T4[(tmp)       & 0xff] & 0xff];
+
+    tmp = expanded_key->round[i].v32[2];
+    expanded_key->round[i].v32[2] = 
+      U3[T4[(tmp >> 24)       ] & 0xff] ^ 
+      U2[T4[(tmp >> 16) & 0xff] & 0xff] ^ 
+      U1[T4[(tmp >> 8)  & 0xff] & 0xff] ^ 
+      U0[T4[(tmp)       & 0xff] & 0xff];
+
+    tmp = expanded_key->round[i].v32[3];
+    expanded_key->round[i].v32[3] = 
+      U3[T4[(tmp >> 24)       ] & 0xff] ^ 
+      U2[T4[(tmp >> 16) & 0xff] & 0xff] ^ 
+      U1[T4[(tmp >> 8)  & 0xff] & 0xff] ^ 
+      U0[T4[(tmp)       & 0xff] & 0xff];
+#endif /* WORDS_BIGENDIAN */
+
 #else /* assume CPU_CISC */
 
     uint32_t c0, c1, c2, c3;
@@ -1768,7 +1799,6 @@
      of state, using the tables U0, U1, U2, U3 */
 
 #ifdef WORDS_BIGENDIAN
-  /* FIX!  WRong indexes */
   column0 = U0[state->v32[0] >> 24] ^ U1[(state->v32[3] >> 16) & 0xff]
     ^ U2[(state->v32[2] >> 8) & 0xff] ^ U3[state->v32[1] & 0xff];
 
@@ -1781,17 +1811,17 @@
   column3 = U0[state->v32[3] >> 24] ^ U1[(state->v32[2] >> 16) & 0xff]
     ^ U2[(state->v32[1] >> 8) & 0xff] ^ U3[state->v32[0] & 0xff];
 #else
-  column0 = U0[state->v32[0] & 0xff] ^ U1[(state->v32[1] >> 8) & 0xff]
-	^ U2[(state->v32[2] >> 16) & 0xff] ^ U3[state->v32[3] >> 24];
+  column0 = U0[state->v32[0] & 0xff] ^ U1[(state->v32[3] >> 8) & 0xff]
+    ^ U2[(state->v32[2] >> 16) & 0xff] ^ U3[(state->v32[1] >> 24) & 0xff];
 
-  column1 = U0[state->v32[1] & 0xff] ^ U1[(state->v32[2] >> 8) & 0xff]
-	^ U2[(state->v32[3] >> 16) & 0xff] ^ U3[state->v32[0] >> 24];
+  column1 = U0[state->v32[1] & 0xff] ^ U1[(state->v32[0] >> 8) & 0xff]
+    ^ U2[(state->v32[3] >> 16) & 0xff] ^ U3[(state->v32[2] >> 24) & 0xff];
 
-  column2 = U0[state->v32[2] & 0xff] ^ U1[(state->v32[3] >> 8) & 0xff]
-	^ U2[(state->v32[0] >> 16) & 0xff] ^ U3[state->v32[1] >> 24];
+  column2 = U0[state->v32[2] & 0xff] ^ U1[(state->v32[1] >> 8) & 0xff]
+    ^ U2[(state->v32[0] >> 16) & 0xff] ^ U3[(state->v32[3] >> 24) & 0xff];
 
-  column3 = U0[state->v32[3] & 0xff] ^ U1[(state->v32[0] >> 8) & 0xff]
-	^ U2[(state->v32[1] >> 16) & 0xff] ^ U3[state->v32[2] >> 24];
+  column3 = U0[state->v32[3] & 0xff] ^ U1[(state->v32[2] >> 8) & 0xff]
+    ^ U2[(state->v32[1] >> 16) & 0xff] ^ U3[(state->v32[0] >> 24) & 0xff];
 #endif /* WORDS_BIGENDIAN */
 
   state->v32[0] = column0 ^ round_key->v32[0];
@@ -1805,6 +1835,7 @@
 aes_final_round(v128_t *state, const v128_t *round_key) {
   uint32_t tmp0, tmp1, tmp2, tmp3;
 
+#ifdef WORDS_BIGENDIAN
   tmp0 = (T4[(state->v32[0] >> 24)]        & 0xff000000) 
        ^ (T4[(state->v32[1] >> 16) & 0xff] & 0x00ff0000) 
        ^ (T4[(state->v32[2] >>  8) & 0xff] & 0x0000ff00) 
@@ -1828,6 +1859,31 @@
        ^ (T4[(state->v32[1] >>  8) & 0xff] & 0x0000ff00)
        ^ (T4[(state->v32[2]      ) & 0xff] & 0x000000ff)
        ^ round_key->v32[3];
+#else
+  tmp0 = (T4[(state->v32[3] >> 24)]        & 0xff000000) 
+       ^ (T4[(state->v32[2] >> 16) & 0xff] & 0x00ff0000) 
+       ^ (T4[(state->v32[1] >>  8) & 0xff] & 0x0000ff00) 
+       ^ (T4[(state->v32[0]      ) & 0xff] & 0x000000ff) 
+       ^ round_key->v32[0];
+
+  tmp1 = (T4[(state->v32[0] >> 24)]        & 0xff000000)
+       ^ (T4[(state->v32[3] >> 16) & 0xff] & 0x00ff0000)
+       ^ (T4[(state->v32[2] >>  8) & 0xff] & 0x0000ff00)
+       ^ (T4[(state->v32[1]      ) & 0xff] & 0x000000ff)
+       ^ round_key->v32[1];
+
+  tmp2 = (T4[(state->v32[1] >> 24)]        & 0xff000000)
+       ^ (T4[(state->v32[0] >> 16) & 0xff] & 0x00ff0000)
+       ^ (T4[(state->v32[3] >>  8) & 0xff] & 0x0000ff00)
+       ^ (T4[(state->v32[2]      ) & 0xff] & 0x000000ff)
+       ^ round_key->v32[2];
+
+  tmp3 = (T4[(state->v32[2] >> 24)]        & 0xff000000)
+       ^ (T4[(state->v32[1] >> 16) & 0xff] & 0x00ff0000)
+       ^ (T4[(state->v32[0] >>  8) & 0xff] & 0x0000ff00)
+       ^ (T4[(state->v32[3]      ) & 0xff] & 0x000000ff)
+       ^ round_key->v32[3];
+#endif /* WORDS_BIGENDIAN */
 
   state->v32[0] = tmp0;
   state->v32[1] = tmp1;
@@ -1840,6 +1896,7 @@
 aes_inv_final_round(v128_t *state, const v128_t *round_key) {
   uint32_t tmp0, tmp1, tmp2, tmp3;
 
+#ifdef WORDS_BIGENDIAN
   tmp0 = (U4[(state->v32[0] >> 24)]        & 0xff000000) 
        ^ (U4[(state->v32[3] >> 16) & 0xff] & 0x00ff0000) 
        ^ (U4[(state->v32[2] >>  8) & 0xff] & 0x0000ff00) 
@@ -1863,6 +1920,31 @@
        ^ (U4[(state->v32[1] >>  8) & 0xff] & 0x0000ff00)
        ^ (U4[(state->v32[0]      ) & 0xff] & 0x000000ff)
        ^ round_key->v32[3];
+#else
+  tmp0 = (U4[(state->v32[1] >> 24)]        & 0xff000000) 
+       ^ (U4[(state->v32[2] >> 16) & 0xff] & 0x00ff0000) 
+       ^ (U4[(state->v32[3] >>  8) & 0xff] & 0x0000ff00) 
+       ^ (U4[(state->v32[0]      ) & 0xff] & 0x000000ff) 
+       ^ round_key->v32[0];
+
+  tmp1 = (U4[(state->v32[2] >> 24)]        & 0xff000000)
+       ^ (U4[(state->v32[3] >> 16) & 0xff] & 0x00ff0000)
+       ^ (U4[(state->v32[0] >>  8) & 0xff] & 0x0000ff00)
+       ^ (U4[(state->v32[1]      ) & 0xff] & 0x000000ff)
+       ^ round_key->v32[1];
+
+  tmp2 = (U4[(state->v32[3] >> 24)]        & 0xff000000)
+       ^ (U4[(state->v32[0] >> 16) & 0xff] & 0x00ff0000)
+       ^ (U4[(state->v32[1] >>  8) & 0xff] & 0x0000ff00)
+       ^ (U4[(state->v32[2]      ) & 0xff] & 0x000000ff)
+       ^ round_key->v32[2];
+
+  tmp3 = (U4[(state->v32[0] >> 24)]        & 0xff000000)
+       ^ (U4[(state->v32[1] >> 16) & 0xff] & 0x00ff0000)
+       ^ (U4[(state->v32[2] >>  8) & 0xff] & 0x0000ff00)
+       ^ (U4[(state->v32[3]      ) & 0xff] & 0x000000ff)
+       ^ round_key->v32[3];
+#endif /* WORDS_BIGENDIAN */
 
   state->v32[0] = tmp0;
   state->v32[1] = tmp1;