libbb: disable a second md5 implementation which managed to creep in :)

function                                             old     new   delta
sha512_end                                           239     237      -2
sha256_end                                           162     160      -2
sha1_end                                             191     189      -2
md5_end                                              168     166      -2
__md5__magic                                           4       -      -4
md5_crypt                                            627     621      -6
static.S                                              16       -     -16
__md5_Init                                            42       -     -42
static.P                                              64       -     -64
__md5_Final                                          131       -    -131
__md5_Update                                         153       -    -153
static.C                                             268      12    -256
__md5_Transform                                      293       -    -293
------------------------------------------------------------------------------
(add/remove: 0/7 grow/shrink: 0/6 up/down: 0/-973)           Total: -973 bytes

diff --git a/include/libbb.h b/include/libbb.h
index 839a0de..85a915e 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -1309,7 +1309,7 @@
 } sha1_ctx_t;
 void sha1_begin(sha1_ctx_t *ctx) FAST_FUNC;
 void sha1_hash(const void *data, size_t length, sha1_ctx_t *ctx) FAST_FUNC;
-void *sha1_end(void *resbuf, sha1_ctx_t *ctx) FAST_FUNC;
+void sha1_end(void *resbuf, sha1_ctx_t *ctx) FAST_FUNC;
 typedef struct sha256_ctx_t {
 	uint32_t H[8];
 	uint32_t total[2]; /* rename to "count"? */
@@ -1318,7 +1318,7 @@
 } sha256_ctx_t;
 void sha256_begin(sha256_ctx_t *ctx) FAST_FUNC;
 void sha256_hash(const void *buffer, size_t len, sha256_ctx_t *ctx) FAST_FUNC;
-void* sha256_end(void *resbuf, sha256_ctx_t *ctx) FAST_FUNC;
+void sha256_end(void *resbuf, sha256_ctx_t *ctx) FAST_FUNC;
 typedef struct sha512_ctx_t {
 	uint64_t H[8];
 	uint64_t total[2];
@@ -1327,7 +1327,8 @@
 } sha512_ctx_t;
 void sha512_begin(sha512_ctx_t *ctx) FAST_FUNC;
 void sha512_hash(const void *buffer, size_t len, sha512_ctx_t *ctx) FAST_FUNC;
-void* sha512_end(void *resbuf, sha512_ctx_t *ctx) FAST_FUNC;
+void sha512_end(void *resbuf, sha512_ctx_t *ctx) FAST_FUNC;
+#if 1
 typedef struct md5_ctx_t {
 	uint32_t A;
 	uint32_t B;
@@ -1337,9 +1338,18 @@
 	uint32_t buflen;
 	char buffer[128];
 } md5_ctx_t;
+#else
+/* libbb/md5prime.c uses a bit different one: */
+typedef struct md5_ctx_t {
+	uint32_t state[4];	/* state (ABCD) */
+	uint32_t count[2];	/* number of bits, modulo 2^64 (lsb first) */
+	unsigned char buffer[64];	/* input buffer */
+} md5_ctx_t;
+#endif
 void md5_begin(md5_ctx_t *ctx) FAST_FUNC;
 void md5_hash(const void *data, size_t length, md5_ctx_t *ctx) FAST_FUNC;
-void *md5_end(void *resbuf, md5_ctx_t *ctx) FAST_FUNC;
+void md5_end(void *resbuf, md5_ctx_t *ctx) FAST_FUNC;
+
 
 uint32_t *crc32_filltable(uint32_t *tbl256, int endian) FAST_FUNC;
 
diff --git a/libbb/Kbuild b/libbb/Kbuild
index 786cbee..b82f03c 100644
--- a/libbb/Kbuild
+++ b/libbb/Kbuild
@@ -58,6 +58,8 @@
 lib-y += makedev.o
 lib-y += match_fstype.o
 lib-y += md5.o
+# Alternative (disabled) implementation
+#lib-y += md5prime.o
 lib-y += messages.o
 lib-y += mode_string.o
 lib-y += mtab_file.o
diff --git a/libbb/md5.c b/libbb/md5.c
index 4ab06eb..eb15d75 100644
--- a/libbb/md5.c
+++ b/libbb/md5.c
@@ -15,8 +15,11 @@
 
 #include "libbb.h"
 
-#if CONFIG_MD5_SIZE_VS_SPEED < 0 || CONFIG_MD5_SIZE_VS_SPEED > 3
-# define MD5_SIZE_VS_SPEED 2
+/* 0: fastest, 3: smallest */
+#if CONFIG_MD5_SIZE_VS_SPEED < 0
+# define MD5_SIZE_VS_SPEED 0
+#elif CONFIG_MD5_SIZE_VS_SPEED > 3
+# define MD5_SIZE_VS_SPEED 3
 #else
 # define MD5_SIZE_VS_SPEED CONFIG_MD5_SIZE_VS_SPEED
 #endif
@@ -30,7 +33,6 @@
 	ctx->B = 0xefcdab89;
 	ctx->C = 0x98badcfe;
 	ctx->D = 0x10325476;
-
 	ctx->total = 0;
 	ctx->buflen = 0;
 }
@@ -40,10 +42,12 @@
  * (as found in Colin Plumbs public domain implementation).
  * #define FF(b, c, d) ((b & c) | (~b & d))
  */
-# define FF(b, c, d) (d ^ (b & (c ^ d)))
-# define FG(b, c, d) FF (d, b, c)
-# define FH(b, c, d) (b ^ c ^ d)
-# define FI(b, c, d) (c ^ (b | ~d))
+#define FF(b, c, d) (d ^ (b & (c ^ d)))
+#define FG(b, c, d) FF(d, b, c)
+#define FH(b, c, d) (b ^ c ^ d)
+#define FI(b, c, d) (c ^ (b | ~d))
+
+#define rotl32(w, s) (((w) << (s)) | ((w) >> (32 - (s))))
 
 /* Hash a single block, 64 bytes long and 4-byte aligned. */
 static void md5_hash_block(const void *buffer, md5_ctx_t *ctx)
@@ -51,7 +55,7 @@
 	uint32_t correct_words[16];
 	const uint32_t *words = buffer;
 
-# if MD5_SIZE_VS_SPEED > 0
+#if MD5_SIZE_VS_SPEED > 0
 	static const uint32_t C_array[] = {
 		/* round 1 */
 		0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
@@ -74,26 +78,23 @@
 		0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
 		0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
 	};
-
 	static const char P_array[] ALIGN1 = {
-#  if MD5_SIZE_VS_SPEED > 1
+# if MD5_SIZE_VS_SPEED > 1
 		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,	/* 1 */
-#  endif	/* MD5_SIZE_VS_SPEED > 1 */
+# endif
 		1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12,	/* 2 */
 		5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2,	/* 3 */
 		0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9	/* 4 */
 	};
-
-#  if MD5_SIZE_VS_SPEED > 1
+# if MD5_SIZE_VS_SPEED > 1
 	static const char S_array[] ALIGN1 = {
 		7, 12, 17, 22,
 		5, 9, 14, 20,
 		4, 11, 16, 23,
 		6, 10, 15, 21
 	};
-#  endif	/* MD5_SIZE_VS_SPEED > 1 */
-# endif
-
+# endif	/* MD5_SIZE_VS_SPEED > 1 */
+#endif
 	uint32_t A = ctx->A;
 	uint32_t B = ctx->B;
 	uint32_t C = ctx->C;
@@ -101,263 +102,252 @@
 
 	/* Process all bytes in the buffer with 64 bytes in each round of
 	   the loop.  */
-		uint32_t *cwp = correct_words;
-		uint32_t A_save = A;
-		uint32_t B_save = B;
-		uint32_t C_save = C;
-		uint32_t D_save = D;
+	uint32_t *cwp = correct_words;
+	uint32_t A_save = A;
+	uint32_t B_save = B;
+	uint32_t C_save = C;
+	uint32_t D_save = D;
 
-# if MD5_SIZE_VS_SPEED > 1
-#  define CYCLIC(w, s) (w = (w << s) | (w >> (32 - s)))
+#if MD5_SIZE_VS_SPEED > 1
+	const uint32_t *pc;
+	const char *pp;
+	const char *ps;
+	int i;
+	uint32_t temp;
 
-		const uint32_t *pc;
-		const char *pp;
-		const char *ps;
-		int i;
-		uint32_t temp;
+	for (i = 0; i < 16; i++)
+		cwp[i] = SWAP_LE32(words[i]);
+	words += 16;
 
-		for (i = 0; i < 16; i++) {
-			cwp[i] = SWAP_LE32(words[i]);
+# if MD5_SIZE_VS_SPEED > 2
+	pc = C_array;
+	pp = P_array;
+	ps = S_array - 4;
+
+	for (i = 0; i < 64; i++) {
+		if ((i & 0x0f) == 0)
+			ps += 4;
+		temp = A;
+		switch (i >> 4) {
+		case 0:
+			temp += FF(B, C, D);
+			break;
+		case 1:
+			temp += FG(B, C, D);
+			break;
+		case 2:
+			temp += FH(B, C, D);
+			break;
+		case 3:
+			temp += FI(B, C, D);
 		}
-		words += 16;
-
-#  if MD5_SIZE_VS_SPEED > 2
-		pc = C_array;
-		pp = P_array;
-		ps = S_array - 4;
-
-		for (i = 0; i < 64; i++) {
-			if ((i & 0x0f) == 0)
-				ps += 4;
-			temp = A;
-			switch (i >> 4) {
-			case 0:
-				temp += FF(B, C, D);
-				break;
-			case 1:
-				temp += FG(B, C, D);
-				break;
-			case 2:
-				temp += FH(B, C, D);
-				break;
-			case 3:
-				temp += FI(B, C, D);
-			}
-			temp += cwp[(int) (*pp++)] + *pc++;
-			CYCLIC(temp, ps[i & 3]);
-			temp += B;
-			A = D;
-			D = C;
-			C = B;
-			B = temp;
-		}
-#  else
-		pc = C_array;
-		pp = P_array;
-		ps = S_array;
-
-		for (i = 0; i < 16; i++) {
-			temp = A + FF(B, C, D) + cwp[(int) (*pp++)] + *pc++;
-			CYCLIC(temp, ps[i & 3]);
-			temp += B;
-			A = D;
-			D = C;
-			C = B;
-			B = temp;
-		}
-
-		ps += 4;
-		for (i = 0; i < 16; i++) {
-			temp = A + FG(B, C, D) + cwp[(int) (*pp++)] + *pc++;
-			CYCLIC(temp, ps[i & 3]);
-			temp += B;
-			A = D;
-			D = C;
-			C = B;
-			B = temp;
-		}
-		ps += 4;
-		for (i = 0; i < 16; i++) {
-			temp = A + FH(B, C, D) + cwp[(int) (*pp++)] + *pc++;
-			CYCLIC(temp, ps[i & 3]);
-			temp += B;
-			A = D;
-			D = C;
-			C = B;
-			B = temp;
-		}
-		ps += 4;
-		for (i = 0; i < 16; i++) {
-			temp = A + FI(B, C, D) + cwp[(int) (*pp++)] + *pc++;
-			CYCLIC(temp, ps[i & 3]);
-			temp += B;
-			A = D;
-			D = C;
-			C = B;
-			B = temp;
-		}
-
-#  endif	/* MD5_SIZE_VS_SPEED > 2 */
+		temp += cwp[(int) (*pp++)] + *pc++;
+		temp = rotl32(temp, ps[i & 3]);
+		temp += B;
+		A = D;
+		D = C;
+		C = B;
+		B = temp;
+	}
 # else
-		/* First round: using the given function, the context and a constant
-		   the next context is computed.  Because the algorithms processing
-		   unit is a 32-bit word and it is determined to work on words in
-		   little endian byte order we perhaps have to change the byte order
-		   before the computation.  To reduce the work for the next steps
-		   we store the swapped words in the array CORRECT_WORDS.  */
+	pc = C_array;
+	pp = P_array;
+	ps = S_array;
 
-#  define OP(a, b, c, d, s, T) \
+	for (i = 0; i < 16; i++) {
+		temp = A + FF(B, C, D) + cwp[(int) (*pp++)] + *pc++;
+		temp = rotl32(temp, ps[i & 3]);
+		temp += B;
+		A = D;
+		D = C;
+		C = B;
+		B = temp;
+	}
+	ps += 4;
+	for (i = 0; i < 16; i++) {
+		temp = A + FG(B, C, D) + cwp[(int) (*pp++)] + *pc++;
+		temp = rotl32(temp, ps[i & 3]);
+		temp += B;
+		A = D;
+		D = C;
+		C = B;
+		B = temp;
+	}
+	ps += 4;
+	for (i = 0; i < 16; i++) {
+		temp = A + FH(B, C, D) + cwp[(int) (*pp++)] + *pc++;
+		temp = rotl32(temp, ps[i & 3]);
+		temp += B;
+		A = D;
+		D = C;
+		C = B;
+		B = temp;
+	}
+	ps += 4;
+	for (i = 0; i < 16; i++) {
+		temp = A + FI(B, C, D) + cwp[(int) (*pp++)] + *pc++;
+		temp = rotl32(temp, ps[i & 3]);
+		temp += B;
+		A = D;
+		D = C;
+		C = B;
+		B = temp;
+	}
+
+# endif /* MD5_SIZE_VS_SPEED > 2 */
+#else
+	/* First round: using the given function, the context and a constant
+	   the next context is computed.  Because the algorithms processing
+	   unit is a 32-bit word and it is determined to work on words in
+	   little endian byte order we perhaps have to change the byte order
+	   before the computation.  To reduce the work for the next steps
+	   we store the swapped words in the array CORRECT_WORDS.  */
+# define OP(a, b, c, d, s, T) \
 	do { \
-		a += FF (b, c, d) + (*cwp++ = SWAP_LE32(*words)) + T; \
+		a += FF(b, c, d) + (*cwp++ = SWAP_LE32(*words)) + T; \
 		++words; \
-		CYCLIC (a, s); \
+		a = rotl32(a, s); \
 		a += b; \
 	} while (0)
 
-		/* It is unfortunate that C does not provide an operator for
-		   cyclic rotation.  Hope the C compiler is smart enough.  */
-		/* gcc 2.95.4 seems to be --aaronl */
-#  define CYCLIC(w, s) (w = (w << s) | (w >> (32 - s)))
+	/* Before we start, one word to the strange constants.
+	   They are defined in RFC 1321 as
+	   T[i] = (int)(4294967296.0 * fabs(sin(i))), i=1..64
+	 */
 
-		/* Before we start, one word to the strange constants.
-		   They are defined in RFC 1321 as
+# if MD5_SIZE_VS_SPEED == 1
+	const uint32_t *pc;
+	const char *pp;
+	int i;
+# endif	/* MD5_SIZE_VS_SPEED */
 
-		   T[i] = (int) (4294967296.0 * fabs (sin (i))), i=1..64
-		 */
+	/* Round 1.  */
+# if MD5_SIZE_VS_SPEED == 1
+	pc = C_array;
+	for (i = 0; i < 4; i++) {
+		OP(A, B, C, D, 7, *pc++);
+		OP(D, A, B, C, 12, *pc++);
+		OP(C, D, A, B, 17, *pc++);
+		OP(B, C, D, A, 22, *pc++);
+	}
+# else
+	OP(A, B, C, D, 7, 0xd76aa478);
+	OP(D, A, B, C, 12, 0xe8c7b756);
+	OP(C, D, A, B, 17, 0x242070db);
+	OP(B, C, D, A, 22, 0xc1bdceee);
+	OP(A, B, C, D, 7, 0xf57c0faf);
+	OP(D, A, B, C, 12, 0x4787c62a);
+	OP(C, D, A, B, 17, 0xa8304613);
+	OP(B, C, D, A, 22, 0xfd469501);
+	OP(A, B, C, D, 7, 0x698098d8);
+	OP(D, A, B, C, 12, 0x8b44f7af);
+	OP(C, D, A, B, 17, 0xffff5bb1);
+	OP(B, C, D, A, 22, 0x895cd7be);
+	OP(A, B, C, D, 7, 0x6b901122);
+	OP(D, A, B, C, 12, 0xfd987193);
+	OP(C, D, A, B, 17, 0xa679438e);
+	OP(B, C, D, A, 22, 0x49b40821);
+# endif/* MD5_SIZE_VS_SPEED == 1 */
 
-#  if MD5_SIZE_VS_SPEED == 1
-		const uint32_t *pc;
-		const char *pp;
-		int i;
-#  endif	/* MD5_SIZE_VS_SPEED */
-
-		/* Round 1.  */
-#  if MD5_SIZE_VS_SPEED == 1
-		pc = C_array;
-		for (i = 0; i < 4; i++) {
-			OP(A, B, C, D, 7, *pc++);
-			OP(D, A, B, C, 12, *pc++);
-			OP(C, D, A, B, 17, *pc++);
-			OP(B, C, D, A, 22, *pc++);
-		}
-#  else
-		OP(A, B, C, D, 7, 0xd76aa478);
-		OP(D, A, B, C, 12, 0xe8c7b756);
-		OP(C, D, A, B, 17, 0x242070db);
-		OP(B, C, D, A, 22, 0xc1bdceee);
-		OP(A, B, C, D, 7, 0xf57c0faf);
-		OP(D, A, B, C, 12, 0x4787c62a);
-		OP(C, D, A, B, 17, 0xa8304613);
-		OP(B, C, D, A, 22, 0xfd469501);
-		OP(A, B, C, D, 7, 0x698098d8);
-		OP(D, A, B, C, 12, 0x8b44f7af);
-		OP(C, D, A, B, 17, 0xffff5bb1);
-		OP(B, C, D, A, 22, 0x895cd7be);
-		OP(A, B, C, D, 7, 0x6b901122);
-		OP(D, A, B, C, 12, 0xfd987193);
-		OP(C, D, A, B, 17, 0xa679438e);
-		OP(B, C, D, A, 22, 0x49b40821);
-#  endif	/* MD5_SIZE_VS_SPEED == 1 */
-
-		/* For the second to fourth round we have the possibly swapped words
-		   in CORRECT_WORDS.  Redefine the macro to take an additional first
-		   argument specifying the function to use.  */
-#  undef OP
-#  define OP(f, a, b, c, d, k, s, T) \
+	/* For the second to fourth round we have the possibly swapped words
+	   in CORRECT_WORDS.  Redefine the macro to take an additional first
+	   argument specifying the function to use.  */
+# undef OP
+# define OP(f, a, b, c, d, k, s, T) \
 	do { \
-		a += f (b, c, d) + correct_words[k] + T; \
-		CYCLIC (a, s); \
+		a += f(b, c, d) + correct_words[k] + T; \
+		a = rotl32(a, s); \
 		a += b; \
 	} while (0)
 
-		/* Round 2.  */
-#  if MD5_SIZE_VS_SPEED == 1
-		pp = P_array;
-		for (i = 0; i < 4; i++) {
-			OP(FG, A, B, C, D, (int) (*pp++), 5, *pc++);
-			OP(FG, D, A, B, C, (int) (*pp++), 9, *pc++);
-			OP(FG, C, D, A, B, (int) (*pp++), 14, *pc++);
-			OP(FG, B, C, D, A, (int) (*pp++), 20, *pc++);
-		}
-#  else
-		OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
-		OP(FG, D, A, B, C, 6, 9, 0xc040b340);
-		OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
-		OP(FG, B, C, D, A, 0, 20, 0xe9b6c7aa);
-		OP(FG, A, B, C, D, 5, 5, 0xd62f105d);
-		OP(FG, D, A, B, C, 10, 9, 0x02441453);
-		OP(FG, C, D, A, B, 15, 14, 0xd8a1e681);
-		OP(FG, B, C, D, A, 4, 20, 0xe7d3fbc8);
-		OP(FG, A, B, C, D, 9, 5, 0x21e1cde6);
-		OP(FG, D, A, B, C, 14, 9, 0xc33707d6);
-		OP(FG, C, D, A, B, 3, 14, 0xf4d50d87);
-		OP(FG, B, C, D, A, 8, 20, 0x455a14ed);
-		OP(FG, A, B, C, D, 13, 5, 0xa9e3e905);
-		OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
-		OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
-		OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
-#  endif	/* MD5_SIZE_VS_SPEED == 1 */
+	/* Round 2.  */
+# if MD5_SIZE_VS_SPEED == 1
+	pp = P_array;
+	for (i = 0; i < 4; i++) {
+		OP(FG, A, B, C, D, (int) (*pp++), 5, *pc++);
+		OP(FG, D, A, B, C, (int) (*pp++), 9, *pc++);
+		OP(FG, C, D, A, B, (int) (*pp++), 14, *pc++);
+		OP(FG, B, C, D, A, (int) (*pp++), 20, *pc++);
+	}
+# else
+	OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
+	OP(FG, D, A, B, C, 6, 9, 0xc040b340);
+	OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
+	OP(FG, B, C, D, A, 0, 20, 0xe9b6c7aa);
+	OP(FG, A, B, C, D, 5, 5, 0xd62f105d);
+	OP(FG, D, A, B, C, 10, 9, 0x02441453);
+	OP(FG, C, D, A, B, 15, 14, 0xd8a1e681);
+	OP(FG, B, C, D, A, 4, 20, 0xe7d3fbc8);
+	OP(FG, A, B, C, D, 9, 5, 0x21e1cde6);
+	OP(FG, D, A, B, C, 14, 9, 0xc33707d6);
+	OP(FG, C, D, A, B, 3, 14, 0xf4d50d87);
+	OP(FG, B, C, D, A, 8, 20, 0x455a14ed);
+	OP(FG, A, B, C, D, 13, 5, 0xa9e3e905);
+	OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
+	OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
+	OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
+# endif/* MD5_SIZE_VS_SPEED == 1 */
 
-		/* Round 3.  */
-#  if MD5_SIZE_VS_SPEED == 1
-		for (i = 0; i < 4; i++) {
-			OP(FH, A, B, C, D, (int) (*pp++), 4, *pc++);
-			OP(FH, D, A, B, C, (int) (*pp++), 11, *pc++);
-			OP(FH, C, D, A, B, (int) (*pp++), 16, *pc++);
-			OP(FH, B, C, D, A, (int) (*pp++), 23, *pc++);
-		}
-#  else
-		OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
-		OP(FH, D, A, B, C, 8, 11, 0x8771f681);
-		OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
-		OP(FH, B, C, D, A, 14, 23, 0xfde5380c);
-		OP(FH, A, B, C, D, 1, 4, 0xa4beea44);
-		OP(FH, D, A, B, C, 4, 11, 0x4bdecfa9);
-		OP(FH, C, D, A, B, 7, 16, 0xf6bb4b60);
-		OP(FH, B, C, D, A, 10, 23, 0xbebfbc70);
-		OP(FH, A, B, C, D, 13, 4, 0x289b7ec6);
-		OP(FH, D, A, B, C, 0, 11, 0xeaa127fa);
-		OP(FH, C, D, A, B, 3, 16, 0xd4ef3085);
-		OP(FH, B, C, D, A, 6, 23, 0x04881d05);
-		OP(FH, A, B, C, D, 9, 4, 0xd9d4d039);
-		OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
-		OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
-		OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
-#  endif	/* MD5_SIZE_VS_SPEED == 1 */
+	/* Round 3.  */
+# if MD5_SIZE_VS_SPEED == 1
+	for (i = 0; i < 4; i++) {
+		OP(FH, A, B, C, D, (int) (*pp++), 4, *pc++);
+		OP(FH, D, A, B, C, (int) (*pp++), 11, *pc++);
+		OP(FH, C, D, A, B, (int) (*pp++), 16, *pc++);
+		OP(FH, B, C, D, A, (int) (*pp++), 23, *pc++);
+	}
+# else
+	OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
+	OP(FH, D, A, B, C, 8, 11, 0x8771f681);
+	OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
+	OP(FH, B, C, D, A, 14, 23, 0xfde5380c);
+	OP(FH, A, B, C, D, 1, 4, 0xa4beea44);
+	OP(FH, D, A, B, C, 4, 11, 0x4bdecfa9);
+	OP(FH, C, D, A, B, 7, 16, 0xf6bb4b60);
+	OP(FH, B, C, D, A, 10, 23, 0xbebfbc70);
+	OP(FH, A, B, C, D, 13, 4, 0x289b7ec6);
+	OP(FH, D, A, B, C, 0, 11, 0xeaa127fa);
+	OP(FH, C, D, A, B, 3, 16, 0xd4ef3085);
+	OP(FH, B, C, D, A, 6, 23, 0x04881d05);
+	OP(FH, A, B, C, D, 9, 4, 0xd9d4d039);
+	OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
+	OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
+	OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
+# endif/* MD5_SIZE_VS_SPEED == 1 */
 
-		/* Round 4.  */
-#  if MD5_SIZE_VS_SPEED == 1
-		for (i = 0; i < 4; i++) {
-			OP(FI, A, B, C, D, (int) (*pp++), 6, *pc++);
-			OP(FI, D, A, B, C, (int) (*pp++), 10, *pc++);
-			OP(FI, C, D, A, B, (int) (*pp++), 15, *pc++);
-			OP(FI, B, C, D, A, (int) (*pp++), 21, *pc++);
-		}
-#  else
-		OP(FI, A, B, C, D, 0, 6, 0xf4292244);
-		OP(FI, D, A, B, C, 7, 10, 0x432aff97);
-		OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
-		OP(FI, B, C, D, A, 5, 21, 0xfc93a039);
-		OP(FI, A, B, C, D, 12, 6, 0x655b59c3);
-		OP(FI, D, A, B, C, 3, 10, 0x8f0ccc92);
-		OP(FI, C, D, A, B, 10, 15, 0xffeff47d);
-		OP(FI, B, C, D, A, 1, 21, 0x85845dd1);
-		OP(FI, A, B, C, D, 8, 6, 0x6fa87e4f);
-		OP(FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
-		OP(FI, C, D, A, B, 6, 15, 0xa3014314);
-		OP(FI, B, C, D, A, 13, 21, 0x4e0811a1);
-		OP(FI, A, B, C, D, 4, 6, 0xf7537e82);
-		OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
-		OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
-		OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
-#  endif	/* MD5_SIZE_VS_SPEED == 1 */
-# endif	/* MD5_SIZE_VS_SPEED > 1 */
+	/* Round 4.  */
+# if MD5_SIZE_VS_SPEED == 1
+	for (i = 0; i < 4; i++) {
+		OP(FI, A, B, C, D, (int) (*pp++), 6, *pc++);
+		OP(FI, D, A, B, C, (int) (*pp++), 10, *pc++);
+		OP(FI, C, D, A, B, (int) (*pp++), 15, *pc++);
+		OP(FI, B, C, D, A, (int) (*pp++), 21, *pc++);
+	}
+# else
+	OP(FI, A, B, C, D, 0, 6, 0xf4292244);
+	OP(FI, D, A, B, C, 7, 10, 0x432aff97);
+	OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
+	OP(FI, B, C, D, A, 5, 21, 0xfc93a039);
+	OP(FI, A, B, C, D, 12, 6, 0x655b59c3);
+	OP(FI, D, A, B, C, 3, 10, 0x8f0ccc92);
+	OP(FI, C, D, A, B, 10, 15, 0xffeff47d);
+	OP(FI, B, C, D, A, 1, 21, 0x85845dd1);
+	OP(FI, A, B, C, D, 8, 6, 0x6fa87e4f);
+	OP(FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
+	OP(FI, C, D, A, B, 6, 15, 0xa3014314);
+	OP(FI, B, C, D, A, 13, 21, 0x4e0811a1);
+	OP(FI, A, B, C, D, 4, 6, 0xf7537e82);
+	OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
+	OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
+	OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
+# endif	/* MD5_SIZE_VS_SPEED == 1 */
+#endif	/* MD5_SIZE_VS_SPEED > 1 */
 
-		/* Add the starting values of the context.  */
-		A += A_save;
-		B += B_save;
-		C += C_save;
-		D += D_save;
+	/* Add the starting values of the context.  */
+	A += A_save;
+	B += B_save;
+	C += C_save;
+	D += D_save;
 
 	/* Put checksum in context given as argument.  */
 	ctx->A = A;
@@ -370,31 +360,26 @@
  * with chunks of data that are 4-byte aligned and a multiple of 64 bytes.
  * This function's internal buffer remembers previous data until it has 64
  * bytes worth to pass on.  Call md5_end() to flush this buffer. */
-
 void FAST_FUNC md5_hash(const void *buffer, size_t len, md5_ctx_t *ctx)
 {
-	char *buf=(char *)buffer;
+	char *buf = (char *)buffer;
 
 	/* RFC 1321 specifies the possible length of the file up to 2^64 bits,
 	 * Here we only track the number of bytes.  */
-
 	ctx->total += len;
 
-	// Process all input.
-
+	/* Process all input. */
 	while (len) {
 		unsigned i = 64 - ctx->buflen;
 
-		// Copy data into aligned buffer.
-
+		/* Copy data into aligned buffer. */
 		if (i > len) i = len;
 		memcpy(ctx->buffer + ctx->buflen, buf, i);
 		len -= i;
 		ctx->buflen += i;
 		buf += i;
 
-		// When buffer fills up, process it.
-
+		/* When buffer fills up, process it. */
 		if (ctx->buflen == 64) {
 			md5_hash_block(ctx->buffer, ctx);
 			ctx->buflen = 0;
@@ -410,23 +395,25 @@
  * IMPORTANT: On some systems it is required that RESBUF is correctly
  * aligned for a 32 bits value.
  */
-void* FAST_FUNC md5_end(void *resbuf, md5_ctx_t *ctx)
+void FAST_FUNC md5_end(void *resbuf, md5_ctx_t *ctx)
 {
 	char *buf = ctx->buffer;
 	int i;
 
 	/* Pad data to block size.  */
-
 	buf[ctx->buflen++] = 0x80;
 	memset(buf + ctx->buflen, 0, 128 - ctx->buflen);
 
 	/* Put the 64-bit file length in *bits* at the end of the buffer.  */
 	ctx->total <<= 3;
-	if (ctx->buflen > 56) buf += 64;
-	for (i = 0; i < 8; i++)  buf[56 + i] = ctx->total >> (i*8);
+	if (ctx->buflen > 56)
+		buf += 64;
+	for (i = 0; i < 8; i++)
+		buf[56 + i] = ctx->total >> (i*8);
 
 	/* Process last bytes.  */
-	if (buf != ctx->buffer) md5_hash_block(ctx->buffer, ctx);
+	if (buf != ctx->buffer)
+		md5_hash_block(ctx->buffer, ctx);
 	md5_hash_block(buf, ctx);
 
 	/* Put result from CTX in first 16 bytes following RESBUF.  The result is
@@ -440,7 +427,4 @@
 	((uint32_t *) resbuf)[1] = SWAP_LE32(ctx->B);
 	((uint32_t *) resbuf)[2] = SWAP_LE32(ctx->C);
 	((uint32_t *) resbuf)[3] = SWAP_LE32(ctx->D);
-
-	return resbuf;
 }
-
diff --git a/libbb/md5prime.c b/libbb/md5prime.c
new file mode 100644
index 0000000..7986f4d
--- /dev/null
+++ b/libbb/md5prime.c
@@ -0,0 +1,460 @@
+/* This file is not used by busybox right now.
+ * However, the code here seems to be a tiny bit smaller
+ * than one in md5.c. Need to investigate which one
+ * is better overall...
+ * Hint: grep for md5prime to find places where you can switch
+ * md5.c/md5prime.c
+ */
+
+/*
+ * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
+ *
+ * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+ * rights reserved.
+ *
+ * License to copy and use this software is granted provided that it
+ * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ * Algorithm" in all material mentioning or referencing this software
+ * or this function.
+ *
+ * License is also granted to make and use derivative works provided
+ * that such works are identified as "derived from the RSA Data
+ * Security, Inc. MD5 Message-Digest Algorithm" in all material
+ * mentioning or referencing the derived work.
+ *
+ * RSA Data Security, Inc. makes no representations concerning either
+ * the merchantability of this software or the suitability of this
+ * software for any particular purpose. It is provided "as is"
+ * without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this
+ * documentation and/or software.
+ *
+ * $FreeBSD: src/lib/libmd/md5c.c,v 1.9.2.1 1999/08/29 14:57:12 peter Exp $
+ *
+ * This code is the same as the code published by RSA Inc.  It has been
+ * edited for clarity and style only.
+ *
+ * ----------------------------------------------------------------------------
+ * The md5_crypt() function was taken from freeBSD's libcrypt and contains
+ * this license:
+ *    "THE BEER-WARE LICENSE" (Revision 42):
+ *     <phk@login.dknet.dk> wrote this file.  As long as you retain this notice you
+ *     can do whatever you want with this stuff. If we meet some day, and you think
+ *     this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ *
+ * $FreeBSD: src/lib/libcrypt/crypt.c,v 1.7.2.1 1999/08/29 14:56:33 peter Exp $
+ *
+ * ----------------------------------------------------------------------------
+ * On April 19th, 2001 md5_crypt() was modified to make it reentrant
+ * by Erik Andersen <andersen@uclibc.org>
+ *
+ * June 28, 2001             Manuel Novoa III
+ *
+ * "Un-inlined" code using loops and static const tables in order to
+ * reduce generated code size (on i386 from approx 4k to approx 2.5k).
+ *
+ * June 29, 2001             Manuel Novoa III
+ *
+ * Completely removed static PADDING array.
+ *
+ * Reintroduced the loop unrolling in md5_transform and added the
+ * MD5_SIZE_VS_SPEED option for configurability.  Define below as:
+ *       0    fully unrolled loops
+ *       1    partially unrolled (4 ops per loop)
+ *       2    no unrolling -- introduces the need to swap 4 variables (slow)
+ *       3    no unrolling and all 4 loops merged into one with switch
+ *               in each loop (glacial)
+ * On i386, sizes are roughly (-Os -fno-builtin):
+ *     0: 3k     1: 2.5k     2: 2.2k     3: 2k
+ *
+ * Since SuSv3 does not require crypt_r, modified again August 7, 2002
+ * by Erik Andersen to remove reentrance stuff...
+ */
+
+#include "libbb.h"
+
+/* 1: fastest, 3: smallest */
+#if CONFIG_MD5_SIZE_VS_SPEED < 1
+# define MD5_SIZE_VS_SPEED 1
+#elif CONFIG_MD5_SIZE_VS_SPEED > 3
+# define MD5_SIZE_VS_SPEED 3
+#else
+# define MD5_SIZE_VS_SPEED CONFIG_MD5_SIZE_VS_SPEED
+#endif
+
+#if BB_LITTLE_ENDIAN
+#define memcpy32_cpu2le memcpy
+#define memcpy32_le2cpu memcpy
+#else
+/* Encodes input (uint32_t) into output (unsigned char).
+ * Assumes len is a multiple of 4. */
+static void
+memcpy32_cpu2le(unsigned char *output, uint32_t *input, unsigned len)
+{
+	unsigned i, j;
+	for (i = 0, j = 0; j < len; i++, j += 4) {
+		output[j] = input[i];
+		output[j+1] = (input[i] >> 8);
+		output[j+2] = (input[i] >> 16);
+		output[j+3] = (input[i] >> 24);
+	}
+}
+/* Decodes input (unsigned char) into output (uint32_t).
+ * Assumes len is a multiple of 4. */
+static void
+memcpy32_le2cpu(uint32_t *output, const unsigned char *input, unsigned len)
+{
+	unsigned i, j;
+	for (i = 0, j = 0; j < len; i++, j += 4)
+		output[i] = ((uint32_t)input[j])
+			| (((uint32_t)input[j+1]) << 8)
+			| (((uint32_t)input[j+2]) << 16)
+			| (((uint32_t)input[j+3]) << 24);
+}
+#endif /* i386 */
+
+/* F, G, H and I are basic MD5 functions. */
+#define F(x, y, z) (((x) & (y)) | (~(x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & ~(z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | ~(z)))
+
+/* rotl32 rotates x left n bits. */
+#define rotl32(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+
+/*
+ * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+ * Rotation is separate from addition to prevent recomputation.
+ */
+#define FF(a, b, c, d, x, s, ac) { \
+	(a) += F((b), (c), (d)) + (x) + (uint32_t)(ac); \
+	(a) = rotl32((a), (s)); \
+	(a) += (b); \
+	}
+#define GG(a, b, c, d, x, s, ac) { \
+	(a) += G((b), (c), (d)) + (x) + (uint32_t)(ac); \
+	(a) = rotl32((a), (s)); \
+	(a) += (b); \
+	}
+#define HH(a, b, c, d, x, s, ac) { \
+	(a) += H((b), (c), (d)) + (x) + (uint32_t)(ac); \
+	(a) = rotl32((a), (s)); \
+	(a) += (b); \
+	}
+#define II(a, b, c, d, x, s, ac) { \
+	(a) += I((b), (c), (d)) + (x) + (uint32_t)(ac); \
+	(a) = rotl32((a), (s)); \
+	(a) += (b); \
+	}
+
+/* MD5 basic transformation. Transforms state based on block. */
+static void md5_transform(uint32_t state[4], const unsigned char block[64])
+{
+	uint32_t a, b, c, d, x[16];
+#if MD5_SIZE_VS_SPEED > 1
+	uint32_t temp;
+	const unsigned char *ps;
+
+	static const unsigned char S[] = {
+		7, 12, 17, 22,
+		5, 9, 14, 20,
+		4, 11, 16, 23,
+		6, 10, 15, 21
+	};
+#endif /* MD5_SIZE_VS_SPEED > 1 */
+
+#if MD5_SIZE_VS_SPEED > 0
+	const uint32_t *pc;
+	const unsigned char *pp;
+	int i;
+
+	static const uint32_t C[] = {
+		/* round 1 */
+		0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
+		0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
+		0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
+		0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
+		/* round 2 */
+		0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
+		0xd62f105d, 0x2441453,  0xd8a1e681, 0xe7d3fbc8,
+		0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
+		0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
+		/* round 3 */
+		0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
+		0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
+		0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
+		0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+		/* round 4 */
+		0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
+		0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
+		0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
+		0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
+	};
+	static const unsigned char P[] = {
+		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
+		1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
+		5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
+		0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9  /* 4 */
+	};
+
+#endif /* MD5_SIZE_VS_SPEED > 0 */
+
+	memcpy32_le2cpu(x, block, 64);
+
+	a = state[0];
+	b = state[1];
+	c = state[2];
+	d = state[3];
+
+#if MD5_SIZE_VS_SPEED > 2
+	pc = C;
+	pp = P;
+	ps = S - 4;
+	for (i = 0; i < 64; i++) {
+		if ((i & 0x0f) == 0) ps += 4;
+		temp = a;
+		switch (i>>4) {
+			case 0:
+				temp += F(b, c, d);
+				break;
+			case 1:
+				temp += G(b, c, d);
+				break;
+			case 2:
+				temp += H(b, c, d);
+				break;
+			case 3:
+				temp += I(b, c, d);
+				break;
+		}
+		temp += x[*pp++] + *pc++;
+		temp = rotl32(temp, ps[i & 3]);
+		temp += b;
+		a = d; d = c; c = b; b = temp;
+	}
+#elif MD5_SIZE_VS_SPEED > 1
+	pc = C;
+	pp = P;
+	ps = S;
+	/* Round 1 */
+	for (i = 0; i < 16; i++) {
+		FF(a, b, c, d, x[*pp], ps[i & 0x3], *pc); pp++; pc++;
+		temp = d; d = c; c = b; b = a; a = temp;
+	}
+	/* Round 2 */
+	ps += 4;
+	for (; i < 32; i++) {
+		GG(a, b, c, d, x[*pp], ps[i & 0x3], *pc); pp++; pc++;
+		temp = d; d = c; c = b; b = a; a = temp;
+	}
+	/* Round 3 */
+	ps += 4;
+	for (; i < 48; i++) {
+		HH(a, b, c, d, x[*pp], ps[i & 0x3], *pc); pp++; pc++;
+		temp = d; d = c; c = b; b = a; a = temp;
+	}
+	/* Round 4 */
+	ps += 4;
+	for (; i < 64; i++) {
+		II(a, b, c, d, x[*pp], ps[i & 0x3], *pc); pp++; pc++;
+		temp = d; d = c; c = b; b = a; a = temp;
+	}
+#elif MD5_SIZE_VS_SPEED > 0
+	pc = C;
+	pp = P;
+	/* Round 1 */
+	for (i = 0; i < 4; i++) {
+		FF(a, b, c, d, x[*pp],  7, *pc); pp++; pc++;
+		FF(d, a, b, c, x[*pp], 12, *pc); pp++; pc++;
+		FF(c, d, a, b, x[*pp], 17, *pc); pp++; pc++;
+		FF(b, c, d, a, x[*pp], 22, *pc); pp++; pc++;
+	}
+	/* Round 2 */
+	for (i = 0; i < 4; i++) {
+		GG(a, b, c, d, x[*pp],  5, *pc); pp++; pc++;
+		GG(d, a, b, c, x[*pp],  9, *pc); pp++; pc++;
+		GG(c, d, a, b, x[*pp], 14, *pc); pp++; pc++;
+		GG(b, c, d, a, x[*pp], 20, *pc); pp++; pc++;
+	}
+	/* Round 3 */
+	for (i = 0; i < 4; i++) {
+		HH(a, b, c, d, x[*pp],  4, *pc); pp++; pc++;
+		HH(d, a, b, c, x[*pp], 11, *pc); pp++; pc++;
+		HH(c, d, a, b, x[*pp], 16, *pc); pp++; pc++;
+		HH(b, c, d, a, x[*pp], 23, *pc); pp++; pc++;
+	}
+	/* Round 4 */
+	for (i = 0; i < 4; i++) {
+		II(a, b, c, d, x[*pp],  6, *pc); pp++; pc++;
+		II(d, a, b, c, x[*pp], 10, *pc); pp++; pc++;
+		II(c, d, a, b, x[*pp], 15, *pc); pp++; pc++;
+		II(b, c, d, a, x[*pp], 21, *pc); pp++; pc++;
+	}
+#else
+	/* Round 1 */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+	FF(a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+	FF(d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+	FF(c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+	FF(b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+	FF(a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+	FF(d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+	FF(c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+	FF(b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+	FF(a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+	FF(d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+	FF(c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+	FF(b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+	FF(a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+	FF(d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+	FF(c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+	FF(b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+	/* Round 2 */
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+	GG(a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+	GG(d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+	GG(c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+	GG(b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+	GG(a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+	GG(d, a, b, c, x[10], S22,  0x2441453); /* 22 */
+	GG(c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+	GG(b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+	GG(a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+	GG(d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+	GG(c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+	GG(b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+	GG(a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+	GG(d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+	GG(c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+	GG(b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+	/* Round 3 */
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+	HH(a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+	HH(d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+	HH(c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+	HH(b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+	HH(a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+	HH(d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+	HH(c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+	HH(b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+	HH(a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+	HH(d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+	HH(c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+	HH(b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
+	HH(a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+	HH(d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+	HH(c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+	HH(b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+	/* Round 4 */
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+	II(a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+	II(d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+	II(c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+	II(b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+	II(a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+	II(d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+	II(c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+	II(b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+	II(a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+	II(d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+	II(c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+	II(b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+	II(a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+	II(d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+	II(c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+	II(b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+#endif
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+
+	/* Zeroize sensitive information. */
+	memset(x, 0, sizeof(x));
+}
+
+
+/* MD5 initialization. */
+void FAST_FUNC md5_begin(md5_ctx_t *context)
+{
+	context->count[0] = context->count[1] = 0;
+	/* Load magic initialization constants.  */
+	context->state[0] = 0x67452301;
+	context->state[1] = 0xefcdab89;
+	context->state[2] = 0x98badcfe;
+	context->state[3] = 0x10325476;
+}
+
+/*
+ * MD5 block update operation. Continues an MD5 message-digest
+ * operation, processing another message block, and updating
+ * the context.
+ */
+void FAST_FUNC md5_hash(const void *buffer, size_t inputLen, md5_ctx_t *context)
+{
+	unsigned i, idx, partLen;
+	const unsigned char *input = buffer;
+
+	/* Compute number of bytes mod 64 */
+	idx = (context->count[0] >> 3) & 0x3F;
+
+	/* Update number of bits */
+	context->count[0] += (inputLen << 3);
+	if (context->count[0] < (inputLen << 3))
+		context->count[1]++;
+	context->count[1] += (inputLen >> 29);
+
+	/* Transform as many times as possible. */
+	i = 0;
+	partLen = 64 - idx;
+	if (inputLen >= partLen) {
+		memcpy(&context->buffer[idx], input, partLen);
+		md5_transform(context->state, context->buffer);
+		for (i = partLen; i + 63 < inputLen; i += 64)
+			md5_transform(context->state, &input[i]);
+		idx = 0;
+	}
+
+	/* Buffer remaining input */
+	memcpy(&context->buffer[idx], &input[i], inputLen - i);
+}
+
+/*
+ * MD5 finalization. Ends an MD5 message-digest operation,
+ * writing the message digest.
+ */
+void FAST_FUNC md5_end(void *digest, md5_ctx_t *context)
+{
+	unsigned idx, padLen;
+	unsigned char bits[8];
+	unsigned char padding[64];
+
+	/* Add padding followed by original length. */
+	memset(padding, 0, sizeof(padding));
+	padding[0] = 0x80;
+	/* save number of bits */
+	memcpy32_cpu2le(bits, context->count, 8);
+	/* pad out to 56 mod 64 */
+	idx = (context->count[0] >> 3) & 0x3f;
+	padLen = (idx < 56) ? (56 - idx) : (120 - idx);
+	md5_hash(padding, padLen, context);
+	/* append length (before padding) */
+	md5_hash(bits, 8, context);
+
+	/* Store state in digest */
+	memcpy32_cpu2le(digest, context->state, 16);
+}
diff --git a/libbb/pw_encrypt_md5.c b/libbb/pw_encrypt_md5.c
index b7478aa..b02cbec 100644
--- a/libbb/pw_encrypt_md5.c
+++ b/libbb/pw_encrypt_md5.c
@@ -61,439 +61,14 @@
  * On i386, sizes are roughly (-Os -fno-builtin):
  *     0: 3k     1: 2.5k     2: 2.2k     3: 2k
  *
- *
  * Since SuSv3 does not require crypt_r, modified again August 7, 2002
  * by Erik Andersen to remove reentrance stuff...
  */
 
-/*
- * Valid values are  1 (fastest/largest) to 3 (smallest/slowest).
- */
-#define MD5_SIZE_OVER_SPEED 3
-
-/**********************************************************************/
-
-/* MD5 context. */
-struct MD5Context {
-	uint32_t state[4];	/* state (ABCD) */
-	uint32_t count[2];	/* number of bits, modulo 2^64 (lsb first) */
-	unsigned char buffer[64];	/* input buffer */
-};
-
-static void __md5_Init(struct MD5Context *);
-static void __md5_Update(struct MD5Context *, const unsigned char *, unsigned int);
-static void __md5_Pad(struct MD5Context *);
-static void __md5_Final(unsigned char [16], struct MD5Context *);
-static void __md5_Transform(uint32_t [4], const unsigned char [64]);
-
-
 #define MD5_MAGIC_STR "$1$"
 #define MD5_MAGIC_LEN (sizeof(MD5_MAGIC_STR) - 1)
 static const unsigned char __md5__magic[] = MD5_MAGIC_STR;
 
-
-#ifdef i386
-#define __md5_Encode memcpy
-#define __md5_Decode memcpy
-#else /* i386 */
-
-/*
- * __md5_Encodes input (uint32_t) into output (unsigned char). Assumes len is
- * a multiple of 4.
- */
-static void
-__md5_Encode(unsigned char *output, uint32_t *input, unsigned int len)
-{
-	unsigned int i, j;
-
-	for (i = 0, j = 0; j < len; i++, j += 4) {
-		output[j] = input[i];
-		output[j+1] = (input[i] >> 8);
-		output[j+2] = (input[i] >> 16);
-		output[j+3] = (input[i] >> 24);
-	}
-}
-
-/*
- * __md5_Decodes input (unsigned char) into output (uint32_t). Assumes len is
- * a multiple of 4.
- */
-static void
-__md5_Decode(uint32_t *output, const unsigned char *input, unsigned int len)
-{
-	unsigned int i, j;
-
-	for (i = 0, j = 0; j < len; i++, j += 4)
-		output[i] = ((uint32_t)input[j]) | (((uint32_t)input[j+1]) << 8) |
-		    (((uint32_t)input[j+2]) << 16) | (((uint32_t)input[j+3]) << 24);
-}
-#endif /* i386 */
-
-/* F, G, H and I are basic MD5 functions. */
-#define F(x, y, z) (((x) & (y)) | (~(x) & (z)))
-#define G(x, y, z) (((x) & (z)) | ((y) & ~(z)))
-#define H(x, y, z) ((x) ^ (y) ^ (z))
-#define I(x, y, z) ((y) ^ ((x) | ~(z)))
-
-/* ROTATE_LEFT rotates x left n bits. */
-#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
-
-/*
- * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
- * Rotation is separate from addition to prevent recomputation.
- */
-#define FF(a, b, c, d, x, s, ac) { \
-	(a) += F ((b), (c), (d)) + (x) + (uint32_t)(ac); \
-	(a) = ROTATE_LEFT((a), (s)); \
-	(a) += (b); \
-	}
-#define GG(a, b, c, d, x, s, ac) { \
-	(a) += G ((b), (c), (d)) + (x) + (uint32_t)(ac); \
-	(a) = ROTATE_LEFT((a), (s)); \
-	(a) += (b); \
-	}
-#define HH(a, b, c, d, x, s, ac) { \
-	(a) += H ((b), (c), (d)) + (x) + (uint32_t)(ac); \
-	(a) = ROTATE_LEFT((a), (s)); \
-	(a) += (b); \
-	}
-#define II(a, b, c, d, x, s, ac) { \
-	(a) += I ((b), (c), (d)) + (x) + (uint32_t)(ac); \
-	(a) = ROTATE_LEFT((a), (s)); \
-	(a) += (b); \
-	}
-
-/* MD5 initialization. Begins an MD5 operation, writing a new context. */
-static void __md5_Init(struct MD5Context *context)
-{
-	context->count[0] = context->count[1] = 0;
-
-	/* Load magic initialization constants.  */
-	context->state[0] = 0x67452301;
-	context->state[1] = 0xefcdab89;
-	context->state[2] = 0x98badcfe;
-	context->state[3] = 0x10325476;
-}
-
-/*
- * MD5 block update operation. Continues an MD5 message-digest
- * operation, processing another message block, and updating the
- * context.
- */
-static void __md5_Update(struct MD5Context *context, const unsigned char *input, unsigned int inputLen)
-{
-	unsigned int i, idx, partLen;
-
-	/* Compute number of bytes mod 64 */
-	idx = (context->count[0] >> 3) & 0x3F;
-
-	/* Update number of bits */
-	context->count[0] += (inputLen << 3);
-	if (context->count[0] < (inputLen << 3))
-		context->count[1]++;
-	context->count[1] += (inputLen >> 29);
-
-	partLen = 64 - idx;
-
-	/* Transform as many times as possible. */
-	if (inputLen >= partLen) {
-		memcpy(&context->buffer[idx], input, partLen);
-		__md5_Transform(context->state, context->buffer);
-
-		for (i = partLen; i + 63 < inputLen; i += 64)
-			__md5_Transform(context->state, &input[i]);
-
-		idx = 0;
-	} else
-		i = 0;
-
-	/* Buffer remaining input */
-	memcpy(&context->buffer[idx], &input[i], inputLen - i);
-}
-
-/*
- * MD5 padding. Adds padding followed by original length.
- */
-static void __md5_Pad(struct MD5Context *context)
-{
-	unsigned char bits[8];
-	unsigned int idx, padLen;
-	unsigned char PADDING[64];
-
-	memset(PADDING, 0, sizeof(PADDING));
-	PADDING[0] = 0x80;
-
-	/* Save number of bits */
-	__md5_Encode(bits, context->count, 8);
-
-	/* Pad out to 56 mod 64. */
-	idx = (context->count[0] >> 3) & 0x3f;
-	padLen = (idx < 56) ? (56 - idx) : (120 - idx);
-	__md5_Update(context, PADDING, padLen);
-
-	/* Append length (before padding) */
-	__md5_Update(context, bits, 8);
-}
-
-/*
- * MD5 finalization. Ends an MD5 message-digest operation, writing the
- * the message digest and zeroizing the context.
- */
-static void __md5_Final(unsigned char digest[16], struct MD5Context *context)
-{
-	/* Do padding. */
-	__md5_Pad(context);
-
-	/* Store state in digest */
-	__md5_Encode(digest, context->state, 16);
-
-	/* Zeroize sensitive information. */
-	memset(context, 0, sizeof(*context));
-}
-
-/* MD5 basic transformation. Transforms state based on block. */
-static void __md5_Transform(uint32_t state[4], const unsigned char block[64])
-{
-	uint32_t a, b, c, d, x[16];
-#if MD5_SIZE_OVER_SPEED > 1
-	uint32_t temp;
-	const unsigned char *ps;
-
-	static const unsigned char S[] = {
-		7, 12, 17, 22,
-		5, 9, 14, 20,
-		4, 11, 16, 23,
-		6, 10, 15, 21
-	};
-#endif /* MD5_SIZE_OVER_SPEED > 1 */
-
-#if MD5_SIZE_OVER_SPEED > 0
-	const uint32_t *pc;
-	const unsigned char *pp;
-	int i;
-
-	static const uint32_t C[] = {
-								/* round 1 */
-		0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
-		0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
-		0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
-		0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
-								/* round 2 */
-		0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
-		0xd62f105d, 0x2441453,  0xd8a1e681, 0xe7d3fbc8,
-		0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
-		0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
-								/* round 3 */
-		0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
-		0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
-		0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
-		0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
-								/* round 4 */
-		0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
-		0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
-		0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
-		0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
-	};
-
-	static const unsigned char P[] = {
-		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
-		1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
-		5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
-		0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9  /* 4 */
-	};
-
-#endif /* MD5_SIZE_OVER_SPEED > 0 */
-
-	__md5_Decode(x, block, 64);
-
-	a = state[0]; b = state[1]; c = state[2]; d = state[3];
-
-#if MD5_SIZE_OVER_SPEED > 2
-	pc = C; pp = P; ps = S - 4;
-
-	for (i = 0; i < 64; i++) {
-		if ((i & 0x0f) == 0) ps += 4;
-		temp = a;
-		switch (i>>4) {
-			case 0:
-				temp += F(b, c, d);
-				break;
-			case 1:
-				temp += G(b, c, d);
-				break;
-			case 2:
-				temp += H(b, c, d);
-				break;
-			case 3:
-				temp += I(b, c, d);
-				break;
-		}
-		temp += x[*pp++] + *pc++;
-		temp = ROTATE_LEFT(temp, ps[i & 3]);
-		temp += b;
-		a = d; d = c; c = b; b = temp;
-	}
-#elif MD5_SIZE_OVER_SPEED > 1
-	pc = C; pp = P; ps = S;
-
-	/* Round 1 */
-	for (i = 0; i < 16; i++) {
-		FF(a, b, c, d, x[*pp], ps[i & 0x3], *pc); pp++; pc++;
-		temp = d; d = c; c = b; b = a; a = temp;
-	}
-
-	/* Round 2 */
-	ps += 4;
-	for (; i < 32; i++) {
-		GG(a, b, c, d, x[*pp], ps[i & 0x3], *pc); pp++; pc++;
-		temp = d; d = c; c = b; b = a; a = temp;
-	}
-	/* Round 3 */
-	ps += 4;
-	for (; i < 48; i++) {
-		HH(a, b, c, d, x[*pp], ps[i & 0x3], *pc); pp++; pc++;
-		temp = d; d = c; c = b; b = a; a = temp;
-	}
-
-	/* Round 4 */
-	ps += 4;
-	for (; i < 64; i++) {
-		II(a, b, c, d, x[*pp], ps[i & 0x3], *pc); pp++; pc++;
-		temp = d; d = c; c = b; b = a; a = temp;
-	}
-#elif MD5_SIZE_OVER_SPEED > 0
-	pc = C; pp = P;
-
-	/* Round 1 */
-	for (i = 0; i < 4; i++) {
-		FF(a, b, c, d, x[*pp],  7, *pc); pp++; pc++;
-		FF(d, a, b, c, x[*pp], 12, *pc); pp++; pc++;
-		FF(c, d, a, b, x[*pp], 17, *pc); pp++; pc++;
-		FF(b, c, d, a, x[*pp], 22, *pc); pp++; pc++;
-	}
-
-	/* Round 2 */
-	for (i = 0; i < 4; i++) {
-		GG(a, b, c, d, x[*pp],  5, *pc); pp++; pc++;
-		GG(d, a, b, c, x[*pp],  9, *pc); pp++; pc++;
-		GG(c, d, a, b, x[*pp], 14, *pc); pp++; pc++;
-		GG(b, c, d, a, x[*pp], 20, *pc); pp++; pc++;
-	}
-	/* Round 3 */
-	for (i = 0; i < 4; i++) {
-		HH(a, b, c, d, x[*pp],  4, *pc); pp++; pc++;
-		HH(d, a, b, c, x[*pp], 11, *pc); pp++; pc++;
-		HH(c, d, a, b, x[*pp], 16, *pc); pp++; pc++;
-		HH(b, c, d, a, x[*pp], 23, *pc); pp++; pc++;
-	}
-
-	/* Round 4 */
-	for (i = 0; i < 4; i++) {
-		II(a, b, c, d, x[*pp],  6, *pc); pp++; pc++;
-		II(d, a, b, c, x[*pp], 10, *pc); pp++; pc++;
-		II(c, d, a, b, x[*pp], 15, *pc); pp++; pc++;
-		II(b, c, d, a, x[*pp], 21, *pc); pp++; pc++;
-	}
-#else
-	/* Round 1 */
-#define S11 7
-#define S12 12
-#define S13 17
-#define S14 22
-	FF(a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
-	FF(d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
-	FF(c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
-	FF(b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
-	FF(a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
-	FF(d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
-	FF(c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
-	FF(b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
-	FF(a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
-	FF(d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
-	FF(c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
-	FF(b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
-	FF(a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
-	FF(d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
-	FF(c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
-	FF(b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
-
-	/* Round 2 */
-#define S21 5
-#define S22 9
-#define S23 14
-#define S24 20
-	GG(a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
-	GG(d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
-	GG(c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
-	GG(b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
-	GG(a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
-	GG(d, a, b, c, x[10], S22,  0x2441453); /* 22 */
-	GG(c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
-	GG(b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
-	GG(a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
-	GG(d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
-	GG(c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
-	GG(b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
-	GG(a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
-	GG(d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
-	GG(c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
-	GG(b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
-
-	/* Round 3 */
-#define S31 4
-#define S32 11
-#define S33 16
-#define S34 23
-	HH(a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
-	HH(d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
-	HH(c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
-	HH(b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
-	HH(a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
-	HH(d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
-	HH(c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
-	HH(b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
-	HH(a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
-	HH(d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
-	HH(c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
-	HH(b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
-	HH(a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
-	HH(d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
-	HH(c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
-	HH(b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
-
-	/* Round 4 */
-#define S41 6
-#define S42 10
-#define S43 15
-#define S44 21
-	II(a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
-	II(d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
-	II(c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
-	II(b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
-	II(a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
-	II(d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
-	II(c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
-	II(b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
-	II(a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
-	II(d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
-	II(c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
-	II(b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
-	II(a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
-	II(d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
-	II(c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
-	II(b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
-#endif
-
-	state[0] += a;
-	state[1] += b;
-	state[2] += c;
-	state[3] += d;
-
-	/* Zeroize sensitive information. */
-	memset(x, 0, sizeof(x));
-}
-
-
 /*
  * UNIX password
  *
@@ -502,94 +77,77 @@
 #define MD5_OUT_BUFSIZE 36
 static char *
 NOINLINE
-md5_crypt(char passwd[MD5_OUT_BUFSIZE], const unsigned char *pw, const unsigned char *salt)
+md5_crypt(char result[MD5_OUT_BUFSIZE], const unsigned char *pw, const unsigned char *salt)
 {
-	const unsigned char *sp, *ep;
 	char *p;
-	unsigned char final[17];	/* final[16] exists only to aid in looping */
+	unsigned char final[17]; /* final[16] exists only to aid in looping */
 	int sl, pl, i, pw_len;
-	struct MD5Context ctx, ctx1;
+	md5_ctx_t ctx, ctx1;
+
+	/* NB: in busybox, "$1$" in salt is always present */
 
 	/* Refine the Salt first */
-	sp = salt;
 
-// always true for bbox
-//	/* If it starts with the magic string, then skip that */
-//	if (!strncmp(sp, __md5__magic, MD5_MAGIC_LEN))
-		sp += MD5_MAGIC_LEN;
+	/* Get the length of the salt including "$1$" */
+	sl = 3;
+	while (salt[sl] && salt[sl] != '$' && sl < (3 + 8))
+		sl++;
 
-	/* It stops at the first '$', max 8 chars */
-	for (ep = sp; *ep && *ep != '$' && ep < (sp+8); ep++)
-		continue;
-
-	/* get the length of the true salt */
-	sl = ep - sp;
-
-	__md5_Init(&ctx);
-
-	/* The password first, since that is what is most unknown */
+	/* Hash. the password first, since that is what is most unknown */
+	md5_begin(&ctx);
 	pw_len = strlen((char*)pw);
-	__md5_Update(&ctx, pw, pw_len);
+	md5_hash(pw, pw_len, &ctx);
 
-	/* Then our magic string */
-	__md5_Update(&ctx, __md5__magic, MD5_MAGIC_LEN);
+	/* Then the salt including "$1$" */
+	md5_hash(salt, sl, &ctx);
 
-	/* Then the raw salt */
-	__md5_Update(&ctx, sp, sl);
+	/* Copy salt to result; skip "$1$" */
+	memcpy(result, salt, sl);
+	result[sl] = '$';
+	salt += 3;
+	sl -= 3;
 
 	/* Then just as many characters of the MD5(pw, salt, pw) */
-	__md5_Init(&ctx1);
-	__md5_Update(&ctx1, pw, pw_len);
-	__md5_Update(&ctx1, sp, sl);
-	__md5_Update(&ctx1, pw, pw_len);
-	__md5_Final(final, &ctx1);
+	md5_begin(&ctx1);
+	md5_hash(pw, pw_len, &ctx1);
+	md5_hash(salt, sl, &ctx1);
+	md5_hash(pw, pw_len, &ctx1);
+	md5_end(final, &ctx1);
 	for (pl = pw_len; pl > 0; pl -= 16)
-		__md5_Update(&ctx, final, pl > 16 ? 16 : pl);
-
-	/* Don't leave anything around in vm they could use. */
-//TODO: the above comment seems to be wrong. final is used later.
-	memset(final, 0, sizeof(final));
+		md5_hash(final, pl > 16 ? 16 : pl, &ctx);
 
 	/* Then something really weird... */
+	memset(final, 0, sizeof(final));
 	for (i = pw_len; i; i >>= 1) {
-		__md5_Update(&ctx, ((i & 1) ? final : (const unsigned char *) pw), 1);
+		md5_hash(((i & 1) ? final : (const unsigned char *) pw), 1, &ctx);
 	}
+	md5_end(final, &ctx);
 
-	/* Now make the output string */
-	passwd[0] = '$';
-	passwd[1] = '1';
-	passwd[2] = '$';
-	strncpy(passwd + 3, (char*)sp, sl);
-	passwd[sl + 3] = '$';
-
-	__md5_Final(final, &ctx);
-
-	/*
-	 * and now, just to make sure things don't run too fast
+	/* And now, just to make sure things don't run too fast.
 	 * On a 60 Mhz Pentium this takes 34 msec, so you would
 	 * need 30 seconds to build a 1000 entry dictionary...
 	 */
 	for (i = 0; i < 1000; i++) {
-		__md5_Init(&ctx1);
+		md5_begin(&ctx1);
 		if (i & 1)
-			__md5_Update(&ctx1, pw, pw_len);
+			md5_hash(pw, pw_len, &ctx1);
 		else
-			__md5_Update(&ctx1, final, 16);
+			md5_hash(final, 16, &ctx1);
 
 		if (i % 3)
-			__md5_Update(&ctx1, sp, sl);
+			md5_hash(salt, sl, &ctx1);
 
 		if (i % 7)
-			__md5_Update(&ctx1, pw, pw_len);
+			md5_hash(pw, pw_len, &ctx1);
 
 		if (i & 1)
-			__md5_Update(&ctx1, final, 16);
+			md5_hash(final, 16, &ctx1);
 		else
-			__md5_Update(&ctx1, pw, pw_len);
-		__md5_Final(final, &ctx1);
+			md5_hash(pw, pw_len, &ctx1);
+		md5_end(final, &ctx1);
 	}
 
-	p = passwd + sl + 4; /* 12 bytes max (sl is up to 8 bytes) */
+	p = result + sl + 4; /* 12 bytes max (sl is up to 8 bytes) */
 
 	/* Add 5*4+2 = 22 bytes of hash, + NUL byte. */
 	final[16] = final[5];
@@ -603,36 +161,7 @@
 	/* Don't leave anything around in vm they could use. */
 	memset(final, 0, sizeof(final));
 
-	return passwd;
+	return result;
 }
-
-#undef MD5_SIZE_OVER_SPEED
 #undef MD5_MAGIC_STR
 #undef MD5_MAGIC_LEN
-#undef __md5_Encode
-#undef __md5_Decode
-#undef F
-#undef G
-#undef H
-#undef I
-#undef ROTATE_LEFT
-#undef FF
-#undef GG
-#undef HH
-#undef II
-#undef S11
-#undef S12
-#undef S13
-#undef S14
-#undef S21
-#undef S22
-#undef S23
-#undef S24
-#undef S31
-#undef S32
-#undef S33
-#undef S34
-#undef S41
-#undef S42
-#undef S43
-#undef S44
diff --git a/libbb/sha1.c b/libbb/sha1.c
index fa468a2..76d5c8f 100644
--- a/libbb/sha1.c
+++ b/libbb/sha1.c
@@ -546,7 +546,7 @@
 }
 
 
-void* FAST_FUNC sha1_end(void *resbuf, sha1_ctx_t *ctx)
+void FAST_FUNC sha1_end(void *resbuf, sha1_ctx_t *ctx)
 {
 	/* SHA1 Final padding and digest calculation  */
 #if BB_BIG_ENDIAN
@@ -593,8 +593,6 @@
 	/* misaligned for 32-bit words                                  */
 	for (i = 0; i < SHA1_DIGEST_SIZE; ++i)
 		hval[i] = (unsigned char) (ctx->hash[i >> 2] >> 8 * (~i & 3));
-
-	return resbuf;
 }
 
 
@@ -603,7 +601,7 @@
 
    IMPORTANT: On some systems it is required that RESBUF is correctly
    aligned for a 32 bits value.  */
-void* FAST_FUNC sha256_end(void *resbuf, sha256_ctx_t *ctx)
+void FAST_FUNC sha256_end(void *resbuf, sha256_ctx_t *ctx)
 {
 	/* Take yet unprocessed bytes into account.  */
 	uint32_t bytes = ctx->buflen;
@@ -630,8 +628,6 @@
 	/* Put result from CTX in first 32 bytes following RESBUF.  */
 	for (unsigned i = 0; i < 8; ++i)
 		((uint32_t *) resbuf)[i] = ntohl(ctx->H[i]);
-
-	return resbuf;
 }
 
 /* Process the remaining bytes in the internal buffer and the usual
@@ -639,7 +635,7 @@
 
    IMPORTANT: On some systems it is required that RESBUF is correctly
    aligned for a 64 bits value.  */
-void* FAST_FUNC sha512_end(void *resbuf, sha512_ctx_t *ctx)
+void FAST_FUNC sha512_end(void *resbuf, sha512_ctx_t *ctx)
 {
 	/* Take yet unprocessed bytes into account.  */
 	uint64_t bytes = ctx->buflen;
@@ -666,6 +662,4 @@
 	/* Put result from CTX in first 64 bytes following RESBUF.  */
 	for (unsigned i = 0; i < 8; ++i)
 		((uint64_t *) resbuf)[i] = hton64(ctx->H[i]);
-
-	return resbuf;
 }