Here's a quick cleanup of md5sum. Executive summary: smaller and faster. On my machine, for a 2.2 GB file of random bytes, the timings with warm cache are: toybox before: 11.4 seconds toybox after: 8.3 seconds GNU md5sum: 3.9 seconds openssl dgst -md5: 3.5 seconds This is clearly better than before (3x openssl), but still slow (2x openssl). I suspect there is more low-hanging fruit to be had by eliminating the memcpy in hash_update (maybe not too much - hash_update accounts for about 4% of total runtime versus 92% for md5_transform according to perf - but this would also help sha1sum). make bloatcheck on x86_64 gcc 4.8.2 -Os: name old new delta ----------------------------------------------------------------------- md5rot 0 64 64 md5_transform 365 223 -142 ----------------------------------------------------------------------- -78 total Rationale for the changes: Move definition of 'rol' up so it can be used in md5_transform. This is purely cosmetic; it expands to exactly the same code. Put rotation counts in a lookup table instead of calculating them on the fly. This is mostly a wash size-wise, +5 bytes total, but worthwhile for readability and speed. Instead of accessing the state array using a rotating index (the variable formerly known as 'a'), access the state with constant offsets and rotate the contents of the array instead. This is the big win - it eliminates all the crazy memory addressing math inside the loop.

commit: afe951b19c0f8313731e28a5f347304d82e7f7af [log] [tgz]
author: Daniel Verkamp <daniel@drv.nu> Thu May 15 19:05:16 2014 -0500
committer: Daniel Verkamp <daniel@drv.nu> Thu May 15 19:05:16 2014 -0500
tree: bd4251ec4a504a2aeb2c8736dc8907e4aa16767b
parent: 97641f459a3e74045305b4ef9b80bb00bc56a29f [diff]
diff --git a/toys/lsb/md5sum.c b/toys/lsb/md5sum.c
index 3b5571b..ab43e7b 100644
--- a/toys/lsb/md5sum.c
+++ b/toys/lsb/md5sum.c

@@ -46,6 +46,8 @@
   } buffer;
 )
 
+#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
+
 // for(i=0; i<64; i++) md5table[i] = abs(sin(i+1))*(1<<32);  But calculating
 // that involves not just floating point but pulling in -lm (and arguing with
 // C about whether 1<<32 is a valid thing to do on 32 bit platforms) so:
@@ -64,44 +66,45 @@
   0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
 };
 
+static const uint8_t md5rot[64] = {
+  7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
+  5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20,
+  4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
+  6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21
+};
+
 // Mix next 64 bytes of data into md5 hash
 
 static void md5_transform(void)
 {
-  unsigned x[4], *b = (unsigned *)TT.buffer.c;
+  unsigned x[4], *b = TT.buffer.i;
   int i;
 
   memcpy(x, TT.state, sizeof(x));
 
   for (i=0; i<64; i++) {
-    unsigned int in, a, rot, temp;
-
-    a = (-i)&3;
+    unsigned int in, temp, swap;
     if (i<16) {
       in = i;
-      rot = 7+(5*(i&3));
-      temp = x[(a+1)&3];
-      temp = (temp & x[(a+2)&3]) | ((~temp) & x[(a+3)&3]);
+      temp = x[1];
+      temp = (temp & x[2]) | ((~temp) & x[3]);
     } else if (i<32) {
       in = (1+(5*i))&15;
-      temp = (i&3)+1;
-      rot = temp*5;
-      if (temp&2) rot--;
-      temp = x[(a+3)&3];
-      temp = (x[(a+1)&3] & temp) | (x[(a+2)&3] & ~temp);
+      temp = x[3];
+      temp = (x[1] & temp) | (x[2] & ~temp);
     } else if (i<48) {
-      in = (5+(3*(i&15)))&15;
-      rot = i&3;
-      rot = 4+(5*rot)+((rot+1)&6);
-      temp = x[(a+1)&3] ^ x[(a+2)&3] ^ x[(a+3)&3];
+      in = (3*i+5)&15;
+      temp = x[1] ^ x[2] ^ x[3];
     } else {
-      in = (7*(i&15))&15;
-      rot = (i&3)+1;
-      rot = (5*rot)+(((rot+2)&2)>>1);
-      temp = x[(a+2)&3] ^ (x[(a+1)&3] | ~x[(a+3)&3]);
+      in = (7*i)&15;
+      temp = x[2] ^ (x[1] | ~x[3]);
     }
-    temp += x[a] + b[in] + md5table[i];
-    x[a] = x[(a+1)&3] + ((temp<<rot) | (temp>>(32-rot)));
+    temp += x[0] + b[in] + md5table[i];
+    swap = x[3];
+    x[3] = x[2];
+    x[2] = x[1];
+    x[1] += rol(temp, md5rot[i]);
+    x[0] = swap;
   }
   for (i=0; i<4; i++) TT.state[i] += x[i];
 }
@@ -109,7 +112,6 @@
 // Mix next 64 bytes of data into sha1 hash.
 
 static const unsigned rconsts[]={0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6};
-#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
 
 static void sha1_transform(void)
 {
commit	afe951b19c0f8313731e28a5f347304d82e7f7af	[log] [tgz]
author	Daniel Verkamp <daniel@drv.nu>	Thu May 15 19:05:16 2014 -0500
committer	Daniel Verkamp <daniel@drv.nu>	Thu May 15 19:05:16 2014 -0500
tree	bd4251ec4a504a2aeb2c8736dc8907e4aa16767b
parent	97641f459a3e74045305b4ef9b80bb00bc56a29f [diff]