Vectorise the memcmp template implementation.  For scenarios requiring
a lot of memcmp-ing (firefox reftests) this is measurably faster.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@14901 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/shared/vg_replace_strmem.c b/shared/vg_replace_strmem.c
index 5203306..ab1b1a8 100644
--- a/shared/vg_replace_strmem.c
+++ b/shared/vg_replace_strmem.c
@@ -959,18 +959,33 @@
    int VG_REPLACE_FUNCTION_EZU(20190,soname,fnname)       \
           ( const void *s1V, const void *s2V, SizeT n )  \
    { \
-      int res; \
-      UChar a0; \
-      UChar b0; \
-      const UChar* s1 = s1V; \
-      const UChar* s2 = s2V; \
+      const Addr WS = sizeof(UWord); /* 8 or 4 */ \
+      const Addr WM = WS - 1;        /* 7 or 3 */ \
+      Addr s1A = (Addr)s1V; \
+      Addr s2A = (Addr)s2V; \
+      \
+      if (((s1A | s2A) & WM) == 0) { \
+         /* Both areas are word aligned.  Skip over the */ \
+         /* equal prefix as fast as possible. */ \
+         while (n >= WS) { \
+            UWord w1 = *(UWord*)s1A; \
+            UWord w2 = *(UWord*)s2A; \
+            if (w1 != w2) break; \
+            s1A += WS; \
+            s2A += WS; \
+            n -= WS; \
+         } \
+      } \
+      \
+      const UChar* s1 = (const UChar*) s1A; \
+      const UChar* s2 = (const UChar*) s2A; \
       \
       while (n != 0) { \
-         a0 = s1[0]; \
-         b0 = s2[0]; \
+         UChar a0 = s1[0]; \
+         UChar b0 = s2[0]; \
          s1 += 1; \
          s2 += 1; \
-         res = ((int)a0) - ((int)b0); \
+         int res = ((int)a0) - ((int)b0); \
          if (res != 0) \
             return res; \
          n -= 1; \