needforspeed: added Py_MEMCPY macro (currently tuned for Visual C only),
and use it for string copy operations.  this gives a 20% speedup on some
string benchmarks.
diff --git a/Include/pyport.h b/Include/pyport.h
index 74ce993..47b9f70 100644
--- a/Include/pyport.h
+++ b/Include/pyport.h
@@ -174,6 +174,27 @@
 #define Py_LOCAL_INLINE(type) static type
 #endif
 
+/* Py_MEMCPY can be used instead of memcpy in cases where the copied blocks
+ * are often very short.  While most platforms have highly optimized code for
+ * large transfers, the setup costs for memcpy are often quite high.  MEMCPY
+ * solves this by doing short copies "in line".
+ */
+
+#if defined(_MSC_VER)
+#define Py_MEMCPY(target, source, length) do {				\
+		size_t i_, n_ = (length);				\
+		char *t_ = (void*) (target);				\
+		const char *s_ = (void*) (source);			\
+		if (n_ >= 16)						\
+			memcpy(t_, s_, n_);				\
+		else							\
+			for (i_ = 0; i_ < n_; i_++)			\
+				t_[i_] = s_[i_];			\
+	} while (0)
+#else
+#define Py_MEMCPY memcpy
+#endif
+
 #include <stdlib.h>
 
 #include <math.h> /* Moved here from the math section, before extern "C" */
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 0531aed..8c39cfe 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -357,15 +357,8 @@
         Py_UNICODE_ISDIGIT(ch) || \
         Py_UNICODE_ISNUMERIC(ch))
 
-/* memcpy has a considerable setup overhead on many platforms; use a
-   loop for short strings (the "16" below is pretty arbitary) */
-#define Py_UNICODE_COPY(target, source, length) do\
-    {Py_ssize_t i_; Py_UNICODE *t_ = (target); const Py_UNICODE *s_ = (source);\
-      if (length > 16)\
-        memcpy(t_, s_, (length)*sizeof(Py_UNICODE));\
-      else\
-        for (i_ = 0; i_ < (length); i_++) t_[i_] = s_[i_];\
-    } while (0)
+#define Py_UNICODE_COPY(target, source, length)				\
+	Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
 
 #define Py_UNICODE_FILL(target, value, length) do\
     {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\