Improvements in zstd decode performance Summary: The idea behind wildcopy is that it can be cheaper to copy more bytes (say 8) than it is to copy less (say, 3). This change takes that further by exploiting some properties: 1. it's almost always OK to copy 16 bytes instead of 8, which means fewer copy instructions, and fewer branches 2. A 16 byte chunk size means that ~90% of wildcopy invocations will have a trip count of 1, so branch prediction will be improved. Speedup on Xeon E5-2680v4 is in the range of 3-5%. Measured wildcopy length distributions on silesia.tar: level <=8 <=16 <=24 >24 1 78.05% 11.49% 3.52% 6.94% 3 82.14% 8.99% 2.44% 6.43% 6 85.81% 6.51% 2.92% 4.76% 8 83.02% 7.31% 3.64% 6.03% 10 84.13% 6.67% 3.29% 5.91% 15 77.58% 7.55% 5.21% 9.66% 16 80.07% 7.20% 3.98% 8.75% Test Plan: benchmark silesia, make check

commit: b83059958246dfcb5b91af9c187fad8c706869a0 [log] [tgz]
author: mgrice <mgrice@fb.com> Tue Aug 27 14:49:23 2019 -0700
committer: mgrice <mgrice@fb.com> Thu Aug 29 12:25:56 2019 -0700
tree: c0aa8f99e3cc6b47c440b292a1c0da8a7f90866d
parent: d944197e7945fe7319b4ac13f5be2a9de6ab0fba [diff] [blame]
diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h
index 585fd6b..74769d1 100644
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h

@@ -191,9 +191,11 @@
 /*-*******************************************
 *  Shared functions to include for inlining
 *********************************************/
+FORCE_INLINE_ATTR
 static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
 
 #define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+FORCE_INLINE_ATTR
 static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); }
 #define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
 
@@ -209,7 +211,7 @@
 /*! ZSTD_wildcopy() :
  *  custom version of memcpy(), can overwrite up to WILDCOPY_OVERLENGTH bytes (if length==0) */
 MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE
-void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e ovtype)
+void ZSTD_wildcopy(void* dst, const void* src, BYTE* oend_g, ptrdiff_t length, ZSTD_overlap_e ovtype)
 {
     ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
     const BYTE* ip = (const BYTE*)src;
@@ -217,25 +219,33 @@
     BYTE* const oend = op + length;
 
     assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff < -8));
+
     if (length < VECLEN || (ovtype == ZSTD_overlap_src_before_dst && diff < VECLEN)) {
       do
           COPY8(op, ip)
       while (op < oend);
     }
     else {
-      if ((length & 8) == 0)
-        COPY8(op, ip);
-      do {
-        COPY16(op, ip);
+      if (oend < oend_g-16) {
+        /* common case */
+        do {
+          COPY16(op, ip);
+        }
+        while (op < oend);
       }
-      while (op < oend);
+      else {
+        do {
+            COPY8(op, ip);
+        }
+        while (op < oend);
+      }
     }
 }
 
 /*! ZSTD_wildcopy_16min() :
- *  same semantics as ZSTD_wilcopy() except guaranteed to be able to copy 16 bytes at the start */
+ *  same semantics as ZSTD_wildcopy() except guaranteed to be able to copy 16 bytes at the start */
 MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE
-void ZSTD_wildcopy_16min(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e ovtype)
+void ZSTD_wildcopy_16min(void* dst, const void* src, BYTE* oend_g, ptrdiff_t length, ZSTD_overlap_e ovtype)
 {
     ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
     const BYTE* ip = (const BYTE*)src;
@@ -246,17 +256,25 @@
     assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff < -8));
 
     if (ovtype == ZSTD_overlap_src_before_dst && diff < VECLEN) {
-      do
-          COPY8(op, ip)
+      do {
+          COPY8(op, ip);
+      }
       while (op < oend);
     }
     else {
-      if ((length & 8) == 0)
-        COPY8(op, ip);
-      do {
-        COPY16(op, ip);
+      if (oend < oend_g-16) {
+        /* common case */
+        do {
+          COPY16(op, ip);
+        }
+        while (op < oend);
       }
-      while (op < oend);
+      else {
+        do {
+            COPY8(op, ip);
+        }
+        while (op < oend);
+      }
     }
 }
commit	b83059958246dfcb5b91af9c187fad8c706869a0	[log] [tgz]
author	mgrice <mgrice@fb.com>	Tue Aug 27 14:49:23 2019 -0700
committer	mgrice <mgrice@fb.com>	Thu Aug 29 12:25:56 2019 -0700
tree	c0aa8f99e3cc6b47c440b292a1c0da8a7f90866d
parent	d944197e7945fe7319b4ac13f5be2a9de6ab0fba [diff] [blame]