[scudo] PRNG makeover

Summary:
This follows the addition of `GetRandom` with D34412. We remove our
`/dev/urandom` code and use the new function. Additionally, change the PRNG for
a slightly faster version. One of the issues with the old code is that we have
64 full bits of randomness per "next", using only 8 of those for the Salt and
discarding the rest. So we add a cached u64 in the PRNG that can serve up to
8 u8 before having to call the "next" function again.

During some integration work, I also realized that some very early processes
(like `init`) do not benefit from `/dev/urandom` yet. So if there is no
`getrandom` syscall as well, we have to fallback to some sort of initialization
of the PRNG.

Now a few words on why XoRoShiRo and not something else. I have played a while
with various PRNGs on 32 & 64 bit platforms. Some results are below. LCG 32 & 64
are usually faster but produce respectively 15 & 31 bits of entropy, meaning
that to get a full 64-bit, you would need to call them several times. The simple
XorShift is fast, produces 32 bits but is mediocre with regard to PRNG test
suites, PCG is slower overall, and XoRoShiRo is faster than XorShift128+ and
produces full 64 bits.

%%%
root@tulip-chiphd:/data # ./randtest.arm
[+] starting xs32...
[?] xs32 duration: 22431833053ns
[+] starting lcg32...
[?] lcg32 duration: 14941402090ns
[+] starting pcg32...
[?] pcg32 duration: 44941973771ns
[+] starting xs128p...
[?] xs128p duration: 48889786981ns
[+] starting lcg64...
[?] lcg64 duration: 33831042391ns
[+] starting xos128p...
[?] xos128p duration: 44850878605ns

root@tulip-chiphd:/data # ./randtest.aarch64
[+] starting xs32...
[?] xs32 duration: 22425151678ns
[+] starting lcg32...
[?] lcg32 duration: 14954255257ns
[+] starting pcg32...
[?] pcg32 duration: 37346265726ns
[+] starting xs128p...
[?] xs128p duration: 22523807219ns
[+] starting lcg64...
[?] lcg64 duration: 26141304679ns
[+] starting xos128p...
[?] xos128p duration: 14937033215ns
%%%

Reviewers: alekseyshl

Reviewed By: alekseyshl

Subscribers: aemerson, kristof.beyls, llvm-commits

Differential Revision: https://reviews.llvm.org/D35221

llvm-svn: 307798
diff --git a/compiler-rt/lib/scudo/scudo_utils.h b/compiler-rt/lib/scudo/scudo_utils.h
index 7198476..6c6c9d8 100644
--- a/compiler-rt/lib/scudo/scudo_utils.h
+++ b/compiler-rt/lib/scudo/scudo_utils.h
@@ -36,23 +36,58 @@
 };
 bool testCPUFeature(CPUFeature feature);
 
-// Tiny PRNG based on https://en.wikipedia.org/wiki/Xorshift#xorshift.2B
-// The state (128 bits) will be stored in thread local storage.
-struct Xorshift128Plus {
+INLINE u64 rotl(const u64 X, int K) {
+  return (X << K) | (X >> (64 - K));
+}
+
+// XoRoShiRo128+ PRNG (http://xoroshiro.di.unimi.it/).
+struct XoRoShiRo128Plus {
  public:
-  void initFromURandom();
-  u64 getNext() {
-    u64 x = State[0];
-    const u64 y = State[1];
-    State[0] = y;
-    x ^= x << 23;
-    State[1] = x ^ y ^ (x >> 17) ^ (y >> 26);
-    return State[1] + y;
+  void init() {
+    if (UNLIKELY(!GetRandom(reinterpret_cast<void *>(State), sizeof(State)))) {
+      // Early processes (eg: init) do not have /dev/urandom yet, but we still
+      // have to provide them with some degree of entropy. Not having a secure
+      // seed is not as problematic for them, as they are less likely to be
+      // the target of heap based vulnerabilities exploitation attempts.
+      State[0] = NanoTime();
+      State[1] = 0;
+    }
+    fillCache();
   }
+  u8 getU8() {
+    if (UNLIKELY(isCacheEmpty()))
+      fillCache();
+    const u8 Result = static_cast<u8>(CachedBytes & 0xff);
+    CachedBytes >>= 8;
+    CachedBytesAvailable--;
+    return Result;
+  }
+  u64 getU64() { return next(); }
+
  private:
+  u8 CachedBytesAvailable;
+  u64 CachedBytes;
   u64 State[2];
+  u64 next() {
+    const u64 S0 = State[0];
+    u64 S1 = State[1];
+    const u64 Result = S0 + S1;
+    S1 ^= S0;
+    State[0] = rotl(S0, 55) ^ S1 ^ (S1 << 14);
+    State[1] = rotl(S1, 36);
+    return Result;
+  }
+  bool isCacheEmpty() {
+    return CachedBytesAvailable == 0;
+  }
+  void fillCache() {
+    CachedBytes = next();
+    CachedBytesAvailable = sizeof(CachedBytes);
+  }
 };
 
+typedef XoRoShiRo128Plus ScudoPrng;
+
 }  // namespace __scudo
 
 #endif  // SCUDO_UTILS_H_