Correct ctype(3) functions with NLS on NetBSD

Summary:
The setlocale(3) function reloads the ctype(3) arrays from
external files. This happens behind the scenes in the internals
of libc (citrus library, runes functions etc).

ctype(3) functions like isspace(3) can be provided with two
variations on NetBSD: inlined or via a global symbol in libc:

```
#if defined(_NETBSD_SOURCE) && !defined(_CTYPE_NOINLINE) && \
    !defined(__cplusplus)
#include <sys/ctype_inline.h>
#else
#include <sys/ctype_bits.h>
#endif
```

The in-lined versions are de-facto array lookup operations.

```
#define isspace(c)      ((int)((_ctype_tab_ + 1)[(c)] & _CTYPE_S))
```

After setting setlocale(3) the ctype(3) arrays (_ctype_tab_,
_toupper_tab_, _tolower_tab_) are reload behind the scenes
and they are required to be marked as initialized.

Set them initialized inside the common setlocale(3) interceptor.

The arrays are of size of 257 elements: 0..255 + 1 (EOF).

This corrects errors on NetBSD/amd64 in applications
prebuilt with MSan.

Sponsored by <The NetBSD Foundation>

Reviewers: vitalybuka, dvyukov, joerg

Reviewed By: vitalybuka

Subscribers: llvm-commits, kubamracek, #sanitizers

Tags: #sanitizers

Differential Revision: https://reviews.llvm.org/D42020

llvm-svn: 326008
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 82e4d52..7768041 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -110,6 +110,9 @@
 #define times __times13
 #define wait3 __wait350
 #define wait4 __wait450
+extern const unsigned short *_ctype_tab_;
+extern const short *_toupper_tab_;
+extern const short *_tolower_tab_;
 #endif
 
 // Platform-specific options.
@@ -3186,13 +3189,25 @@
 #endif
 
 #if SANITIZER_INTERCEPT_SETLOCALE
+static void unpoison_ctype_arrays(void *ctx) {
+#if SANITIZER_NETBSD
+  // These arrays contain 256 regular elements in unsigned char range + 1 EOF
+  COMMON_INTERCEPTOR_WRITE_RANGE(ctx, _ctype_tab_, 257 * sizeof(short));
+  COMMON_INTERCEPTOR_WRITE_RANGE(ctx, _toupper_tab_, 257 * sizeof(short));
+  COMMON_INTERCEPTOR_WRITE_RANGE(ctx, _tolower_tab_, 257 * sizeof(short));
+#endif
+}
+
 INTERCEPTOR(char *, setlocale, int category, char *locale) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, setlocale, category, locale);
   if (locale)
     COMMON_INTERCEPTOR_READ_RANGE(ctx, locale, REAL(strlen)(locale) + 1);
   char *res = REAL(setlocale)(category, locale);
-  if (res) COMMON_INTERCEPTOR_WRITE_RANGE(ctx, res, REAL(strlen)(res) + 1);
+  if (res) {
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, res, REAL(strlen)(res) + 1);
+    unpoison_ctype_arrays(ctx);
+  }
   return res;
 }
 
diff --git a/compiler-rt/test/sanitizer_common/TestCases/ctype.c b/compiler-rt/test/sanitizer_common/TestCases/ctype.c
new file mode 100644
index 0000000..37e0af8
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/ctype.c
@@ -0,0 +1,89 @@
+// RUN: %clang %s -o %t && %run %t 2>&1 | FileCheck %s
+
+#include <ctype.h>
+#include <limits.h>
+#include <locale.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void check_ctype(void) {
+  unsigned char c;
+  volatile size_t i = 0; /* a dummy variable to prevent optimizing code out */
+
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!isalpha(c);
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!isascii(c);
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!isblank(c);
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!iscntrl(c);
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!isdigit(c);
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!isgraph(c);
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!islower(c);
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!isprint(c);
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!ispunct(c);
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!isspace(c);
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!isupper(c);
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!isxdigit(c);
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!isalnum(c);
+
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!tolower(c);
+  for (c = 0; c < UCHAR_MAX; c++)
+    i += !!toupper(c);
+
+  i += !!isalpha(EOF);
+  i += !!isascii(EOF);
+  i += !!isblank(EOF);
+  i += !!iscntrl(EOF);
+  i += !!isdigit(EOF);
+  i += !!isgraph(EOF);
+  i += !!islower(EOF);
+  i += !!isprint(EOF);
+  i += !!ispunct(EOF);
+  i += !!isspace(EOF);
+  i += !!isupper(EOF);
+  i += !!isxdigit(EOF);
+  i += !!isalnum(EOF);
+
+  i += !!tolower(EOF);
+  i += !!toupper(EOF);
+
+  if (i)
+    return;
+  else
+    return;
+}
+
+int main(int argc, char **argv) {
+  check_ctype();
+
+  setlocale(LC_ALL, "");
+
+  check_ctype();
+
+  setlocale(LC_ALL, "en_US.UTF-8");
+
+  check_ctype();
+
+  setlocale(LC_CTYPE, "pl_PL.UTF-8");
+
+  check_ctype();
+
+  printf("OK\n");
+
+  // CHECK: OK
+
+  return 0;
+}