liblog: display valid utf8 characters with 'printable' log format

This started as a change to use mbrtowc() instead of
utf8_character_length() as mbrtowc() does everything that
utf8_character_length() intends to do, but is a libc function. The
change was further intended to add unit tests to ensure that these
functions operate as intended.

It turns out that utf8_character_length() returned an error for the
utf8 characters that I tested, so this also has the side effect of
allowing valid utf8 characters to be printed in the 'printable' log
format, which was the original intention.

Also, print the binary data as hex instead of octal, since it is a
more suitable choice.

Test: new unit tests, existing unit tests, logcat -v printable
Change-Id: I4cc95aee81519411ef47892ca74eb31117c972d2
diff --git a/liblog/logprint.cpp b/liblog/logprint.cpp
index cec5d96..6b5ea4c 100644
--- a/liblog/logprint.cpp
+++ b/liblog/logprint.cpp
@@ -33,6 +33,7 @@
 #include <string.h>
 #include <sys/param.h>
 #include <sys/types.h>
+#include <wchar.h>
 
 #include <cutils/list.h>
 #include <log/log.h>
@@ -1134,66 +1135,13 @@
 }
 
 /*
- * One utf8 character at a time
- *
- * Returns the length of the utf8 character in the buffer,
- * or -1 if illegal or truncated
- *
- * Open coded from libutils/Unicode.cpp, borrowed from utf8_length(),
- * can not remove from here because of library circular dependencies.
- * Expect one-day utf8_character_length with the same signature could
- * _also_ be part of libutils/Unicode.cpp if its usefullness needs to
- * propagate globally.
- */
-static ssize_t utf8_character_length(const char* src, size_t len) {
-  const char* cur = src;
-  const char first_char = *cur++;
-  static const uint32_t kUnicodeMaxCodepoint = 0x0010FFFF;
-  int32_t mask, to_ignore_mask;
-  size_t num_to_read;
-  uint32_t utf32;
-
-  if ((first_char & 0x80) == 0) { /* ASCII */
-    return first_char ? 1 : -1;
-  }
-
-  /*
-   * (UTF-8's character must not be like 10xxxxxx,
-   *  but 110xxxxx, 1110xxxx, ... or 1111110x)
-   */
-  if ((first_char & 0x40) == 0) {
-    return -1;
-  }
-
-  for (utf32 = 1, num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
-       num_to_read < 5 && (first_char & mask); num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
-    if (num_to_read > len) {
-      return -1;
-    }
-    if ((*cur & 0xC0) != 0x80) { /* can not be 10xxxxxx? */
-      return -1;
-    }
-    utf32 = (utf32 << 6) + (*cur++ & 0b00111111);
-  }
-  /* "first_char" must be (110xxxxx - 11110xxx) */
-  if (num_to_read >= 5) {
-    return -1;
-  }
-  to_ignore_mask |= mask;
-  utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
-  if (utf32 > kUnicodeMaxCodepoint) {
-    return -1;
-  }
-  return num_to_read;
-}
-
-/*
  * Convert to printable from message to p buffer, return string length. If p is
  * NULL, do not copy, but still return the expected string length.
  */
-static size_t convertPrintable(char* p, const char* message, size_t messageLen) {
+size_t convertPrintable(char* p, const char* message, size_t messageLen) {
   char* begin = p;
   bool print = p != NULL;
+  mbstate_t mb_state = {};
 
   while (messageLen) {
     char buf[6];
@@ -1201,11 +1149,10 @@
     if ((size_t)len > messageLen) {
       len = messageLen;
     }
-    len = utf8_character_length(message, len);
+    len = mbrtowc(nullptr, message, len, &mb_state);
 
     if (len < 0) {
-      snprintf(buf, sizeof(buf), ((messageLen > 1) && isdigit(message[1])) ? "\\%03o" : "\\%o",
-               *message & 0377);
+      snprintf(buf, sizeof(buf), "\\x%02X", static_cast<unsigned char>(*message));
       len = 1;
     } else {
       buf[0] = '\0';
@@ -1225,7 +1172,7 @@
         } else if (*message == '\\') {
           strcpy(buf, "\\\\");
         } else if ((*message < ' ') || (*message & 0x80)) {
-          snprintf(buf, sizeof(buf), "\\%o", *message & 0377);
+          snprintf(buf, sizeof(buf), "\\x%02X", static_cast<unsigned char>(*message));
         }
       }
       if (!buf[0]) {
diff --git a/liblog/tests/Android.bp b/liblog/tests/Android.bp
index 50755ce..d9d1a21 100644
--- a/liblog/tests/Android.bp
+++ b/liblog/tests/Android.bp
@@ -62,6 +62,7 @@
         "log_system_test.cpp",
         "log_time_test.cpp",
         "log_wrap_test.cpp",
+        "logprint_test.cpp",
     ],
     shared_libs: [
         "libcutils",
diff --git a/liblog/tests/logprint_test.cpp b/liblog/tests/logprint_test.cpp
new file mode 100644
index 0000000..7ca02ac
--- /dev/null
+++ b/liblog/tests/logprint_test.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+size_t convertPrintable(char* p, const char* message, size_t messageLen);
+
+TEST(liblog, convertPrintable_ascii) {
+  auto input = "easy string, output same";
+  auto output_size = convertPrintable(nullptr, input, strlen(input));
+  EXPECT_EQ(output_size, strlen(input));
+
+  char output[output_size];
+
+  output_size = convertPrintable(output, input, strlen(input));
+  EXPECT_EQ(output_size, strlen(input));
+  EXPECT_STREQ(input, output);
+}
+
+TEST(liblog, convertPrintable_escapes) {
+  // Note that \t is not escaped.
+  auto input = "escape\a\b\t\v\f\r\\";
+  auto expected_output = "escape\\a\\b\t\\v\\f\\r\\\\";
+  auto output_size = convertPrintable(nullptr, input, strlen(input));
+  EXPECT_EQ(output_size, strlen(expected_output));
+
+  char output[output_size];
+
+  output_size = convertPrintable(output, input, strlen(input));
+  EXPECT_EQ(output_size, strlen(expected_output));
+  EXPECT_STREQ(expected_output, output);
+}
+
+TEST(liblog, convertPrintable_validutf8) {
+  auto input = u8"¢ह€𐍈";
+  auto output_size = convertPrintable(nullptr, input, strlen(input));
+  EXPECT_EQ(output_size, strlen(input));
+
+  char output[output_size];
+
+  output_size = convertPrintable(output, input, strlen(input));
+  EXPECT_EQ(output_size, strlen(input));
+  EXPECT_STREQ(input, output);
+}
+
+TEST(liblog, convertPrintable_invalidutf8) {
+  auto input = "\x80\xC2\x01\xE0\xA4\x06\xE0\x06\xF0\x90\x8D\x06\xF0\x90\x06\xF0\x0E";
+  auto expected_output =
+      "\\x80\\xC2\\x01\\xE0\\xA4\\x06\\xE0\\x06\\xF0\\x90\\x8D\\x06\\xF0\\x90\\x06\\xF0\\x0E";
+  auto output_size = convertPrintable(nullptr, input, strlen(input));
+  EXPECT_EQ(output_size, strlen(expected_output));
+
+  char output[output_size];
+
+  output_size = convertPrintable(output, input, strlen(input));
+  EXPECT_EQ(output_size, strlen(expected_output));
+  EXPECT_STREQ(expected_output, output);
+}
+
+TEST(liblog, convertPrintable_mixed) {
+  auto input =
+      u8"\x80\xC2¢ह€𐍈\x01\xE0\xA4\x06¢ह€𐍈\xE0\x06\a\b\xF0\x90¢ह€𐍈\x8D\x06\xF0\t\t\x90\x06\xF0\x0E";
+  auto expected_output =
+      u8"\\x80\\xC2¢ह€𐍈\\x01\\xE0\\xA4\\x06¢ह€𐍈\\xE0\\x06\\a\\b\\xF0\\x90¢ह€𐍈\\x8D\\x06\\xF0\t\t"
+      u8"\\x90\\x06\\xF0\\x0E";
+  auto output_size = convertPrintable(nullptr, input, strlen(input));
+  EXPECT_EQ(output_size, strlen(expected_output));
+
+  char output[output_size];
+
+  output_size = convertPrintable(output, input, strlen(input));
+  EXPECT_EQ(output_size, strlen(expected_output));
+  EXPECT_STREQ(expected_output, output);
+}