Pass through non-character codepoints in UTF-8,16,32 and Wide conversion functions. 

They're structurally valid code points unlike malformed byte/surrogate sequences. I believe it's better to leave them
alone in conversion functions. 

This CL was triggered by file_util_unittest failure on Linux/Mac with my upcoming change 
to file_util::ReplaceIllegalCharacters (a part of http://codereview.chromium.org/126223 ). 

In addition, the upper bound for the output length in CodepageToWide was tightened. 

TEST=pass string_util and file_util unittests
BUG=NONE


Review URL: http://codereview.chromium.org/147038

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@19132 0039d316-1c4b-4281-b951-d872f2087c98


CrOS-Libchrome-Original-Commit: 8df44a01ec210a3e0c04191fb34b392727017a2c
diff --git a/base/file_util_unittest.cc b/base/file_util_unittest.cc
index e0884f7..738bac8 100644
--- a/base/file_util_unittest.cc
+++ b/base/file_util_unittest.cc
@@ -839,11 +839,9 @@
 #if defined(OS_WIN)
   {L"bad*file\\name.jpg", L"bad-file-name.jpg"},
   {L"\t  bad*file\\name/.jpg ", L"bad-file-name-.jpg"},
-  {L"bad\uFFFFfile\U0010FFFEname.jpg ", L"bad-file-name.jpg"},
 #elif defined(OS_POSIX)
   {L"bad*file?name.jpg", L"bad-file-name.jpg"},
   {L"\t  bad*file?name/.jpg ", L"bad-file-name-.jpg"},
-  {L"bad\uFFFFfile-name.jpg ", L"bad-file-name.jpg"},
 #endif
   {L"this_file_name is okay!.mp3", L"this_file_name is okay!.mp3"},
   {L"\u4E00\uAC00.mp3", L"\u4E00\uAC00.mp3"},
@@ -851,6 +849,9 @@
   {L"\U00010330\U00010331.mp3", L"\U00010330\U00010331.mp3"},
   // Unassigned codepoints are ok.
   {L"\u0378\U00040001.mp3", L"\u0378\U00040001.mp3"},
+  // Non-characters are not allowed.
+  {L"bad\uFFFFfile\U0010FFFEname.jpg ", L"bad-file-name.jpg"},
+  {L"bad\uFDD0file\uFDEFname.jpg ", L"bad-file-name.jpg"},
 };
 
 TEST_F(FileUtilTest, ReplaceIllegalCharactersTest) {
diff --git a/base/string_util.h b/base/string_util.h
index d17e7d7..9a033b4 100644
--- a/base/string_util.h
+++ b/base/string_util.h
@@ -186,6 +186,13 @@
 // do the best it can and put the result in the output buffer. The versions that
 // return strings ignore this error and just return the best conversion
 // possible.
+//
+// Note that only the structural validity is checked and non-character
+// codepoints and unassigned are regarded as valid.
+// TODO(jungshik): Consider replacing an invalid input sequence with
+// the Unicode replacement character or adding |replacement_char| parameter.
+// Currently, it's skipped in the ouput, which could be problematic in
+// some situations.
 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output);
 std::string WideToUTF8(const std::wstring& wide);
 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output);
@@ -250,6 +257,13 @@
 // string be 8-bit or UTF8? It contains only characters that are < 256 (in the
 // first case) or characters that use only 8-bits and whose 8-bit
 // representation looks like a UTF-8 string (the second case).
+//
+// Note that IsStringUTF8 checks not only if the input is structrually
+// valid but also if it doesn't contain any non-character codepoint
+// (e.g. U+FFFE). It's done on purpose because all the existing callers want
+// to have the maximum 'discriminating' power from other encodings. If
+// there's a use case for just checking the structural validity, we have to
+// add a new function for that.
 bool IsString8Bit(const std::wstring& str);
 bool IsStringUTF8(const std::string& str);
 bool IsStringWideUTF8(const std::wstring& str);
diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc
index df3ee4f..87731de 100644
--- a/base/string_util_icu.cc
+++ b/base/string_util_icu.cc
@@ -16,6 +16,14 @@
 
 namespace {
 
+inline bool IsValidCodepoint(uint32 code_point) {
+  // Excludes the surrogate code points ([0xD800, 0xDFFF]) and
+  // codepoints larger than 0x10FFFF (the highest codepoint allowed).
+  // Non-characters and unassigned codepoints are allowed.
+  return code_point < 0xD800u ||
+         (code_point >= 0xE000u && code_point <= 0x10FFFFu);
+}
+
 // ReadUnicodeCharacter --------------------------------------------------------
 
 // Reads a UTF-8 stream, placing the next code point into the given output
@@ -39,7 +47,7 @@
   (*char_index)--;
 
   // Validate the decoded value.
-  return U_IS_UNICODE_CHAR(code_point);
+  return IsValidCodepoint(code_point);
 }
 
 // Reads a UTF-16 character. The usage is the same as the 8-bit version above.
@@ -62,7 +70,7 @@
     *code_point = src[*char_index];
   }
 
-  return U_IS_UNICODE_CHAR(*code_point);
+  return IsValidCodepoint(*code_point);
 }
 
 #if defined(WCHAR_T_IS_UTF32)
@@ -73,7 +81,7 @@
   *code_point = src[*char_index];
 
   // Validate the value.
-  return U_IS_UNICODE_CHAR(*code_point);
+  return IsValidCodepoint(*code_point);
 }
 #endif  // defined(WCHAR_T_IS_UTF32)
 
@@ -134,10 +142,13 @@
   int32 src_len32 = static_cast<int32>(src_len);
   for (int32 i = 0; i < src_len32; i++) {
     uint32 code_point;
-    if (ReadUnicodeCharacter(src, src_len32, &i, &code_point))
+    if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
       WriteUnicodeCharacter(code_point, output);
-    else
+    } else {
+      // TODO(jungshik): consider adding 'Replacement character' (U+FFFD)
+      // in place of an invalid codepoint.
       success = false;
+    }
   }
   return success;
 }
@@ -428,8 +439,15 @@
   if (!U_SUCCESS(status))
     return false;
 
-  // The worst case is all the input characters are non-BMP (32-bit) ones.
-  size_t uchar_max_length = encoded.length() * 2 + 1;
+  // Even in the worst case, the maximum length in 2-byte units of UTF-16
+  // output would be at most the same as the number of bytes in input. There
+  // is no single-byte encoding in which a character is mapped to a
+  // non-BMP character requiring two 2-byte units.
+  //
+  // Moreover, non-BMP characters in legacy multibyte encodings
+  // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are
+  // BOCU and SCSU, but we don't care about them.
+  size_t uchar_max_length = encoded.length() + 1;
 
   UChar* uchar_dst;
 #if defined(WCHAR_T_IS_UTF16)
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index 1087aea..6f196cc 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -309,8 +309,8 @@
   } convert_cases[] = {
     // Regular UTF-8 input.
     {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true},
-    // Invalid Unicode code point.
-    {"\xef\xbf\xbfHello", L"Hello", false},
+    // Non-character is passed through.
+    {"\xef\xbf\xbfHello", L"\xffffHello", true},
     // Truncated UTF-8 sequence.
     {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false},
     // Truncated off the end.
@@ -319,11 +319,14 @@
     {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false},
     // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal.
     {"\xed\xb0\x80", L"", false},
-    // Non-BMP character. The result will either be in UTF-16 or UTF-32.
+    // Non-BMP characters. The second is a non-character regarded as valid.
+    // The result will either be in UTF-16 or UTF-32.
 #if defined(WCHAR_T_IS_UTF16)
     {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true},
+    {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true},
 #elif defined(WCHAR_T_IS_UTF32)
     {"A\xF0\x90\x8C\x80z", L"A\x10300z", true},
+    {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true},
 #endif
   };
 
@@ -367,8 +370,9 @@
     {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
     // Test a non-BMP character.
     {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true},
-    // Invalid Unicode code point.
-    {L"\xffffHello", "Hello", false},
+    // Non-characters are passed through.
+    {L"\xffffHello", "\xEF\xBF\xBFHello", true},
+    {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true},
     // The first character is a truncated UTF-16 character.
     {L"\xd800\x597d", "\xe5\xa5\xbd", false},
     // Truncated at the end.
@@ -389,7 +393,7 @@
 #elif defined(WCHAR_T_IS_UTF32)
 // This test is only valid when wchar_t == UTF-32.
 TEST(StringUtilTest, ConvertUTF32ToUTF8) {
-  struct UTF8ToWideCase {
+  struct WideToUTF8Case {
     const wchar_t* utf32;
     const char* utf8;
     bool success;
@@ -398,11 +402,14 @@
     {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
     // Test a non-BMP character.
     {L"A\x10300z", "A\xF0\x90\x8C\x80z", true},
+    // Non-characters are passed through.
+    {L"\xffffHello", "\xEF\xBF\xBFHello", true},
+    {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true},
     // Invalid Unicode code points.
-    {L"\xffffHello", "Hello", false},
     {L"\xfffffffHello", "Hello", false},
     // The first character is a truncated UTF-16 character.
     {L"\xd800\x597d", "\xe5\xa5\xbd", false},
+    {L"\xdc01Hello", "Hello", false},
   };
 
   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) {