blob: baa46746847e5e350f1fb807f57259c4cbbda8e3 [file] [log] [blame]
license.botf003cfe2008-08-24 09:55:55 +09001// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
brettw@google.comfed55ab2008-08-08 00:29:49 +09004
initial.commit3f4a7322008-07-27 06:49:38 +09005#include "base/string_util.h"
6
7#include <string.h>
8#include <vector>
9
10#include "base/basictypes.h"
11#include "base/logging.h"
12#include "base/singleton.h"
13#include "unicode/ucnv.h"
14#include "unicode/numfmt.h"
15#include "unicode/ustring.h"
16
brettw@google.comfed55ab2008-08-08 00:29:49 +090017namespace {
18
19// ReadUnicodeCharacter --------------------------------------------------------
20
21// Reads a UTF-8 stream, placing the next code point into the given output
22// |*code_point|. |src| represents the entire string to read, and |*char_index|
23// is the character offset within the string to start reading at. |*char_index|
24// will be updated to index the last character read, such that incrementing it
25// (as in a for loop) will take the reader to the next character.
26//
27// Returns true on success. On false, |*code_point| will be invalid.
28bool ReadUnicodeCharacter(const char* src, int32 src_len,
evanm@google.come1581aa2008-08-19 09:31:24 +090029 int32* char_index, uint32* code_point_out) {
30 // U8_NEXT expects to be able to use -1 to signal an error, so we must
31 // use a signed type for code_point. But this function returns false
32 // on error anyway, so code_point_out is unsigned.
33 int32 code_point;
34 U8_NEXT(src, *char_index, src_len, code_point);
35 *code_point_out = static_cast<uint32>(code_point);
brettw@google.comfed55ab2008-08-08 00:29:49 +090036
37 // The ICU macro above moves to the next char, we want to point to the last
38 // char consumed.
39 (*char_index)--;
40
41 // Validate the decoded value.
evanm@google.come1581aa2008-08-19 09:31:24 +090042 return U_IS_UNICODE_CHAR(code_point);
brettw@google.comfed55ab2008-08-08 00:29:49 +090043}
44
brettw@google.come3c034a2008-08-08 03:31:40 +090045// Reads a UTF-16 character. The usage is the same as the 8-bit version above.
brettw@google.comfa499052008-08-08 05:27:57 +090046bool ReadUnicodeCharacter(const char16* src, int32 src_len,
brettw@google.comfed55ab2008-08-08 00:29:49 +090047 int32* char_index, uint32* code_point) {
48 if (U16_IS_SURROGATE(src[*char_index])) {
49 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) ||
50 *char_index + 1 >= src_len ||
51 !U16_IS_TRAIL(src[*char_index + 1])) {
52 // Invalid surrogate pair.
53 return false;
54 }
55
56 // Valid surrogate pair.
57 *code_point = U16_GET_SUPPLEMENTARY(src[*char_index],
58 src[*char_index + 1]);
59 (*char_index)++;
60 } else {
61 // Not a surrogate, just one 16-bit word.
62 *code_point = src[*char_index];
63 }
64
65 return U_IS_UNICODE_CHAR(*code_point);
66}
brettw@google.comfa499052008-08-08 05:27:57 +090067
68#if defined(WCHAR_T_IS_UTF32)
brettw@google.come3c034a2008-08-08 03:31:40 +090069// Reads UTF-32 character. The usage is the same as the 8-bit version above.
pinkerton@google.comc5c7a722008-08-08 07:36:01 +090070bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
brettw@google.comfa499052008-08-08 05:27:57 +090071 int32* char_index, uint32* code_point) {
brettw@google.comfed55ab2008-08-08 00:29:49 +090072 // Conversion is easy since the source is 32-bit.
73 *code_point = src[*char_index];
74
75 // Validate the value.
76 return U_IS_UNICODE_CHAR(*code_point);
77}
brettw@google.come3c034a2008-08-08 03:31:40 +090078#endif // defined(WCHAR_T_IS_UTF32)
brettw@google.comfed55ab2008-08-08 00:29:49 +090079
80// WriteUnicodeCharacter -------------------------------------------------------
81
82// Appends a UTF-8 character to the given 8-bit string.
83void WriteUnicodeCharacter(uint32 code_point, std::basic_string<char>* output) {
84 if (code_point <= 0x7f) {
85 // Fast path the common case of one byte.
86 output->push_back(code_point);
87 return;
88 }
89
90 // U8_APPEND_UNSAFE can append up to 4 bytes.
91 int32 char_offset = static_cast<int32>(output->length());
92 output->resize(char_offset + U8_MAX_LENGTH);
93
94 U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
95
96 // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so
97 // it will represent the new length of the string.
98 output->resize(char_offset);
99}
100
brettw@google.come3c034a2008-08-08 03:31:40 +0900101// Appends the given code point as a UTF-16 character to the STL string.
brettw@google.comfed55ab2008-08-08 00:29:49 +0900102void WriteUnicodeCharacter(uint32 code_point,
brettw@google.comfa499052008-08-08 05:27:57 +0900103 std::basic_string<char16>* output) {
brettw@google.comfed55ab2008-08-08 00:29:49 +0900104 if (U16_LENGTH(code_point) == 1) {
105 // Thie code point is in the Basic Multilingual Plane (BMP).
brettw@google.comfa499052008-08-08 05:27:57 +0900106 output->push_back(static_cast<char16>(code_point));
brettw@google.comfed55ab2008-08-08 00:29:49 +0900107 } else {
108 // Non-BMP characters use a double-character encoding.
109 int32 char_offset = static_cast<int32>(output->length());
110 output->resize(char_offset + U16_MAX_LENGTH);
111 U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
112 }
113}
brettw@google.comfa499052008-08-08 05:27:57 +0900114
115#if defined(WCHAR_T_IS_UTF32)
brettw@google.come3c034a2008-08-08 03:31:40 +0900116// Appends the given UTF-32 character to the given 32-bit string.
brettw@google.comfed55ab2008-08-08 00:29:49 +0900117inline void WriteUnicodeCharacter(uint32 code_point,
118 std::basic_string<wchar_t>* output) {
119 // This is the easy case, just append the character.
120 output->push_back(code_point);
121}
brettw@google.come3c034a2008-08-08 03:31:40 +0900122#endif // defined(WCHAR_T_IS_UTF32)
brettw@google.comfed55ab2008-08-08 00:29:49 +0900123
124// Generalized Unicode converter -----------------------------------------------
125
126// Converts the given source Unicode character type to the given destination
127// Unicode character type as a STL string. The given input buffer and size
128// determine the source, and the given output STL string will be replaced by
129// the result.
130template<typename SRC_CHAR, typename DEST_CHAR>
131bool ConvertUnicode(const SRC_CHAR* src, size_t src_len,
132 std::basic_string<DEST_CHAR>* output) {
133 output->clear();
134
135 // ICU requires 32-bit numbers.
136 bool success = true;
137 int32 src_len32 = static_cast<int32>(src_len);
138 for (int32 i = 0; i < src_len32; i++) {
139 uint32 code_point;
140 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point))
141 WriteUnicodeCharacter(code_point, output);
142 else
143 success = false;
144 }
145 return success;
146}
147
148} // namespace
149
brettw@google.comfa499052008-08-08 05:27:57 +0900150// UTF-8 <-> Wide --------------------------------------------------------------
brettw@google.comfed55ab2008-08-08 00:29:49 +0900151
152std::string WideToUTF8(const std::wstring& wide) {
153 std::string ret;
154 if (wide.empty())
155 return ret;
156
157 // Ignore the success flag of this call, it will do the best it can for
158 // invalid input, which is what we want here.
159 WideToUTF8(wide.data(), wide.length(), &ret);
160 return ret;
161}
162
163bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
164 if (src_len == 0) {
165 output->clear();
166 return true;
167 }
168
169 // Intelligently guess the size of the output string. When it's an ASCII
170 // character, assume the rest will be ASCII and use a buffer size the same as
171 // the input. When it's not ASCII, assume 3-bytes per character as the
172 // starting point. This will be resized internally later if it's too small.
mmentovai@google.com38cabad2008-08-13 10:17:18 +0900173 if (static_cast<uint32>(src[0]) < 0x80)
brettw@google.comfed55ab2008-08-08 00:29:49 +0900174 output->reserve(src_len);
175 else
176 output->reserve(src_len * 3);
177 return ConvertUnicode<wchar_t, char>(src, src_len, output);
178}
179
180std::wstring UTF8ToWide(const std::string& utf8) {
181 std::wstring ret;
182 if (utf8.empty())
183 return ret;
184
185 UTF8ToWide(utf8.data(), utf8.length(), &ret);
186 return ret;
187}
188
189bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
190 if (src_len == 0) {
191 output->clear();
192 return true;
193 }
194
195 // Intelligently guess the size of the output string. When it's an ASCII
196 // character, assume the rest will be ASCII and use a buffer size the same as
197 // the input. When it's not ASCII, assume the UTF-8 takes 2 bytes per
198 // character (this is more conservative than 3 which we use above when
199 // converting the other way).
mmentovai@google.com38cabad2008-08-13 10:17:18 +0900200 if (static_cast<unsigned char>(src[0]) < 0x80)
brettw@google.comfed55ab2008-08-08 00:29:49 +0900201 output->reserve(src_len);
202 else
203 output->reserve(src_len / 2);
204 return ConvertUnicode<char, wchar_t>(src, src_len, output);
205}
206
brettw@google.comfa499052008-08-08 05:27:57 +0900207// UTF-16 <-> Wide -------------------------------------------------------------
208
209#if defined(WCHAR_T_IS_UTF16)
210
211// When wide == UTF-16, then conversions are a NOP.
212std::string16 WideToUTF16(const std::wstring& wide) {
213 return wide;
214}
215
216bool WideToUTF16(const wchar_t* src, size_t src_len, std::string16* output) {
217 output->assign(src, src_len);
218 return true;
219}
220
221std::wstring UTF16ToWide(const std::string16& utf16) {
222 return utf16;
223}
224
225bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
226 output->assign(src, src_len);
227 return true;
228}
229
230#elif defined(WCHAR_T_IS_UTF32)
231
232std::string16 WideToUTF16(const std::wstring& wide) {
233 std::string16 ret;
234 if (wide.empty())
235 return ret;
236
pinkerton@google.comc5c7a722008-08-08 07:36:01 +0900237 WideToUTF16(wide.data(), wide.length(), &ret);
brettw@google.comfa499052008-08-08 05:27:57 +0900238 return ret;
239}
240
241bool WideToUTF16(const wchar_t* src, size_t src_len, std::string16* output) {
242 if (src_len == 0) {
243 output->clear();
244 return true;
245 }
246
247 // Assume that normally we won't have any non-BMP characters so the counts
248 // will be the same.
249 output->reserve(src_len);
250 return ConvertUnicode<wchar_t, char16>(src, src_len, output);
251}
252
253std::wstring UTF16ToWide(const std::string16& utf16) {
254 std::wstring ret;
255 if (utf16.empty())
256 return ret;
257
pinkerton@google.comc5c7a722008-08-08 07:36:01 +0900258 UTF16ToWide(utf16.data(), utf16.length(), &ret);
brettw@google.comfa499052008-08-08 05:27:57 +0900259 return ret;
260}
261
262bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
263 if (src_len == 0) {
264 output->clear();
265 return true;
266 }
267
268 // Assume that normally we won't have any non-BMP characters so the counts
269 // will be the same.
270 output->reserve(src_len);
271 return ConvertUnicode<char16, wchar_t>(src, src_len, output);
272}
273
274#endif // defined(WCHAR_T_IS_UTF32)
275
initial.commit3f4a7322008-07-27 06:49:38 +0900276// Codepage <-> Wide -----------------------------------------------------------
277
278// Convert a unicode string into the specified codepage_name. If the codepage
279// isn't found, return false.
280bool WideToCodepage(const std::wstring& wide,
281 const char* codepage_name,
282 OnStringUtilConversionError::Type on_error,
283 std::string* encoded) {
284 encoded->clear();
285
286 UErrorCode status = U_ZERO_ERROR;
287 UConverter* converter = ucnv_open(codepage_name, &status);
288 if (!U_SUCCESS(status))
289 return false;
290
291 const UChar* uchar_src;
292 int uchar_len;
brettw@google.come3c034a2008-08-08 03:31:40 +0900293#if defined(WCHAR_T_IS_UTF16)
initial.commit3f4a7322008-07-27 06:49:38 +0900294 uchar_src = wide.c_str();
295 uchar_len = static_cast<int>(wide.length());
brettw@google.come3c034a2008-08-08 03:31:40 +0900296#elif defined(WCHAR_T_IS_UTF32)
initial.commit3f4a7322008-07-27 06:49:38 +0900297 // When wchar_t is wider than UChar (16 bits), transform |wide| into a
298 // UChar* string. Size the UChar* buffer to be large enough to hold twice
mmentovai@google.com38cabad2008-08-13 10:17:18 +0900299 // as many UTF-16 code points as there are UTF-16 characters, in case each
initial.commit3f4a7322008-07-27 06:49:38 +0900300 // character translates to a UTF-16 surrogate pair, and leave room for a NUL
301 // terminator.
302 std::vector<UChar> wide_uchar(wide.length() * 2 + 1);
303 u_strFromWCS(&wide_uchar[0], wide_uchar.size(), &uchar_len,
304 wide.c_str(), wide.length(), &status);
305 uchar_src = &wide_uchar[0];
306 DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*";
brettw@google.come3c034a2008-08-08 03:31:40 +0900307#endif // defined(WCHAR_T_IS_UTF32)
initial.commit3f4a7322008-07-27 06:49:38 +0900308
309 int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len,
310 ucnv_getMaxCharSize(converter));
311 encoded->resize(encoded_max_length);
312
313 // Setup our error handler.
314 switch (on_error) {
315 case OnStringUtilConversionError::FAIL:
316 ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0,
317 NULL, NULL, &status);
318 break;
319 case OnStringUtilConversionError::SKIP:
320 ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0,
321 NULL, NULL, &status);
322 break;
323 default:
324 NOTREACHED();
325 }
326
327 // ucnv_fromUChars returns size not including terminating null
328 int actual_size = ucnv_fromUChars(converter, &(*encoded)[0],
329 encoded_max_length, uchar_src, uchar_len, &status);
330 encoded->resize(actual_size);
331 ucnv_close(converter);
332 if (U_SUCCESS(status))
333 return true;
334 encoded->clear(); // Make sure the output is empty on error.
335 return false;
336}
337
338// Converts a string of the given codepage into unicode.
339// If the codepage isn't found, return false.
340bool CodepageToWide(const std::string& encoded,
341 const char* codepage_name,
342 OnStringUtilConversionError::Type on_error,
343 std::wstring* wide) {
344 wide->clear();
345
346 UErrorCode status = U_ZERO_ERROR;
347 UConverter* converter = ucnv_open(codepage_name, &status);
348 if (!U_SUCCESS(status))
349 return false;
350
351 // The worst case is all the input characters are non-BMP (32-bit) ones.
352 size_t uchar_max_length = encoded.length() * 2 + 1;
353
354 UChar* uchar_dst;
brettw@google.come3c034a2008-08-08 03:31:40 +0900355#if defined(WCHAR_T_IS_UTF16)
initial.commit3f4a7322008-07-27 06:49:38 +0900356 uchar_dst = WriteInto(wide, uchar_max_length);
brettw@google.come3c034a2008-08-08 03:31:40 +0900357#elif defined(WCHAR_T_IS_UTF32)
initial.commit3f4a7322008-07-27 06:49:38 +0900358 // When wchar_t is wider than UChar (16 bits), convert into a temporary
359 // UChar* buffer.
360 std::vector<UChar> wide_uchar(uchar_max_length);
361 uchar_dst = &wide_uchar[0];
brettw@google.come3c034a2008-08-08 03:31:40 +0900362#endif // defined(WCHAR_T_IS_UTF32)
initial.commit3f4a7322008-07-27 06:49:38 +0900363
364 // Setup our error handler.
365 switch (on_error) {
366 case OnStringUtilConversionError::FAIL:
367 ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0,
368 NULL, NULL, &status);
369 break;
370 case OnStringUtilConversionError::SKIP:
371 ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0,
372 NULL, NULL, &status);
373 break;
374 default:
375 NOTREACHED();
376 }
377
378 int actual_size = ucnv_toUChars(converter,
379 uchar_dst,
380 static_cast<int>(uchar_max_length),
381 encoded.data(),
382 static_cast<int>(encoded.length()),
383 &status);
384 ucnv_close(converter);
385 if (!U_SUCCESS(status)) {
386 wide->clear(); // Make sure the output is empty on error.
387 return false;
388 }
389
brettw@google.come3c034a2008-08-08 03:31:40 +0900390#ifdef WCHAR_T_IS_UTF32
initial.commit3f4a7322008-07-27 06:49:38 +0900391 // When wchar_t is wider than UChar (16 bits), it's not possible to wind up
392 // with any more wchar_t elements than UChar elements. ucnv_toUChars
393 // returns the number of UChar elements not including the NUL terminator, so
394 // leave extra room for that.
395 u_strToWCS(WriteInto(wide, actual_size + 1), actual_size + 1, &actual_size,
396 uchar_dst, actual_size, &status);
397 DCHECK(U_SUCCESS(status)) << "failed to convert UChar* to wstring";
brettw@google.come3c034a2008-08-08 03:31:40 +0900398#endif // WCHAR_T_IS_UTF32
initial.commit3f4a7322008-07-27 06:49:38 +0900399
400 wide->resize(actual_size);
401 return true;
402}
403
404// Number formatting -----------------------------------------------------------
405
mmentovai@google.com50a04142008-08-14 00:32:27 +0900406namespace {
407
408struct NumberFormatSingletonTraits
409 : public DefaultSingletonTraits<NumberFormat> {
410 static NumberFormat* New() {
411 UErrorCode status = U_ZERO_ERROR;
412 NumberFormat* formatter = NumberFormat::createInstance(status);
413 DCHECK(U_SUCCESS(status));
414 return formatter;
415 }
416 // There's no ICU call to destroy a NumberFormat object other than
417 // operator delete, so use the default Delete, which calls operator delete.
418 // This can cause problems if a different allocator is used by this file than
419 // by ICU.
420};
421
422} // namespace
initial.commit3f4a7322008-07-27 06:49:38 +0900423
424std::wstring FormatNumber(int64 number) {
mmentovai@google.com50a04142008-08-14 00:32:27 +0900425 NumberFormat* number_format =
426 Singleton<NumberFormat, NumberFormatSingletonTraits>::get();
427
initial.commit3f4a7322008-07-27 06:49:38 +0900428 if (!number_format) {
429 // As a fallback, just return the raw number in a string.
430 return StringPrintf(L"%lld", number);
431 }
432 UnicodeString ustr;
433 number_format->format(number, ustr);
434
brettw@google.come3c034a2008-08-08 03:31:40 +0900435#if defined(WCHAR_T_IS_UTF16)
initial.commit3f4a7322008-07-27 06:49:38 +0900436 return std::wstring(ustr.getBuffer(),
437 static_cast<std::wstring::size_type>(ustr.length()));
brettw@google.come3c034a2008-08-08 03:31:40 +0900438#elif defined(WCHAR_T_IS_UTF32)
initial.commit3f4a7322008-07-27 06:49:38 +0900439 wchar_t buffer[64]; // A int64 is less than 20 chars long, so 64 chars
440 // leaves plenty of room for formating stuff.
441 int length = 0;
442 UErrorCode error = U_ZERO_ERROR;
443 u_strToWCS(buffer, 64, &length, ustr.getBuffer(), ustr.length() , &error);
444 if (U_FAILURE(error)) {
445 NOTREACHED();
446 // As a fallback, just return the raw number in a string.
447 return StringPrintf(L"%lld", number);
448 }
449 return std::wstring(buffer, static_cast<std::wstring::size_type>(length));
brettw@google.come3c034a2008-08-08 03:31:40 +0900450#endif // defined(WCHAR_T_IS_UTF32)
initial.commit3f4a7322008-07-27 06:49:38 +0900451}
license.botf003cfe2008-08-24 09:55:55 +0900452