mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 1 | // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "base/utf_offset_string_conversions.h" |
| 6 | |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 7 | #include <algorithm> |
| 8 | |
thestig@chromium.org | efd4aaf | 2011-06-15 13:14:23 +0900 | [diff] [blame] | 9 | #include "base/memory/scoped_ptr.h" |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 10 | #include "base/string_piece.h" |
| 11 | #include "base/utf_string_conversion_utils.h" |
| 12 | |
| 13 | using base::PrepareForUTF16Or32Output; |
kinaba@chromium.org | fb4d529 | 2011-09-08 11:18:10 +0900 | [diff] [blame] | 14 | using base::PrepareForUTF8Output; |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 15 | using base::ReadUnicodeCharacter; |
| 16 | using base::WriteUnicodeCharacter; |
| 17 | |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 18 | // Converts the given source Unicode character type to the given destination |
| 19 | // Unicode character type as a STL string. The given input buffer and size |
| 20 | // determine the source, and the given output STL string will be replaced by |
| 21 | // the result. |
kinaba@chromium.org | fb4d529 | 2011-09-08 11:18:10 +0900 | [diff] [blame] | 22 | template<typename SrcChar, typename DestStdString> |
| 23 | bool ConvertUnicode(const SrcChar* src, |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 24 | size_t src_len, |
kinaba@chromium.org | fb4d529 | 2011-09-08 11:18:10 +0900 | [diff] [blame] | 25 | DestStdString* output, |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 26 | std::vector<size_t>* offsets_for_adjustment) { |
| 27 | if (offsets_for_adjustment) { |
| 28 | std::for_each(offsets_for_adjustment->begin(), |
| 29 | offsets_for_adjustment->end(), |
kinaba@chromium.org | fb4d529 | 2011-09-08 11:18:10 +0900 | [diff] [blame] | 30 | LimitOffset<DestStdString>(src_len)); |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 31 | } |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 32 | |
| 33 | // ICU requires 32-bit numbers. |
| 34 | bool success = true; |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 35 | OffsetAdjuster offset_adjuster(offsets_for_adjustment); |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 36 | int32 src_len32 = static_cast<int32>(src_len); |
| 37 | for (int32 i = 0; i < src_len32; i++) { |
| 38 | uint32 code_point; |
| 39 | size_t original_i = i; |
| 40 | size_t chars_written = 0; |
| 41 | if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { |
| 42 | chars_written = WriteUnicodeCharacter(code_point, output); |
| 43 | } else { |
cevans@chromium.org | d0e46a4 | 2010-01-02 07:16:38 +0900 | [diff] [blame] | 44 | chars_written = WriteUnicodeCharacter(0xFFFD, output); |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 45 | success = false; |
| 46 | } |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 47 | if (offsets_for_adjustment) { |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 48 | // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last |
| 49 | // character read, not after it (so that incrementing it in the loop |
| 50 | // increment will place it at the right location), so we need to account |
| 51 | // for that in determining the amount that was read. |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 52 | offset_adjuster.Add(OffsetAdjuster::Adjustment(original_i, |
| 53 | i - original_i + 1, chars_written)); |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 54 | } |
| 55 | } |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 56 | return success; |
| 57 | } |
| 58 | |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 59 | bool UTF8ToUTF16AndAdjustOffset(const char* src, |
| 60 | size_t src_len, |
| 61 | string16* output, |
| 62 | size_t* offset_for_adjustment) { |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 63 | std::vector<size_t> offsets; |
| 64 | if (offset_for_adjustment) |
| 65 | offsets.push_back(*offset_for_adjustment); |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 66 | PrepareForUTF16Or32Output(src, src_len, output); |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 67 | bool ret = ConvertUnicode(src, src_len, output, &offsets); |
| 68 | if (offset_for_adjustment) |
| 69 | *offset_for_adjustment = offsets[0]; |
| 70 | return ret; |
| 71 | } |
| 72 | |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 73 | bool UTF8ToUTF16AndAdjustOffsets(const char* src, |
| 74 | size_t src_len, |
| 75 | string16* output, |
| 76 | std::vector<size_t>* offsets_for_adjustment) { |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 77 | PrepareForUTF16Or32Output(src, src_len, output); |
| 78 | return ConvertUnicode(src, src_len, output, offsets_for_adjustment); |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 79 | } |
| 80 | |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 81 | string16 UTF8ToUTF16AndAdjustOffset(const base::StringPiece& utf8, |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 82 | size_t* offset_for_adjustment) { |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 83 | std::vector<size_t> offsets; |
| 84 | if (offset_for_adjustment) |
| 85 | offsets.push_back(*offset_for_adjustment); |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 86 | string16 result; |
| 87 | UTF8ToUTF16AndAdjustOffsets(utf8.data(), utf8.length(), &result, |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 88 | &offsets); |
| 89 | if (offset_for_adjustment) |
| 90 | *offset_for_adjustment = offsets[0]; |
| 91 | return result; |
| 92 | } |
| 93 | |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 94 | string16 UTF8ToUTF16AndAdjustOffsets( |
| 95 | const base::StringPiece& utf8, |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 96 | std::vector<size_t>* offsets_for_adjustment) { |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 97 | string16 result; |
| 98 | UTF8ToUTF16AndAdjustOffsets(utf8.data(), utf8.length(), &result, |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 99 | offsets_for_adjustment); |
| 100 | return result; |
pkasting@chromium.org | 046cd5a | 2009-11-14 04:27:48 +0900 | [diff] [blame] | 101 | } |
| 102 | |
kinaba@chromium.org | fb4d529 | 2011-09-08 11:18:10 +0900 | [diff] [blame] | 103 | std::string UTF16ToUTF8AndAdjustOffset( |
| 104 | const base::StringPiece16& utf16, |
| 105 | size_t* offset_for_adjustment) { |
| 106 | std::vector<size_t> offsets; |
| 107 | if (offset_for_adjustment) |
| 108 | offsets.push_back(*offset_for_adjustment); |
| 109 | std::string result = UTF16ToUTF8AndAdjustOffsets(utf16, &offsets); |
| 110 | if (offset_for_adjustment) |
| 111 | *offset_for_adjustment = offsets[0]; |
| 112 | return result; |
| 113 | } |
| 114 | |
| 115 | std::string UTF16ToUTF8AndAdjustOffsets( |
| 116 | const base::StringPiece16& utf16, |
| 117 | std::vector<size_t>* offsets_for_adjustment) { |
| 118 | std::string result; |
| 119 | PrepareForUTF8Output(utf16.data(), utf16.length(), &result); |
| 120 | ConvertUnicode(utf16.data(), utf16.length(), &result, offsets_for_adjustment); |
| 121 | return result; |
| 122 | } |
| 123 | |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 124 | OffsetAdjuster::Adjustment::Adjustment(size_t original_offset, |
| 125 | size_t original_length, |
| 126 | size_t output_length) |
| 127 | : original_offset(original_offset), |
| 128 | original_length(original_length), |
| 129 | output_length(output_length) { |
| 130 | } |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 131 | |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 132 | OffsetAdjuster::OffsetAdjuster(std::vector<size_t>* offsets_for_adjustment) |
| 133 | : offsets_for_adjustment_(offsets_for_adjustment) { |
| 134 | } |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 135 | |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 136 | OffsetAdjuster::~OffsetAdjuster() { |
| 137 | if (!offsets_for_adjustment_ || adjustments_.empty()) |
| 138 | return; |
| 139 | for (std::vector<size_t>::iterator i(offsets_for_adjustment_->begin()); |
| 140 | i != offsets_for_adjustment_->end(); ++i) |
| 141 | AdjustOffset(i); |
| 142 | } |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 143 | |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 144 | void OffsetAdjuster::Add(const Adjustment& adjustment) { |
| 145 | adjustments_.push_back(adjustment); |
| 146 | } |
| 147 | |
| 148 | void OffsetAdjuster::AdjustOffset(std::vector<size_t>::iterator offset) { |
| 149 | if (*offset == string16::npos) |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 150 | return; |
| 151 | size_t adjustment = 0; |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 152 | for (std::vector<Adjustment>::const_iterator i = adjustments_.begin(); |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 153 | i != adjustments_.end(); ++i) { |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 154 | if (*offset == i->original_offset && i->output_length == 0) { |
| 155 | *offset = string16::npos; |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 156 | return; |
| 157 | } |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 158 | if (*offset <= i->original_offset) |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 159 | break; |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 160 | if (*offset < (i->original_offset + i->original_length)) { |
| 161 | *offset = string16::npos; |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 162 | return; |
| 163 | } |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 164 | adjustment += (i->original_length - i->output_length); |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 165 | } |
pkasting@chromium.org | e6b5c20 | 2011-05-04 05:03:50 +0900 | [diff] [blame] | 166 | *offset -= adjustment; |
mrossetti@chromium.org | 9422b22 | 2011-04-14 03:43:05 +0900 | [diff] [blame] | 167 | } |