philip.liard@gmail.com | b905691 | 2011-08-18 11:41:24 +0000 | [diff] [blame] | 1 | // Copyright (C) 2011 The Libphonenumber Authors |
philip.liard@gmail.com | 1ad5e5b | 2011-07-01 08:22:06 +0000 | [diff] [blame] | 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | // Author: George Yakovlev |
| 16 | // Philippe Liard |
| 17 | |
philip.liard@gmail.com | af3adc4 | 2012-05-10 15:59:25 +0000 | [diff] [blame] | 18 | // Note that we don't use features of ICU that depend on std::string (e.g. |
| 19 | // UnicodeString::toUTF8String()) to support clients that build ICU without |
| 20 | // -DU_HAVE_STD_STRING. |
| 21 | |
philip.liard@gmail.com | 384682a | 2011-07-12 15:41:29 +0000 | [diff] [blame] | 22 | #include "phonenumbers/regexp_adapter_icu.h" |
philip.liard@gmail.com | 1ad5e5b | 2011-07-01 08:22:06 +0000 | [diff] [blame] | 23 | |
philip.liard@gmail.com | 603e7e5 | 2011-10-12 12:25:09 +0000 | [diff] [blame] | 24 | #include <stddef.h> |
philip.liard@gmail.com | 1ad5e5b | 2011-07-01 08:22:06 +0000 | [diff] [blame] | 25 | #include <string> |
| 26 | |
| 27 | #include <unicode/regex.h> |
philip.liard@gmail.com | af3adc4 | 2012-05-10 15:59:25 +0000 | [diff] [blame] | 28 | #include <unicode/stringpiece.h> |
philip.liard@gmail.com | 1ad5e5b | 2011-07-01 08:22:06 +0000 | [diff] [blame] | 29 | #include <unicode/unistr.h> |
| 30 | |
philip.liard@gmail.com | af4a2ce | 2013-04-30 11:35:55 +0000 | [diff] [blame] | 31 | #include "phonenumbers/base/basictypes.h" |
| 32 | #include "phonenumbers/base/logging.h" |
| 33 | #include "phonenumbers/base/memory/scoped_ptr.h" |
philip.liard@gmail.com | 1ad5e5b | 2011-07-01 08:22:06 +0000 | [diff] [blame] | 34 | #include "phonenumbers/default_logger.h" |
philip.liard@gmail.com | af3adc4 | 2012-05-10 15:59:25 +0000 | [diff] [blame] | 35 | #include "phonenumbers/string_byte_sink.h" |
philip.liard@gmail.com | 1ad5e5b | 2011-07-01 08:22:06 +0000 | [diff] [blame] | 36 | |
| 37 | namespace i18n { |
| 38 | namespace phonenumbers { |
| 39 | |
| 40 | using icu::RegexMatcher; |
| 41 | using icu::RegexPattern; |
| 42 | using icu::UnicodeString; |
| 43 | |
| 44 | namespace { |
| 45 | |
| 46 | // Converts UnicodeString 'source' to a UTF8-formatted std::string. |
| 47 | string UnicodeStringToUtf8String(const UnicodeString& source) { |
| 48 | string data; |
philip.liard@gmail.com | af3adc4 | 2012-05-10 15:59:25 +0000 | [diff] [blame] | 49 | StringByteSink sink(&data); |
| 50 | source.toUTF8(sink); |
philip.liard@gmail.com | 1ad5e5b | 2011-07-01 08:22:06 +0000 | [diff] [blame] | 51 | return data; |
| 52 | } |
| 53 | |
philip.liard@gmail.com | af3adc4 | 2012-05-10 15:59:25 +0000 | [diff] [blame] | 54 | // Converts UTF8-formatted std::string 'source' to a UnicodeString. |
| 55 | UnicodeString Utf8StringToUnicodeString(const string& source) { |
| 56 | // Note that we don't use icu::StringPiece(const string&). |
| 57 | return UnicodeString::fromUTF8( |
| 58 | icu::StringPiece(source.c_str(), source.size())); |
| 59 | } |
| 60 | |
philip.liard@gmail.com | 1ad5e5b | 2011-07-01 08:22:06 +0000 | [diff] [blame] | 61 | } // namespace |
| 62 | |
| 63 | // Implementation of the abstract classes RegExpInput and RegExp using ICU |
| 64 | // regular expression capabilities. |
| 65 | |
| 66 | // ICU implementation of the RegExpInput abstract class. |
| 67 | class IcuRegExpInput : public RegExpInput { |
| 68 | public: |
| 69 | explicit IcuRegExpInput(const string& utf8_input) |
philip.liard@gmail.com | af3adc4 | 2012-05-10 15:59:25 +0000 | [diff] [blame] | 70 | : utf8_input_(Utf8StringToUnicodeString(utf8_input)), |
philip.liard@gmail.com | 1ad5e5b | 2011-07-01 08:22:06 +0000 | [diff] [blame] | 71 | position_(0) {} |
| 72 | |
| 73 | virtual ~IcuRegExpInput() {} |
| 74 | |
| 75 | virtual string ToString() const { |
| 76 | return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_)); |
| 77 | } |
| 78 | |
| 79 | UnicodeString* Data() { |
| 80 | return &utf8_input_; |
| 81 | } |
| 82 | |
| 83 | // The current start position. For a newly created input, position is 0. Each |
| 84 | // call to ConsumeRegExp() or RegExp::Consume() advances the position in the |
| 85 | // case of the successful match to be after the match. |
| 86 | int position() const { |
| 87 | return position_; |
| 88 | } |
| 89 | |
| 90 | void set_position(int position) { |
| 91 | DCHECK(position >= 0 && position <= utf8_input_.length()); |
| 92 | position_ = position; |
| 93 | } |
| 94 | |
| 95 | private: |
| 96 | UnicodeString utf8_input_; |
| 97 | int position_; |
| 98 | |
| 99 | DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput); |
| 100 | }; |
| 101 | |
| 102 | // ICU implementation of the RegExp abstract class. |
| 103 | class IcuRegExp : public RegExp { |
| 104 | public: |
| 105 | explicit IcuRegExp(const string& utf8_regexp) { |
| 106 | UParseError parse_error; |
| 107 | UErrorCode status = U_ZERO_ERROR; |
| 108 | utf8_regexp_.reset(RegexPattern::compile( |
philip.liard@gmail.com | af3adc4 | 2012-05-10 15:59:25 +0000 | [diff] [blame] | 109 | Utf8StringToUnicodeString(utf8_regexp), 0, parse_error, status)); |
philip.liard@gmail.com | 1ad5e5b | 2011-07-01 08:22:06 +0000 | [diff] [blame] | 110 | if (U_FAILURE(status)) { |
| 111 | // The provided regular expressions should compile correctly. |
| 112 | LOG(ERROR) << "Error compiling regular expression: " << utf8_regexp; |
| 113 | utf8_regexp_.reset(NULL); |
| 114 | } |
| 115 | } |
| 116 | |
| 117 | virtual ~IcuRegExp() {} |
| 118 | |
| 119 | virtual bool Consume(RegExpInput* input_string, |
| 120 | bool anchor_at_start, |
| 121 | string* matched_string1, |
| 122 | string* matched_string2, |
| 123 | string* matched_string3) const { |
| 124 | DCHECK(input_string); |
| 125 | if (!utf8_regexp_.get()) { |
| 126 | return false; |
| 127 | } |
| 128 | IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string); |
| 129 | UErrorCode status = U_ZERO_ERROR; |
| 130 | const scoped_ptr<RegexMatcher> matcher( |
| 131 | utf8_regexp_->matcher(*input->Data(), status)); |
| 132 | bool match_succeeded = anchor_at_start |
| 133 | ? matcher->lookingAt(input->position(), status) |
| 134 | : matcher->find(input->position(), status); |
| 135 | if (!match_succeeded || U_FAILURE(status)) { |
| 136 | return false; |
| 137 | } |
| 138 | string* const matched_strings[] = { |
| 139 | matched_string1, matched_string2, matched_string3 |
| 140 | }; |
| 141 | // If less matches than expected - fail. |
| 142 | for (size_t i = 0; i < arraysize(matched_strings); ++i) { |
| 143 | if (matched_strings[i]) { |
| 144 | // Groups are counted from 1 rather than 0. |
| 145 | const int group_index = i + 1; |
| 146 | if (group_index > matcher->groupCount()) { |
| 147 | return false; |
| 148 | } |
| 149 | *matched_strings[i] = |
| 150 | UnicodeStringToUtf8String(matcher->group(group_index, status)); |
| 151 | } |
| 152 | } |
| 153 | input->set_position(matcher->end(status)); |
| 154 | return !U_FAILURE(status); |
| 155 | } |
| 156 | |
| 157 | bool Match(const string& input_string, |
| 158 | bool full_match, |
| 159 | string* matched_string) const { |
| 160 | if (!utf8_regexp_.get()) { |
| 161 | return false; |
| 162 | } |
| 163 | IcuRegExpInput input(input_string); |
| 164 | UErrorCode status = U_ZERO_ERROR; |
| 165 | const scoped_ptr<RegexMatcher> matcher( |
| 166 | utf8_regexp_->matcher(*input.Data(), status)); |
| 167 | bool match_succeeded = full_match |
| 168 | ? matcher->matches(input.position(), status) |
| 169 | : matcher->find(input.position(), status); |
| 170 | if (!match_succeeded || U_FAILURE(status)) { |
| 171 | return false; |
| 172 | } |
| 173 | if (matcher->groupCount() > 0 && matched_string) { |
| 174 | *matched_string = UnicodeStringToUtf8String(matcher->group(1, status)); |
| 175 | } |
| 176 | return !U_FAILURE(status); |
| 177 | } |
| 178 | |
| 179 | bool Replace(string* string_to_process, |
| 180 | bool global, |
| 181 | const string& replacement_string) const { |
| 182 | DCHECK(string_to_process); |
| 183 | if (!utf8_regexp_.get()) { |
| 184 | return false; |
| 185 | } |
| 186 | IcuRegExpInput input(*string_to_process); |
| 187 | UErrorCode status = U_ZERO_ERROR; |
| 188 | const scoped_ptr<RegexMatcher> matcher( |
| 189 | utf8_regexp_->matcher(*input.Data(), status)); |
| 190 | if (U_FAILURE(status)) { |
| 191 | return false; |
| 192 | } |
lararennie@google.com | 35bd393 | 2012-09-06 09:48:57 +0000 | [diff] [blame] | 193 | |
| 194 | UnicodeString output; |
| 195 | // We reimplement ReplaceFirst and ReplaceAll such that their behaviour is |
| 196 | // consistent with the RE2 reg-ex matcher. |
| 197 | if (!matcher->find()) { |
| 198 | return false; |
| 199 | } |
| 200 | matcher->appendReplacement(output, |
| 201 | Utf8StringToUnicodeString(replacement_string), |
| 202 | status); |
| 203 | if (global) { |
| 204 | // Continue and look for more matches. |
| 205 | while (matcher->find()) { |
| 206 | matcher->appendReplacement( |
| 207 | output, |
| 208 | Utf8StringToUnicodeString(replacement_string), |
| 209 | status); |
| 210 | } |
| 211 | } |
| 212 | |
| 213 | matcher->appendTail(output); |
philip.liard@gmail.com | 1ad5e5b | 2011-07-01 08:22:06 +0000 | [diff] [blame] | 214 | if (U_FAILURE(status)) { |
| 215 | return false; |
| 216 | } |
lararennie@google.com | 35bd393 | 2012-09-06 09:48:57 +0000 | [diff] [blame] | 217 | const string replaced_string = UnicodeStringToUtf8String(output); |
philip.liard@gmail.com | 1ad5e5b | 2011-07-01 08:22:06 +0000 | [diff] [blame] | 218 | *string_to_process = replaced_string; |
| 219 | return true; |
| 220 | } |
| 221 | |
| 222 | private: |
| 223 | scoped_ptr<RegexPattern> utf8_regexp_; |
| 224 | |
| 225 | DISALLOW_COPY_AND_ASSIGN(IcuRegExp); |
| 226 | }; |
| 227 | |
philip.liard@gmail.com | 384682a | 2011-07-12 15:41:29 +0000 | [diff] [blame] | 228 | RegExpInput* ICURegExpFactory::CreateInput(const string& utf8_input) const { |
philip.liard@gmail.com | 1ad5e5b | 2011-07-01 08:22:06 +0000 | [diff] [blame] | 229 | return new IcuRegExpInput(utf8_input); |
| 230 | } |
| 231 | |
philip.liard@gmail.com | 384682a | 2011-07-12 15:41:29 +0000 | [diff] [blame] | 232 | RegExp* ICURegExpFactory::CreateRegExp(const string& utf8_regexp) const { |
philip.liard@gmail.com | 1ad5e5b | 2011-07-01 08:22:06 +0000 | [diff] [blame] | 233 | return new IcuRegExp(utf8_regexp); |
| 234 | } |
| 235 | |
| 236 | } // namespace phonenumbers |
| 237 | } // namespace i18n |