blob: ba8215cc932d05eee3c3eb7ccc9149a56601bdf2 [file] [log] [blame]
philip.liard@gmail.comb9056912011-08-18 11:41:24 +00001// Copyright (C) 2011 The Libphonenumber Authors
philip.liard@gmail.com1ad5e5b2011-07-01 08:22:06 +00002//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Author: George Yakovlev
16// Philippe Liard
17
philip.liard@gmail.comaf3adc42012-05-10 15:59:25 +000018// Note that we don't use features of ICU that depend on std::string (e.g.
19// UnicodeString::toUTF8String()) to support clients that build ICU without
20// -DU_HAVE_STD_STRING.
21
philip.liard@gmail.com384682a2011-07-12 15:41:29 +000022#include "phonenumbers/regexp_adapter_icu.h"
philip.liard@gmail.com1ad5e5b2011-07-01 08:22:06 +000023
philip.liard@gmail.com603e7e52011-10-12 12:25:09 +000024#include <stddef.h>
philip.liard@gmail.com1ad5e5b2011-07-01 08:22:06 +000025#include <string>
26
27#include <unicode/regex.h>
philip.liard@gmail.comaf3adc42012-05-10 15:59:25 +000028#include <unicode/stringpiece.h>
philip.liard@gmail.com1ad5e5b2011-07-01 08:22:06 +000029#include <unicode/unistr.h>
30
philip.liard@gmail.comaf4a2ce2013-04-30 11:35:55 +000031#include "phonenumbers/base/basictypes.h"
32#include "phonenumbers/base/logging.h"
33#include "phonenumbers/base/memory/scoped_ptr.h"
philip.liard@gmail.com1ad5e5b2011-07-01 08:22:06 +000034#include "phonenumbers/default_logger.h"
philip.liard@gmail.comaf3adc42012-05-10 15:59:25 +000035#include "phonenumbers/string_byte_sink.h"
philip.liard@gmail.com1ad5e5b2011-07-01 08:22:06 +000036
37namespace i18n {
38namespace phonenumbers {
39
40using icu::RegexMatcher;
41using icu::RegexPattern;
42using icu::UnicodeString;
43
44namespace {
45
46// Converts UnicodeString 'source' to a UTF8-formatted std::string.
47string UnicodeStringToUtf8String(const UnicodeString& source) {
48 string data;
philip.liard@gmail.comaf3adc42012-05-10 15:59:25 +000049 StringByteSink sink(&data);
50 source.toUTF8(sink);
philip.liard@gmail.com1ad5e5b2011-07-01 08:22:06 +000051 return data;
52}
53
philip.liard@gmail.comaf3adc42012-05-10 15:59:25 +000054// Converts UTF8-formatted std::string 'source' to a UnicodeString.
55UnicodeString Utf8StringToUnicodeString(const string& source) {
56 // Note that we don't use icu::StringPiece(const string&).
57 return UnicodeString::fromUTF8(
58 icu::StringPiece(source.c_str(), source.size()));
59}
60
philip.liard@gmail.com1ad5e5b2011-07-01 08:22:06 +000061} // namespace
62
63// Implementation of the abstract classes RegExpInput and RegExp using ICU
64// regular expression capabilities.
65
66// ICU implementation of the RegExpInput abstract class.
67class IcuRegExpInput : public RegExpInput {
68 public:
69 explicit IcuRegExpInput(const string& utf8_input)
philip.liard@gmail.comaf3adc42012-05-10 15:59:25 +000070 : utf8_input_(Utf8StringToUnicodeString(utf8_input)),
philip.liard@gmail.com1ad5e5b2011-07-01 08:22:06 +000071 position_(0) {}
72
73 virtual ~IcuRegExpInput() {}
74
75 virtual string ToString() const {
76 return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_));
77 }
78
79 UnicodeString* Data() {
80 return &utf8_input_;
81 }
82
83 // The current start position. For a newly created input, position is 0. Each
84 // call to ConsumeRegExp() or RegExp::Consume() advances the position in the
85 // case of the successful match to be after the match.
86 int position() const {
87 return position_;
88 }
89
90 void set_position(int position) {
91 DCHECK(position >= 0 && position <= utf8_input_.length());
92 position_ = position;
93 }
94
95 private:
96 UnicodeString utf8_input_;
97 int position_;
98
99 DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput);
100};
101
102// ICU implementation of the RegExp abstract class.
103class IcuRegExp : public RegExp {
104 public:
105 explicit IcuRegExp(const string& utf8_regexp) {
106 UParseError parse_error;
107 UErrorCode status = U_ZERO_ERROR;
108 utf8_regexp_.reset(RegexPattern::compile(
philip.liard@gmail.comaf3adc42012-05-10 15:59:25 +0000109 Utf8StringToUnicodeString(utf8_regexp), 0, parse_error, status));
philip.liard@gmail.com1ad5e5b2011-07-01 08:22:06 +0000110 if (U_FAILURE(status)) {
111 // The provided regular expressions should compile correctly.
112 LOG(ERROR) << "Error compiling regular expression: " << utf8_regexp;
113 utf8_regexp_.reset(NULL);
114 }
115 }
116
117 virtual ~IcuRegExp() {}
118
119 virtual bool Consume(RegExpInput* input_string,
120 bool anchor_at_start,
121 string* matched_string1,
122 string* matched_string2,
123 string* matched_string3) const {
124 DCHECK(input_string);
125 if (!utf8_regexp_.get()) {
126 return false;
127 }
128 IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string);
129 UErrorCode status = U_ZERO_ERROR;
130 const scoped_ptr<RegexMatcher> matcher(
131 utf8_regexp_->matcher(*input->Data(), status));
132 bool match_succeeded = anchor_at_start
133 ? matcher->lookingAt(input->position(), status)
134 : matcher->find(input->position(), status);
135 if (!match_succeeded || U_FAILURE(status)) {
136 return false;
137 }
138 string* const matched_strings[] = {
139 matched_string1, matched_string2, matched_string3
140 };
141 // If less matches than expected - fail.
142 for (size_t i = 0; i < arraysize(matched_strings); ++i) {
143 if (matched_strings[i]) {
144 // Groups are counted from 1 rather than 0.
145 const int group_index = i + 1;
146 if (group_index > matcher->groupCount()) {
147 return false;
148 }
149 *matched_strings[i] =
150 UnicodeStringToUtf8String(matcher->group(group_index, status));
151 }
152 }
153 input->set_position(matcher->end(status));
154 return !U_FAILURE(status);
155 }
156
157 bool Match(const string& input_string,
158 bool full_match,
159 string* matched_string) const {
160 if (!utf8_regexp_.get()) {
161 return false;
162 }
163 IcuRegExpInput input(input_string);
164 UErrorCode status = U_ZERO_ERROR;
165 const scoped_ptr<RegexMatcher> matcher(
166 utf8_regexp_->matcher(*input.Data(), status));
167 bool match_succeeded = full_match
168 ? matcher->matches(input.position(), status)
169 : matcher->find(input.position(), status);
170 if (!match_succeeded || U_FAILURE(status)) {
171 return false;
172 }
173 if (matcher->groupCount() > 0 && matched_string) {
174 *matched_string = UnicodeStringToUtf8String(matcher->group(1, status));
175 }
176 return !U_FAILURE(status);
177 }
178
179 bool Replace(string* string_to_process,
180 bool global,
181 const string& replacement_string) const {
182 DCHECK(string_to_process);
183 if (!utf8_regexp_.get()) {
184 return false;
185 }
186 IcuRegExpInput input(*string_to_process);
187 UErrorCode status = U_ZERO_ERROR;
188 const scoped_ptr<RegexMatcher> matcher(
189 utf8_regexp_->matcher(*input.Data(), status));
190 if (U_FAILURE(status)) {
191 return false;
192 }
lararennie@google.com35bd3932012-09-06 09:48:57 +0000193
194 UnicodeString output;
195 // We reimplement ReplaceFirst and ReplaceAll such that their behaviour is
196 // consistent with the RE2 reg-ex matcher.
197 if (!matcher->find()) {
198 return false;
199 }
200 matcher->appendReplacement(output,
201 Utf8StringToUnicodeString(replacement_string),
202 status);
203 if (global) {
204 // Continue and look for more matches.
205 while (matcher->find()) {
206 matcher->appendReplacement(
207 output,
208 Utf8StringToUnicodeString(replacement_string),
209 status);
210 }
211 }
212
213 matcher->appendTail(output);
philip.liard@gmail.com1ad5e5b2011-07-01 08:22:06 +0000214 if (U_FAILURE(status)) {
215 return false;
216 }
lararennie@google.com35bd3932012-09-06 09:48:57 +0000217 const string replaced_string = UnicodeStringToUtf8String(output);
philip.liard@gmail.com1ad5e5b2011-07-01 08:22:06 +0000218 *string_to_process = replaced_string;
219 return true;
220 }
221
222 private:
223 scoped_ptr<RegexPattern> utf8_regexp_;
224
225 DISALLOW_COPY_AND_ASSIGN(IcuRegExp);
226};
227
philip.liard@gmail.com384682a2011-07-12 15:41:29 +0000228RegExpInput* ICURegExpFactory::CreateInput(const string& utf8_input) const {
philip.liard@gmail.com1ad5e5b2011-07-01 08:22:06 +0000229 return new IcuRegExpInput(utf8_input);
230}
231
philip.liard@gmail.com384682a2011-07-12 15:41:29 +0000232RegExp* ICURegExpFactory::CreateRegExp(const string& utf8_regexp) const {
philip.liard@gmail.com1ad5e5b2011-07-01 08:22:06 +0000233 return new IcuRegExp(utf8_regexp);
234}
235
236} // namespace phonenumbers
237} // namespace i18n