Blame - regexp_adapter_icu.cc - fp2-dev/platform/external/chromium_org/third_party/libphonenumber/src/phonenumbers

blob: ba8215cc932d05eee3c3eb7ccc9149a56601bdf2 [file] [log] [blame]

philip.liard@gmail.com	b905691	2011-08-18 11:41:24 +0000	[diff] [blame]	1	// Copyright (C) 2011 The Libphonenumber Authors
philip.liard@gmail.com	1ad5e5b	2011-07-01 08:22:06 +0000	[diff] [blame]	2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	// Author: George Yakovlev
				16	// Philippe Liard
				17
philip.liard@gmail.com	af3adc4	2012-05-10 15:59:25 +0000	[diff] [blame]	18	// Note that we don't use features of ICU that depend on std::string (e.g.
				19	// UnicodeString::toUTF8String()) to support clients that build ICU without
				20	// -DU_HAVE_STD_STRING.
				21
philip.liard@gmail.com	384682a	2011-07-12 15:41:29 +0000	[diff] [blame]	22	#include "phonenumbers/regexp_adapter_icu.h"
philip.liard@gmail.com	1ad5e5b	2011-07-01 08:22:06 +0000	[diff] [blame]	23
philip.liard@gmail.com	603e7e5	2011-10-12 12:25:09 +0000	[diff] [blame]	24	#include <stddef.h>
philip.liard@gmail.com	1ad5e5b	2011-07-01 08:22:06 +0000	[diff] [blame]	25	#include <string>
				26
				27	#include <unicode/regex.h>
philip.liard@gmail.com	af3adc4	2012-05-10 15:59:25 +0000	[diff] [blame]	28	#include <unicode/stringpiece.h>
philip.liard@gmail.com	1ad5e5b	2011-07-01 08:22:06 +0000	[diff] [blame]	29	#include <unicode/unistr.h>
				30
philip.liard@gmail.com	af4a2ce	2013-04-30 11:35:55 +0000	[diff] [blame]	31	#include "phonenumbers/base/basictypes.h"
				32	#include "phonenumbers/base/logging.h"
				33	#include "phonenumbers/base/memory/scoped_ptr.h"
philip.liard@gmail.com	1ad5e5b	2011-07-01 08:22:06 +0000	[diff] [blame]	34	#include "phonenumbers/default_logger.h"
philip.liard@gmail.com	af3adc4	2012-05-10 15:59:25 +0000	[diff] [blame]	35	#include "phonenumbers/string_byte_sink.h"
philip.liard@gmail.com	1ad5e5b	2011-07-01 08:22:06 +0000	[diff] [blame]	36
				37	namespace i18n {
				38	namespace phonenumbers {
				39
				40	using icu::RegexMatcher;
				41	using icu::RegexPattern;
				42	using icu::UnicodeString;
				43
				44	namespace {
				45
				46	// Converts UnicodeString 'source' to a UTF8-formatted std::string.
				47	string UnicodeStringToUtf8String(const UnicodeString& source) {
				48	string data;
philip.liard@gmail.com	af3adc4	2012-05-10 15:59:25 +0000	[diff] [blame]	49	StringByteSink sink(&data);
				50	source.toUTF8(sink);
philip.liard@gmail.com	1ad5e5b	2011-07-01 08:22:06 +0000	[diff] [blame]	51	return data;
				52	}
				53
philip.liard@gmail.com	af3adc4	2012-05-10 15:59:25 +0000	[diff] [blame]	54	// Converts UTF8-formatted std::string 'source' to a UnicodeString.
				55	UnicodeString Utf8StringToUnicodeString(const string& source) {
				56	// Note that we don't use icu::StringPiece(const string&).
				57	return UnicodeString::fromUTF8(
				58	icu::StringPiece(source.c_str(), source.size()));
				59	}
				60
philip.liard@gmail.com	1ad5e5b	2011-07-01 08:22:06 +0000	[diff] [blame]	61	} // namespace
				62
				63	// Implementation of the abstract classes RegExpInput and RegExp using ICU
				64	// regular expression capabilities.
				65
				66	// ICU implementation of the RegExpInput abstract class.
				67	class IcuRegExpInput : public RegExpInput {
				68	public:
				69	explicit IcuRegExpInput(const string& utf8_input)
philip.liard@gmail.com	af3adc4	2012-05-10 15:59:25 +0000	[diff] [blame]	70	: utf8_input_(Utf8StringToUnicodeString(utf8_input)),
philip.liard@gmail.com	1ad5e5b	2011-07-01 08:22:06 +0000	[diff] [blame]	71	position_(0) {}
				72
				73	virtual ~IcuRegExpInput() {}
				74
				75	virtual string ToString() const {
				76	return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_));
				77	}
				78
				79	UnicodeString* Data() {
				80	return &utf8_input_;
				81	}
				82
				83	// The current start position. For a newly created input, position is 0. Each
				84	// call to ConsumeRegExp() or RegExp::Consume() advances the position in the
				85	// case of the successful match to be after the match.
				86	int position() const {
				87	return position_;
				88	}
				89
				90	void set_position(int position) {
				91	DCHECK(position >= 0 && position <= utf8_input_.length());
				92	position_ = position;
				93	}
				94
				95	private:
				96	UnicodeString utf8_input_;
				97	int position_;
				98
				99	DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput);
				100	};
				101
				102	// ICU implementation of the RegExp abstract class.
				103	class IcuRegExp : public RegExp {
				104	public:
				105	explicit IcuRegExp(const string& utf8_regexp) {
				106	UParseError parse_error;
				107	UErrorCode status = U_ZERO_ERROR;
				108	utf8_regexp_.reset(RegexPattern::compile(
philip.liard@gmail.com	af3adc4	2012-05-10 15:59:25 +0000	[diff] [blame]	109	Utf8StringToUnicodeString(utf8_regexp), 0, parse_error, status));
philip.liard@gmail.com	1ad5e5b	2011-07-01 08:22:06 +0000	[diff] [blame]	110	if (U_FAILURE(status)) {
				111	// The provided regular expressions should compile correctly.
				112	LOG(ERROR) << "Error compiling regular expression: " << utf8_regexp;
				113	utf8_regexp_.reset(NULL);
				114	}
				115	}
				116
				117	virtual ~IcuRegExp() {}
				118
				119	virtual bool Consume(RegExpInput* input_string,
				120	bool anchor_at_start,
				121	string* matched_string1,
				122	string* matched_string2,
				123	string* matched_string3) const {
				124	DCHECK(input_string);
				125	if (!utf8_regexp_.get()) {
				126	return false;
				127	}
				128	IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string);
				129	UErrorCode status = U_ZERO_ERROR;
				130	const scoped_ptr<RegexMatcher> matcher(
				131	utf8_regexp_->matcher(*input->Data(), status));
				132	bool match_succeeded = anchor_at_start
				133	? matcher->lookingAt(input->position(), status)
				134	: matcher->find(input->position(), status);
				135	if (!match_succeeded \|\| U_FAILURE(status)) {
				136	return false;
				137	}
				138	string* const matched_strings[] = {
				139	matched_string1, matched_string2, matched_string3
				140	};
				141	// If less matches than expected - fail.
				142	for (size_t i = 0; i < arraysize(matched_strings); ++i) {
				143	if (matched_strings[i]) {
				144	// Groups are counted from 1 rather than 0.
				145	const int group_index = i + 1;
				146	if (group_index > matcher->groupCount()) {
				147	return false;
				148	}
				149	*matched_strings[i] =
				150	UnicodeStringToUtf8String(matcher->group(group_index, status));
				151	}
				152	}
				153	input->set_position(matcher->end(status));
				154	return !U_FAILURE(status);
				155	}
				156
				157	bool Match(const string& input_string,
				158	bool full_match,
				159	string* matched_string) const {
				160	if (!utf8_regexp_.get()) {
				161	return false;
				162	}
				163	IcuRegExpInput input(input_string);
				164	UErrorCode status = U_ZERO_ERROR;
				165	const scoped_ptr<RegexMatcher> matcher(
				166	utf8_regexp_->matcher(*input.Data(), status));
				167	bool match_succeeded = full_match
				168	? matcher->matches(input.position(), status)
				169	: matcher->find(input.position(), status);
				170	if (!match_succeeded \|\| U_FAILURE(status)) {
				171	return false;
				172	}
				173	if (matcher->groupCount() > 0 && matched_string) {
				174	*matched_string = UnicodeStringToUtf8String(matcher->group(1, status));
				175	}
				176	return !U_FAILURE(status);
				177	}
				178
				179	bool Replace(string* string_to_process,
				180	bool global,
				181	const string& replacement_string) const {
				182	DCHECK(string_to_process);
				183	if (!utf8_regexp_.get()) {
				184	return false;
				185	}
				186	IcuRegExpInput input(*string_to_process);
				187	UErrorCode status = U_ZERO_ERROR;
				188	const scoped_ptr<RegexMatcher> matcher(
				189	utf8_regexp_->matcher(*input.Data(), status));
				190	if (U_FAILURE(status)) {
				191	return false;
				192	}
lararennie@google.com	35bd393	2012-09-06 09:48:57 +0000	[diff] [blame]	193
				194	UnicodeString output;
				195	// We reimplement ReplaceFirst and ReplaceAll such that their behaviour is
				196	// consistent with the RE2 reg-ex matcher.
				197	if (!matcher->find()) {
				198	return false;
				199	}
				200	matcher->appendReplacement(output,
				201	Utf8StringToUnicodeString(replacement_string),
				202	status);
				203	if (global) {
				204	// Continue and look for more matches.
				205	while (matcher->find()) {
				206	matcher->appendReplacement(
				207	output,
				208	Utf8StringToUnicodeString(replacement_string),
				209	status);
				210	}
				211	}
				212
				213	matcher->appendTail(output);
philip.liard@gmail.com	1ad5e5b	2011-07-01 08:22:06 +0000	[diff] [blame]	214	if (U_FAILURE(status)) {
				215	return false;
				216	}
lararennie@google.com	35bd393	2012-09-06 09:48:57 +0000	[diff] [blame]	217	const string replaced_string = UnicodeStringToUtf8String(output);
philip.liard@gmail.com	1ad5e5b	2011-07-01 08:22:06 +0000	[diff] [blame]	218	*string_to_process = replaced_string;
				219	return true;
				220	}
				221
				222	private:
				223	scoped_ptr<RegexPattern> utf8_regexp_;
				224
				225	DISALLOW_COPY_AND_ASSIGN(IcuRegExp);
				226	};
				227
philip.liard@gmail.com	384682a	2011-07-12 15:41:29 +0000	[diff] [blame]	228	RegExpInput* ICURegExpFactory::CreateInput(const string& utf8_input) const {
philip.liard@gmail.com	1ad5e5b	2011-07-01 08:22:06 +0000	[diff] [blame]	229	return new IcuRegExpInput(utf8_input);
				230	}
				231
philip.liard@gmail.com	384682a	2011-07-12 15:41:29 +0000	[diff] [blame]	232	RegExp* ICURegExpFactory::CreateRegExp(const string& utf8_regexp) const {
philip.liard@gmail.com	1ad5e5b	2011-07-01 08:22:06 +0000	[diff] [blame]	233	return new IcuRegExp(utf8_regexp);
				234	}
				235
				236	} // namespace phonenumbers
				237	} // namespace i18n