Blame - base/strings/string_tokenizer.h - platform/external/libchrome

blob: 8defbac3b8d0779fbceb6f9744c394428249d482 [file] [log] [blame]

scr@chromium.org	e3e4306	2011-06-28 02:23:28 +0900	[diff] [blame]	1	// Copyright (c) 2011 The Chromium Authors. All rights reserved.
license.bot	f003cfe	2008-08-24 09:55:55 +0900	[diff] [blame]	2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	4
brettw@chromium.org	2cbc287	2013-02-02 09:21:39 +0900	[diff] [blame]	5	#ifndef BASE_STRINGS_STRING_TOKENIZER_H_
				6	#define BASE_STRINGS_STRING_TOKENIZER_H_
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	7
erg@chromium.org	560daeb	2010-05-13 01:49:35 +0900	[diff] [blame]	8	#include <algorithm>
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	9	#include <string>
				10
tfarina@chromium.org	b6d4911	2013-03-30 23:29:00 +0900	[diff] [blame]	11	#include "base/strings/string_piece.h"
erg@chromium.org	560daeb	2010-05-13 01:49:35 +0900	[diff] [blame]	12
brettw@chromium.org	2cbc287	2013-02-02 09:21:39 +0900	[diff] [blame]	13	namespace base {
				14
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	15	// StringTokenizerT is a simple string tokenizer class. It works like an
				16	// iterator that with each step (see the Advance method) updates members that
				17	// refer to the next token in the input string. The user may optionally
				18	// configure the tokenizer to return delimiters.
				19	//
mattm@chromium.org	c10b705	2009-08-26 10:19:43 +0900	[diff] [blame]	20	// Warning: be careful not to pass a C string into the 2-arg constructor:
				21	// StringTokenizer t("this is a test", " "); // WRONG
				22	// This will create a temporary std::string, save the begin() and end()
				23	// iterators, and then the string will be freed before we actually start
				24	// tokenizing it.
				25	// Instead, use a std::string or use the 3 arg constructor of CStringTokenizer.
				26	//
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	27	//
				28	// EXAMPLE 1:
				29	//
mattm@chromium.org	c10b705	2009-08-26 10:19:43 +0900	[diff] [blame]	30	// char input[] = "this is a test";
				31	// CStringTokenizer t(input, input + strlen(input), " ");
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	32	// while (t.GetNext()) {
				33	// printf("%s\n", t.token().c_str());
				34	// }
				35	//
				36	// Output:
				37	//
				38	// this
				39	// is
				40	// a
				41	// test
				42	//
				43	//
				44	// EXAMPLE 2:
				45	//
mattm@chromium.org	c10b705	2009-08-26 10:19:43 +0900	[diff] [blame]	46	// std::string input = "no-cache=\"foo, bar\", private";
				47	// StringTokenizer t(input, ", ");
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	48	// t.set_quote_chars("\"");
				49	// while (t.GetNext()) {
				50	// printf("%s\n", t.token().c_str());
				51	// }
				52	//
				53	// Output:
				54	//
				55	// no-cache="foo, bar"
				56	// private
				57	//
				58	//
				59	// EXAMPLE 3:
				60	//
				61	// bool next_is_option = false, next_is_value = false;
				62	// std::string input = "text/html; charset=UTF-8; foo=bar";
				63	// StringTokenizer t(input, "; =");
				64	// t.set_options(StringTokenizer::RETURN_DELIMS);
				65	// while (t.GetNext()) {
				66	// if (t.token_is_delim()) {
				67	// switch (*t.token_begin()) {
				68	// case ';':
				69	// next_is_option = true;
				70	// break;
				71	// case '=':
				72	// next_is_value = true;
				73	// break;
				74	// }
				75	// } else {
				76	// const char* label;
				77	// if (next_is_option) {
				78	// label = "option-name";
				79	// next_is_option = false;
				80	// } else if (next_is_value) {
				81	// label = "option-value";
				82	// next_is_value = false;
				83	// } else {
				84	// label = "mime-type";
				85	// }
				86	// printf("%s: %s\n", label, t.token().c_str());
				87	// }
				88	// }
				89	//
				90	//
ericroman@google.com	6dd1ab7	2008-09-06 10:00:53 +0900	[diff] [blame]	91	template <class str, class const_iterator>
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	92	class StringTokenizerT {
				93	public:
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	94	typedef typename str::value_type char_type;
				95
				96	// Options that may be pass to set_options()
				97	enum {
				98	// Specifies the delimiters should be returned as tokens
				99	RETURN_DELIMS = 1 << 0,
				100	};
				101
mattm@chromium.org	c10b705	2009-08-26 10:19:43 +0900	[diff] [blame]	102	// The string object must live longer than the tokenizer. (In particular this
				103	// should not be constructed with a temporary.)
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	104	StringTokenizerT(const str& string,
				105	const str& delims) {
				106	Init(string.begin(), string.end(), delims);
				107	}
				108
				109	StringTokenizerT(const_iterator string_begin,
				110	const_iterator string_end,
				111	const str& delims) {
				112	Init(string_begin, string_end, delims);
				113	}
				114
				115	// Set the options for this tokenizer. By default, this is 0.
				116	void set_options(int options) { options_ = options; }
				117
				118	// Set the characters to regard as quotes. By default, this is empty. When
				119	// a quote char is encountered, the tokenizer will switch into a mode where
				120	// it ignores delimiters that it finds. It switches out of this mode once it
				121	// finds another instance of the quote char. If a backslash is encountered
				122	// within a quoted string, then the next character is skipped.
robertshield@chromium.org	a6e762c	2009-02-05 03:55:43 +0900	[diff] [blame]	123	void set_quote_chars(const str& quotes) { quotes_ = quotes; }
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	124
				125	// Call this method to advance the tokenizer to the next delimiter. This
				126	// returns false if the tokenizer is complete. This method must be called
				127	// before calling any of the token* methods.
				128	bool GetNext() {
erg@chromium.org	560daeb	2010-05-13 01:49:35 +0900	[diff] [blame]	129	if (quotes_.empty() && options_ == 0)
				130	return QuickGetNext();
				131	else
				132	return FullGetNext();
				133	}
				134
				135	// Start iterating through tokens from the beginning of the string.
				136	void Reset() {
				137	token_end_ = start_pos_;
				138	}
				139
				140	// Returns true if token is a delimiter. When the tokenizer is constructed
				141	// with the RETURN_DELIMS option, this method can be used to check if the
				142	// returned token is actually a delimiter.
				143	bool token_is_delim() const { return token_is_delim_; }
				144
				145	// If GetNext() returned true, then these methods may be used to read the
				146	// value of the token.
				147	const_iterator token_begin() const { return token_begin_; }
				148	const_iterator token_end() const { return token_end_; }
				149	str token() const { return str(token_begin_, token_end_); }
				150	base::StringPiece token_piece() const {
				151	return base::StringPiece(&*token_begin_,
				152	std::distance(token_begin_, token_end_));
				153	}
				154
				155	private:
				156	void Init(const_iterator string_begin,
				157	const_iterator string_end,
				158	const str& delims) {
				159	start_pos_ = string_begin;
				160	token_begin_ = string_begin;
				161	token_end_ = string_begin;
				162	end_ = string_end;
				163	delims_ = delims;
				164	options_ = 0;
finnur@chromium.org	436ea09	2010-07-24 22:12:43 +0900	[diff] [blame]	165	token_is_delim_ = false;
erg@chromium.org	560daeb	2010-05-13 01:49:35 +0900	[diff] [blame]	166	}
				167
				168	// Implementation of GetNext() for when we have no quote characters. We have
				169	// two separate implementations because AdvanceOne() is a hot spot in large
				170	// text files with large tokens.
				171	bool QuickGetNext() {
				172	token_is_delim_ = false;
				173	for (;;) {
				174	token_begin_ = token_end_;
				175	if (token_end_ == end_)
				176	return false;
				177	++token_end_;
				178	if (delims_.find(*token_begin_) == str::npos)
				179	break;
finnur@chromium.org	436ea09	2010-07-24 22:12:43 +0900	[diff] [blame]	180	// else skip over delimiter.
erg@chromium.org	560daeb	2010-05-13 01:49:35 +0900	[diff] [blame]	181	}
				182	while (token_end_ != end_ && delims_.find(*token_end_) == str::npos)
				183	++token_end_;
				184	return true;
				185	}
				186
				187	// Implementation of GetNext() for when we have to take quotes into account.
				188	bool FullGetNext() {
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	189	AdvanceState state;
				190	token_is_delim_ = false;
				191	for (;;) {
				192	token_begin_ = token_end_;
				193	if (token_end_ == end_)
				194	return false;
				195	++token_end_;
				196	if (AdvanceOne(&state, *token_begin_))
				197	break;
				198	if (options_ & RETURN_DELIMS) {
				199	token_is_delim_ = true;
				200	return true;
				201	}
finnur@chromium.org	436ea09	2010-07-24 22:12:43 +0900	[diff] [blame]	202	// else skip over delimiter.
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	203	}
				204	while (token_end_ != end_ && AdvanceOne(&state, *token_end_))
				205	++token_end_;
				206	return true;
				207	}
				208
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	209	bool IsDelim(char_type c) const {
				210	return delims_.find(c) != str::npos;
				211	}
				212
				213	bool IsQuote(char_type c) const {
				214	return quotes_.find(c) != str::npos;
				215	}
				216
				217	struct AdvanceState {
				218	bool in_quote;
				219	bool in_escape;
				220	char_type quote_char;
scr@chromium.org	e3e4306	2011-06-28 02:23:28 +0900	[diff] [blame]	221	AdvanceState() : in_quote(false), in_escape(false), quote_char('\0') {}
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	222	};
				223
				224	// Returns true if a delimiter was not hit.
				225	bool AdvanceOne(AdvanceState* state, char_type c) {
				226	if (state->in_quote) {
				227	if (state->in_escape) {
				228	state->in_escape = false;
				229	} else if (c == '\\') {
				230	state->in_escape = true;
				231	} else if (c == state->quote_char) {
				232	state->in_quote = false;
				233	}
				234	} else {
				235	if (IsDelim(c))
				236	return false;
				237	state->in_quote = IsQuote(state->quote_char = c);
				238	}
				239	return true;
				240	}
				241
tommi@chromium.org	1b78831	2009-10-17 06:41:09 +0900	[diff] [blame]	242	const_iterator start_pos_;
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	243	const_iterator token_begin_;
				244	const_iterator token_end_;
				245	const_iterator end_;
				246	str delims_;
				247	str quotes_;
				248	int options_;
				249	bool token_is_delim_;
				250	};
				251
ericroman@google.com	6dd1ab7	2008-09-06 10:00:53 +0900	[diff] [blame]	252	typedef StringTokenizerT<std::string, std::string::const_iterator>
				253	StringTokenizer;
				254	typedef StringTokenizerT<std::wstring, std::wstring::const_iterator>
				255	WStringTokenizer;
				256	typedef StringTokenizerT<std::string, const char*> CStringTokenizer;
initial.commit	3f4a732	2008-07-27 06:49:38 +0900	[diff] [blame]	257
brettw@chromium.org	2cbc287	2013-02-02 09:21:39 +0900	[diff] [blame]	258	} // namespace base
				259
				260	#endif // BASE_STRINGS_STRING_TOKENIZER_H_