blob: f41d767cb4a9b3bf349a3191aadb716c207e503f [file] [log] [blame]
license.botf003cfe2008-08-24 09:55:55 +09001// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
initial.commit3f4a7322008-07-27 06:49:38 +09004
5#ifndef BASE_WORD_ITERATOR_H__
6#define BASE_WORD_ITERATOR_H__
7
maruel@google.com26e717a2008-08-08 05:48:51 +09008#include <string>
avi@google.com3948d4d2008-08-12 06:26:52 +09009#include <vector>
10
11#include "unicode/uchar.h"
maruel@google.com26e717a2008-08-08 05:48:51 +090012
initial.commit3f4a7322008-07-27 06:49:38 +090013#include "base/basictypes.h"
14
15// The WordIterator class iterates through the words and word breaks
16// in a string. (In the string " foo bar! ", the word breaks are at the
17// periods in ". .foo. .bar.!. .".)
18//
19// To extract the words from a string, move a WordIterator through the
20// string and test whether IsWord() is true. E.g.,
21// WordIterator iter(str, WordIterator::BREAK_WORD);
22// if (!iter.Init()) return false;
23// while (iter.Advance()) {
24// if (iter.IsWord()) {
25// // region [iter.prev(),iter.pos()) contains a word.
26// LOG(INFO) << "word: " << iter.GetWord();
27// }
28// }
29
30
31class WordIterator {
32 public:
33 enum BreakType {
34 BREAK_WORD,
35 BREAK_LINE
36 };
37
38 // Requires |str| to live as long as the WordIterator does.
39 WordIterator(const std::wstring& str, BreakType break_type);
40 ~WordIterator();
41
42 // Init() must be called before any of the iterators are valid.
43 // Returns false if ICU failed to initialize.
44 bool Init();
45
46 // Return the current break position within the string,
47 // or WordIterator::npos when done.
48 int pos() const { return pos_; }
49 // Return the value of pos() returned before Advance() was last called.
50 int prev() const { return prev_; }
51
52 // A special position value indicating "end of string".
53 static const int npos;
54
55 // Advance to the next break. Returns false if we've run past the end of
56 // the string. (Note that the very last "word break" is after the final
57 // character in the string, and when we advance to that position it's the
58 // last time Advance() returns true.)
59 bool Advance();
60
61 // Returns true if the break we just hit is the end of a word.
62 // (Otherwise, the break iterator just skipped over e.g. whitespace
63 // or punctuation.)
64 bool IsWord() const;
65
66 // Return the word between prev() and pos().
67 // Advance() must have been called successfully at least once
68 // for pos() to have advanced to somewhere useful.
maruel@google.com26e717a2008-08-08 05:48:51 +090069 std::wstring GetWord() const;
initial.commit3f4a7322008-07-27 06:49:38 +090070
71 private:
72 // ICU iterator.
73 void* iter_;
avi@google.com3948d4d2008-08-12 06:26:52 +090074#if !defined(WCHAR_T_IS_UTF16)
avi@google.com4fc32c02008-08-05 22:32:54 +090075 std::vector<UChar> chars_;
76#endif
initial.commit3f4a7322008-07-27 06:49:38 +090077
78 // The string we're iterating over.
79 const std::wstring& string_;
80
81 // The breaking style (word/line).
82 BreakType break_type_;
83
84 // Previous and current iterator positions.
85 int prev_, pos_;
86
87 DISALLOW_EVIL_CONSTRUCTORS(WordIterator);
88};
89
deanm@google.com97137862008-08-14 20:44:17 +090090#endif // BASE_WORD_ITERATOR_H__
license.botf003cfe2008-08-24 09:55:55 +090091