license.bot | f003cfe | 2008-08-24 09:55:55 +0900 | [diff] [blame^] | 1 | // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
initial.commit | 3f4a732 | 2008-07-27 06:49:38 +0900 | [diff] [blame] | 4 | |
initial.commit | 3f4a732 | 2008-07-27 06:49:38 +0900 | [diff] [blame] | 5 | #include "base/word_iterator.h" |
maruel@google.com | 26e717a | 2008-08-08 05:48:51 +0900 | [diff] [blame] | 6 | |
| 7 | #include "base/logging.h" |
initial.commit | 3f4a732 | 2008-07-27 06:49:38 +0900 | [diff] [blame] | 8 | #include "unicode/ubrk.h" |
avi@google.com | 3948d4d | 2008-08-12 06:26:52 +0900 | [diff] [blame] | 9 | #include "unicode/ustring.h" |
initial.commit | 3f4a732 | 2008-07-27 06:49:38 +0900 | [diff] [blame] | 10 | |
| 11 | const int WordIterator::npos = -1; |
| 12 | |
| 13 | WordIterator::WordIterator(const std::wstring& str, BreakType break_type) |
| 14 | : iter_(NULL), |
| 15 | string_(str), |
| 16 | break_type_(break_type), |
| 17 | prev_(npos), |
| 18 | pos_(0) { |
| 19 | } |
| 20 | |
| 21 | WordIterator::~WordIterator() { |
| 22 | if (iter_) |
| 23 | ubrk_close(iter_); |
| 24 | } |
| 25 | |
| 26 | bool WordIterator::Init() { |
| 27 | UErrorCode status = U_ZERO_ERROR; |
| 28 | UBreakIteratorType break_type; |
| 29 | switch (break_type_) { |
| 30 | case BREAK_WORD: |
| 31 | break_type = UBRK_WORD; |
| 32 | break; |
| 33 | case BREAK_LINE: |
| 34 | break_type = UBRK_LINE; |
| 35 | break; |
| 36 | default: |
| 37 | NOTREACHED(); |
| 38 | break_type = UBRK_LINE; |
| 39 | } |
avi@google.com | 3948d4d | 2008-08-12 06:26:52 +0900 | [diff] [blame] | 40 | #if defined(WCHAR_T_IS_UTF16) |
initial.commit | 3f4a732 | 2008-07-27 06:49:38 +0900 | [diff] [blame] | 41 | iter_ = ubrk_open(break_type, NULL, |
| 42 | string_.data(), static_cast<int32_t>(string_.size()), |
| 43 | &status); |
avi@google.com | 3948d4d | 2008-08-12 06:26:52 +0900 | [diff] [blame] | 44 | #else // WCHAR_T_IS_UTF16 |
avi@google.com | 4fc32c0 | 2008-08-05 22:32:54 +0900 | [diff] [blame] | 45 | // When wchar_t is wider than UChar (16 bits), transform |string_| into a |
| 46 | // UChar* string. Size the UChar* buffer to be large enough to hold twice |
| 47 | // as many UTF-16 code points as there are UCS-4 characters, in case each |
| 48 | // character translates to a UTF-16 surrogate pair, and leave room for a NUL |
| 49 | // terminator. |
| 50 | // TODO(avi): avoid this alloc |
avi@google.com | 3948d4d | 2008-08-12 06:26:52 +0900 | [diff] [blame] | 51 | chars_.resize(string_.length() * sizeof(UChar) + 1); |
avi@google.com | 4fc32c0 | 2008-08-05 22:32:54 +0900 | [diff] [blame] | 52 | |
| 53 | UErrorCode error = U_ZERO_ERROR; |
| 54 | int32_t destLength; |
| 55 | u_strFromWCS(&chars_[0], chars_.size(), &destLength, string_.data(), |
| 56 | string_.length(), &error); |
| 57 | |
avi@google.com | 3948d4d | 2008-08-12 06:26:52 +0900 | [diff] [blame] | 58 | iter_ = ubrk_open(break_type, NULL, &chars_[0], destLength, &status); |
avi@google.com | 4fc32c0 | 2008-08-05 22:32:54 +0900 | [diff] [blame] | 59 | #endif |
initial.commit | 3f4a732 | 2008-07-27 06:49:38 +0900 | [diff] [blame] | 60 | if (U_FAILURE(status)) { |
| 61 | NOTREACHED() << "ubrk_open failed"; |
| 62 | return false; |
| 63 | } |
| 64 | ubrk_first(iter_); // Move the iterator to the beginning of the string. |
| 65 | return true; |
| 66 | } |
| 67 | |
| 68 | bool WordIterator::Advance() { |
| 69 | prev_ = pos_; |
| 70 | const int32_t pos = ubrk_next(iter_); |
| 71 | if (pos == UBRK_DONE) { |
| 72 | pos_ = npos; |
| 73 | return false; |
| 74 | } else { |
| 75 | pos_ = static_cast<int>(pos); |
| 76 | return true; |
| 77 | } |
| 78 | } |
| 79 | |
| 80 | bool WordIterator::IsWord() const { |
| 81 | return (ubrk_getRuleStatus(iter_) != UBRK_WORD_NONE); |
maruel@google.com | 26e717a | 2008-08-08 05:48:51 +0900 | [diff] [blame] | 82 | } |
| 83 | |
| 84 | std::wstring WordIterator::GetWord() const { |
| 85 | DCHECK(prev_ >= 0 && pos_ >= 0); |
| 86 | return string_.substr(prev_, pos_ - prev_); |
| 87 | } |
license.bot | f003cfe | 2008-08-24 09:55:55 +0900 | [diff] [blame^] | 88 | |