whesse@chromium.org | e90029b | 2010-08-02 11:52:17 +0000 | [diff] [blame] | 1 | // Copyright 2007-2010 the V8 project authors. All rights reserved. |
christian.plesner.hansen | 43d26ec | 2008-07-03 15:10:15 +0000 | [diff] [blame] | 2 | // Redistribution and use in source and binary forms, with or without |
| 3 | // modification, are permitted provided that the following conditions are |
| 4 | // met: |
| 5 | // |
| 6 | // * Redistributions of source code must retain the above copyright |
| 7 | // notice, this list of conditions and the following disclaimer. |
| 8 | // * Redistributions in binary form must reproduce the above |
| 9 | // copyright notice, this list of conditions and the following |
| 10 | // disclaimer in the documentation and/or other materials provided |
| 11 | // with the distribution. |
| 12 | // * Neither the name of Google Inc. nor the names of its |
| 13 | // contributors may be used to endorse or promote products derived |
| 14 | // from this software without specific prior written permission. |
| 15 | // |
| 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 27 | |
ager@chromium.org | 5ec4892 | 2009-05-05 07:25:34 +0000 | [diff] [blame] | 28 | #ifndef V8_UNICODE_INL_H_ |
| 29 | #define V8_UNICODE_INL_H_ |
christian.plesner.hansen | 43d26ec | 2008-07-03 15:10:15 +0000 | [diff] [blame] | 30 | |
| 31 | #include "unicode.h" |
yangguo@chromium.org | 355cfd1 | 2012-08-29 15:32:24 +0000 | [diff] [blame] | 32 | #include "checks.h" |
mstarzinger@chromium.org | e27d617 | 2013-04-17 11:51:44 +0000 | [diff] [blame] | 33 | #include "platform.h" |
christian.plesner.hansen | 43d26ec | 2008-07-03 15:10:15 +0000 | [diff] [blame] | 34 | |
| 35 | namespace unibrow { |
| 36 | |
| 37 | template <class T, int s> bool Predicate<T, s>::get(uchar code_point) { |
| 38 | CacheEntry entry = entries_[code_point & kMask]; |
| 39 | if (entry.code_point_ == code_point) return entry.value_; |
| 40 | return CalculateValue(code_point); |
| 41 | } |
| 42 | |
| 43 | template <class T, int s> bool Predicate<T, s>::CalculateValue( |
| 44 | uchar code_point) { |
| 45 | bool result = T::Is(code_point); |
| 46 | entries_[code_point & kMask] = CacheEntry(code_point, result); |
| 47 | return result; |
| 48 | } |
| 49 | |
| 50 | template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n, |
| 51 | uchar* result) { |
| 52 | CacheEntry entry = entries_[c & kMask]; |
| 53 | if (entry.code_point_ == c) { |
| 54 | if (entry.offset_ == 0) { |
| 55 | return 0; |
| 56 | } else { |
| 57 | result[0] = c + entry.offset_; |
| 58 | return 1; |
| 59 | } |
| 60 | } else { |
| 61 | return CalculateValue(c, n, result); |
| 62 | } |
| 63 | } |
| 64 | |
| 65 | template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n, |
| 66 | uchar* result) { |
| 67 | bool allow_caching = true; |
| 68 | int length = T::Convert(c, n, result, &allow_caching); |
| 69 | if (allow_caching) { |
| 70 | if (length == 1) { |
| 71 | entries_[c & kMask] = CacheEntry(c, result[0] - c); |
| 72 | return 1; |
| 73 | } else { |
| 74 | entries_[c & kMask] = CacheEntry(c, 0); |
| 75 | return 0; |
| 76 | } |
| 77 | } else { |
| 78 | return length; |
| 79 | } |
| 80 | } |
| 81 | |
| 82 | |
mvstanton@chromium.org | 6bec009 | 2013-01-23 13:46:53 +0000 | [diff] [blame] | 83 | uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { |
yangguo@chromium.org | 46a2a51 | 2013-01-18 16:29:40 +0000 | [diff] [blame] | 84 | ASSERT(c > Latin1::kMaxChar); |
| 85 | switch (c) { |
mvstanton@chromium.org | 6bec009 | 2013-01-23 13:46:53 +0000 | [diff] [blame] | 86 | // This are equivalent characters in unicode. |
| 87 | case 0x39c: |
| 88 | case 0x3bc: |
| 89 | return 0xb5; |
| 90 | // This is an uppercase of a Latin-1 character |
| 91 | // outside of Latin-1. |
yangguo@chromium.org | 46a2a51 | 2013-01-18 16:29:40 +0000 | [diff] [blame] | 92 | case 0x178: |
mvstanton@chromium.org | 6bec009 | 2013-01-23 13:46:53 +0000 | [diff] [blame] | 93 | return 0xff; |
yangguo@chromium.org | 46a2a51 | 2013-01-18 16:29:40 +0000 | [diff] [blame] | 94 | } |
mvstanton@chromium.org | 6bec009 | 2013-01-23 13:46:53 +0000 | [diff] [blame] | 95 | return 0; |
yangguo@chromium.org | 46a2a51 | 2013-01-18 16:29:40 +0000 | [diff] [blame] | 96 | } |
| 97 | |
| 98 | |
svenpanne@chromium.org | 2bda543 | 2013-03-15 12:39:50 +0000 | [diff] [blame] | 99 | unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { |
| 100 | static const int kMask = ~(1 << 6); |
| 101 | if (c <= kMaxOneByteChar) { |
| 102 | str[0] = c; |
| 103 | return 1; |
| 104 | } |
| 105 | str[0] = 0xC0 | (c >> 6); |
| 106 | str[1] = 0x80 | (c & kMask); |
| 107 | return 2; |
| 108 | } |
| 109 | |
| 110 | |
yangguo@chromium.org | 154ff99 | 2012-03-13 08:09:54 +0000 | [diff] [blame] | 111 | unsigned Utf8::Encode(char* str, uchar c, int previous) { |
christian.plesner.hansen | 43d26ec | 2008-07-03 15:10:15 +0000 | [diff] [blame] | 112 | static const int kMask = ~(1 << 6); |
| 113 | if (c <= kMaxOneByteChar) { |
| 114 | str[0] = c; |
| 115 | return 1; |
| 116 | } else if (c <= kMaxTwoByteChar) { |
| 117 | str[0] = 0xC0 | (c >> 6); |
| 118 | str[1] = 0x80 | (c & kMask); |
| 119 | return 2; |
| 120 | } else if (c <= kMaxThreeByteChar) { |
yangguo@chromium.org | 154ff99 | 2012-03-13 08:09:54 +0000 | [diff] [blame] | 121 | if (Utf16::IsTrailSurrogate(c) && |
| 122 | Utf16::IsLeadSurrogate(previous)) { |
| 123 | const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; |
| 124 | return Encode(str - kUnmatchedSize, |
| 125 | Utf16::CombineSurrogatePair(previous, c), |
| 126 | Utf16::kNoPreviousCharacter) - kUnmatchedSize; |
| 127 | } |
christian.plesner.hansen | 43d26ec | 2008-07-03 15:10:15 +0000 | [diff] [blame] | 128 | str[0] = 0xE0 | (c >> 12); |
| 129 | str[1] = 0x80 | ((c >> 6) & kMask); |
| 130 | str[2] = 0x80 | (c & kMask); |
| 131 | return 3; |
| 132 | } else { |
| 133 | str[0] = 0xF0 | (c >> 18); |
| 134 | str[1] = 0x80 | ((c >> 12) & kMask); |
| 135 | str[2] = 0x80 | ((c >> 6) & kMask); |
| 136 | str[3] = 0x80 | (c & kMask); |
| 137 | return 4; |
| 138 | } |
| 139 | } |
| 140 | |
| 141 | |
| 142 | uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) { |
| 143 | if (length <= 0) return kBadChar; |
| 144 | byte first = bytes[0]; |
| 145 | // Characters between 0000 and 0007F are encoded as a single character |
| 146 | if (first <= kMaxOneByteChar) { |
| 147 | *cursor += 1; |
| 148 | return first; |
| 149 | } |
| 150 | return CalculateValue(bytes, length, cursor); |
| 151 | } |
| 152 | |
yangguo@chromium.org | 154ff99 | 2012-03-13 08:09:54 +0000 | [diff] [blame] | 153 | unsigned Utf8::Length(uchar c, int previous) { |
christian.plesner.hansen | 43d26ec | 2008-07-03 15:10:15 +0000 | [diff] [blame] | 154 | if (c <= kMaxOneByteChar) { |
| 155 | return 1; |
| 156 | } else if (c <= kMaxTwoByteChar) { |
| 157 | return 2; |
| 158 | } else if (c <= kMaxThreeByteChar) { |
yangguo@chromium.org | 154ff99 | 2012-03-13 08:09:54 +0000 | [diff] [blame] | 159 | if (Utf16::IsTrailSurrogate(c) && |
| 160 | Utf16::IsLeadSurrogate(previous)) { |
| 161 | return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates; |
| 162 | } |
christian.plesner.hansen | 43d26ec | 2008-07-03 15:10:15 +0000 | [diff] [blame] | 163 | return 3; |
| 164 | } else { |
| 165 | return 4; |
| 166 | } |
| 167 | } |
| 168 | |
yangguo@chromium.org | a6bbcc8 | 2012-12-21 12:35:02 +0000 | [diff] [blame] | 169 | Utf8DecoderBase::Utf8DecoderBase() |
| 170 | : unbuffered_start_(NULL), |
| 171 | utf16_length_(0), |
| 172 | last_byte_of_buffer_unused_(false) {} |
| 173 | |
| 174 | Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, |
| 175 | unsigned buffer_length, |
| 176 | const uint8_t* stream, |
| 177 | unsigned stream_length) { |
| 178 | Reset(buffer, buffer_length, stream, stream_length); |
| 179 | } |
| 180 | |
| 181 | template<unsigned kBufferSize> |
| 182 | Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length) |
| 183 | : Utf8DecoderBase(buffer_, |
| 184 | kBufferSize, |
| 185 | reinterpret_cast<const uint8_t*>(stream), |
| 186 | length) { |
| 187 | } |
| 188 | |
| 189 | template<unsigned kBufferSize> |
| 190 | void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) { |
| 191 | Utf8DecoderBase::Reset(buffer_, |
| 192 | kBufferSize, |
| 193 | reinterpret_cast<const uint8_t*>(stream), |
| 194 | length); |
| 195 | } |
| 196 | |
| 197 | template <unsigned kBufferSize> |
| 198 | unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data, |
| 199 | unsigned length) const { |
| 200 | ASSERT(length > 0); |
| 201 | if (length > utf16_length_) length = utf16_length_; |
| 202 | // memcpy everything in buffer. |
| 203 | unsigned buffer_length = |
| 204 | last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize; |
| 205 | unsigned memcpy_length = length <= buffer_length ? length : buffer_length; |
mstarzinger@chromium.org | e27d617 | 2013-04-17 11:51:44 +0000 | [diff] [blame] | 206 | v8::internal::OS::MemCopy(data, buffer_, memcpy_length*sizeof(uint16_t)); |
yangguo@chromium.org | a6bbcc8 | 2012-12-21 12:35:02 +0000 | [diff] [blame] | 207 | if (length <= buffer_length) return length; |
| 208 | ASSERT(unbuffered_start_ != NULL); |
| 209 | // Copy the rest the slow way. |
| 210 | WriteUtf16Slow(unbuffered_start_, |
| 211 | data + buffer_length, |
| 212 | length - buffer_length); |
| 213 | return length; |
christian.plesner.hansen | 43d26ec | 2008-07-03 15:10:15 +0000 | [diff] [blame] | 214 | } |
| 215 | |
| 216 | } // namespace unibrow |
| 217 | |
ager@chromium.org | 5ec4892 | 2009-05-05 07:25:34 +0000 | [diff] [blame] | 218 | #endif // V8_UNICODE_INL_H_ |