Ben Murdoch | bb769b2 | 2010-08-11 14:56:33 +0100 | [diff] [blame] | 1 | // Copyright 2007-2010 the V8 project authors. All rights reserved. |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame^] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 4 | |
| 5 | #ifndef V8_UNICODE_INL_H_ |
| 6 | #define V8_UNICODE_INL_H_ |
| 7 | |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame^] | 8 | #include "src/unicode.h" |
| 9 | #include "src/base/logging.h" |
| 10 | #include "src/utils.h" |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 11 | |
| 12 | namespace unibrow { |
| 13 | |
| 14 | template <class T, int s> bool Predicate<T, s>::get(uchar code_point) { |
| 15 | CacheEntry entry = entries_[code_point & kMask]; |
| 16 | if (entry.code_point_ == code_point) return entry.value_; |
| 17 | return CalculateValue(code_point); |
| 18 | } |
| 19 | |
| 20 | template <class T, int s> bool Predicate<T, s>::CalculateValue( |
| 21 | uchar code_point) { |
| 22 | bool result = T::Is(code_point); |
| 23 | entries_[code_point & kMask] = CacheEntry(code_point, result); |
| 24 | return result; |
| 25 | } |
| 26 | |
| 27 | template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n, |
| 28 | uchar* result) { |
| 29 | CacheEntry entry = entries_[c & kMask]; |
| 30 | if (entry.code_point_ == c) { |
| 31 | if (entry.offset_ == 0) { |
| 32 | return 0; |
| 33 | } else { |
| 34 | result[0] = c + entry.offset_; |
| 35 | return 1; |
| 36 | } |
| 37 | } else { |
| 38 | return CalculateValue(c, n, result); |
| 39 | } |
| 40 | } |
| 41 | |
| 42 | template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n, |
| 43 | uchar* result) { |
| 44 | bool allow_caching = true; |
| 45 | int length = T::Convert(c, n, result, &allow_caching); |
| 46 | if (allow_caching) { |
| 47 | if (length == 1) { |
| 48 | entries_[c & kMask] = CacheEntry(c, result[0] - c); |
| 49 | return 1; |
| 50 | } else { |
| 51 | entries_[c & kMask] = CacheEntry(c, 0); |
| 52 | return 0; |
| 53 | } |
| 54 | } else { |
| 55 | return length; |
| 56 | } |
| 57 | } |
| 58 | |
| 59 | |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame^] | 60 | uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { |
| 61 | DCHECK(c > Latin1::kMaxChar); |
| 62 | switch (c) { |
| 63 | // This are equivalent characters in unicode. |
| 64 | case 0x39c: |
| 65 | case 0x3bc: |
| 66 | return 0xb5; |
| 67 | // This is an uppercase of a Latin-1 character |
| 68 | // outside of Latin-1. |
| 69 | case 0x178: |
| 70 | return 0xff; |
| 71 | } |
| 72 | return 0; |
| 73 | } |
| 74 | |
| 75 | |
| 76 | unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { |
| 77 | static const int kMask = ~(1 << 6); |
| 78 | if (c <= kMaxOneByteChar) { |
| 79 | str[0] = c; |
| 80 | return 1; |
| 81 | } |
| 82 | str[0] = 0xC0 | (c >> 6); |
| 83 | str[1] = 0x80 | (c & kMask); |
| 84 | return 2; |
| 85 | } |
| 86 | |
| 87 | // Encode encodes the UTF-16 code units c and previous into the given str |
| 88 | // buffer, and combines surrogate code units into single code points. If |
| 89 | // replace_invalid is set to true, orphan surrogate code units will be replaced |
| 90 | // with kBadChar. |
| 91 | unsigned Utf8::Encode(char* str, |
| 92 | uchar c, |
| 93 | int previous, |
| 94 | bool replace_invalid) { |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 95 | static const int kMask = ~(1 << 6); |
| 96 | if (c <= kMaxOneByteChar) { |
| 97 | str[0] = c; |
| 98 | return 1; |
| 99 | } else if (c <= kMaxTwoByteChar) { |
| 100 | str[0] = 0xC0 | (c >> 6); |
| 101 | str[1] = 0x80 | (c & kMask); |
| 102 | return 2; |
| 103 | } else if (c <= kMaxThreeByteChar) { |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame^] | 104 | if (Utf16::IsSurrogatePair(previous, c)) { |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 105 | const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; |
| 106 | return Encode(str - kUnmatchedSize, |
| 107 | Utf16::CombineSurrogatePair(previous, c), |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame^] | 108 | Utf16::kNoPreviousCharacter, |
| 109 | replace_invalid) - kUnmatchedSize; |
| 110 | } else if (replace_invalid && |
| 111 | (Utf16::IsLeadSurrogate(c) || |
| 112 | Utf16::IsTrailSurrogate(c))) { |
| 113 | c = kBadChar; |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 114 | } |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 115 | str[0] = 0xE0 | (c >> 12); |
| 116 | str[1] = 0x80 | ((c >> 6) & kMask); |
| 117 | str[2] = 0x80 | (c & kMask); |
| 118 | return 3; |
| 119 | } else { |
| 120 | str[0] = 0xF0 | (c >> 18); |
| 121 | str[1] = 0x80 | ((c >> 12) & kMask); |
| 122 | str[2] = 0x80 | ((c >> 6) & kMask); |
| 123 | str[3] = 0x80 | (c & kMask); |
| 124 | return 4; |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | |
| 129 | uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) { |
| 130 | if (length <= 0) return kBadChar; |
| 131 | byte first = bytes[0]; |
| 132 | // Characters between 0000 and 0007F are encoded as a single character |
| 133 | if (first <= kMaxOneByteChar) { |
| 134 | *cursor += 1; |
| 135 | return first; |
| 136 | } |
| 137 | return CalculateValue(bytes, length, cursor); |
| 138 | } |
| 139 | |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 140 | unsigned Utf8::Length(uchar c, int previous) { |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 141 | if (c <= kMaxOneByteChar) { |
| 142 | return 1; |
| 143 | } else if (c <= kMaxTwoByteChar) { |
| 144 | return 2; |
| 145 | } else if (c <= kMaxThreeByteChar) { |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 146 | if (Utf16::IsTrailSurrogate(c) && |
| 147 | Utf16::IsLeadSurrogate(previous)) { |
| 148 | return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates; |
| 149 | } |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 150 | return 3; |
| 151 | } else { |
| 152 | return 4; |
| 153 | } |
| 154 | } |
| 155 | |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame^] | 156 | Utf8DecoderBase::Utf8DecoderBase() |
| 157 | : unbuffered_start_(NULL), |
| 158 | utf16_length_(0), |
| 159 | last_byte_of_buffer_unused_(false) {} |
| 160 | |
| 161 | Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, |
| 162 | unsigned buffer_length, |
| 163 | const uint8_t* stream, |
| 164 | unsigned stream_length) { |
| 165 | Reset(buffer, buffer_length, stream, stream_length); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 166 | } |
| 167 | |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame^] | 168 | template<unsigned kBufferSize> |
| 169 | Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length) |
| 170 | : Utf8DecoderBase(buffer_, |
| 171 | kBufferSize, |
| 172 | reinterpret_cast<const uint8_t*>(stream), |
| 173 | length) { |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 174 | } |
| 175 | |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame^] | 176 | template<unsigned kBufferSize> |
| 177 | void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) { |
| 178 | Utf8DecoderBase::Reset(buffer_, |
| 179 | kBufferSize, |
| 180 | reinterpret_cast<const uint8_t*>(stream), |
| 181 | length); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 182 | } |
| 183 | |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame^] | 184 | template <unsigned kBufferSize> |
| 185 | unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data, |
| 186 | unsigned length) const { |
| 187 | DCHECK(length > 0); |
| 188 | if (length > utf16_length_) length = utf16_length_; |
| 189 | // memcpy everything in buffer. |
| 190 | unsigned buffer_length = |
| 191 | last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize; |
| 192 | unsigned memcpy_length = length <= buffer_length ? length : buffer_length; |
| 193 | v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t)); |
| 194 | if (length <= buffer_length) return length; |
| 195 | DCHECK(unbuffered_start_ != NULL); |
| 196 | // Copy the rest the slow way. |
| 197 | WriteUtf16Slow(unbuffered_start_, |
| 198 | data + buffer_length, |
| 199 | length - buffer_length); |
| 200 | return length; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 201 | } |
| 202 | |
| 203 | } // namespace unibrow |
| 204 | |
| 205 | #endif // V8_UNICODE_INL_H_ |