blob: f861f9f2d47449945d62a6fbc8044abbcd0b2a2b [file] [log] [blame]
whesse@chromium.orge90029b2010-08-02 11:52:17 +00001// Copyright 2007-2010 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
ager@chromium.org5ec48922009-05-05 07:25:34 +000028#ifndef V8_UNICODE_INL_H_
29#define V8_UNICODE_INL_H_
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000030
31#include "unicode.h"
yangguo@chromium.org355cfd12012-08-29 15:32:24 +000032#include "checks.h"
mstarzinger@chromium.orge27d6172013-04-17 11:51:44 +000033#include "platform.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000034
35namespace unibrow {
36
37template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
38 CacheEntry entry = entries_[code_point & kMask];
39 if (entry.code_point_ == code_point) return entry.value_;
40 return CalculateValue(code_point);
41}
42
43template <class T, int s> bool Predicate<T, s>::CalculateValue(
44 uchar code_point) {
45 bool result = T::Is(code_point);
46 entries_[code_point & kMask] = CacheEntry(code_point, result);
47 return result;
48}
49
50template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
51 uchar* result) {
52 CacheEntry entry = entries_[c & kMask];
53 if (entry.code_point_ == c) {
54 if (entry.offset_ == 0) {
55 return 0;
56 } else {
57 result[0] = c + entry.offset_;
58 return 1;
59 }
60 } else {
61 return CalculateValue(c, n, result);
62 }
63}
64
65template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
66 uchar* result) {
67 bool allow_caching = true;
68 int length = T::Convert(c, n, result, &allow_caching);
69 if (allow_caching) {
70 if (length == 1) {
71 entries_[c & kMask] = CacheEntry(c, result[0] - c);
72 return 1;
73 } else {
74 entries_[c & kMask] = CacheEntry(c, 0);
75 return 0;
76 }
77 } else {
78 return length;
79 }
80}
81
82
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +000083uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
yangguo@chromium.org46a2a512013-01-18 16:29:40 +000084 ASSERT(c > Latin1::kMaxChar);
85 switch (c) {
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +000086 // This are equivalent characters in unicode.
87 case 0x39c:
88 case 0x3bc:
89 return 0xb5;
90 // This is an uppercase of a Latin-1 character
91 // outside of Latin-1.
yangguo@chromium.org46a2a512013-01-18 16:29:40 +000092 case 0x178:
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +000093 return 0xff;
yangguo@chromium.org46a2a512013-01-18 16:29:40 +000094 }
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +000095 return 0;
yangguo@chromium.org46a2a512013-01-18 16:29:40 +000096}
97
98
svenpanne@chromium.org2bda5432013-03-15 12:39:50 +000099unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
100 static const int kMask = ~(1 << 6);
101 if (c <= kMaxOneByteChar) {
102 str[0] = c;
103 return 1;
104 }
105 str[0] = 0xC0 | (c >> 6);
106 str[1] = 0x80 | (c & kMask);
107 return 2;
108}
109
110
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000111unsigned Utf8::Encode(char* str, uchar c, int previous) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000112 static const int kMask = ~(1 << 6);
113 if (c <= kMaxOneByteChar) {
114 str[0] = c;
115 return 1;
116 } else if (c <= kMaxTwoByteChar) {
117 str[0] = 0xC0 | (c >> 6);
118 str[1] = 0x80 | (c & kMask);
119 return 2;
120 } else if (c <= kMaxThreeByteChar) {
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000121 if (Utf16::IsTrailSurrogate(c) &&
122 Utf16::IsLeadSurrogate(previous)) {
123 const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
124 return Encode(str - kUnmatchedSize,
125 Utf16::CombineSurrogatePair(previous, c),
126 Utf16::kNoPreviousCharacter) - kUnmatchedSize;
127 }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000128 str[0] = 0xE0 | (c >> 12);
129 str[1] = 0x80 | ((c >> 6) & kMask);
130 str[2] = 0x80 | (c & kMask);
131 return 3;
132 } else {
133 str[0] = 0xF0 | (c >> 18);
134 str[1] = 0x80 | ((c >> 12) & kMask);
135 str[2] = 0x80 | ((c >> 6) & kMask);
136 str[3] = 0x80 | (c & kMask);
137 return 4;
138 }
139}
140
141
142uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
143 if (length <= 0) return kBadChar;
144 byte first = bytes[0];
145 // Characters between 0000 and 0007F are encoded as a single character
146 if (first <= kMaxOneByteChar) {
147 *cursor += 1;
148 return first;
149 }
150 return CalculateValue(bytes, length, cursor);
151}
152
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000153unsigned Utf8::Length(uchar c, int previous) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000154 if (c <= kMaxOneByteChar) {
155 return 1;
156 } else if (c <= kMaxTwoByteChar) {
157 return 2;
158 } else if (c <= kMaxThreeByteChar) {
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000159 if (Utf16::IsTrailSurrogate(c) &&
160 Utf16::IsLeadSurrogate(previous)) {
161 return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
162 }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000163 return 3;
164 } else {
165 return 4;
166 }
167}
168
yangguo@chromium.orga6bbcc82012-12-21 12:35:02 +0000169Utf8DecoderBase::Utf8DecoderBase()
170 : unbuffered_start_(NULL),
171 utf16_length_(0),
172 last_byte_of_buffer_unused_(false) {}
173
174Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer,
175 unsigned buffer_length,
176 const uint8_t* stream,
177 unsigned stream_length) {
178 Reset(buffer, buffer_length, stream, stream_length);
179}
180
181template<unsigned kBufferSize>
182Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
183 : Utf8DecoderBase(buffer_,
184 kBufferSize,
185 reinterpret_cast<const uint8_t*>(stream),
186 length) {
187}
188
189template<unsigned kBufferSize>
190void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
191 Utf8DecoderBase::Reset(buffer_,
192 kBufferSize,
193 reinterpret_cast<const uint8_t*>(stream),
194 length);
195}
196
197template <unsigned kBufferSize>
198unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
199 unsigned length) const {
200 ASSERT(length > 0);
201 if (length > utf16_length_) length = utf16_length_;
202 // memcpy everything in buffer.
203 unsigned buffer_length =
204 last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
205 unsigned memcpy_length = length <= buffer_length ? length : buffer_length;
mstarzinger@chromium.orge27d6172013-04-17 11:51:44 +0000206 v8::internal::OS::MemCopy(data, buffer_, memcpy_length*sizeof(uint16_t));
yangguo@chromium.orga6bbcc82012-12-21 12:35:02 +0000207 if (length <= buffer_length) return length;
208 ASSERT(unbuffered_start_ != NULL);
209 // Copy the rest the slow way.
210 WriteUtf16Slow(unbuffered_start_,
211 data + buffer_length,
212 length - buffer_length);
213 return length;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000214}
215
216} // namespace unibrow
217
ager@chromium.org5ec48922009-05-05 07:25:34 +0000218#endif // V8_UNICODE_INL_H_