blob: f8a1f60cdd04ac9578aa374eb2dd7599bf3ee9b6 [file] [log] [blame]
mstarzinger@chromium.org1b3afd12011-11-29 14:28:56 +00001// Copyright 2011 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
ager@chromium.org5ec48922009-05-05 07:25:34 +000028#ifndef V8_UNICODE_H_
29#define V8_UNICODE_H_
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000030
31#include <sys/types.h>
yangguo@chromium.orga6bbcc82012-12-21 12:35:02 +000032#include <globals.h>
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000033/**
34 * \file
35 * Definitions and convenience functions for working with unicode.
36 */
37
38namespace unibrow {
39
40typedef unsigned int uchar;
41typedef unsigned char byte;
42
43/**
44 * The max length of the result of converting the case of a single
45 * character.
46 */
mstarzinger@chromium.org1b3afd12011-11-29 14:28:56 +000047const int kMaxMappingSize = 4;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000048
49template <class T, int size = 256>
50class Predicate {
51 public:
52 inline Predicate() { }
53 inline bool get(uchar c);
54 private:
55 friend class Test;
56 bool CalculateValue(uchar c);
57 struct CacheEntry {
58 inline CacheEntry() : code_point_(0), value_(0) { }
59 inline CacheEntry(uchar code_point, bool value)
60 : code_point_(code_point),
61 value_(value) { }
62 uchar code_point_ : 21;
63 bool value_ : 1;
64 };
65 static const int kSize = size;
66 static const int kMask = kSize - 1;
67 CacheEntry entries_[kSize];
68};
69
70// A cache used in case conversion. It caches the value for characters
71// that either have no mapping or map to a single character independent
72// of context. Characters that map to more than one character or that
73// map differently depending on context are always looked up.
74template <class T, int size = 256>
75class Mapping {
76 public:
77 inline Mapping() { }
78 inline int get(uchar c, uchar n, uchar* result);
79 private:
80 friend class Test;
81 int CalculateValue(uchar c, uchar n, uchar* result);
82 struct CacheEntry {
ager@chromium.orga74f0da2008-12-03 16:05:52 +000083 inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000084 inline CacheEntry(uchar code_point, signed offset)
85 : code_point_(code_point),
86 offset_(offset) { }
ager@chromium.orga74f0da2008-12-03 16:05:52 +000087 uchar code_point_;
88 signed offset_;
89 static const int kNoChar = (1 << 21) - 1;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000090 };
91 static const int kSize = size;
92 static const int kMask = kSize - 1;
93 CacheEntry entries_[kSize];
94};
95
96class UnicodeData {
97 private:
98 friend class Test;
99 static int GetByteCount();
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000100 static const uchar kMaxCodePoint;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000101};
102
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000103class Utf16 {
104 public:
105 static inline bool IsLeadSurrogate(int code) {
106 if (code == kNoPreviousCharacter) return false;
107 return (code & 0xfc00) == 0xd800;
108 }
109 static inline bool IsTrailSurrogate(int code) {
110 if (code == kNoPreviousCharacter) return false;
111 return (code & 0xfc00) == 0xdc00;
112 }
113
114 static inline int CombineSurrogatePair(uchar lead, uchar trail) {
115 return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
116 }
117 static const int kNoPreviousCharacter = -1;
118 static const uchar kMaxNonSurrogateCharCode = 0xffff;
119 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
120 // of UTF-8 data. The special case where the unit is a surrogate
121 // trail produces 1 byte net, because the encoding of the pair is
122 // 4 bytes and the 3 bytes that were used to encode the lead surrogate
123 // can be reclaimed.
124 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
125 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
126 // The illegality stems from the surrogate not being part of a pair.
127 static const int kUtf8BytesToCodeASurrogate = 3;
yangguo@chromium.orga6bbcc82012-12-21 12:35:02 +0000128 static inline uint16_t LeadSurrogate(uint32_t char_code) {
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000129 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
130 }
yangguo@chromium.orga6bbcc82012-12-21 12:35:02 +0000131 static inline uint16_t TrailSurrogate(uint32_t char_code) {
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000132 return 0xdc00 + (char_code & 0x3ff);
133 }
134};
135
jkummerow@chromium.org59297c72013-01-09 16:32:23 +0000136class Latin1 {
137 public:
138#ifndef ENABLE_LATIN_1
139 static const unsigned kMaxChar = 0x7f;
140#else
141 static const unsigned kMaxChar = 0xff;
142#endif
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +0000143 // Returns 0 if character does not convert to single latin-1 character
144 // or if the character doesn't not convert back to latin-1 via inverse
145 // operation (upper to lower, etc).
146 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
jkummerow@chromium.org59297c72013-01-09 16:32:23 +0000147};
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000148
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000149class Utf8 {
150 public:
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000151 static inline uchar Length(uchar chr, int previous);
152 static inline unsigned Encode(
153 char* out, uchar c, int previous);
erik.corry@gmail.comd88afa22010-09-15 12:33:05 +0000154 static uchar CalculateValue(const byte* str,
155 unsigned length,
156 unsigned* cursor);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000157 static const uchar kBadChar = 0xFFFD;
158 static const unsigned kMaxEncodedSize = 4;
159 static const unsigned kMaxOneByteChar = 0x7f;
160 static const unsigned kMaxTwoByteChar = 0x7ff;
161 static const unsigned kMaxThreeByteChar = 0xffff;
162 static const unsigned kMaxFourByteChar = 0x1fffff;
163
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000164 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
165 // that match are coded as a 4 byte UTF-8 sequence.
166 static const unsigned kBytesSavedByCombiningSurrogates = 2;
167 static const unsigned kSizeOfUnmatchedSurrogate = 3;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000168 static inline uchar ValueOf(const byte* str,
169 unsigned length,
170 unsigned* cursor);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000171};
172
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000173
yangguo@chromium.orga6bbcc82012-12-21 12:35:02 +0000174class Utf8DecoderBase {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000175 public:
yangguo@chromium.orga6bbcc82012-12-21 12:35:02 +0000176 // Initialization done in subclass.
177 inline Utf8DecoderBase();
178 inline Utf8DecoderBase(uint16_t* buffer,
179 unsigned buffer_length,
180 const uint8_t* stream,
181 unsigned stream_length);
182 inline unsigned Utf16Length() const { return utf16_length_; }
183 protected:
184 // This reads all characters and sets the utf16_length_.
185 // The first buffer_length utf16 chars are cached in the buffer.
186 void Reset(uint16_t* buffer,
187 unsigned buffer_length,
188 const uint8_t* stream,
189 unsigned stream_length);
190 static void WriteUtf16Slow(const uint8_t* stream,
191 uint16_t* data,
192 unsigned length);
193 const uint8_t* unbuffered_start_;
194 unsigned utf16_length_;
195 bool last_byte_of_buffer_unused_;
196 private:
197 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
198};
199
200template <unsigned kBufferSize>
201class Utf8Decoder : public Utf8DecoderBase {
202 public:
203 inline Utf8Decoder() {}
204 inline Utf8Decoder(const char* stream, unsigned length);
205 inline void Reset(const char* stream, unsigned length);
206 inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
207 private:
208 uint16_t buffer_[kBufferSize];
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000209};
210
whesse@chromium.orge90029b2010-08-02 11:52:17 +0000211
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000212struct Uppercase {
213 static bool Is(uchar c);
214};
215struct Lowercase {
216 static bool Is(uchar c);
217};
218struct Letter {
219 static bool Is(uchar c);
220};
221struct Space {
222 static bool Is(uchar c);
223};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000224struct Number {
225 static bool Is(uchar c);
226};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000227struct WhiteSpace {
228 static bool Is(uchar c);
229};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000230struct LineTerminator {
231 static bool Is(uchar c);
232};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000233struct CombiningMark {
234 static bool Is(uchar c);
235};
236struct ConnectorPunctuation {
237 static bool Is(uchar c);
238};
239struct ToLowercase {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000240 static const int kMaxWidth = 3;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000241 static int Convert(uchar c,
242 uchar n,
243 uchar* result,
244 bool* allow_caching_ptr);
245};
246struct ToUppercase {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000247 static const int kMaxWidth = 3;
248 static int Convert(uchar c,
249 uchar n,
250 uchar* result,
251 bool* allow_caching_ptr);
252};
253struct Ecma262Canonicalize {
254 static const int kMaxWidth = 1;
255 static int Convert(uchar c,
256 uchar n,
257 uchar* result,
258 bool* allow_caching_ptr);
259};
260struct Ecma262UnCanonicalize {
261 static const int kMaxWidth = 4;
262 static int Convert(uchar c,
263 uchar n,
264 uchar* result,
265 bool* allow_caching_ptr);
266};
267struct CanonicalizationRange {
268 static const int kMaxWidth = 1;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000269 static int Convert(uchar c,
270 uchar n,
271 uchar* result,
272 bool* allow_caching_ptr);
273};
274
275} // namespace unibrow
276
ager@chromium.org5ec48922009-05-05 07:25:34 +0000277#endif // V8_UNICODE_H_