blob: 42a81824bac3a42941c41ad2550088b2532d424f [file] [log] [blame]
mstarzinger@chromium.org1b3afd12011-11-29 14:28:56 +00001// Copyright 2011 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
ager@chromium.org5ec48922009-05-05 07:25:34 +000028#ifndef V8_UNICODE_H_
29#define V8_UNICODE_H_
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000030
31#include <sys/types.h>
yangguo@chromium.orga6bbcc82012-12-21 12:35:02 +000032#include <globals.h>
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000033/**
34 * \file
35 * Definitions and convenience functions for working with unicode.
36 */
37
38namespace unibrow {
39
40typedef unsigned int uchar;
41typedef unsigned char byte;
42
43/**
44 * The max length of the result of converting the case of a single
45 * character.
46 */
mstarzinger@chromium.org1b3afd12011-11-29 14:28:56 +000047const int kMaxMappingSize = 4;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000048
49template <class T, int size = 256>
50class Predicate {
51 public:
52 inline Predicate() { }
53 inline bool get(uchar c);
54 private:
55 friend class Test;
56 bool CalculateValue(uchar c);
57 struct CacheEntry {
58 inline CacheEntry() : code_point_(0), value_(0) { }
59 inline CacheEntry(uchar code_point, bool value)
60 : code_point_(code_point),
61 value_(value) { }
62 uchar code_point_ : 21;
63 bool value_ : 1;
64 };
65 static const int kSize = size;
66 static const int kMask = kSize - 1;
67 CacheEntry entries_[kSize];
68};
69
70// A cache used in case conversion. It caches the value for characters
71// that either have no mapping or map to a single character independent
72// of context. Characters that map to more than one character or that
73// map differently depending on context are always looked up.
74template <class T, int size = 256>
75class Mapping {
76 public:
77 inline Mapping() { }
78 inline int get(uchar c, uchar n, uchar* result);
79 private:
80 friend class Test;
81 int CalculateValue(uchar c, uchar n, uchar* result);
82 struct CacheEntry {
ager@chromium.orga74f0da2008-12-03 16:05:52 +000083 inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000084 inline CacheEntry(uchar code_point, signed offset)
85 : code_point_(code_point),
86 offset_(offset) { }
ager@chromium.orga74f0da2008-12-03 16:05:52 +000087 uchar code_point_;
88 signed offset_;
89 static const int kNoChar = (1 << 21) - 1;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000090 };
91 static const int kSize = size;
92 static const int kMask = kSize - 1;
93 CacheEntry entries_[kSize];
94};
95
96class UnicodeData {
97 private:
98 friend class Test;
99 static int GetByteCount();
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000100 static const uchar kMaxCodePoint;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000101};
102
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000103class Utf16 {
104 public:
105 static inline bool IsLeadSurrogate(int code) {
106 if (code == kNoPreviousCharacter) return false;
107 return (code & 0xfc00) == 0xd800;
108 }
109 static inline bool IsTrailSurrogate(int code) {
110 if (code == kNoPreviousCharacter) return false;
111 return (code & 0xfc00) == 0xdc00;
112 }
113
114 static inline int CombineSurrogatePair(uchar lead, uchar trail) {
115 return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
116 }
117 static const int kNoPreviousCharacter = -1;
118 static const uchar kMaxNonSurrogateCharCode = 0xffff;
119 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
120 // of UTF-8 data. The special case where the unit is a surrogate
121 // trail produces 1 byte net, because the encoding of the pair is
122 // 4 bytes and the 3 bytes that were used to encode the lead surrogate
123 // can be reclaimed.
124 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
125 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
126 // The illegality stems from the surrogate not being part of a pair.
127 static const int kUtf8BytesToCodeASurrogate = 3;
yangguo@chromium.orga6bbcc82012-12-21 12:35:02 +0000128 static inline uint16_t LeadSurrogate(uint32_t char_code) {
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000129 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
130 }
yangguo@chromium.orga6bbcc82012-12-21 12:35:02 +0000131 static inline uint16_t TrailSurrogate(uint32_t char_code) {
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000132 return 0xdc00 + (char_code & 0x3ff);
133 }
134};
135
jkummerow@chromium.org59297c72013-01-09 16:32:23 +0000136class Latin1 {
137 public:
jkummerow@chromium.org59297c72013-01-09 16:32:23 +0000138 static const unsigned kMaxChar = 0xff;
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +0000139 // Returns 0 if character does not convert to single latin-1 character
140 // or if the character doesn't not convert back to latin-1 via inverse
141 // operation (upper to lower, etc).
142 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
jkummerow@chromium.org59297c72013-01-09 16:32:23 +0000143};
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000144
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000145class Utf8 {
146 public:
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000147 static inline uchar Length(uchar chr, int previous);
svenpanne@chromium.org2bda5432013-03-15 12:39:50 +0000148 static inline unsigned EncodeOneByte(char* out, uint8_t c);
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000149 static inline unsigned Encode(
150 char* out, uchar c, int previous);
erik.corry@gmail.comd88afa22010-09-15 12:33:05 +0000151 static uchar CalculateValue(const byte* str,
152 unsigned length,
153 unsigned* cursor);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000154 static const uchar kBadChar = 0xFFFD;
155 static const unsigned kMaxEncodedSize = 4;
156 static const unsigned kMaxOneByteChar = 0x7f;
157 static const unsigned kMaxTwoByteChar = 0x7ff;
158 static const unsigned kMaxThreeByteChar = 0xffff;
159 static const unsigned kMaxFourByteChar = 0x1fffff;
160
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000161 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
162 // that match are coded as a 4 byte UTF-8 sequence.
163 static const unsigned kBytesSavedByCombiningSurrogates = 2;
164 static const unsigned kSizeOfUnmatchedSurrogate = 3;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000165 static inline uchar ValueOf(const byte* str,
166 unsigned length,
167 unsigned* cursor);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000168};
169
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000170
yangguo@chromium.orga6bbcc82012-12-21 12:35:02 +0000171class Utf8DecoderBase {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000172 public:
yangguo@chromium.orga6bbcc82012-12-21 12:35:02 +0000173 // Initialization done in subclass.
174 inline Utf8DecoderBase();
175 inline Utf8DecoderBase(uint16_t* buffer,
176 unsigned buffer_length,
177 const uint8_t* stream,
178 unsigned stream_length);
179 inline unsigned Utf16Length() const { return utf16_length_; }
180 protected:
181 // This reads all characters and sets the utf16_length_.
182 // The first buffer_length utf16 chars are cached in the buffer.
183 void Reset(uint16_t* buffer,
184 unsigned buffer_length,
185 const uint8_t* stream,
186 unsigned stream_length);
187 static void WriteUtf16Slow(const uint8_t* stream,
188 uint16_t* data,
189 unsigned length);
190 const uint8_t* unbuffered_start_;
191 unsigned utf16_length_;
192 bool last_byte_of_buffer_unused_;
193 private:
194 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
195};
196
197template <unsigned kBufferSize>
198class Utf8Decoder : public Utf8DecoderBase {
199 public:
200 inline Utf8Decoder() {}
201 inline Utf8Decoder(const char* stream, unsigned length);
202 inline void Reset(const char* stream, unsigned length);
203 inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
204 private:
205 uint16_t buffer_[kBufferSize];
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000206};
207
whesse@chromium.orge90029b2010-08-02 11:52:17 +0000208
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000209struct Uppercase {
210 static bool Is(uchar c);
211};
212struct Lowercase {
213 static bool Is(uchar c);
214};
215struct Letter {
216 static bool Is(uchar c);
217};
218struct Space {
219 static bool Is(uchar c);
220};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000221struct Number {
222 static bool Is(uchar c);
223};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000224struct WhiteSpace {
225 static bool Is(uchar c);
226};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000227struct LineTerminator {
228 static bool Is(uchar c);
229};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000230struct CombiningMark {
231 static bool Is(uchar c);
232};
233struct ConnectorPunctuation {
234 static bool Is(uchar c);
235};
236struct ToLowercase {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000237 static const int kMaxWidth = 3;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000238 static int Convert(uchar c,
239 uchar n,
240 uchar* result,
241 bool* allow_caching_ptr);
242};
243struct ToUppercase {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000244 static const int kMaxWidth = 3;
245 static int Convert(uchar c,
246 uchar n,
247 uchar* result,
248 bool* allow_caching_ptr);
249};
250struct Ecma262Canonicalize {
251 static const int kMaxWidth = 1;
252 static int Convert(uchar c,
253 uchar n,
254 uchar* result,
255 bool* allow_caching_ptr);
256};
257struct Ecma262UnCanonicalize {
258 static const int kMaxWidth = 4;
259 static int Convert(uchar c,
260 uchar n,
261 uchar* result,
262 bool* allow_caching_ptr);
263};
264struct CanonicalizationRange {
265 static const int kMaxWidth = 1;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000266 static int Convert(uchar c,
267 uchar n,
268 uchar* result,
269 bool* allow_caching_ptr);
270};
271
272} // namespace unibrow
273
ager@chromium.org5ec48922009-05-05 07:25:34 +0000274#endif // V8_UNICODE_H_