blob: 94ab1b4c1e7793cc1b0879e6906bb0b44851143c [file] [log] [blame]
mstarzinger@chromium.org1b3afd12011-11-29 14:28:56 +00001// Copyright 2011 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
ager@chromium.org5ec48922009-05-05 07:25:34 +000028#ifndef V8_UNICODE_H_
29#define V8_UNICODE_H_
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000030
31#include <sys/types.h>
32
33/**
34 * \file
35 * Definitions and convenience functions for working with unicode.
36 */
37
38namespace unibrow {
39
40typedef unsigned int uchar;
41typedef unsigned char byte;
42
43/**
44 * The max length of the result of converting the case of a single
45 * character.
46 */
mstarzinger@chromium.org1b3afd12011-11-29 14:28:56 +000047const int kMaxMappingSize = 4;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000048
49template <class T, int size = 256>
50class Predicate {
51 public:
52 inline Predicate() { }
53 inline bool get(uchar c);
54 private:
55 friend class Test;
56 bool CalculateValue(uchar c);
57 struct CacheEntry {
58 inline CacheEntry() : code_point_(0), value_(0) { }
59 inline CacheEntry(uchar code_point, bool value)
60 : code_point_(code_point),
61 value_(value) { }
62 uchar code_point_ : 21;
63 bool value_ : 1;
64 };
65 static const int kSize = size;
66 static const int kMask = kSize - 1;
67 CacheEntry entries_[kSize];
68};
69
70// A cache used in case conversion. It caches the value for characters
71// that either have no mapping or map to a single character independent
72// of context. Characters that map to more than one character or that
73// map differently depending on context are always looked up.
74template <class T, int size = 256>
75class Mapping {
76 public:
77 inline Mapping() { }
78 inline int get(uchar c, uchar n, uchar* result);
79 private:
80 friend class Test;
81 int CalculateValue(uchar c, uchar n, uchar* result);
82 struct CacheEntry {
ager@chromium.orga74f0da2008-12-03 16:05:52 +000083 inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000084 inline CacheEntry(uchar code_point, signed offset)
85 : code_point_(code_point),
86 offset_(offset) { }
ager@chromium.orga74f0da2008-12-03 16:05:52 +000087 uchar code_point_;
88 signed offset_;
89 static const int kNoChar = (1 << 21) - 1;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000090 };
91 static const int kSize = size;
92 static const int kMask = kSize - 1;
93 CacheEntry entries_[kSize];
94};
95
96class UnicodeData {
97 private:
98 friend class Test;
99 static int GetByteCount();
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000100 static const uchar kMaxCodePoint;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000101};
102
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000103// --- U t f 8 a n d 16 ---
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000104
105template <typename Data>
106class Buffer {
107 public:
108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
109 inline Buffer() : data_(0), length_(0) { }
110 Data data() { return data_; }
111 unsigned length() { return length_; }
112 private:
113 Data data_;
114 unsigned length_;
115};
116
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000117
118class Utf16 {
119 public:
120 static inline bool IsLeadSurrogate(int code) {
121 if (code == kNoPreviousCharacter) return false;
122 return (code & 0xfc00) == 0xd800;
123 }
124 static inline bool IsTrailSurrogate(int code) {
125 if (code == kNoPreviousCharacter) return false;
126 return (code & 0xfc00) == 0xdc00;
127 }
128
129 static inline int CombineSurrogatePair(uchar lead, uchar trail) {
130 return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
131 }
132 static const int kNoPreviousCharacter = -1;
133 static const uchar kMaxNonSurrogateCharCode = 0xffff;
134 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
135 // of UTF-8 data. The special case where the unit is a surrogate
136 // trail produces 1 byte net, because the encoding of the pair is
137 // 4 bytes and the 3 bytes that were used to encode the lead surrogate
138 // can be reclaimed.
139 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
140 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
141 // The illegality stems from the surrogate not being part of a pair.
142 static const int kUtf8BytesToCodeASurrogate = 3;
143 static inline uchar LeadSurrogate(int char_code) {
144 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
145 }
146 static inline uchar TrailSurrogate(int char_code) {
147 return 0xdc00 + (char_code & 0x3ff);
148 }
149};
150
151
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000152class Utf8 {
153 public:
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000154 static inline uchar Length(uchar chr, int previous);
155 static inline unsigned Encode(
156 char* out, uchar c, int previous);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000157 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
158 unsigned capacity, unsigned* chars_read, unsigned* offset);
erik.corry@gmail.comd88afa22010-09-15 12:33:05 +0000159 static uchar CalculateValue(const byte* str,
160 unsigned length,
161 unsigned* cursor);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000162 static const uchar kBadChar = 0xFFFD;
163 static const unsigned kMaxEncodedSize = 4;
164 static const unsigned kMaxOneByteChar = 0x7f;
165 static const unsigned kMaxTwoByteChar = 0x7ff;
166 static const unsigned kMaxThreeByteChar = 0xffff;
167 static const unsigned kMaxFourByteChar = 0x1fffff;
168
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000169 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
170 // that match are coded as a 4 byte UTF-8 sequence.
171 static const unsigned kBytesSavedByCombiningSurrogates = 2;
172 static const unsigned kSizeOfUnmatchedSurrogate = 3;
173
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000174 private:
175 template <unsigned s> friend class Utf8InputBuffer;
176 friend class Test;
177 static inline uchar ValueOf(const byte* str,
178 unsigned length,
179 unsigned* cursor);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000180};
181
182// --- C h a r a c t e r S t r e a m ---
183
184class CharacterStream {
185 public:
186 inline uchar GetNext();
187 inline bool has_more() { return remaining_ != 0; }
188 // Note that default implementation is not efficient.
189 virtual void Seek(unsigned);
190 unsigned Length();
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000191 unsigned Utf16Length();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000192 virtual ~CharacterStream() { }
193 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
194 unsigned& offset);
195 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
196 unsigned capacity, unsigned& offset);
197 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
198 unsigned capacity, unsigned& offset);
199 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
200 virtual void Rewind() = 0;
yangguo@chromium.org154ff992012-03-13 08:09:54 +0000201
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000202 protected:
203 virtual void FillBuffer() = 0;
204 // The number of characters left in the current buffer
205 unsigned remaining_;
206 // The current offset within the buffer
207 unsigned cursor_;
208 // The buffer containing the decoded characters.
209 const byte* buffer_;
210};
211
212// --- I n p u t B u f f e r ---
213
214/**
215 * Provides efficient access to encoded characters in strings. It
216 * does so by reading characters one block at a time, rather than one
217 * character at a time, which gives string implementations an
218 * opportunity to optimize the decoding.
219 */
220template <class Reader, class Input = Reader*, unsigned kSize = 256>
221class InputBuffer : public CharacterStream {
222 public:
223 virtual void Rewind();
224 inline void Reset(Input input);
225 void Seek(unsigned position);
226 inline void Reset(unsigned position, Input input);
227 protected:
228 InputBuffer() { }
229 explicit InputBuffer(Input input) { Reset(input); }
230 virtual void FillBuffer();
231
232 // A custom offset that can be used by the string implementation to
233 // mark progress within the encoded string.
234 unsigned offset_;
235 // The input string
236 Input input_;
237 // To avoid heap allocation, we keep an internal buffer to which
238 // the encoded string can write its characters. The string
239 // implementation is free to decide whether it wants to use this
240 // buffer or not.
241 byte util_buffer_[kSize];
242};
243
244// --- U t f 8 I n p u t B u f f e r ---
245
246template <unsigned s = 256>
247class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
248 public:
249 inline Utf8InputBuffer() { }
250 inline Utf8InputBuffer(const char* data, unsigned length);
251 inline void Reset(const char* data, unsigned length) {
252 InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
253 Buffer<const char*>(data, length));
254 }
255};
256
whesse@chromium.orge90029b2010-08-02 11:52:17 +0000257
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000258struct Uppercase {
259 static bool Is(uchar c);
260};
261struct Lowercase {
262 static bool Is(uchar c);
263};
264struct Letter {
265 static bool Is(uchar c);
266};
267struct Space {
268 static bool Is(uchar c);
269};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000270struct Number {
271 static bool Is(uchar c);
272};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000273struct WhiteSpace {
274 static bool Is(uchar c);
275};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000276struct LineTerminator {
277 static bool Is(uchar c);
278};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000279struct CombiningMark {
280 static bool Is(uchar c);
281};
282struct ConnectorPunctuation {
283 static bool Is(uchar c);
284};
285struct ToLowercase {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000286 static const int kMaxWidth = 3;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000287 static int Convert(uchar c,
288 uchar n,
289 uchar* result,
290 bool* allow_caching_ptr);
291};
292struct ToUppercase {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000293 static const int kMaxWidth = 3;
294 static int Convert(uchar c,
295 uchar n,
296 uchar* result,
297 bool* allow_caching_ptr);
298};
299struct Ecma262Canonicalize {
300 static const int kMaxWidth = 1;
301 static int Convert(uchar c,
302 uchar n,
303 uchar* result,
304 bool* allow_caching_ptr);
305};
306struct Ecma262UnCanonicalize {
307 static const int kMaxWidth = 4;
308 static int Convert(uchar c,
309 uchar n,
310 uchar* result,
311 bool* allow_caching_ptr);
312};
313struct CanonicalizationRange {
314 static const int kMaxWidth = 1;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000315 static int Convert(uchar c,
316 uchar n,
317 uchar* result,
318 bool* allow_caching_ptr);
319};
320
321} // namespace unibrow
322
ager@chromium.org5ec48922009-05-05 07:25:34 +0000323#endif // V8_UNICODE_H_