blob: fb9e6339e1279dd2e4c5ce6dd08333f805b03c19 [file] [log] [blame]
Ben Murdoch592a9fc2012-03-05 11:04:45 +00001// Copyright 2011 the V8 project authors. All rights reserved.
Steve Blocka7e24c12009-10-30 11:49:00 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_UNICODE_H_
29#define V8_UNICODE_H_
30
31#include <sys/types.h>
32
33/**
34 * \file
35 * Definitions and convenience functions for working with unicode.
36 */
37
38namespace unibrow {
39
40typedef unsigned int uchar;
41typedef unsigned char byte;
42
43/**
44 * The max length of the result of converting the case of a single
45 * character.
46 */
Ben Murdoch592a9fc2012-03-05 11:04:45 +000047const int kMaxMappingSize = 4;
Steve Blocka7e24c12009-10-30 11:49:00 +000048
49template <class T, int size = 256>
50class Predicate {
51 public:
52 inline Predicate() { }
53 inline bool get(uchar c);
54 private:
55 friend class Test;
56 bool CalculateValue(uchar c);
57 struct CacheEntry {
58 inline CacheEntry() : code_point_(0), value_(0) { }
59 inline CacheEntry(uchar code_point, bool value)
60 : code_point_(code_point),
61 value_(value) { }
62 uchar code_point_ : 21;
63 bool value_ : 1;
64 };
65 static const int kSize = size;
66 static const int kMask = kSize - 1;
67 CacheEntry entries_[kSize];
68};
69
70// A cache used in case conversion. It caches the value for characters
71// that either have no mapping or map to a single character independent
72// of context. Characters that map to more than one character or that
73// map differently depending on context are always looked up.
74template <class T, int size = 256>
75class Mapping {
76 public:
77 inline Mapping() { }
78 inline int get(uchar c, uchar n, uchar* result);
79 private:
80 friend class Test;
81 int CalculateValue(uchar c, uchar n, uchar* result);
82 struct CacheEntry {
83 inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
84 inline CacheEntry(uchar code_point, signed offset)
85 : code_point_(code_point),
86 offset_(offset) { }
87 uchar code_point_;
88 signed offset_;
89 static const int kNoChar = (1 << 21) - 1;
90 };
91 static const int kSize = size;
92 static const int kMask = kSize - 1;
93 CacheEntry entries_[kSize];
94};
95
96class UnicodeData {
97 private:
98 friend class Test;
99 static int GetByteCount();
Steve Block44f0eee2011-05-26 01:26:41 +0100100 static const uchar kMaxCodePoint;
Steve Blocka7e24c12009-10-30 11:49:00 +0000101};
102
103// --- U t f 8 ---
104
105template <typename Data>
106class Buffer {
107 public:
108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
109 inline Buffer() : data_(0), length_(0) { }
110 Data data() { return data_; }
111 unsigned length() { return length_; }
112 private:
113 Data data_;
114 unsigned length_;
115};
116
117class Utf8 {
118 public:
119 static inline uchar Length(uchar chr);
120 static inline unsigned Encode(char* out, uchar c);
121 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
122 unsigned capacity, unsigned* chars_read, unsigned* offset);
Kristian Monsen0d5e1162010-09-30 15:31:59 +0100123 static uchar CalculateValue(const byte* str,
124 unsigned length,
125 unsigned* cursor);
Steve Blocka7e24c12009-10-30 11:49:00 +0000126 static const uchar kBadChar = 0xFFFD;
127 static const unsigned kMaxEncodedSize = 4;
128 static const unsigned kMaxOneByteChar = 0x7f;
129 static const unsigned kMaxTwoByteChar = 0x7ff;
130 static const unsigned kMaxThreeByteChar = 0xffff;
131 static const unsigned kMaxFourByteChar = 0x1fffff;
132
133 private:
134 template <unsigned s> friend class Utf8InputBuffer;
135 friend class Test;
136 static inline uchar ValueOf(const byte* str,
137 unsigned length,
138 unsigned* cursor);
Steve Blocka7e24c12009-10-30 11:49:00 +0000139};
140
141// --- C h a r a c t e r S t r e a m ---
142
143class CharacterStream {
144 public:
145 inline uchar GetNext();
146 inline bool has_more() { return remaining_ != 0; }
147 // Note that default implementation is not efficient.
148 virtual void Seek(unsigned);
149 unsigned Length();
150 virtual ~CharacterStream() { }
151 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
152 unsigned& offset);
153 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
154 unsigned capacity, unsigned& offset);
155 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
156 unsigned capacity, unsigned& offset);
157 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
158 virtual void Rewind() = 0;
159 protected:
160 virtual void FillBuffer() = 0;
161 // The number of characters left in the current buffer
162 unsigned remaining_;
163 // The current offset within the buffer
164 unsigned cursor_;
165 // The buffer containing the decoded characters.
166 const byte* buffer_;
167};
168
169// --- I n p u t B u f f e r ---
170
171/**
172 * Provides efficient access to encoded characters in strings. It
173 * does so by reading characters one block at a time, rather than one
174 * character at a time, which gives string implementations an
175 * opportunity to optimize the decoding.
176 */
177template <class Reader, class Input = Reader*, unsigned kSize = 256>
178class InputBuffer : public CharacterStream {
179 public:
180 virtual void Rewind();
181 inline void Reset(Input input);
182 void Seek(unsigned position);
183 inline void Reset(unsigned position, Input input);
184 protected:
185 InputBuffer() { }
186 explicit InputBuffer(Input input) { Reset(input); }
187 virtual void FillBuffer();
188
189 // A custom offset that can be used by the string implementation to
190 // mark progress within the encoded string.
191 unsigned offset_;
192 // The input string
193 Input input_;
194 // To avoid heap allocation, we keep an internal buffer to which
195 // the encoded string can write its characters. The string
196 // implementation is free to decide whether it wants to use this
197 // buffer or not.
198 byte util_buffer_[kSize];
199};
200
201// --- U t f 8 I n p u t B u f f e r ---
202
203template <unsigned s = 256>
204class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
205 public:
206 inline Utf8InputBuffer() { }
207 inline Utf8InputBuffer(const char* data, unsigned length);
208 inline void Reset(const char* data, unsigned length) {
209 InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
210 Buffer<const char*>(data, length));
211 }
212};
213
Ben Murdochbb769b22010-08-11 14:56:33 +0100214
Steve Blocka7e24c12009-10-30 11:49:00 +0000215struct Uppercase {
216 static bool Is(uchar c);
217};
218struct Lowercase {
219 static bool Is(uchar c);
220};
221struct Letter {
222 static bool Is(uchar c);
223};
224struct Space {
225 static bool Is(uchar c);
226};
227struct Number {
228 static bool Is(uchar c);
229};
230struct WhiteSpace {
231 static bool Is(uchar c);
232};
233struct LineTerminator {
234 static bool Is(uchar c);
235};
236struct CombiningMark {
237 static bool Is(uchar c);
238};
239struct ConnectorPunctuation {
240 static bool Is(uchar c);
241};
242struct ToLowercase {
243 static const int kMaxWidth = 3;
244 static int Convert(uchar c,
245 uchar n,
246 uchar* result,
247 bool* allow_caching_ptr);
248};
249struct ToUppercase {
250 static const int kMaxWidth = 3;
251 static int Convert(uchar c,
252 uchar n,
253 uchar* result,
254 bool* allow_caching_ptr);
255};
256struct Ecma262Canonicalize {
257 static const int kMaxWidth = 1;
258 static int Convert(uchar c,
259 uchar n,
260 uchar* result,
261 bool* allow_caching_ptr);
262};
263struct Ecma262UnCanonicalize {
264 static const int kMaxWidth = 4;
265 static int Convert(uchar c,
266 uchar n,
267 uchar* result,
268 bool* allow_caching_ptr);
269};
270struct CanonicalizationRange {
271 static const int kMaxWidth = 1;
272 static int Convert(uchar c,
273 uchar n,
274 uchar* result,
275 bool* allow_caching_ptr);
276};
277
278} // namespace unibrow
279
280#endif // V8_UNICODE_H_