blob: 86fd49885cff2b2916a6297cabd13dad0860c338 [file] [log] [blame]
ager@chromium.org9258b6b2008-09-11 09:11:10 +00001// Copyright 2007-2008 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef __UNIBROW_H__
29#define __UNIBROW_H__
30
31#include <sys/types.h>
32
33/**
34 * \file
35 * Definitions and convenience functions for working with unicode.
36 */
37
38namespace unibrow {
39
40typedef unsigned int uchar;
41typedef unsigned char byte;
42
43/**
44 * The max length of the result of converting the case of a single
45 * character.
46 */
ager@chromium.orga74f0da2008-12-03 16:05:52 +000047static const int kMaxMappingSize = 4;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000048
49template <class T, int size = 256>
50class Predicate {
51 public:
52 inline Predicate() { }
53 inline bool get(uchar c);
54 private:
55 friend class Test;
56 bool CalculateValue(uchar c);
57 struct CacheEntry {
58 inline CacheEntry() : code_point_(0), value_(0) { }
59 inline CacheEntry(uchar code_point, bool value)
60 : code_point_(code_point),
61 value_(value) { }
62 uchar code_point_ : 21;
63 bool value_ : 1;
64 };
65 static const int kSize = size;
66 static const int kMask = kSize - 1;
67 CacheEntry entries_[kSize];
68};
69
70// A cache used in case conversion. It caches the value for characters
71// that either have no mapping or map to a single character independent
72// of context. Characters that map to more than one character or that
73// map differently depending on context are always looked up.
74template <class T, int size = 256>
75class Mapping {
76 public:
77 inline Mapping() { }
78 inline int get(uchar c, uchar n, uchar* result);
79 private:
80 friend class Test;
81 int CalculateValue(uchar c, uchar n, uchar* result);
82 struct CacheEntry {
ager@chromium.orga74f0da2008-12-03 16:05:52 +000083 inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000084 inline CacheEntry(uchar code_point, signed offset)
85 : code_point_(code_point),
86 offset_(offset) { }
ager@chromium.orga74f0da2008-12-03 16:05:52 +000087 uchar code_point_;
88 signed offset_;
89 static const int kNoChar = (1 << 21) - 1;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000090 };
91 static const int kSize = size;
92 static const int kMask = kSize - 1;
93 CacheEntry entries_[kSize];
94};
95
96class UnicodeData {
97 private:
98 friend class Test;
99 static int GetByteCount();
100 static uchar kMaxCodePoint;
101};
102
103// --- U t f 8 ---
104
105template <typename Data>
106class Buffer {
107 public:
108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
109 inline Buffer() : data_(0), length_(0) { }
110 Data data() { return data_; }
111 unsigned length() { return length_; }
112 private:
113 Data data_;
114 unsigned length_;
115};
116
117class Utf8 {
118 public:
119 static inline uchar Length(uchar chr);
120 static inline unsigned Encode(char* out, uchar c);
121 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
122 unsigned capacity, unsigned* chars_read, unsigned* offset);
123 static const uchar kBadChar = 0xFFFD;
124 static const unsigned kMaxEncodedSize = 4;
125 static const unsigned kMaxOneByteChar = 0x7f;
126 static const unsigned kMaxTwoByteChar = 0x7ff;
127 static const unsigned kMaxThreeByteChar = 0xffff;
128 static const unsigned kMaxFourByteChar = 0x1fffff;
129
130 private:
131 template <unsigned s> friend class Utf8InputBuffer;
132 friend class Test;
133 static inline uchar ValueOf(const byte* str,
134 unsigned length,
135 unsigned* cursor);
136 static uchar CalculateValue(const byte* str,
137 unsigned length,
138 unsigned* cursor);
139};
140
141// --- C h a r a c t e r S t r e a m ---
142
143class CharacterStream {
144 public:
145 inline uchar GetNext();
146 inline bool has_more() { return remaining_ != 0; }
147 // Note that default implementation is not efficient.
148 virtual void Seek(unsigned);
149 unsigned Length();
150 virtual ~CharacterStream() { }
151 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
152 unsigned& offset);
153 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
154 unsigned capacity, unsigned& offset);
155 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
156 unsigned capacity, unsigned& offset);
157 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
158 virtual void Rewind() = 0;
159 protected:
160 virtual void FillBuffer() = 0;
161 // The number of characters left in the current buffer
162 unsigned remaining_;
163 // The current offset within the buffer
164 unsigned cursor_;
165 // The buffer containing the decoded characters.
166 const byte* buffer_;
167};
168
169// --- I n p u t B u f f e r ---
170
171/**
172 * Provides efficient access to encoded characters in strings. It
173 * does so by reading characters one block at a time, rather than one
174 * character at a time, which gives string implementations an
175 * opportunity to optimize the decoding.
176 */
177template <class Reader, class Input = Reader*, unsigned kSize = 256>
178class InputBuffer : public CharacterStream {
179 public:
180 virtual void Rewind();
181 inline void Reset(Input input);
182 void Seek(unsigned position);
183 inline void Reset(unsigned position, Input input);
184 protected:
185 InputBuffer() { }
186 explicit InputBuffer(Input input) { Reset(input); }
187 virtual void FillBuffer();
188
189 // A custom offset that can be used by the string implementation to
190 // mark progress within the encoded string.
191 unsigned offset_;
192 // The input string
193 Input input_;
194 // To avoid heap allocation, we keep an internal buffer to which
195 // the encoded string can write its characters. The string
196 // implementation is free to decide whether it wants to use this
197 // buffer or not.
198 byte util_buffer_[kSize];
199};
200
201// --- U t f 8 I n p u t B u f f e r ---
202
203template <unsigned s = 256>
204class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
205 public:
206 inline Utf8InputBuffer() { }
207 inline Utf8InputBuffer(const char* data, unsigned length);
208 inline void Reset(const char* data, unsigned length) {
209 InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
210 Buffer<const char*>(data, length));
211 }
212};
213
214struct Uppercase {
215 static bool Is(uchar c);
216};
217struct Lowercase {
218 static bool Is(uchar c);
219};
220struct Letter {
221 static bool Is(uchar c);
222};
223struct Space {
224 static bool Is(uchar c);
225};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000226struct Number {
227 static bool Is(uchar c);
228};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000229struct WhiteSpace {
230 static bool Is(uchar c);
231};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000232struct LineTerminator {
233 static bool Is(uchar c);
234};
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000235struct CombiningMark {
236 static bool Is(uchar c);
237};
238struct ConnectorPunctuation {
239 static bool Is(uchar c);
240};
241struct ToLowercase {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000242 static const int kMaxWidth = 3;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000243 static int Convert(uchar c,
244 uchar n,
245 uchar* result,
246 bool* allow_caching_ptr);
247};
248struct ToUppercase {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000249 static const int kMaxWidth = 3;
250 static int Convert(uchar c,
251 uchar n,
252 uchar* result,
253 bool* allow_caching_ptr);
254};
255struct Ecma262Canonicalize {
256 static const int kMaxWidth = 1;
257 static int Convert(uchar c,
258 uchar n,
259 uchar* result,
260 bool* allow_caching_ptr);
261};
262struct Ecma262UnCanonicalize {
263 static const int kMaxWidth = 4;
264 static int Convert(uchar c,
265 uchar n,
266 uchar* result,
267 bool* allow_caching_ptr);
268};
269struct CanonicalizationRange {
270 static const int kMaxWidth = 1;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000271 static int Convert(uchar c,
272 uchar n,
273 uchar* result,
274 bool* allow_caching_ptr);
275};
276
277} // namespace unibrow
278
279#endif // __UNIBROW_H__