Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 1 | // Copyright 2011 the V8 project authors. All rights reserved. |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 4 | |
| 5 | #ifndef V8_UNICODE_H_ |
| 6 | #define V8_UNICODE_H_ |
| 7 | |
| 8 | #include <sys/types.h> |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 9 | #include "src/globals.h" |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 10 | #include "src/utils.h" |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 11 | /** |
| 12 | * \file |
| 13 | * Definitions and convenience functions for working with unicode. |
| 14 | */ |
| 15 | |
| 16 | namespace unibrow { |
| 17 | |
| 18 | typedef unsigned int uchar; |
| 19 | typedef unsigned char byte; |
| 20 | |
| 21 | /** |
| 22 | * The max length of the result of converting the case of a single |
| 23 | * character. |
| 24 | */ |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 25 | const int kMaxMappingSize = 4; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 26 | |
| 27 | template <class T, int size = 256> |
| 28 | class Predicate { |
| 29 | public: |
| 30 | inline Predicate() { } |
| 31 | inline bool get(uchar c); |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 32 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 33 | private: |
| 34 | friend class Test; |
| 35 | bool CalculateValue(uchar c); |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 36 | class CacheEntry { |
| 37 | public: |
| 38 | inline CacheEntry() |
| 39 | : bit_field_(CodePointField::encode(0) | ValueField::encode(0)) {} |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 40 | inline CacheEntry(uchar code_point, bool value) |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 41 | : bit_field_(CodePointField::encode(code_point) | |
| 42 | ValueField::encode(value)) {} |
| 43 | |
| 44 | uchar code_point() const { return CodePointField::decode(bit_field_); } |
| 45 | bool value() const { return ValueField::decode(bit_field_); } |
| 46 | |
| 47 | private: |
| 48 | class CodePointField : public v8::internal::BitField<uchar, 0, 21> {}; |
| 49 | class ValueField : public v8::internal::BitField<bool, 21, 1> {}; |
| 50 | |
| 51 | uint32_t bit_field_; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 52 | }; |
| 53 | static const int kSize = size; |
| 54 | static const int kMask = kSize - 1; |
| 55 | CacheEntry entries_[kSize]; |
| 56 | }; |
| 57 | |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 58 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 59 | // A cache used in case conversion. It caches the value for characters |
| 60 | // that either have no mapping or map to a single character independent |
| 61 | // of context. Characters that map to more than one character or that |
| 62 | // map differently depending on context are always looked up. |
| 63 | template <class T, int size = 256> |
| 64 | class Mapping { |
| 65 | public: |
| 66 | inline Mapping() { } |
| 67 | inline int get(uchar c, uchar n, uchar* result); |
| 68 | private: |
| 69 | friend class Test; |
| 70 | int CalculateValue(uchar c, uchar n, uchar* result); |
| 71 | struct CacheEntry { |
| 72 | inline CacheEntry() : code_point_(kNoChar), offset_(0) { } |
| 73 | inline CacheEntry(uchar code_point, signed offset) |
| 74 | : code_point_(code_point), |
| 75 | offset_(offset) { } |
| 76 | uchar code_point_; |
| 77 | signed offset_; |
| 78 | static const int kNoChar = (1 << 21) - 1; |
| 79 | }; |
| 80 | static const int kSize = size; |
| 81 | static const int kMask = kSize - 1; |
| 82 | CacheEntry entries_[kSize]; |
| 83 | }; |
| 84 | |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 85 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 86 | class UnicodeData { |
| 87 | private: |
| 88 | friend class Test; |
| 89 | static int GetByteCount(); |
Steve Block | 44f0eee | 2011-05-26 01:26:41 +0100 | [diff] [blame] | 90 | static const uchar kMaxCodePoint; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 91 | }; |
| 92 | |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 93 | |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 94 | class Utf16 { |
| 95 | public: |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 96 | static inline bool IsSurrogatePair(int lead, int trail) { |
| 97 | return IsLeadSurrogate(lead) && IsTrailSurrogate(trail); |
| 98 | } |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 99 | static inline bool IsLeadSurrogate(int code) { |
| 100 | if (code == kNoPreviousCharacter) return false; |
| 101 | return (code & 0xfc00) == 0xd800; |
| 102 | } |
| 103 | static inline bool IsTrailSurrogate(int code) { |
| 104 | if (code == kNoPreviousCharacter) return false; |
| 105 | return (code & 0xfc00) == 0xdc00; |
| 106 | } |
| 107 | |
| 108 | static inline int CombineSurrogatePair(uchar lead, uchar trail) { |
| 109 | return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff); |
| 110 | } |
| 111 | static const int kNoPreviousCharacter = -1; |
| 112 | static const uchar kMaxNonSurrogateCharCode = 0xffff; |
| 113 | // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes |
| 114 | // of UTF-8 data. The special case where the unit is a surrogate |
| 115 | // trail produces 1 byte net, because the encoding of the pair is |
| 116 | // 4 bytes and the 3 bytes that were used to encode the lead surrogate |
| 117 | // can be reclaimed. |
| 118 | static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3; |
| 119 | // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes. |
| 120 | // The illegality stems from the surrogate not being part of a pair. |
| 121 | static const int kUtf8BytesToCodeASurrogate = 3; |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 122 | static inline uint16_t LeadSurrogate(uint32_t char_code) { |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 123 | return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); |
| 124 | } |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 125 | static inline uint16_t TrailSurrogate(uint32_t char_code) { |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 126 | return 0xdc00 + (char_code & 0x3ff); |
| 127 | } |
| 128 | }; |
| 129 | |
| 130 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 131 | class Utf8 { |
| 132 | public: |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 133 | static inline uchar Length(uchar chr, int previous); |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 134 | static inline unsigned EncodeOneByte(char* out, uint8_t c); |
| 135 | static inline unsigned Encode(char* out, |
| 136 | uchar c, |
| 137 | int previous, |
| 138 | bool replace_invalid = false); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 139 | static uchar CalculateValue(const byte* str, size_t length, size_t* cursor); |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 140 | |
| 141 | // The unicode replacement character, used to signal invalid unicode |
| 142 | // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding. |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 143 | static const uchar kBadChar = 0xFFFD; |
| 144 | static const unsigned kMaxEncodedSize = 4; |
| 145 | static const unsigned kMaxOneByteChar = 0x7f; |
| 146 | static const unsigned kMaxTwoByteChar = 0x7ff; |
| 147 | static const unsigned kMaxThreeByteChar = 0xffff; |
| 148 | static const unsigned kMaxFourByteChar = 0x1fffff; |
| 149 | |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 150 | // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together |
| 151 | // that match are coded as a 4 byte UTF-8 sequence. |
| 152 | static const unsigned kBytesSavedByCombiningSurrogates = 2; |
| 153 | static const unsigned kSizeOfUnmatchedSurrogate = 3; |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 154 | // The maximum size a single UTF-16 code unit may take up when encoded as |
| 155 | // UTF-8. |
| 156 | static const unsigned kMax16BitCodeUnitSize = 3; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 157 | static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor); |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 158 | |
| 159 | // Excludes non-characters from the set of valid code points. |
| 160 | static inline bool IsValidCharacter(uchar c); |
| 161 | |
| 162 | static bool Validate(const byte* str, size_t length); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 163 | }; |
| 164 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 165 | struct Uppercase { |
| 166 | static bool Is(uchar c); |
| 167 | }; |
| 168 | struct Lowercase { |
| 169 | static bool Is(uchar c); |
| 170 | }; |
| 171 | struct Letter { |
| 172 | static bool Is(uchar c); |
| 173 | }; |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 174 | struct ID_Start { |
| 175 | static bool Is(uchar c); |
| 176 | }; |
| 177 | struct ID_Continue { |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 178 | static bool Is(uchar c); |
| 179 | }; |
| 180 | struct WhiteSpace { |
| 181 | static bool Is(uchar c); |
| 182 | }; |
| 183 | struct LineTerminator { |
| 184 | static bool Is(uchar c); |
| 185 | }; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 186 | struct ToLowercase { |
| 187 | static const int kMaxWidth = 3; |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 188 | static const bool kIsToLower = true; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 189 | static int Convert(uchar c, |
| 190 | uchar n, |
| 191 | uchar* result, |
| 192 | bool* allow_caching_ptr); |
| 193 | }; |
| 194 | struct ToUppercase { |
| 195 | static const int kMaxWidth = 3; |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 196 | static const bool kIsToLower = false; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 197 | static int Convert(uchar c, |
| 198 | uchar n, |
| 199 | uchar* result, |
| 200 | bool* allow_caching_ptr); |
| 201 | }; |
| 202 | struct Ecma262Canonicalize { |
| 203 | static const int kMaxWidth = 1; |
| 204 | static int Convert(uchar c, |
| 205 | uchar n, |
| 206 | uchar* result, |
| 207 | bool* allow_caching_ptr); |
| 208 | }; |
| 209 | struct Ecma262UnCanonicalize { |
| 210 | static const int kMaxWidth = 4; |
| 211 | static int Convert(uchar c, |
| 212 | uchar n, |
| 213 | uchar* result, |
| 214 | bool* allow_caching_ptr); |
| 215 | }; |
| 216 | struct CanonicalizationRange { |
| 217 | static const int kMaxWidth = 1; |
| 218 | static int Convert(uchar c, |
| 219 | uchar n, |
| 220 | uchar* result, |
| 221 | bool* allow_caching_ptr); |
| 222 | }; |
| 223 | |
| 224 | } // namespace unibrow |
| 225 | |
| 226 | #endif // V8_UNICODE_H_ |