Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 1 | // Copyright 2014 the V8 project authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #ifndef V8_UNICODE_DECODER_H_ |
| 6 | #define V8_UNICODE_DECODER_H_ |
| 7 | |
| 8 | #include <sys/types.h> |
| 9 | #include "src/globals.h" |
| 10 | |
| 11 | namespace unibrow { |
| 12 | |
| 13 | class Utf8DecoderBase { |
| 14 | public: |
| 15 | // Initialization done in subclass. |
| 16 | inline Utf8DecoderBase(); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 17 | inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length, |
| 18 | const uint8_t* stream, size_t stream_length); |
| 19 | inline size_t Utf16Length() const { return utf16_length_; } |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 20 | |
| 21 | protected: |
| 22 | // This reads all characters and sets the utf16_length_. |
| 23 | // The first buffer_length utf16 chars are cached in the buffer. |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 24 | void Reset(uint16_t* buffer, size_t buffer_length, const uint8_t* stream, |
| 25 | size_t stream_length); |
| 26 | static void WriteUtf16Slow(const uint8_t* stream, size_t stream_length, |
| 27 | uint16_t* data, size_t length); |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 28 | const uint8_t* unbuffered_start_; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 29 | size_t unbuffered_length_; |
| 30 | size_t utf16_length_; |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 31 | bool last_byte_of_buffer_unused_; |
| 32 | |
| 33 | private: |
| 34 | DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase); |
| 35 | }; |
| 36 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 37 | template <size_t kBufferSize> |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 38 | class Utf8Decoder : public Utf8DecoderBase { |
| 39 | public: |
| 40 | inline Utf8Decoder() {} |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 41 | inline Utf8Decoder(const char* stream, size_t length); |
| 42 | inline void Reset(const char* stream, size_t length); |
| 43 | inline size_t WriteUtf16(uint16_t* data, size_t length) const; |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 44 | |
| 45 | private: |
| 46 | uint16_t buffer_[kBufferSize]; |
| 47 | }; |
| 48 | |
| 49 | |
| 50 | Utf8DecoderBase::Utf8DecoderBase() |
| 51 | : unbuffered_start_(NULL), |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 52 | unbuffered_length_(0), |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 53 | utf16_length_(0), |
| 54 | last_byte_of_buffer_unused_(false) {} |
| 55 | |
| 56 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 57 | Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, size_t buffer_length, |
| 58 | const uint8_t* stream, size_t stream_length) { |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 59 | Reset(buffer, buffer_length, stream, stream_length); |
| 60 | } |
| 61 | |
| 62 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 63 | template <size_t kBufferSize> |
| 64 | Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, size_t length) |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 65 | : Utf8DecoderBase(buffer_, kBufferSize, |
| 66 | reinterpret_cast<const uint8_t*>(stream), length) {} |
| 67 | |
| 68 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 69 | template <size_t kBufferSize> |
| 70 | void Utf8Decoder<kBufferSize>::Reset(const char* stream, size_t length) { |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 71 | Utf8DecoderBase::Reset(buffer_, kBufferSize, |
| 72 | reinterpret_cast<const uint8_t*>(stream), length); |
| 73 | } |
| 74 | |
| 75 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 76 | template <size_t kBufferSize> |
| 77 | size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data, |
| 78 | size_t length) const { |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 79 | DCHECK(length > 0); |
| 80 | if (length > utf16_length_) length = utf16_length_; |
| 81 | // memcpy everything in buffer. |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 82 | size_t buffer_length = |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 83 | last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 84 | size_t memcpy_length = length <= buffer_length ? length : buffer_length; |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 85 | v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t)); |
| 86 | if (length <= buffer_length) return length; |
| 87 | DCHECK(unbuffered_start_ != NULL); |
| 88 | // Copy the rest the slow way. |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 89 | WriteUtf16Slow(unbuffered_start_, unbuffered_length_, data + buffer_length, |
Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 90 | length - buffer_length); |
| 91 | return length; |
| 92 | } |
| 93 | |
| 94 | class Latin1 { |
| 95 | public: |
| 96 | static const unsigned kMaxChar = 0xff; |
| 97 | // Returns 0 if character does not convert to single latin-1 character |
| 98 | // or if the character doesn't not convert back to latin-1 via inverse |
| 99 | // operation (upper to lower, etc). |
| 100 | static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t); |
| 101 | }; |
| 102 | |
| 103 | |
| 104 | uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { |
| 105 | DCHECK(c > Latin1::kMaxChar); |
| 106 | switch (c) { |
| 107 | // This are equivalent characters in unicode. |
| 108 | case 0x39c: |
| 109 | case 0x3bc: |
| 110 | return 0xb5; |
| 111 | // This is an uppercase of a Latin-1 character |
| 112 | // outside of Latin-1. |
| 113 | case 0x178: |
| 114 | return 0xff; |
| 115 | } |
| 116 | return 0; |
| 117 | } |
| 118 | |
| 119 | |
| 120 | } // namespace unibrow |
| 121 | |
| 122 | #endif // V8_UNICODE_DECODER_H_ |