Emily Bernier | d0a1eb7 | 2015-03-24 16:35:39 -0400 | [diff] [blame] | 1 | // Copyright 2014 the V8 project authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #ifndef V8_UNICODE_DECODER_H_ |
| 6 | #define V8_UNICODE_DECODER_H_ |
| 7 | |
| 8 | #include <sys/types.h> |
| 9 | #include "src/globals.h" |
| 10 | |
| 11 | namespace unibrow { |
| 12 | |
| 13 | class Utf8DecoderBase { |
| 14 | public: |
| 15 | // Initialization done in subclass. |
| 16 | inline Utf8DecoderBase(); |
| 17 | inline Utf8DecoderBase(uint16_t* buffer, unsigned buffer_length, |
| 18 | const uint8_t* stream, unsigned stream_length); |
| 19 | inline unsigned Utf16Length() const { return utf16_length_; } |
| 20 | |
| 21 | protected: |
| 22 | // This reads all characters and sets the utf16_length_. |
| 23 | // The first buffer_length utf16 chars are cached in the buffer. |
| 24 | void Reset(uint16_t* buffer, unsigned buffer_length, const uint8_t* stream, |
| 25 | unsigned stream_length); |
| 26 | static void WriteUtf16Slow(const uint8_t* stream, uint16_t* data, |
| 27 | unsigned length); |
| 28 | const uint8_t* unbuffered_start_; |
| 29 | unsigned utf16_length_; |
| 30 | bool last_byte_of_buffer_unused_; |
| 31 | |
| 32 | private: |
| 33 | DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase); |
| 34 | }; |
| 35 | |
| 36 | template <unsigned kBufferSize> |
| 37 | class Utf8Decoder : public Utf8DecoderBase { |
| 38 | public: |
| 39 | inline Utf8Decoder() {} |
| 40 | inline Utf8Decoder(const char* stream, unsigned length); |
| 41 | inline void Reset(const char* stream, unsigned length); |
| 42 | inline unsigned WriteUtf16(uint16_t* data, unsigned length) const; |
| 43 | |
| 44 | private: |
| 45 | uint16_t buffer_[kBufferSize]; |
| 46 | }; |
| 47 | |
| 48 | |
| 49 | Utf8DecoderBase::Utf8DecoderBase() |
| 50 | : unbuffered_start_(NULL), |
| 51 | utf16_length_(0), |
| 52 | last_byte_of_buffer_unused_(false) {} |
| 53 | |
| 54 | |
| 55 | Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, unsigned buffer_length, |
| 56 | const uint8_t* stream, |
| 57 | unsigned stream_length) { |
| 58 | Reset(buffer, buffer_length, stream, stream_length); |
| 59 | } |
| 60 | |
| 61 | |
| 62 | template <unsigned kBufferSize> |
| 63 | Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length) |
| 64 | : Utf8DecoderBase(buffer_, kBufferSize, |
| 65 | reinterpret_cast<const uint8_t*>(stream), length) {} |
| 66 | |
| 67 | |
| 68 | template <unsigned kBufferSize> |
| 69 | void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) { |
| 70 | Utf8DecoderBase::Reset(buffer_, kBufferSize, |
| 71 | reinterpret_cast<const uint8_t*>(stream), length); |
| 72 | } |
| 73 | |
| 74 | |
| 75 | template <unsigned kBufferSize> |
| 76 | unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data, |
| 77 | unsigned length) const { |
| 78 | DCHECK(length > 0); |
| 79 | if (length > utf16_length_) length = utf16_length_; |
| 80 | // memcpy everything in buffer. |
| 81 | unsigned buffer_length = |
| 82 | last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize; |
| 83 | unsigned memcpy_length = length <= buffer_length ? length : buffer_length; |
| 84 | v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t)); |
| 85 | if (length <= buffer_length) return length; |
| 86 | DCHECK(unbuffered_start_ != NULL); |
| 87 | // Copy the rest the slow way. |
| 88 | WriteUtf16Slow(unbuffered_start_, data + buffer_length, |
| 89 | length - buffer_length); |
| 90 | return length; |
| 91 | } |
| 92 | |
| 93 | class Latin1 { |
| 94 | public: |
| 95 | static const unsigned kMaxChar = 0xff; |
| 96 | // Returns 0 if character does not convert to single latin-1 character |
| 97 | // or if the character doesn't not convert back to latin-1 via inverse |
| 98 | // operation (upper to lower, etc). |
| 99 | static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t); |
| 100 | }; |
| 101 | |
| 102 | |
| 103 | uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { |
| 104 | DCHECK(c > Latin1::kMaxChar); |
| 105 | switch (c) { |
| 106 | // This are equivalent characters in unicode. |
| 107 | case 0x39c: |
| 108 | case 0x3bc: |
| 109 | return 0xb5; |
| 110 | // This is an uppercase of a Latin-1 character |
| 111 | // outside of Latin-1. |
| 112 | case 0x178: |
| 113 | return 0xff; |
| 114 | } |
| 115 | return 0; |
| 116 | } |
| 117 | |
| 118 | |
| 119 | } // namespace unibrow |
| 120 | |
| 121 | #endif // V8_UNICODE_DECODER_H_ |