blob: c03084116603c5db734f1d471e25fa92e1106623 [file] [log] [blame]
Emily Bernierd0a1eb72015-03-24 16:35:39 -04001// Copyright 2014 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_UNICODE_DECODER_H_
6#define V8_UNICODE_DECODER_H_
7
8#include <sys/types.h>
9#include "src/globals.h"
10
11namespace unibrow {
12
13class Utf8DecoderBase {
14 public:
15 // Initialization done in subclass.
16 inline Utf8DecoderBase();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000017 inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
18 const uint8_t* stream, size_t stream_length);
19 inline size_t Utf16Length() const { return utf16_length_; }
Emily Bernierd0a1eb72015-03-24 16:35:39 -040020
21 protected:
22 // This reads all characters and sets the utf16_length_.
23 // The first buffer_length utf16 chars are cached in the buffer.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000024 void Reset(uint16_t* buffer, size_t buffer_length, const uint8_t* stream,
25 size_t stream_length);
26 static void WriteUtf16Slow(const uint8_t* stream, size_t stream_length,
27 uint16_t* data, size_t length);
Emily Bernierd0a1eb72015-03-24 16:35:39 -040028 const uint8_t* unbuffered_start_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000029 size_t unbuffered_length_;
30 size_t utf16_length_;
Emily Bernierd0a1eb72015-03-24 16:35:39 -040031 bool last_byte_of_buffer_unused_;
32
33 private:
34 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
35};
36
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000037template <size_t kBufferSize>
Emily Bernierd0a1eb72015-03-24 16:35:39 -040038class Utf8Decoder : public Utf8DecoderBase {
39 public:
40 inline Utf8Decoder() {}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000041 inline Utf8Decoder(const char* stream, size_t length);
42 inline void Reset(const char* stream, size_t length);
43 inline size_t WriteUtf16(uint16_t* data, size_t length) const;
Emily Bernierd0a1eb72015-03-24 16:35:39 -040044
45 private:
46 uint16_t buffer_[kBufferSize];
47};
48
49
50Utf8DecoderBase::Utf8DecoderBase()
51 : unbuffered_start_(NULL),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000052 unbuffered_length_(0),
Emily Bernierd0a1eb72015-03-24 16:35:39 -040053 utf16_length_(0),
54 last_byte_of_buffer_unused_(false) {}
55
56
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000057Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
58 const uint8_t* stream, size_t stream_length) {
Emily Bernierd0a1eb72015-03-24 16:35:39 -040059 Reset(buffer, buffer_length, stream, stream_length);
60}
61
62
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000063template <size_t kBufferSize>
64Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, size_t length)
Emily Bernierd0a1eb72015-03-24 16:35:39 -040065 : Utf8DecoderBase(buffer_, kBufferSize,
66 reinterpret_cast<const uint8_t*>(stream), length) {}
67
68
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000069template <size_t kBufferSize>
70void Utf8Decoder<kBufferSize>::Reset(const char* stream, size_t length) {
Emily Bernierd0a1eb72015-03-24 16:35:39 -040071 Utf8DecoderBase::Reset(buffer_, kBufferSize,
72 reinterpret_cast<const uint8_t*>(stream), length);
73}
74
75
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000076template <size_t kBufferSize>
77size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
78 size_t length) const {
Emily Bernierd0a1eb72015-03-24 16:35:39 -040079 DCHECK(length > 0);
80 if (length > utf16_length_) length = utf16_length_;
81 // memcpy everything in buffer.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000082 size_t buffer_length =
Emily Bernierd0a1eb72015-03-24 16:35:39 -040083 last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000084 size_t memcpy_length = length <= buffer_length ? length : buffer_length;
Emily Bernierd0a1eb72015-03-24 16:35:39 -040085 v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
86 if (length <= buffer_length) return length;
87 DCHECK(unbuffered_start_ != NULL);
88 // Copy the rest the slow way.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000089 WriteUtf16Slow(unbuffered_start_, unbuffered_length_, data + buffer_length,
Emily Bernierd0a1eb72015-03-24 16:35:39 -040090 length - buffer_length);
91 return length;
92}
93
94class Latin1 {
95 public:
96 static const unsigned kMaxChar = 0xff;
97 // Returns 0 if character does not convert to single latin-1 character
98 // or if the character doesn't not convert back to latin-1 via inverse
99 // operation (upper to lower, etc).
100 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
101};
102
103
104uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
105 DCHECK(c > Latin1::kMaxChar);
106 switch (c) {
107 // This are equivalent characters in unicode.
108 case 0x39c:
109 case 0x3bc:
110 return 0xb5;
111 // This is an uppercase of a Latin-1 character
112 // outside of Latin-1.
113 case 0x178:
114 return 0xff;
115 }
116 return 0;
117}
118
119
120} // namespace unibrow
121
122#endif // V8_UNICODE_DECODER_H_