blob: 35ea30cf1a5ad63ef02a41a9cbc377264edc2904 [file] [log] [blame]
Emily Bernierd0a1eb72015-03-24 16:35:39 -04001// Copyright 2014 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_UNICODE_DECODER_H_
6#define V8_UNICODE_DECODER_H_
7
8#include <sys/types.h>
9#include "src/globals.h"
10
11namespace unibrow {
12
13class Utf8DecoderBase {
14 public:
15 // Initialization done in subclass.
16 inline Utf8DecoderBase();
17 inline Utf8DecoderBase(uint16_t* buffer, unsigned buffer_length,
18 const uint8_t* stream, unsigned stream_length);
19 inline unsigned Utf16Length() const { return utf16_length_; }
20
21 protected:
22 // This reads all characters and sets the utf16_length_.
23 // The first buffer_length utf16 chars are cached in the buffer.
24 void Reset(uint16_t* buffer, unsigned buffer_length, const uint8_t* stream,
25 unsigned stream_length);
26 static void WriteUtf16Slow(const uint8_t* stream, uint16_t* data,
27 unsigned length);
28 const uint8_t* unbuffered_start_;
29 unsigned utf16_length_;
30 bool last_byte_of_buffer_unused_;
31
32 private:
33 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
34};
35
36template <unsigned kBufferSize>
37class Utf8Decoder : public Utf8DecoderBase {
38 public:
39 inline Utf8Decoder() {}
40 inline Utf8Decoder(const char* stream, unsigned length);
41 inline void Reset(const char* stream, unsigned length);
42 inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
43
44 private:
45 uint16_t buffer_[kBufferSize];
46};
47
48
49Utf8DecoderBase::Utf8DecoderBase()
50 : unbuffered_start_(NULL),
51 utf16_length_(0),
52 last_byte_of_buffer_unused_(false) {}
53
54
55Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, unsigned buffer_length,
56 const uint8_t* stream,
57 unsigned stream_length) {
58 Reset(buffer, buffer_length, stream, stream_length);
59}
60
61
62template <unsigned kBufferSize>
63Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
64 : Utf8DecoderBase(buffer_, kBufferSize,
65 reinterpret_cast<const uint8_t*>(stream), length) {}
66
67
68template <unsigned kBufferSize>
69void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
70 Utf8DecoderBase::Reset(buffer_, kBufferSize,
71 reinterpret_cast<const uint8_t*>(stream), length);
72}
73
74
75template <unsigned kBufferSize>
76unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
77 unsigned length) const {
78 DCHECK(length > 0);
79 if (length > utf16_length_) length = utf16_length_;
80 // memcpy everything in buffer.
81 unsigned buffer_length =
82 last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
83 unsigned memcpy_length = length <= buffer_length ? length : buffer_length;
84 v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
85 if (length <= buffer_length) return length;
86 DCHECK(unbuffered_start_ != NULL);
87 // Copy the rest the slow way.
88 WriteUtf16Slow(unbuffered_start_, data + buffer_length,
89 length - buffer_length);
90 return length;
91}
92
93class Latin1 {
94 public:
95 static const unsigned kMaxChar = 0xff;
96 // Returns 0 if character does not convert to single latin-1 character
97 // or if the character doesn't not convert back to latin-1 via inverse
98 // operation (upper to lower, etc).
99 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
100};
101
102
103uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
104 DCHECK(c > Latin1::kMaxChar);
105 switch (c) {
106 // This are equivalent characters in unicode.
107 case 0x39c:
108 case 0x3bc:
109 return 0xb5;
110 // This is an uppercase of a Latin-1 character
111 // outside of Latin-1.
112 case 0x178:
113 return 0xff;
114 }
115 return 0;
116}
117
118
119} // namespace unibrow
120
121#endif // V8_UNICODE_DECODER_H_