blob: 2289e083425aeb1ec61fff7ca67328afa2fde1ca [file] [log] [blame]
Emily Bernierd0a1eb72015-03-24 16:35:39 -04001// Copyright 2014 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5
6#include "src/unicode-inl.h"
7#include "src/unicode-decoder.h"
8#include <stdio.h>
9#include <stdlib.h>
10
11namespace unibrow {
12
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000013void Utf8DecoderBase::Reset(uint16_t* buffer, size_t buffer_length,
14 const uint8_t* stream, size_t stream_length) {
Emily Bernierd0a1eb72015-03-24 16:35:39 -040015 // Assume everything will fit in the buffer and stream won't be needed.
16 last_byte_of_buffer_unused_ = false;
17 unbuffered_start_ = NULL;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000018 unbuffered_length_ = 0;
Emily Bernierd0a1eb72015-03-24 16:35:39 -040019 bool writing_to_buffer = true;
20 // Loop until stream is read, writing to buffer as long as buffer has space.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000021 size_t utf16_length = 0;
Emily Bernierd0a1eb72015-03-24 16:35:39 -040022 while (stream_length != 0) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000023 size_t cursor = 0;
Emily Bernierd0a1eb72015-03-24 16:35:39 -040024 uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
25 DCHECK(cursor > 0 && cursor <= stream_length);
26 stream += cursor;
27 stream_length -= cursor;
28 bool is_two_characters = character > Utf16::kMaxNonSurrogateCharCode;
29 utf16_length += is_two_characters ? 2 : 1;
30 // Don't need to write to the buffer, but still need utf16_length.
31 if (!writing_to_buffer) continue;
32 // Write out the characters to the buffer.
33 // Must check for equality with buffer_length as we've already updated it.
34 if (utf16_length <= buffer_length) {
35 if (is_two_characters) {
36 *buffer++ = Utf16::LeadSurrogate(character);
37 *buffer++ = Utf16::TrailSurrogate(character);
38 } else {
39 *buffer++ = character;
40 }
41 if (utf16_length == buffer_length) {
42 // Just wrote last character of buffer
43 writing_to_buffer = false;
44 unbuffered_start_ = stream;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000045 unbuffered_length_ = stream_length;
Emily Bernierd0a1eb72015-03-24 16:35:39 -040046 }
47 continue;
48 }
49 // Have gone over buffer.
50 // Last char of buffer is unused, set cursor back.
51 DCHECK(is_two_characters);
52 writing_to_buffer = false;
53 last_byte_of_buffer_unused_ = true;
54 unbuffered_start_ = stream - cursor;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000055 unbuffered_length_ = stream_length + cursor;
Emily Bernierd0a1eb72015-03-24 16:35:39 -040056 }
57 utf16_length_ = utf16_length;
58}
59
60
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000061void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream,
62 size_t stream_length, uint16_t* data,
63 size_t data_length) {
Emily Bernierd0a1eb72015-03-24 16:35:39 -040064 while (data_length != 0) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000065 size_t cursor = 0;
66 uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
Emily Bernierd0a1eb72015-03-24 16:35:39 -040067 // There's a total lack of bounds checking for stream
68 // as it was already done in Reset.
69 stream += cursor;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000070 DCHECK(stream_length >= cursor);
71 stream_length -= cursor;
Emily Bernierd0a1eb72015-03-24 16:35:39 -040072 if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
73 *data++ = Utf16::LeadSurrogate(character);
74 *data++ = Utf16::TrailSurrogate(character);
75 DCHECK(data_length > 1);
76 data_length -= 2;
77 } else {
78 *data++ = character;
79 data_length -= 1;
80 }
81 }
82}
83
84} // namespace unibrow