Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1 | // Copyright 2011 the V8 project authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "src/parsing/scanner-character-streams.h" |
| 6 | |
| 7 | #include "include/v8.h" |
| 8 | #include "src/globals.h" |
| 9 | #include "src/handles.h" |
| 10 | #include "src/list-inl.h" // TODO(mstarzinger): Temporary cycle breaker! |
| 11 | #include "src/objects.h" |
| 12 | #include "src/unicode-inl.h" |
| 13 | |
| 14 | namespace v8 { |
| 15 | namespace internal { |
| 16 | |
| 17 | namespace { |
| 18 | |
| 19 | size_t CopyCharsHelper(uint16_t* dest, size_t length, const uint8_t* src, |
| 20 | size_t* src_pos, size_t src_length, |
| 21 | ScriptCompiler::StreamedSource::Encoding encoding) { |
| 22 | // It's possible that this will be called with length 0, but don't assume that |
| 23 | // the functions this calls handle it gracefully. |
| 24 | if (length == 0) return 0; |
| 25 | |
| 26 | if (encoding == ScriptCompiler::StreamedSource::UTF8) { |
| 27 | return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( |
| 28 | dest, length, src, src_pos, src_length); |
| 29 | } |
| 30 | |
| 31 | size_t to_fill = length; |
| 32 | if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos; |
| 33 | |
| 34 | if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) { |
| 35 | v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill); |
| 36 | } else { |
| 37 | DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE); |
| 38 | v8::internal::CopyChars<uint16_t, uint16_t>( |
| 39 | dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill); |
| 40 | } |
| 41 | *src_pos += to_fill; |
| 42 | return to_fill; |
| 43 | } |
| 44 | |
| 45 | } // namespace |
| 46 | |
| 47 | |
| 48 | // ---------------------------------------------------------------------------- |
| 49 | // BufferedUtf16CharacterStreams |
| 50 | |
| 51 | BufferedUtf16CharacterStream::BufferedUtf16CharacterStream() |
| 52 | : Utf16CharacterStream(), |
| 53 | pushback_limit_(NULL) { |
| 54 | // Initialize buffer as being empty. First read will fill the buffer. |
| 55 | buffer_cursor_ = buffer_; |
| 56 | buffer_end_ = buffer_; |
| 57 | } |
| 58 | |
| 59 | |
| 60 | BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { } |
| 61 | |
| 62 | void BufferedUtf16CharacterStream::PushBack(uc32 character) { |
| 63 | if (character == kEndOfInput) { |
| 64 | pos_--; |
| 65 | return; |
| 66 | } |
| 67 | if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) { |
| 68 | // buffer_ is writable, buffer_cursor_ is const pointer. |
| 69 | buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character); |
| 70 | pos_--; |
| 71 | return; |
| 72 | } |
| 73 | SlowPushBack(static_cast<uc16>(character)); |
| 74 | } |
| 75 | |
| 76 | |
| 77 | void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) { |
| 78 | // In pushback mode, the end of the buffer contains pushback, |
| 79 | // and the start of the buffer (from buffer start to pushback_limit_) |
| 80 | // contains valid data that comes just after the pushback. |
| 81 | // We NULL the pushback_limit_ if pushing all the way back to the |
| 82 | // start of the buffer. |
| 83 | |
| 84 | if (pushback_limit_ == NULL) { |
| 85 | // Enter pushback mode. |
| 86 | pushback_limit_ = buffer_end_; |
| 87 | buffer_end_ = buffer_ + kBufferSize; |
| 88 | buffer_cursor_ = buffer_end_; |
| 89 | } |
| 90 | // Ensure that there is room for at least one pushback. |
| 91 | DCHECK(buffer_cursor_ > buffer_); |
| 92 | DCHECK(pos_ > 0); |
| 93 | buffer_[--buffer_cursor_ - buffer_] = character; |
| 94 | if (buffer_cursor_ == buffer_) { |
| 95 | pushback_limit_ = NULL; |
| 96 | } else if (buffer_cursor_ < pushback_limit_) { |
| 97 | pushback_limit_ = buffer_cursor_; |
| 98 | } |
| 99 | pos_--; |
| 100 | } |
| 101 | |
| 102 | |
| 103 | bool BufferedUtf16CharacterStream::ReadBlock() { |
| 104 | buffer_cursor_ = buffer_; |
| 105 | if (pushback_limit_ != NULL) { |
| 106 | // Leave pushback mode. |
| 107 | buffer_end_ = pushback_limit_; |
| 108 | pushback_limit_ = NULL; |
| 109 | // If there were any valid characters left at the |
| 110 | // start of the buffer, use those. |
| 111 | if (buffer_cursor_ < buffer_end_) return true; |
| 112 | // Otherwise read a new block. |
| 113 | } |
| 114 | size_t length = FillBuffer(pos_); |
| 115 | buffer_end_ = buffer_ + length; |
| 116 | return length > 0; |
| 117 | } |
| 118 | |
| 119 | |
| 120 | size_t BufferedUtf16CharacterStream::SlowSeekForward(size_t delta) { |
| 121 | // Leave pushback mode (i.e., ignore that there might be valid data |
| 122 | // in the buffer before the pushback_limit_ point). |
| 123 | pushback_limit_ = NULL; |
| 124 | return BufferSeekForward(delta); |
| 125 | } |
| 126 | |
| 127 | |
| 128 | // ---------------------------------------------------------------------------- |
| 129 | // GenericStringUtf16CharacterStream |
| 130 | |
| 131 | |
| 132 | GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream( |
| 133 | Handle<String> data, size_t start_position, size_t end_position) |
| 134 | : string_(data), length_(end_position), bookmark_(kNoBookmark) { |
| 135 | DCHECK(end_position >= start_position); |
| 136 | pos_ = start_position; |
| 137 | } |
| 138 | |
| 139 | |
| 140 | GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { } |
| 141 | |
| 142 | |
| 143 | bool GenericStringUtf16CharacterStream::SetBookmark() { |
| 144 | bookmark_ = pos_; |
| 145 | return true; |
| 146 | } |
| 147 | |
| 148 | |
| 149 | void GenericStringUtf16CharacterStream::ResetToBookmark() { |
| 150 | DCHECK(bookmark_ != kNoBookmark); |
| 151 | pos_ = bookmark_; |
| 152 | buffer_cursor_ = buffer_; |
| 153 | buffer_end_ = buffer_ + FillBuffer(pos_); |
| 154 | } |
| 155 | |
| 156 | |
| 157 | size_t GenericStringUtf16CharacterStream::BufferSeekForward(size_t delta) { |
| 158 | size_t old_pos = pos_; |
| 159 | pos_ = Min(pos_ + delta, length_); |
| 160 | ReadBlock(); |
| 161 | return pos_ - old_pos; |
| 162 | } |
| 163 | |
| 164 | |
| 165 | size_t GenericStringUtf16CharacterStream::FillBuffer(size_t from_pos) { |
| 166 | if (from_pos >= length_) return 0; |
| 167 | size_t length = kBufferSize; |
| 168 | if (from_pos + length > length_) { |
| 169 | length = length_ - from_pos; |
| 170 | } |
| 171 | String::WriteToFlat<uc16>(*string_, buffer_, static_cast<int>(from_pos), |
| 172 | static_cast<int>(from_pos + length)); |
| 173 | return length; |
| 174 | } |
| 175 | |
| 176 | |
| 177 | // ---------------------------------------------------------------------------- |
| 178 | // Utf8ToUtf16CharacterStream |
| 179 | Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data, |
| 180 | size_t length) |
| 181 | : BufferedUtf16CharacterStream(), |
| 182 | raw_data_(data), |
| 183 | raw_data_length_(length), |
| 184 | raw_data_pos_(0), |
| 185 | raw_character_position_(0) { |
| 186 | ReadBlock(); |
| 187 | } |
| 188 | |
| 189 | |
| 190 | Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } |
| 191 | |
| 192 | |
| 193 | size_t Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, size_t length, |
| 194 | const byte* src, size_t* src_pos, |
| 195 | size_t src_length) { |
| 196 | static const unibrow::uchar kMaxUtf16Character = |
| 197 | unibrow::Utf16::kMaxNonSurrogateCharCode; |
| 198 | size_t i = 0; |
| 199 | // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer |
| 200 | // one character early (in the normal case), because we need to have at least |
| 201 | // two free spaces in the buffer to be sure that the next character will fit. |
| 202 | while (i < length - 1) { |
| 203 | if (*src_pos == src_length) break; |
| 204 | unibrow::uchar c = src[*src_pos]; |
| 205 | if (c <= unibrow::Utf8::kMaxOneByteChar) { |
| 206 | *src_pos = *src_pos + 1; |
| 207 | } else { |
| 208 | c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, |
| 209 | src_pos); |
| 210 | } |
| 211 | if (c > kMaxUtf16Character) { |
| 212 | dest[i++] = unibrow::Utf16::LeadSurrogate(c); |
| 213 | dest[i++] = unibrow::Utf16::TrailSurrogate(c); |
| 214 | } else { |
| 215 | dest[i++] = static_cast<uc16>(c); |
| 216 | } |
| 217 | } |
| 218 | return i; |
| 219 | } |
| 220 | |
| 221 | |
| 222 | size_t Utf8ToUtf16CharacterStream::BufferSeekForward(size_t delta) { |
| 223 | size_t old_pos = pos_; |
| 224 | size_t target_pos = pos_ + delta; |
| 225 | SetRawPosition(target_pos); |
| 226 | pos_ = raw_character_position_; |
| 227 | ReadBlock(); |
| 228 | return pos_ - old_pos; |
| 229 | } |
| 230 | |
| 231 | |
| 232 | size_t Utf8ToUtf16CharacterStream::FillBuffer(size_t char_position) { |
| 233 | SetRawPosition(char_position); |
| 234 | if (raw_character_position_ != char_position) { |
| 235 | // char_position was not a valid position in the stream (hit the end |
| 236 | // while spooling to it). |
| 237 | return 0u; |
| 238 | } |
| 239 | size_t i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, |
| 240 | raw_data_length_); |
| 241 | raw_character_position_ = char_position + i; |
| 242 | return i; |
| 243 | } |
| 244 | |
| 245 | |
| 246 | static const byte kUtf8MultiByteMask = 0xC0; |
| 247 | static const byte kUtf8MultiByteCharFollower = 0x80; |
| 248 | |
| 249 | |
| 250 | #ifdef DEBUG |
| 251 | static const byte kUtf8MultiByteCharStart = 0xC0; |
| 252 | static bool IsUtf8MultiCharacterStart(byte first_byte) { |
| 253 | return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; |
| 254 | } |
| 255 | #endif |
| 256 | |
| 257 | |
| 258 | static bool IsUtf8MultiCharacterFollower(byte later_byte) { |
| 259 | return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; |
| 260 | } |
| 261 | |
| 262 | |
| 263 | // Move the cursor back to point at the preceding UTF-8 character start |
| 264 | // in the buffer. |
| 265 | static inline void Utf8CharacterBack(const byte* buffer, size_t* cursor) { |
| 266 | byte character = buffer[--*cursor]; |
| 267 | if (character > unibrow::Utf8::kMaxOneByteChar) { |
| 268 | DCHECK(IsUtf8MultiCharacterFollower(character)); |
| 269 | // Last byte of a multi-byte character encoding. Step backwards until |
| 270 | // pointing to the first byte of the encoding, recognized by having the |
| 271 | // top two bits set. |
| 272 | while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { } |
| 273 | DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor])); |
| 274 | } |
| 275 | } |
| 276 | |
| 277 | |
| 278 | // Move the cursor forward to point at the next following UTF-8 character start |
| 279 | // in the buffer. |
| 280 | static inline void Utf8CharacterForward(const byte* buffer, size_t* cursor) { |
| 281 | byte character = buffer[(*cursor)++]; |
| 282 | if (character > unibrow::Utf8::kMaxOneByteChar) { |
| 283 | // First character of a multi-byte character encoding. |
| 284 | // The number of most-significant one-bits determines the length of the |
| 285 | // encoding: |
| 286 | // 110..... - (0xCx, 0xDx) one additional byte (minimum). |
| 287 | // 1110.... - (0xEx) two additional bytes. |
| 288 | // 11110... - (0xFx) three additional bytes (maximum). |
| 289 | DCHECK(IsUtf8MultiCharacterStart(character)); |
| 290 | // Additional bytes is: |
| 291 | // 1 if value in range 0xC0 .. 0xDF. |
| 292 | // 2 if value in range 0xE0 .. 0xEF. |
| 293 | // 3 if value in range 0xF0 .. 0xF7. |
| 294 | // Encode that in a single value. |
| 295 | size_t additional_bytes = |
| 296 | ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; |
| 297 | *cursor += additional_bytes; |
| 298 | DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); |
| 299 | } |
| 300 | } |
| 301 | |
| 302 | |
| 303 | // This can't set a raw position between two surrogate pairs, since there |
| 304 | // is no position in the UTF8 stream that corresponds to that. This assumes |
| 305 | // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If |
| 306 | // it is illegally coded as two 3 byte sequences then there is no problem here. |
| 307 | void Utf8ToUtf16CharacterStream::SetRawPosition(size_t target_position) { |
| 308 | if (raw_character_position_ > target_position) { |
| 309 | // Spool backwards in utf8 buffer. |
| 310 | do { |
| 311 | size_t old_pos = raw_data_pos_; |
| 312 | Utf8CharacterBack(raw_data_, &raw_data_pos_); |
| 313 | raw_character_position_--; |
| 314 | DCHECK(old_pos - raw_data_pos_ <= 4); |
| 315 | // Step back over both code units for surrogate pairs. |
| 316 | if (old_pos - raw_data_pos_ == 4) raw_character_position_--; |
| 317 | } while (raw_character_position_ > target_position); |
| 318 | // No surrogate pair splitting. |
| 319 | DCHECK(raw_character_position_ == target_position); |
| 320 | return; |
| 321 | } |
| 322 | // Spool forwards in the utf8 buffer. |
| 323 | while (raw_character_position_ < target_position) { |
| 324 | if (raw_data_pos_ == raw_data_length_) return; |
| 325 | size_t old_pos = raw_data_pos_; |
| 326 | Utf8CharacterForward(raw_data_, &raw_data_pos_); |
| 327 | raw_character_position_++; |
| 328 | DCHECK(raw_data_pos_ - old_pos <= 4); |
| 329 | if (raw_data_pos_ - old_pos == 4) raw_character_position_++; |
| 330 | } |
| 331 | // No surrogate pair splitting. |
| 332 | DCHECK(raw_character_position_ == target_position); |
| 333 | } |
| 334 | |
| 335 | |
| 336 | size_t ExternalStreamingStream::FillBuffer(size_t position) { |
| 337 | // Ignore "position" which is the position in the decoded data. Instead, |
| 338 | // ExternalStreamingStream keeps track of the position in the raw data. |
| 339 | size_t data_in_buffer = 0; |
| 340 | // Note that the UTF-8 decoder might not be able to fill the buffer |
| 341 | // completely; it will typically leave the last character empty (see |
| 342 | // Utf8ToUtf16CharacterStream::CopyChars). |
| 343 | while (data_in_buffer < kBufferSize - 1) { |
| 344 | if (current_data_ == NULL) { |
| 345 | // GetSomeData will wait until the embedder has enough data. Here's an |
| 346 | // interface between the API which uses size_t (which is the correct type |
| 347 | // here) and the internal parts which use size_t. |
| 348 | current_data_length_ = source_stream_->GetMoreData(¤t_data_); |
| 349 | current_data_offset_ = 0; |
| 350 | bool data_ends = current_data_length_ == 0; |
| 351 | bookmark_data_is_from_current_data_ = false; |
| 352 | |
| 353 | // A caveat: a data chunk might end with bytes from an incomplete UTF-8 |
| 354 | // character (the rest of the bytes will be in the next chunk). |
| 355 | if (encoding_ == ScriptCompiler::StreamedSource::UTF8) { |
| 356 | HandleUtf8SplitCharacters(&data_in_buffer); |
| 357 | if (!data_ends && current_data_offset_ == current_data_length_) { |
| 358 | // The data stream didn't end, but we used all the data in the |
| 359 | // chunk. This will only happen when the chunk was really small. We |
| 360 | // don't handle the case where a UTF-8 character is split over several |
| 361 | // chunks; in that case V8 won't crash, but it will be a parse error. |
| 362 | FlushCurrent(); |
| 363 | continue; // Request a new chunk. |
| 364 | } |
| 365 | } |
| 366 | |
| 367 | // Did the data stream end? |
| 368 | if (data_ends) { |
| 369 | DCHECK(utf8_split_char_buffer_length_ == 0); |
| 370 | return data_in_buffer; |
| 371 | } |
| 372 | } |
| 373 | |
| 374 | // Fill the buffer from current_data_. |
| 375 | size_t new_offset = 0; |
| 376 | size_t new_chars_in_buffer = |
| 377 | CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer, |
| 378 | current_data_ + current_data_offset_, &new_offset, |
| 379 | current_data_length_ - current_data_offset_, encoding_); |
| 380 | data_in_buffer += new_chars_in_buffer; |
| 381 | current_data_offset_ += new_offset; |
| 382 | DCHECK(data_in_buffer <= kBufferSize); |
| 383 | |
| 384 | // Did we use all the data in the data chunk? |
| 385 | if (current_data_offset_ == current_data_length_) { |
| 386 | FlushCurrent(); |
| 387 | } |
| 388 | } |
| 389 | return data_in_buffer; |
| 390 | } |
| 391 | |
| 392 | |
| 393 | bool ExternalStreamingStream::SetBookmark() { |
| 394 | // Bookmarking for this stream is a bit more complex than expected, since |
| 395 | // the stream state is distributed over several places: |
| 396 | // - pos_ (inherited from Utf16CharacterStream) |
| 397 | // - buffer_cursor_ and buffer_end_ (also from Utf16CharacterStream) |
| 398 | // - buffer_ (from BufferedUtf16CharacterStream) |
| 399 | // - current_data_ (+ .._offset_ and .._length) (this class) |
| 400 | // - utf8_split_char_buffer_* (a partial utf8 symbol at the block boundary) |
| 401 | // |
| 402 | // The underlying source_stream_ instance likely could re-construct this |
| 403 | // local data for us, but with the given interfaces we have no way of |
| 404 | // accomplishing this. Thus, we'll have to save all data locally. |
| 405 | // |
| 406 | // What gets saved where: |
| 407 | // - pos_ => bookmark_ |
| 408 | // - buffer_[buffer_cursor_ .. buffer_end_] => bookmark_buffer_ |
| 409 | // - current_data_[.._offset_ .. .._length_] => bookmark_data_ |
| 410 | // - utf8_split_char_buffer_* => bookmark_utf8_split... |
| 411 | // |
| 412 | // To make sure we don't unnecessarily copy data, we also maintain |
| 413 | // whether bookmark_data_ contains a copy of the current current_data_ |
| 414 | // block. This is done with: |
| 415 | // - bookmark_data_is_from_current_data_ |
| 416 | // - bookmark_data_offset_: offset into bookmark_data_ |
| 417 | // |
| 418 | // Note that bookmark_data_is_from_current_data_ must be maintained |
| 419 | // whenever current_data_ is updated. |
| 420 | |
| 421 | bookmark_ = pos_; |
| 422 | |
| 423 | size_t buffer_length = buffer_end_ - buffer_cursor_; |
| 424 | bookmark_buffer_.Dispose(); |
| 425 | bookmark_buffer_ = Vector<uint16_t>::New(static_cast<int>(buffer_length)); |
| 426 | CopyCharsUnsigned(bookmark_buffer_.start(), buffer_cursor_, buffer_length); |
| 427 | |
| 428 | size_t data_length = current_data_length_ - current_data_offset_; |
| 429 | size_t bookmark_data_length = static_cast<size_t>(bookmark_data_.length()); |
| 430 | if (bookmark_data_is_from_current_data_ && |
| 431 | data_length < bookmark_data_length) { |
| 432 | // Fast case: bookmark_data_ was previously copied from the current |
| 433 | // data block, and we have enough data for this bookmark. |
| 434 | bookmark_data_offset_ = bookmark_data_length - data_length; |
| 435 | } else { |
| 436 | // Slow case: We need to copy current_data_. |
| 437 | bookmark_data_.Dispose(); |
| 438 | bookmark_data_ = Vector<uint8_t>::New(static_cast<int>(data_length)); |
| 439 | CopyBytes(bookmark_data_.start(), current_data_ + current_data_offset_, |
| 440 | data_length); |
| 441 | bookmark_data_is_from_current_data_ = true; |
| 442 | bookmark_data_offset_ = 0; |
| 443 | } |
| 444 | |
| 445 | bookmark_utf8_split_char_buffer_length_ = utf8_split_char_buffer_length_; |
| 446 | for (size_t i = 0; i < utf8_split_char_buffer_length_; i++) { |
| 447 | bookmark_utf8_split_char_buffer_[i] = utf8_split_char_buffer_[i]; |
| 448 | } |
| 449 | |
| 450 | return source_stream_->SetBookmark(); |
| 451 | } |
| 452 | |
| 453 | |
| 454 | void ExternalStreamingStream::ResetToBookmark() { |
| 455 | source_stream_->ResetToBookmark(); |
| 456 | FlushCurrent(); |
| 457 | |
| 458 | pos_ = bookmark_; |
| 459 | |
| 460 | // bookmark_data_* => current_data_* |
| 461 | // (current_data_ assumes ownership of its memory.) |
| 462 | current_data_offset_ = 0; |
| 463 | current_data_length_ = bookmark_data_.length() - bookmark_data_offset_; |
| 464 | uint8_t* data = new uint8_t[current_data_length_]; |
| 465 | CopyCharsUnsigned(data, bookmark_data_.begin() + bookmark_data_offset_, |
| 466 | current_data_length_); |
| 467 | delete[] current_data_; |
| 468 | current_data_ = data; |
| 469 | bookmark_data_is_from_current_data_ = true; |
| 470 | |
| 471 | // bookmark_buffer_ needs to be copied to buffer_. |
| 472 | CopyCharsUnsigned(buffer_, bookmark_buffer_.begin(), |
| 473 | bookmark_buffer_.length()); |
| 474 | buffer_cursor_ = buffer_; |
| 475 | buffer_end_ = buffer_ + bookmark_buffer_.length(); |
| 476 | |
| 477 | // utf8 split char buffer |
| 478 | utf8_split_char_buffer_length_ = bookmark_utf8_split_char_buffer_length_; |
| 479 | for (size_t i = 0; i < bookmark_utf8_split_char_buffer_length_; i++) { |
| 480 | utf8_split_char_buffer_[i] = bookmark_utf8_split_char_buffer_[i]; |
| 481 | } |
| 482 | } |
| 483 | |
| 484 | |
| 485 | void ExternalStreamingStream::FlushCurrent() { |
| 486 | delete[] current_data_; |
| 487 | current_data_ = NULL; |
| 488 | current_data_length_ = 0; |
| 489 | current_data_offset_ = 0; |
| 490 | bookmark_data_is_from_current_data_ = false; |
| 491 | } |
| 492 | |
| 493 | |
| 494 | void ExternalStreamingStream::HandleUtf8SplitCharacters( |
| 495 | size_t* data_in_buffer) { |
| 496 | // Note the following property of UTF-8 which makes this function possible: |
| 497 | // Given any byte, we can always read its local environment (in both |
| 498 | // directions) to find out the (possibly multi-byte) character it belongs |
| 499 | // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a |
| 500 | // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or |
| 501 | // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX. |
| 502 | |
| 503 | // First check if we have leftover data from the last chunk. |
| 504 | unibrow::uchar c; |
| 505 | if (utf8_split_char_buffer_length_ > 0) { |
| 506 | // Move the bytes which are part of the split character (which started in |
| 507 | // the previous chunk) into utf8_split_char_buffer_. Note that the |
| 508 | // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2. |
| 509 | while (current_data_offset_ < current_data_length_ && |
| 510 | utf8_split_char_buffer_length_ < 4 && |
| 511 | (c = current_data_[current_data_offset_]) >> 6 == 2) { |
| 512 | utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; |
| 513 | ++utf8_split_char_buffer_length_; |
| 514 | ++current_data_offset_; |
| 515 | } |
| 516 | |
| 517 | // Convert the data in utf8_split_char_buffer_. |
| 518 | size_t new_offset = 0; |
| 519 | size_t new_chars_in_buffer = |
| 520 | CopyCharsHelper(buffer_ + *data_in_buffer, |
| 521 | kBufferSize - *data_in_buffer, utf8_split_char_buffer_, |
| 522 | &new_offset, utf8_split_char_buffer_length_, encoding_); |
| 523 | *data_in_buffer += new_chars_in_buffer; |
| 524 | // Make sure we used all the data. |
| 525 | DCHECK(new_offset == utf8_split_char_buffer_length_); |
| 526 | DCHECK(*data_in_buffer <= kBufferSize); |
| 527 | |
| 528 | utf8_split_char_buffer_length_ = 0; |
| 529 | } |
| 530 | |
| 531 | // Move bytes which are part of an incomplete character from the end of the |
| 532 | // current chunk to utf8_split_char_buffer_. They will be converted when the |
| 533 | // next data chunk arrives. Note that all valid UTF-8 characters are at most 4 |
| 534 | // bytes long, but if the data is invalid, we can have character values bigger |
| 535 | // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes. |
| 536 | while (current_data_length_ > current_data_offset_ && |
| 537 | (c = current_data_[current_data_length_ - 1]) > |
| 538 | unibrow::Utf8::kMaxOneByteChar && |
| 539 | utf8_split_char_buffer_length_ < 4) { |
| 540 | --current_data_length_; |
| 541 | ++utf8_split_char_buffer_length_; |
| 542 | if (c >= (3 << 6)) { |
| 543 | // 3 << 6 = 0b11000000; this is the first byte of the multi-byte |
| 544 | // character. No need to copy the previous characters into the conversion |
| 545 | // buffer (even if they're multi-byte). |
| 546 | break; |
| 547 | } |
| 548 | } |
| 549 | CHECK(utf8_split_char_buffer_length_ <= 4); |
| 550 | for (size_t i = 0; i < utf8_split_char_buffer_length_; ++i) { |
| 551 | utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i]; |
| 552 | } |
| 553 | } |
| 554 | |
| 555 | |
| 556 | // ---------------------------------------------------------------------------- |
| 557 | // ExternalTwoByteStringUtf16CharacterStream |
| 558 | |
| 559 | ExternalTwoByteStringUtf16CharacterStream:: |
| 560 | ~ExternalTwoByteStringUtf16CharacterStream() { } |
| 561 | |
| 562 | |
| 563 | ExternalTwoByteStringUtf16CharacterStream:: |
| 564 | ExternalTwoByteStringUtf16CharacterStream( |
| 565 | Handle<ExternalTwoByteString> data, int start_position, |
| 566 | int end_position) |
| 567 | : Utf16CharacterStream(), |
| 568 | source_(data), |
| 569 | raw_data_(data->GetTwoByteData(start_position)), |
| 570 | bookmark_(kNoBookmark) { |
| 571 | buffer_cursor_ = raw_data_, |
| 572 | buffer_end_ = raw_data_ + (end_position - start_position); |
| 573 | pos_ = start_position; |
| 574 | } |
| 575 | |
| 576 | |
| 577 | bool ExternalTwoByteStringUtf16CharacterStream::SetBookmark() { |
| 578 | bookmark_ = pos_; |
| 579 | return true; |
| 580 | } |
| 581 | |
| 582 | |
| 583 | void ExternalTwoByteStringUtf16CharacterStream::ResetToBookmark() { |
| 584 | DCHECK(bookmark_ != kNoBookmark); |
| 585 | pos_ = bookmark_; |
| 586 | buffer_cursor_ = raw_data_ + bookmark_; |
| 587 | } |
| 588 | } // namespace internal |
| 589 | } // namespace v8 |