blob: 96a778cfb7ce2fd1f0f6158fd06aac7fa7f5fb39 [file] [log] [blame]
Ben Murdoch014dc512016-03-22 12:00:34 +00001// Copyright 2012 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/jsregexp.h"
6
Ben Murdochf91f0612016-11-29 16:50:11 +00007#include <memory>
8
Ben Murdoch014dc512016-03-22 12:00:34 +00009#include "src/base/platform/platform.h"
10#include "src/compilation-cache.h"
Ben Murdochf91f0612016-11-29 16:50:11 +000011#include "src/elements.h"
Ben Murdoch014dc512016-03-22 12:00:34 +000012#include "src/execution.h"
13#include "src/factory.h"
14#include "src/isolate-inl.h"
15#include "src/messages.h"
16#include "src/ostreams.h"
17#include "src/regexp/interpreter-irregexp.h"
18#include "src/regexp/jsregexp-inl.h"
Ben Murdoch014dc512016-03-22 12:00:34 +000019#include "src/regexp/regexp-macro-assembler-irregexp.h"
20#include "src/regexp/regexp-macro-assembler-tracer.h"
Ben Murdochf91f0612016-11-29 16:50:11 +000021#include "src/regexp/regexp-macro-assembler.h"
Ben Murdoch014dc512016-03-22 12:00:34 +000022#include "src/regexp/regexp-parser.h"
23#include "src/regexp/regexp-stack.h"
24#include "src/runtime/runtime.h"
25#include "src/splay-tree-inl.h"
26#include "src/string-search.h"
27#include "src/unicode-decoder.h"
28
Ben Murdoch109988c2016-05-18 11:27:45 +010029#ifdef V8_I18N_SUPPORT
30#include "unicode/uset.h"
31#include "unicode/utypes.h"
32#endif // V8_I18N_SUPPORT
33
Ben Murdoch014dc512016-03-22 12:00:34 +000034#ifndef V8_INTERPRETED_REGEXP
35#if V8_TARGET_ARCH_IA32
36#include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
37#elif V8_TARGET_ARCH_X64
38#include "src/regexp/x64/regexp-macro-assembler-x64.h"
39#elif V8_TARGET_ARCH_ARM64
40#include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
41#elif V8_TARGET_ARCH_ARM
42#include "src/regexp/arm/regexp-macro-assembler-arm.h"
43#elif V8_TARGET_ARCH_PPC
44#include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
Ben Murdoch3b9bc312016-06-02 14:46:10 +010045#elif V8_TARGET_ARCH_S390
46#include "src/regexp/s390/regexp-macro-assembler-s390.h"
Ben Murdoch014dc512016-03-22 12:00:34 +000047#elif V8_TARGET_ARCH_MIPS
48#include "src/regexp/mips/regexp-macro-assembler-mips.h"
49#elif V8_TARGET_ARCH_MIPS64
50#include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
51#elif V8_TARGET_ARCH_X87
52#include "src/regexp/x87/regexp-macro-assembler-x87.h"
53#else
54#error Unsupported target architecture.
55#endif
56#endif
57
58
59namespace v8 {
60namespace internal {
61
62MUST_USE_RESULT
63static inline MaybeHandle<Object> ThrowRegExpException(
64 Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
65 Isolate* isolate = re->GetIsolate();
66 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
67 pattern, error_text),
68 Object);
69}
70
71
72inline void ThrowRegExpException(Handle<JSRegExp> re,
73 Handle<String> error_text) {
74 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
75}
76
77
78ContainedInLattice AddRange(ContainedInLattice containment,
79 const int* ranges,
80 int ranges_length,
81 Interval new_range) {
82 DCHECK((ranges_length & 1) == 1);
Ben Murdoch109988c2016-05-18 11:27:45 +010083 DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
Ben Murdoch014dc512016-03-22 12:00:34 +000084 if (containment == kLatticeUnknown) return containment;
85 bool inside = false;
86 int last = 0;
87 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
88 // Consider the range from last to ranges[i].
89 // We haven't got to the new range yet.
90 if (ranges[i] <= new_range.from()) continue;
91 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
92 // inclusive, but the values in ranges are not.
93 if (last <= new_range.from() && new_range.to() < ranges[i]) {
94 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
95 }
96 return kLatticeUnknown;
97 }
98 return containment;
99}
100
101
102// More makes code generation slower, less makes V8 benchmark score lower.
103const int kMaxLookaheadForBoyerMoore = 8;
104// In a 3-character pattern you can maximally step forwards 3 characters
105// at a time, which is not always enough to pay for the extra logic.
106const int kPatternTooShortForBoyerMoore = 2;
107
108
109// Identifies the sort of regexps where the regexp engine is faster
110// than the code used for atom matches.
111static bool HasFewDifferentCharacters(Handle<String> pattern) {
112 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
113 if (length <= kPatternTooShortForBoyerMoore) return false;
114 const int kMod = 128;
115 bool character_found[kMod];
116 int different = 0;
117 memset(&character_found[0], 0, sizeof(character_found));
118 for (int i = 0; i < length; i++) {
119 int ch = (pattern->Get(i) & (kMod - 1));
120 if (!character_found[ch]) {
121 character_found[ch] = true;
122 different++;
123 // We declare a regexp low-alphabet if it has at least 3 times as many
124 // characters as it has different characters.
125 if (different * 3 > length) return false;
126 }
127 }
128 return true;
129}
130
131
132// Generic RegExp methods. Dispatches to implementation specific methods.
133
134
135MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
136 Handle<String> pattern,
137 JSRegExp::Flags flags) {
138 Isolate* isolate = re->GetIsolate();
Ben Murdoch3b9bc312016-06-02 14:46:10 +0100139 Zone zone(isolate->allocator());
Ben Murdoch014dc512016-03-22 12:00:34 +0000140 CompilationCache* compilation_cache = isolate->compilation_cache();
141 MaybeHandle<FixedArray> maybe_cached =
142 compilation_cache->LookupRegExp(pattern, flags);
143 Handle<FixedArray> cached;
144 bool in_cache = maybe_cached.ToHandle(&cached);
145 LOG(isolate, RegExpCompileEvent(re, in_cache));
146
147 Handle<Object> result;
148 if (in_cache) {
149 re->set_data(*cached);
150 return re;
151 }
152 pattern = String::Flatten(pattern);
153 PostponeInterruptsScope postpone(isolate);
154 RegExpCompileData parse_result;
155 FlatStringReader reader(isolate, pattern);
Ben Murdoch109988c2016-05-18 11:27:45 +0100156 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
157 &parse_result)) {
Ben Murdoch014dc512016-03-22 12:00:34 +0000158 // Throw an exception if we fail to parse the pattern.
159 return ThrowRegExpException(re, pattern, parse_result.error);
160 }
161
162 bool has_been_compiled = false;
163
164 if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
165 !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
166 // Parse-tree is a single atom that is equal to the pattern.
167 AtomCompile(re, pattern, flags, pattern);
168 has_been_compiled = true;
169 } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
170 !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
171 RegExpAtom* atom = parse_result.tree->AsAtom();
172 Vector<const uc16> atom_pattern = atom->data();
173 Handle<String> atom_string;
174 ASSIGN_RETURN_ON_EXCEPTION(
175 isolate, atom_string,
176 isolate->factory()->NewStringFromTwoByte(atom_pattern),
177 Object);
178 if (!HasFewDifferentCharacters(atom_string)) {
179 AtomCompile(re, pattern, flags, atom_string);
180 has_been_compiled = true;
181 }
182 }
183 if (!has_been_compiled) {
184 IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
185 }
186 DCHECK(re->data()->IsFixedArray());
187 // Compilation succeeded so the data is set on the regexp
188 // and we can store it in the cache.
189 Handle<FixedArray> data(FixedArray::cast(re->data()));
190 compilation_cache->PutRegExp(pattern, flags, data);
191
192 return re;
193}
194
Ben Murdoch014dc512016-03-22 12:00:34 +0000195MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
Ben Murdochf91f0612016-11-29 16:50:11 +0000196 Handle<String> subject, int index,
197 Handle<JSObject> last_match_info) {
Ben Murdoch014dc512016-03-22 12:00:34 +0000198 switch (regexp->TypeTag()) {
199 case JSRegExp::ATOM:
200 return AtomExec(regexp, subject, index, last_match_info);
201 case JSRegExp::IRREGEXP: {
202 return IrregexpExec(regexp, subject, index, last_match_info);
203 }
204 default:
205 UNREACHABLE();
206 return MaybeHandle<Object>();
207 }
208}
209
210
211// RegExp Atom implementation: Simple string search using indexOf.
212
213
214void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
215 Handle<String> pattern,
216 JSRegExp::Flags flags,
217 Handle<String> match_pattern) {
218 re->GetIsolate()->factory()->SetRegExpAtomData(re,
219 JSRegExp::ATOM,
220 pattern,
221 flags,
222 match_pattern);
223}
224
225
226static void SetAtomLastCapture(FixedArray* array,
227 String* subject,
228 int from,
229 int to) {
230 SealHandleScope shs(array->GetIsolate());
231 RegExpImpl::SetLastCaptureCount(array, 2);
232 RegExpImpl::SetLastSubject(array, subject);
233 RegExpImpl::SetLastInput(array, subject);
234 RegExpImpl::SetCapture(array, 0, from);
235 RegExpImpl::SetCapture(array, 1, to);
236}
237
238
239int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
240 Handle<String> subject,
241 int index,
242 int32_t* output,
243 int output_size) {
244 Isolate* isolate = regexp->GetIsolate();
245
246 DCHECK(0 <= index);
247 DCHECK(index <= subject->length());
248
249 subject = String::Flatten(subject);
250 DisallowHeapAllocation no_gc; // ensure vectors stay valid
251
252 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
253 int needle_len = needle->length();
254 DCHECK(needle->IsFlat());
255 DCHECK_LT(0, needle_len);
256
257 if (index + needle_len > subject->length()) {
258 return RegExpImpl::RE_FAILURE;
259 }
260
261 for (int i = 0; i < output_size; i += 2) {
262 String::FlatContent needle_content = needle->GetFlatContent();
263 String::FlatContent subject_content = subject->GetFlatContent();
264 DCHECK(needle_content.IsFlat());
265 DCHECK(subject_content.IsFlat());
266 // dispatch on type of strings
267 index =
268 (needle_content.IsOneByte()
269 ? (subject_content.IsOneByte()
270 ? SearchString(isolate, subject_content.ToOneByteVector(),
271 needle_content.ToOneByteVector(), index)
272 : SearchString(isolate, subject_content.ToUC16Vector(),
273 needle_content.ToOneByteVector(), index))
274 : (subject_content.IsOneByte()
275 ? SearchString(isolate, subject_content.ToOneByteVector(),
276 needle_content.ToUC16Vector(), index)
277 : SearchString(isolate, subject_content.ToUC16Vector(),
278 needle_content.ToUC16Vector(), index)));
279 if (index == -1) {
280 return i / 2; // Return number of matches.
281 } else {
282 output[i] = index;
283 output[i+1] = index + needle_len;
284 index += needle_len;
285 }
286 }
287 return output_size / 2;
288}
289
Ben Murdochf91f0612016-11-29 16:50:11 +0000290Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, Handle<String> subject,
Ben Murdoch014dc512016-03-22 12:00:34 +0000291 int index,
Ben Murdochf91f0612016-11-29 16:50:11 +0000292 Handle<JSObject> last_match_info) {
Ben Murdoch014dc512016-03-22 12:00:34 +0000293 Isolate* isolate = re->GetIsolate();
294
295 static const int kNumRegisters = 2;
296 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
297 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
298
299 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
300
301 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
302
303 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
304 SealHandleScope shs(isolate);
305 FixedArray* array = FixedArray::cast(last_match_info->elements());
306 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
307 return last_match_info;
308}
309
310
311// Irregexp implementation.
312
313// Ensures that the regexp object contains a compiled version of the
314// source for either one-byte or two-byte subject strings.
315// If the compiled version doesn't already exist, it is compiled
316// from the source pattern.
317// If compilation fails, an exception is thrown and this function
318// returns false.
319bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
320 Handle<String> sample_subject,
321 bool is_one_byte) {
322 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
323#ifdef V8_INTERPRETED_REGEXP
324 if (compiled_code->IsByteArray()) return true;
325#else // V8_INTERPRETED_REGEXP (RegExp native code)
326 if (compiled_code->IsCode()) return true;
327#endif
328 // We could potentially have marked this as flushable, but have kept
329 // a saved version if we did not flush it yet.
330 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
331 if (saved_code->IsCode()) {
332 // Reinstate the code in the original place.
333 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
334 DCHECK(compiled_code->IsSmi());
335 return true;
336 }
337 return CompileIrregexp(re, sample_subject, is_one_byte);
338}
339
340
341bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
342 Handle<String> sample_subject,
343 bool is_one_byte) {
344 // Compile the RegExp.
345 Isolate* isolate = re->GetIsolate();
Ben Murdoch3b9bc312016-06-02 14:46:10 +0100346 Zone zone(isolate->allocator());
Ben Murdoch014dc512016-03-22 12:00:34 +0000347 PostponeInterruptsScope postpone(isolate);
348 // If we had a compilation error the last time this is saved at the
349 // saved code index.
350 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
351 // When arriving here entry can only be a smi, either representing an
352 // uncompiled regexp, a previous compilation error, or code that has
353 // been flushed.
354 DCHECK(entry->IsSmi());
355 int entry_value = Smi::cast(entry)->value();
356 DCHECK(entry_value == JSRegExp::kUninitializedValue ||
357 entry_value == JSRegExp::kCompilationErrorValue ||
358 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
359
360 if (entry_value == JSRegExp::kCompilationErrorValue) {
361 // A previous compilation failed and threw an error which we store in
362 // the saved code index (we store the error message, not the actual
363 // error). Recreate the error object and throw it.
364 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
365 DCHECK(error_string->IsString());
366 Handle<String> error_message(String::cast(error_string));
367 ThrowRegExpException(re, error_message);
368 return false;
369 }
370
371 JSRegExp::Flags flags = re->GetFlags();
372
373 Handle<String> pattern(re->Pattern());
374 pattern = String::Flatten(pattern);
375 RegExpCompileData compile_data;
376 FlatStringReader reader(isolate, pattern);
Ben Murdoch109988c2016-05-18 11:27:45 +0100377 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
378 &compile_data)) {
Ben Murdoch014dc512016-03-22 12:00:34 +0000379 // Throw an exception if we fail to parse the pattern.
380 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
381 USE(ThrowRegExpException(re, pattern, compile_data.error));
382 return false;
383 }
Ben Murdoch109988c2016-05-18 11:27:45 +0100384 RegExpEngine::CompilationResult result =
385 RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
386 sample_subject, is_one_byte);
Ben Murdoch014dc512016-03-22 12:00:34 +0000387 if (result.error_message != NULL) {
388 // Unable to compile regexp.
389 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
390 CStrVector(result.error_message)).ToHandleChecked();
391 ThrowRegExpException(re, error_message);
392 return false;
393 }
394
395 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
396 data->set(JSRegExp::code_index(is_one_byte), result.code);
Ben Murdoch13e2dad2016-09-16 13:49:30 +0100397 SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
Ben Murdoch014dc512016-03-22 12:00:34 +0000398 int register_max = IrregexpMaxRegisterCount(*data);
399 if (result.num_registers > register_max) {
400 SetIrregexpMaxRegisterCount(*data, result.num_registers);
401 }
402
403 return true;
404}
405
406
407int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
408 return Smi::cast(
409 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
410}
411
412
413void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
414 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
415}
416
Ben Murdoch13e2dad2016-09-16 13:49:30 +0100417void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
418 Handle<FixedArray> value) {
419 if (value.is_null()) {
420 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::FromInt(0));
421 } else {
422 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
423 }
424}
Ben Murdoch014dc512016-03-22 12:00:34 +0000425
426int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
427 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
428}
429
430
431int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
432 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
433}
434
435
436ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
437 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
438}
439
440
441Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
442 return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
443}
444
445
446void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
447 Handle<String> pattern,
448 JSRegExp::Flags flags,
449 int capture_count) {
450 // Initialize compiled code entries to null.
451 re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
452 JSRegExp::IRREGEXP,
453 pattern,
454 flags,
455 capture_count);
456}
457
458
459int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
460 Handle<String> subject) {
461 subject = String::Flatten(subject);
462
463 // Check representation of the underlying storage.
464 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
465 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
466
467#ifdef V8_INTERPRETED_REGEXP
468 // Byte-code regexp needs space allocated for all its registers.
469 // The result captures are copied to the start of the registers array
470 // if the match succeeds. This way those registers are not clobbered
471 // when we set the last match info from last successful match.
472 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
473 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
474#else // V8_INTERPRETED_REGEXP
475 // Native regexp only needs room to output captures. Registers are handled
476 // internally.
477 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
478#endif // V8_INTERPRETED_REGEXP
479}
480
481
482int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
483 Handle<String> subject,
484 int index,
485 int32_t* output,
486 int output_size) {
487 Isolate* isolate = regexp->GetIsolate();
488
489 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
490
491 DCHECK(index >= 0);
492 DCHECK(index <= subject->length());
493 DCHECK(subject->IsFlat());
494
495 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
496
497#ifndef V8_INTERPRETED_REGEXP
498 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
499 do {
500 EnsureCompiledIrregexp(regexp, subject, is_one_byte);
501 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
502 // The stack is used to allocate registers for the compiled regexp code.
503 // This means that in case of failure, the output registers array is left
504 // untouched and contains the capture results from the previous successful
505 // match. We can use that to set the last match info lazily.
506 NativeRegExpMacroAssembler::Result res =
507 NativeRegExpMacroAssembler::Match(code,
508 subject,
509 output,
510 output_size,
511 index,
512 isolate);
513 if (res != NativeRegExpMacroAssembler::RETRY) {
514 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
515 isolate->has_pending_exception());
516 STATIC_ASSERT(
517 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
518 STATIC_ASSERT(
519 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
520 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
521 == RE_EXCEPTION);
522 return static_cast<IrregexpResult>(res);
523 }
524 // If result is RETRY, the string has changed representation, and we
525 // must restart from scratch.
526 // In this case, it means we must make sure we are prepared to handle
527 // the, potentially, different subject (the string can switch between
528 // being internal and external, and even between being Latin1 and UC16,
529 // but the characters are always the same).
530 IrregexpPrepare(regexp, subject);
531 is_one_byte = subject->IsOneByteRepresentationUnderneath();
532 } while (true);
533 UNREACHABLE();
534 return RE_EXCEPTION;
535#else // V8_INTERPRETED_REGEXP
536
537 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
538 // We must have done EnsureCompiledIrregexp, so we can get the number of
539 // registers.
540 int number_of_capture_registers =
541 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
542 int32_t* raw_output = &output[number_of_capture_registers];
543 // We do not touch the actual capture result registers until we know there
544 // has been a match so that we can use those capture results to set the
545 // last match info.
546 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
547 raw_output[i] = -1;
548 }
549 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
550 isolate);
551
552 IrregexpResult result = IrregexpInterpreter::Match(isolate,
553 byte_codes,
554 subject,
555 raw_output,
556 index);
557 if (result == RE_SUCCESS) {
558 // Copy capture results to the start of the registers array.
559 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
560 }
561 if (result == RE_EXCEPTION) {
562 DCHECK(!isolate->has_pending_exception());
563 isolate->StackOverflow();
564 }
565 return result;
566#endif // V8_INTERPRETED_REGEXP
567}
568
Ben Murdoch014dc512016-03-22 12:00:34 +0000569MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
570 Handle<String> subject,
571 int previous_index,
Ben Murdochf91f0612016-11-29 16:50:11 +0000572 Handle<JSObject> last_match_info) {
Ben Murdoch014dc512016-03-22 12:00:34 +0000573 Isolate* isolate = regexp->GetIsolate();
574 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
575
576 // Prepare space for the return values.
577#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
578 if (FLAG_trace_regexp_bytecodes) {
579 String* pattern = regexp->Pattern();
580 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
581 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
582 }
583#endif
584 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
585 if (required_registers < 0) {
586 // Compiling failed with an exception.
587 DCHECK(isolate->has_pending_exception());
588 return MaybeHandle<Object>();
589 }
590
591 int32_t* output_registers = NULL;
592 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
593 output_registers = NewArray<int32_t>(required_registers);
594 }
Ben Murdochf91f0612016-11-29 16:50:11 +0000595 std::unique_ptr<int32_t[]> auto_release(output_registers);
Ben Murdoch014dc512016-03-22 12:00:34 +0000596 if (output_registers == NULL) {
597 output_registers = isolate->jsregexp_static_offsets_vector();
598 }
599
600 int res = RegExpImpl::IrregexpExecRaw(
601 regexp, subject, previous_index, output_registers, required_registers);
602 if (res == RE_SUCCESS) {
603 int capture_count =
604 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
605 return SetLastMatchInfo(
606 last_match_info, subject, capture_count, output_registers);
607 }
608 if (res == RE_EXCEPTION) {
609 DCHECK(isolate->has_pending_exception());
610 return MaybeHandle<Object>();
611 }
612 DCHECK(res == RE_FAILURE);
613 return isolate->factory()->null_value();
614}
615
Ben Murdochf91f0612016-11-29 16:50:11 +0000616static void EnsureSize(Handle<JSObject> array, uint32_t minimum_size) {
Ben Murdoch014dc512016-03-22 12:00:34 +0000617 if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) {
Ben Murdochf91f0612016-11-29 16:50:11 +0000618 array->GetElementsAccessor()->GrowCapacityAndConvert(array, minimum_size);
Ben Murdoch014dc512016-03-22 12:00:34 +0000619 }
620}
621
Ben Murdochf91f0612016-11-29 16:50:11 +0000622Handle<JSObject> RegExpImpl::SetLastMatchInfo(Handle<JSObject> last_match_info,
623 Handle<String> subject,
624 int capture_count,
625 int32_t* match) {
Ben Murdoch014dc512016-03-22 12:00:34 +0000626 DCHECK(last_match_info->HasFastObjectElements());
627 int capture_register_count = (capture_count + 1) * 2;
628 EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead);
629 DisallowHeapAllocation no_allocation;
630 FixedArray* array = FixedArray::cast(last_match_info->elements());
631 if (match != NULL) {
632 for (int i = 0; i < capture_register_count; i += 2) {
633 SetCapture(array, i, match[i]);
634 SetCapture(array, i + 1, match[i + 1]);
635 }
636 }
637 SetLastCaptureCount(array, capture_register_count);
638 SetLastSubject(array, *subject);
639 SetLastInput(array, *subject);
640 return last_match_info;
641}
642
643
644RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
645 Handle<String> subject,
Ben Murdoch014dc512016-03-22 12:00:34 +0000646 Isolate* isolate)
647 : register_array_(NULL),
648 register_array_size_(0),
649 regexp_(regexp),
650 subject_(subject) {
651#ifdef V8_INTERPRETED_REGEXP
652 bool interpreted = true;
653#else
654 bool interpreted = false;
655#endif // V8_INTERPRETED_REGEXP
656
657 if (regexp_->TypeTag() == JSRegExp::ATOM) {
658 static const int kAtomRegistersPerMatch = 2;
659 registers_per_match_ = kAtomRegistersPerMatch;
660 // There is no distinction between interpreted and native for atom regexps.
661 interpreted = false;
662 } else {
663 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
664 if (registers_per_match_ < 0) {
665 num_matches_ = -1; // Signal exception.
666 return;
667 }
668 }
669
Ben Murdoch109988c2016-05-18 11:27:45 +0100670 DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
671 if (!interpreted) {
Ben Murdoch014dc512016-03-22 12:00:34 +0000672 register_array_size_ =
673 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
674 max_matches_ = register_array_size_ / registers_per_match_;
675 } else {
676 // Global loop in interpreted regexp is not implemented. We choose
677 // the size of the offsets vector so that it can only store one match.
678 register_array_size_ = registers_per_match_;
679 max_matches_ = 1;
680 }
681
682 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
683 register_array_ = NewArray<int32_t>(register_array_size_);
684 } else {
685 register_array_ = isolate->jsregexp_static_offsets_vector();
686 }
687
688 // Set state so that fetching the results the first time triggers a call
689 // to the compiled regexp.
690 current_match_index_ = max_matches_ - 1;
691 num_matches_ = max_matches_;
692 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
693 DCHECK_GE(register_array_size_, registers_per_match_);
694 int32_t* last_match =
695 &register_array_[current_match_index_ * registers_per_match_];
696 last_match[0] = -1;
697 last_match[1] = 0;
698}
699
Ben Murdoch109988c2016-05-18 11:27:45 +0100700int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
701 if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
702 last_index + 1 < subject_->length() &&
703 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
704 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
705 // Advance over the surrogate pair.
706 return last_index + 2;
707 }
708 return last_index + 1;
709}
Ben Murdoch014dc512016-03-22 12:00:34 +0000710
711// -------------------------------------------------------------------
712// Implementation of the Irregexp regular expression engine.
713//
714// The Irregexp regular expression engine is intended to be a complete
715// implementation of ECMAScript regular expressions. It generates either
716// bytecodes or native code.
717
718// The Irregexp regexp engine is structured in three steps.
719// 1) The parser generates an abstract syntax tree. See ast.cc.
720// 2) From the AST a node network is created. The nodes are all
721// subclasses of RegExpNode. The nodes represent states when
722// executing a regular expression. Several optimizations are
723// performed on the node network.
724// 3) From the nodes we generate either byte codes or native code
725// that can actually execute the regular expression (perform
726// the search). The code generation step is described in more
727// detail below.
728
729// Code generation.
730//
731// The nodes are divided into four main categories.
732// * Choice nodes
733// These represent places where the regular expression can
734// match in more than one way. For example on entry to an
735// alternation (foo|bar) or a repetition (*, +, ? or {}).
736// * Action nodes
737// These represent places where some action should be
738// performed. Examples include recording the current position
739// in the input string to a register (in order to implement
740// captures) or other actions on register for example in order
741// to implement the counters needed for {} repetitions.
742// * Matching nodes
743// These attempt to match some element part of the input string.
744// Examples of elements include character classes, plain strings
745// or back references.
746// * End nodes
747// These are used to implement the actions required on finding
748// a successful match or failing to find a match.
749//
750// The code generated (whether as byte codes or native code) maintains
751// some state as it runs. This consists of the following elements:
752//
753// * The capture registers. Used for string captures.
754// * Other registers. Used for counters etc.
755// * The current position.
756// * The stack of backtracking information. Used when a matching node
757// fails to find a match and needs to try an alternative.
758//
759// Conceptual regular expression execution model:
760//
761// There is a simple conceptual model of regular expression execution
762// which will be presented first. The actual code generated is a more
763// efficient simulation of the simple conceptual model:
764//
765// * Choice nodes are implemented as follows:
766// For each choice except the last {
767// push current position
768// push backtrack code location
769// <generate code to test for choice>
770// backtrack code location:
771// pop current position
772// }
773// <generate code to test for last choice>
774//
775// * Actions nodes are generated as follows
776// <push affected registers on backtrack stack>
777// <generate code to perform action>
778// push backtrack code location
779// <generate code to test for following nodes>
780// backtrack code location:
781// <pop affected registers to restore their state>
782// <pop backtrack location from stack and go to it>
783//
784// * Matching nodes are generated as follows:
785// if input string matches at current position
786// update current position
787// <generate code to test for following nodes>
788// else
789// <pop backtrack location from stack and go to it>
790//
791// Thus it can be seen that the current position is saved and restored
792// by the choice nodes, whereas the registers are saved and restored by
793// by the action nodes that manipulate them.
794//
795// The other interesting aspect of this model is that nodes are generated
796// at the point where they are needed by a recursive call to Emit(). If
797// the node has already been code generated then the Emit() call will
798// generate a jump to the previously generated code instead. In order to
799// limit recursion it is possible for the Emit() function to put the node
800// on a work list for later generation and instead generate a jump. The
801// destination of the jump is resolved later when the code is generated.
802//
803// Actual regular expression code generation.
804//
805// Code generation is actually more complicated than the above. In order
806// to improve the efficiency of the generated code some optimizations are
807// performed
808//
809// * Choice nodes have 1-character lookahead.
810// A choice node looks at the following character and eliminates some of
811// the choices immediately based on that character. This is not yet
812// implemented.
813// * Simple greedy loops store reduced backtracking information.
814// A quantifier like /.*foo/m will greedily match the whole input. It will
815// then need to backtrack to a point where it can match "foo". The naive
816// implementation of this would push each character position onto the
817// backtracking stack, then pop them off one by one. This would use space
818// proportional to the length of the input string. However since the "."
819// can only match in one way and always has a constant length (in this case
820// of 1) it suffices to store the current position on the top of the stack
821// once. Matching now becomes merely incrementing the current position and
822// backtracking becomes decrementing the current position and checking the
823// result against the stored current position. This is faster and saves
824// space.
825// * The current state is virtualized.
826// This is used to defer expensive operations until it is clear that they
827// are needed and to generate code for a node more than once, allowing
828// specialized an efficient versions of the code to be created. This is
829// explained in the section below.
830//
831// Execution state virtualization.
832//
833// Instead of emitting code, nodes that manipulate the state can record their
834// manipulation in an object called the Trace. The Trace object can record a
835// current position offset, an optional backtrack code location on the top of
836// the virtualized backtrack stack and some register changes. When a node is
837// to be emitted it can flush the Trace or update it. Flushing the Trace
838// will emit code to bring the actual state into line with the virtual state.
839// Avoiding flushing the state can postpone some work (e.g. updates of capture
840// registers). Postponing work can save time when executing the regular
841// expression since it may be found that the work never has to be done as a
842// failure to match can occur. In addition it is much faster to jump to a
843// known backtrack code location than it is to pop an unknown backtrack
844// location from the stack and jump there.
845//
846// The virtual state found in the Trace affects code generation. For example
847// the virtual state contains the difference between the actual current
848// position and the virtual current position, and matching code needs to use
849// this offset to attempt a match in the correct location of the input
850// string. Therefore code generated for a non-trivial trace is specialized
851// to that trace. The code generator therefore has the ability to generate
852// code for each node several times. In order to limit the size of the
853// generated code there is an arbitrary limit on how many specialized sets of
854// code may be generated for a given node. If the limit is reached, the
855// trace is flushed and a generic version of the code for a node is emitted.
856// This is subsequently used for that node. The code emitted for non-generic
857// trace is not recorded in the node and so it cannot currently be reused in
858// the event that code generation is requested for an identical trace.
859
860
861void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
862 UNREACHABLE();
863}
864
865
866void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
867 text->AddElement(TextElement::Atom(this), zone);
868}
869
870
871void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
872 text->AddElement(TextElement::CharClass(this), zone);
873}
874
875
876void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
877 for (int i = 0; i < elements()->length(); i++)
878 text->AddElement(elements()->at(i), zone);
879}
880
881
882TextElement TextElement::Atom(RegExpAtom* atom) {
883 return TextElement(ATOM, atom);
884}
885
886
887TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
888 return TextElement(CHAR_CLASS, char_class);
889}
890
891
892int TextElement::length() const {
893 switch (text_type()) {
894 case ATOM:
895 return atom()->length();
896
897 case CHAR_CLASS:
898 return 1;
899 }
900 UNREACHABLE();
901 return 0;
902}
903
904
905DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
906 if (table_ == NULL) {
907 table_ = new(zone()) DispatchTable(zone());
908 DispatchTableConstructor cons(table_, ignore_case, zone());
909 cons.BuildTable(this);
910 }
911 return table_;
912}
913
914
915class FrequencyCollator {
916 public:
917 FrequencyCollator() : total_samples_(0) {
918 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
919 frequencies_[i] = CharacterFrequency(i);
920 }
921 }
922
923 void CountCharacter(int character) {
924 int index = (character & RegExpMacroAssembler::kTableMask);
925 frequencies_[index].Increment();
926 total_samples_++;
927 }
928
929 // Does not measure in percent, but rather per-128 (the table size from the
930 // regexp macro assembler).
931 int Frequency(int in_character) {
932 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
933 if (total_samples_ < 1) return 1; // Division by zero.
934 int freq_in_per128 =
935 (frequencies_[in_character].counter() * 128) / total_samples_;
936 return freq_in_per128;
937 }
938
939 private:
940 class CharacterFrequency {
941 public:
942 CharacterFrequency() : counter_(0), character_(-1) { }
943 explicit CharacterFrequency(int character)
944 : counter_(0), character_(character) { }
945
946 void Increment() { counter_++; }
947 int counter() { return counter_; }
948 int character() { return character_; }
949
950 private:
951 int counter_;
952 int character_;
953 };
954
955
956 private:
957 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
958 int total_samples_;
959};
960
961
962class RegExpCompiler {
963 public:
964 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch109988c2016-05-18 11:27:45 +0100965 JSRegExp::Flags flags, bool is_one_byte);
Ben Murdoch014dc512016-03-22 12:00:34 +0000966
967 int AllocateRegister() {
968 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
969 reg_exp_too_big_ = true;
970 return next_register_;
971 }
972 return next_register_++;
973 }
974
Ben Murdoch109988c2016-05-18 11:27:45 +0100975 // Lookarounds to match lone surrogates for unicode character class matches
976 // are never nested. We can therefore reuse registers.
977 int UnicodeLookaroundStackRegister() {
978 if (unicode_lookaround_stack_register_ == kNoRegister) {
979 unicode_lookaround_stack_register_ = AllocateRegister();
980 }
981 return unicode_lookaround_stack_register_;
982 }
983
984 int UnicodeLookaroundPositionRegister() {
985 if (unicode_lookaround_position_register_ == kNoRegister) {
986 unicode_lookaround_position_register_ = AllocateRegister();
987 }
988 return unicode_lookaround_position_register_;
989 }
990
Ben Murdoch014dc512016-03-22 12:00:34 +0000991 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
992 RegExpNode* start,
993 int capture_count,
994 Handle<String> pattern);
995
996 inline void AddWork(RegExpNode* node) {
997 if (!node->on_work_list() && !node->label()->is_bound()) {
998 node->set_on_work_list(true);
999 work_list_->Add(node);
1000 }
1001 }
1002
1003 static const int kImplementationOffset = 0;
1004 static const int kNumberOfRegistersOffset = 0;
1005 static const int kCodeOffset = 1;
1006
1007 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1008 EndNode* accept() { return accept_; }
1009
1010 static const int kMaxRecursion = 100;
1011 inline int recursion_depth() { return recursion_depth_; }
1012 inline void IncrementRecursionDepth() { recursion_depth_++; }
1013 inline void DecrementRecursionDepth() { recursion_depth_--; }
1014
1015 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1016
Ben Murdoch109988c2016-05-18 11:27:45 +01001017 inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
1018 inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
Ben Murdoch014dc512016-03-22 12:00:34 +00001019 inline bool one_byte() { return one_byte_; }
1020 inline bool optimize() { return optimize_; }
1021 inline void set_optimize(bool value) { optimize_ = value; }
1022 inline bool limiting_recursion() { return limiting_recursion_; }
1023 inline void set_limiting_recursion(bool value) {
1024 limiting_recursion_ = value;
1025 }
1026 bool read_backward() { return read_backward_; }
1027 void set_read_backward(bool value) { read_backward_ = value; }
1028 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1029
1030 int current_expansion_factor() { return current_expansion_factor_; }
1031 void set_current_expansion_factor(int value) {
1032 current_expansion_factor_ = value;
1033 }
1034
1035 Isolate* isolate() const { return isolate_; }
1036 Zone* zone() const { return zone_; }
1037
1038 static const int kNoRegister = -1;
1039
1040 private:
1041 EndNode* accept_;
1042 int next_register_;
Ben Murdoch109988c2016-05-18 11:27:45 +01001043 int unicode_lookaround_stack_register_;
1044 int unicode_lookaround_position_register_;
Ben Murdoch014dc512016-03-22 12:00:34 +00001045 List<RegExpNode*>* work_list_;
1046 int recursion_depth_;
1047 RegExpMacroAssembler* macro_assembler_;
Ben Murdoch109988c2016-05-18 11:27:45 +01001048 JSRegExp::Flags flags_;
Ben Murdoch014dc512016-03-22 12:00:34 +00001049 bool one_byte_;
1050 bool reg_exp_too_big_;
1051 bool limiting_recursion_;
1052 bool optimize_;
1053 bool read_backward_;
1054 int current_expansion_factor_;
1055 FrequencyCollator frequency_collator_;
1056 Isolate* isolate_;
1057 Zone* zone_;
1058};
1059
1060
1061class RecursionCheck {
1062 public:
1063 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1064 compiler->IncrementRecursionDepth();
1065 }
1066 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1067 private:
1068 RegExpCompiler* compiler_;
1069};
1070
1071
1072static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1073 return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1074}
1075
1076
1077// Attempts to compile the regexp using an Irregexp code generator. Returns
1078// a fixed array or a null handle depending on whether it succeeded.
1079RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch109988c2016-05-18 11:27:45 +01001080 JSRegExp::Flags flags, bool one_byte)
Ben Murdoch014dc512016-03-22 12:00:34 +00001081 : next_register_(2 * (capture_count + 1)),
Ben Murdoch109988c2016-05-18 11:27:45 +01001082 unicode_lookaround_stack_register_(kNoRegister),
1083 unicode_lookaround_position_register_(kNoRegister),
Ben Murdoch014dc512016-03-22 12:00:34 +00001084 work_list_(NULL),
1085 recursion_depth_(0),
Ben Murdoch109988c2016-05-18 11:27:45 +01001086 flags_(flags),
Ben Murdoch014dc512016-03-22 12:00:34 +00001087 one_byte_(one_byte),
1088 reg_exp_too_big_(false),
1089 limiting_recursion_(false),
1090 optimize_(FLAG_regexp_optimization),
1091 read_backward_(false),
1092 current_expansion_factor_(1),
1093 frequency_collator_(),
1094 isolate_(isolate),
1095 zone_(zone) {
1096 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1097 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1098}
1099
1100
1101RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1102 RegExpMacroAssembler* macro_assembler,
1103 RegExpNode* start,
1104 int capture_count,
1105 Handle<String> pattern) {
1106 Heap* heap = pattern->GetHeap();
1107
1108#ifdef DEBUG
1109 if (FLAG_trace_regexp_assembler)
1110 macro_assembler_ =
1111 new RegExpMacroAssemblerTracer(isolate(), macro_assembler);
1112 else
1113#endif
1114 macro_assembler_ = macro_assembler;
1115
1116 List <RegExpNode*> work_list(0);
1117 work_list_ = &work_list;
1118 Label fail;
1119 macro_assembler_->PushBacktrack(&fail);
1120 Trace new_trace;
1121 start->Emit(this, &new_trace);
1122 macro_assembler_->Bind(&fail);
1123 macro_assembler_->Fail();
1124 while (!work_list.is_empty()) {
1125 RegExpNode* node = work_list.RemoveLast();
1126 node->set_on_work_list(false);
1127 if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1128 }
1129 if (reg_exp_too_big_) {
1130 macro_assembler_->AbortedCodeGeneration();
1131 return IrregexpRegExpTooBig(isolate_);
1132 }
1133
1134 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1135 heap->IncreaseTotalRegexpCodeGenerated(code->Size());
1136 work_list_ = NULL;
1137#ifdef ENABLE_DISASSEMBLER
1138 if (FLAG_print_code) {
1139 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
1140 OFStream os(trace_scope.file());
1141 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1142 }
1143#endif
1144#ifdef DEBUG
1145 if (FLAG_trace_regexp_assembler) {
1146 delete macro_assembler_;
1147 }
1148#endif
1149 return RegExpEngine::CompilationResult(*code, next_register_);
1150}
1151
1152
1153bool Trace::DeferredAction::Mentions(int that) {
1154 if (action_type() == ActionNode::CLEAR_CAPTURES) {
1155 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1156 return range.Contains(that);
1157 } else {
1158 return reg() == that;
1159 }
1160}
1161
1162
1163bool Trace::mentions_reg(int reg) {
1164 for (DeferredAction* action = actions_;
1165 action != NULL;
1166 action = action->next()) {
1167 if (action->Mentions(reg))
1168 return true;
1169 }
1170 return false;
1171}
1172
1173
1174bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1175 DCHECK_EQ(0, *cp_offset);
1176 for (DeferredAction* action = actions_;
1177 action != NULL;
1178 action = action->next()) {
1179 if (action->Mentions(reg)) {
1180 if (action->action_type() == ActionNode::STORE_POSITION) {
1181 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1182 return true;
1183 } else {
1184 return false;
1185 }
1186 }
1187 }
1188 return false;
1189}
1190
1191
1192int Trace::FindAffectedRegisters(OutSet* affected_registers,
1193 Zone* zone) {
1194 int max_register = RegExpCompiler::kNoRegister;
1195 for (DeferredAction* action = actions_;
1196 action != NULL;
1197 action = action->next()) {
1198 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1199 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1200 for (int i = range.from(); i <= range.to(); i++)
1201 affected_registers->Set(i, zone);
1202 if (range.to() > max_register) max_register = range.to();
1203 } else {
1204 affected_registers->Set(action->reg(), zone);
1205 if (action->reg() > max_register) max_register = action->reg();
1206 }
1207 }
1208 return max_register;
1209}
1210
1211
1212void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1213 int max_register,
1214 const OutSet& registers_to_pop,
1215 const OutSet& registers_to_clear) {
1216 for (int reg = max_register; reg >= 0; reg--) {
1217 if (registers_to_pop.Get(reg)) {
1218 assembler->PopRegister(reg);
1219 } else if (registers_to_clear.Get(reg)) {
1220 int clear_to = reg;
1221 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1222 reg--;
1223 }
1224 assembler->ClearRegisters(reg, clear_to);
1225 }
1226 }
1227}
1228
1229
1230void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1231 int max_register,
1232 const OutSet& affected_registers,
1233 OutSet* registers_to_pop,
1234 OutSet* registers_to_clear,
1235 Zone* zone) {
1236 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1237 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1238
1239 // Count pushes performed to force a stack limit check occasionally.
1240 int pushes = 0;
1241
1242 for (int reg = 0; reg <= max_register; reg++) {
1243 if (!affected_registers.Get(reg)) {
1244 continue;
1245 }
1246
1247 // The chronologically first deferred action in the trace
1248 // is used to infer the action needed to restore a register
1249 // to its previous state (or not, if it's safe to ignore it).
1250 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1251 DeferredActionUndoType undo_action = IGNORE;
1252
1253 int value = 0;
1254 bool absolute = false;
1255 bool clear = false;
1256 static const int kNoStore = kMinInt;
1257 int store_position = kNoStore;
1258 // This is a little tricky because we are scanning the actions in reverse
1259 // historical order (newest first).
1260 for (DeferredAction* action = actions_;
1261 action != NULL;
1262 action = action->next()) {
1263 if (action->Mentions(reg)) {
1264 switch (action->action_type()) {
1265 case ActionNode::SET_REGISTER: {
1266 Trace::DeferredSetRegister* psr =
1267 static_cast<Trace::DeferredSetRegister*>(action);
1268 if (!absolute) {
1269 value += psr->value();
1270 absolute = true;
1271 }
1272 // SET_REGISTER is currently only used for newly introduced loop
1273 // counters. They can have a significant previous value if they
1274 // occour in a loop. TODO(lrn): Propagate this information, so
1275 // we can set undo_action to IGNORE if we know there is no value to
1276 // restore.
1277 undo_action = RESTORE;
1278 DCHECK_EQ(store_position, kNoStore);
1279 DCHECK(!clear);
1280 break;
1281 }
1282 case ActionNode::INCREMENT_REGISTER:
1283 if (!absolute) {
1284 value++;
1285 }
1286 DCHECK_EQ(store_position, kNoStore);
1287 DCHECK(!clear);
1288 undo_action = RESTORE;
1289 break;
1290 case ActionNode::STORE_POSITION: {
1291 Trace::DeferredCapture* pc =
1292 static_cast<Trace::DeferredCapture*>(action);
1293 if (!clear && store_position == kNoStore) {
1294 store_position = pc->cp_offset();
1295 }
1296
1297 // For captures we know that stores and clears alternate.
1298 // Other register, are never cleared, and if the occur
1299 // inside a loop, they might be assigned more than once.
1300 if (reg <= 1) {
1301 // Registers zero and one, aka "capture zero", is
1302 // always set correctly if we succeed. There is no
1303 // need to undo a setting on backtrack, because we
1304 // will set it again or fail.
1305 undo_action = IGNORE;
1306 } else {
1307 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1308 }
1309 DCHECK(!absolute);
1310 DCHECK_EQ(value, 0);
1311 break;
1312 }
1313 case ActionNode::CLEAR_CAPTURES: {
1314 // Since we're scanning in reverse order, if we've already
1315 // set the position we have to ignore historically earlier
1316 // clearing operations.
1317 if (store_position == kNoStore) {
1318 clear = true;
1319 }
1320 undo_action = RESTORE;
1321 DCHECK(!absolute);
1322 DCHECK_EQ(value, 0);
1323 break;
1324 }
1325 default:
1326 UNREACHABLE();
1327 break;
1328 }
1329 }
1330 }
1331 // Prepare for the undo-action (e.g., push if it's going to be popped).
1332 if (undo_action == RESTORE) {
1333 pushes++;
1334 RegExpMacroAssembler::StackCheckFlag stack_check =
1335 RegExpMacroAssembler::kNoStackLimitCheck;
1336 if (pushes == push_limit) {
1337 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1338 pushes = 0;
1339 }
1340
1341 assembler->PushRegister(reg, stack_check);
1342 registers_to_pop->Set(reg, zone);
1343 } else if (undo_action == CLEAR) {
1344 registers_to_clear->Set(reg, zone);
1345 }
1346 // Perform the chronologically last action (or accumulated increment)
1347 // for the register.
1348 if (store_position != kNoStore) {
1349 assembler->WriteCurrentPositionToRegister(reg, store_position);
1350 } else if (clear) {
1351 assembler->ClearRegisters(reg, reg);
1352 } else if (absolute) {
1353 assembler->SetRegister(reg, value);
1354 } else if (value != 0) {
1355 assembler->AdvanceRegister(reg, value);
1356 }
1357 }
1358}
1359
1360
1361// This is called as we come into a loop choice node and some other tricky
1362// nodes. It normalizes the state of the code generator to ensure we can
1363// generate generic code.
1364void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1365 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1366
1367 DCHECK(!is_trivial());
1368
1369 if (actions_ == NULL && backtrack() == NULL) {
1370 // Here we just have some deferred cp advances to fix and we are back to
1371 // a normal situation. We may also have to forget some information gained
1372 // through a quick check that was already performed.
1373 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1374 // Create a new trivial state and generate the node with that.
1375 Trace new_state;
1376 successor->Emit(compiler, &new_state);
1377 return;
1378 }
1379
1380 // Generate deferred actions here along with code to undo them again.
1381 OutSet affected_registers;
1382
1383 if (backtrack() != NULL) {
1384 // Here we have a concrete backtrack location. These are set up by choice
1385 // nodes and so they indicate that we have a deferred save of the current
1386 // position which we may need to emit here.
1387 assembler->PushCurrentPosition();
1388 }
1389
1390 int max_register = FindAffectedRegisters(&affected_registers,
1391 compiler->zone());
1392 OutSet registers_to_pop;
1393 OutSet registers_to_clear;
1394 PerformDeferredActions(assembler,
1395 max_register,
1396 affected_registers,
1397 &registers_to_pop,
1398 &registers_to_clear,
1399 compiler->zone());
1400 if (cp_offset_ != 0) {
1401 assembler->AdvanceCurrentPosition(cp_offset_);
1402 }
1403
1404 // Create a new trivial state and generate the node with that.
1405 Label undo;
1406 assembler->PushBacktrack(&undo);
1407 if (successor->KeepRecursing(compiler)) {
1408 Trace new_state;
1409 successor->Emit(compiler, &new_state);
1410 } else {
1411 compiler->AddWork(successor);
1412 assembler->GoTo(successor->label());
1413 }
1414
1415 // On backtrack we need to restore state.
1416 assembler->Bind(&undo);
1417 RestoreAffectedRegisters(assembler,
1418 max_register,
1419 registers_to_pop,
1420 registers_to_clear);
1421 if (backtrack() == NULL) {
1422 assembler->Backtrack();
1423 } else {
1424 assembler->PopCurrentPosition();
1425 assembler->GoTo(backtrack());
1426 }
1427}
1428
1429
1430void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1431 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1432
1433 // Omit flushing the trace. We discard the entire stack frame anyway.
1434
1435 if (!label()->is_bound()) {
1436 // We are completely independent of the trace, since we ignore it,
1437 // so this code can be used as the generic version.
1438 assembler->Bind(label());
1439 }
1440
1441 // Throw away everything on the backtrack stack since the start
1442 // of the negative submatch and restore the character position.
1443 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1444 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1445 if (clear_capture_count_ > 0) {
1446 // Clear any captures that might have been performed during the success
1447 // of the body of the negative look-ahead.
1448 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1449 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1450 }
1451 // Now that we have unwound the stack we find at the top of the stack the
1452 // backtrack that the BeginSubmatch node got.
1453 assembler->Backtrack();
1454}
1455
1456
1457void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1458 if (!trace->is_trivial()) {
1459 trace->Flush(compiler, this);
1460 return;
1461 }
1462 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1463 if (!label()->is_bound()) {
1464 assembler->Bind(label());
1465 }
1466 switch (action_) {
1467 case ACCEPT:
1468 assembler->Succeed();
1469 return;
1470 case BACKTRACK:
1471 assembler->GoTo(trace->backtrack());
1472 return;
1473 case NEGATIVE_SUBMATCH_SUCCESS:
1474 // This case is handled in a different virtual method.
1475 UNREACHABLE();
1476 }
1477 UNIMPLEMENTED();
1478}
1479
1480
1481void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1482 if (guards_ == NULL)
1483 guards_ = new(zone) ZoneList<Guard*>(1, zone);
1484 guards_->Add(guard, zone);
1485}
1486
1487
1488ActionNode* ActionNode::SetRegister(int reg,
1489 int val,
1490 RegExpNode* on_success) {
1491 ActionNode* result =
1492 new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1493 result->data_.u_store_register.reg = reg;
1494 result->data_.u_store_register.value = val;
1495 return result;
1496}
1497
1498
1499ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1500 ActionNode* result =
1501 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1502 result->data_.u_increment_register.reg = reg;
1503 return result;
1504}
1505
1506
1507ActionNode* ActionNode::StorePosition(int reg,
1508 bool is_capture,
1509 RegExpNode* on_success) {
1510 ActionNode* result =
1511 new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1512 result->data_.u_position_register.reg = reg;
1513 result->data_.u_position_register.is_capture = is_capture;
1514 return result;
1515}
1516
1517
1518ActionNode* ActionNode::ClearCaptures(Interval range,
1519 RegExpNode* on_success) {
1520 ActionNode* result =
1521 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1522 result->data_.u_clear_captures.range_from = range.from();
1523 result->data_.u_clear_captures.range_to = range.to();
1524 return result;
1525}
1526
1527
1528ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1529 int position_reg,
1530 RegExpNode* on_success) {
1531 ActionNode* result =
1532 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1533 result->data_.u_submatch.stack_pointer_register = stack_reg;
1534 result->data_.u_submatch.current_position_register = position_reg;
1535 return result;
1536}
1537
1538
1539ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1540 int position_reg,
1541 int clear_register_count,
1542 int clear_register_from,
1543 RegExpNode* on_success) {
1544 ActionNode* result =
1545 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1546 result->data_.u_submatch.stack_pointer_register = stack_reg;
1547 result->data_.u_submatch.current_position_register = position_reg;
1548 result->data_.u_submatch.clear_register_count = clear_register_count;
1549 result->data_.u_submatch.clear_register_from = clear_register_from;
1550 return result;
1551}
1552
1553
1554ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1555 int repetition_register,
1556 int repetition_limit,
1557 RegExpNode* on_success) {
1558 ActionNode* result =
1559 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1560 result->data_.u_empty_match_check.start_register = start_register;
1561 result->data_.u_empty_match_check.repetition_register = repetition_register;
1562 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1563 return result;
1564}
1565
1566
1567#define DEFINE_ACCEPT(Type) \
1568 void Type##Node::Accept(NodeVisitor* visitor) { \
1569 visitor->Visit##Type(this); \
1570 }
1571FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1572#undef DEFINE_ACCEPT
1573
1574
1575void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1576 visitor->VisitLoopChoice(this);
1577}
1578
1579
1580// -------------------------------------------------------------------
1581// Emit code.
1582
1583
1584void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1585 Guard* guard,
1586 Trace* trace) {
1587 switch (guard->op()) {
1588 case Guard::LT:
1589 DCHECK(!trace->mentions_reg(guard->reg()));
1590 macro_assembler->IfRegisterGE(guard->reg(),
1591 guard->value(),
1592 trace->backtrack());
1593 break;
1594 case Guard::GEQ:
1595 DCHECK(!trace->mentions_reg(guard->reg()));
1596 macro_assembler->IfRegisterLT(guard->reg(),
1597 guard->value(),
1598 trace->backtrack());
1599 break;
1600 }
1601}
1602
1603
1604// Returns the number of characters in the equivalence class, omitting those
1605// that cannot occur in the source string because it is Latin1.
1606static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1607 bool one_byte_subject,
1608 unibrow::uchar* letters) {
1609 int length =
1610 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1611 // Unibrow returns 0 or 1 for characters where case independence is
1612 // trivial.
1613 if (length == 0) {
1614 letters[0] = character;
1615 length = 1;
1616 }
1617
1618 if (one_byte_subject) {
1619 int new_length = 0;
1620 for (int i = 0; i < length; i++) {
1621 if (letters[i] <= String::kMaxOneByteCharCode) {
1622 letters[new_length++] = letters[i];
1623 }
1624 }
1625 length = new_length;
1626 }
1627
1628 return length;
1629}
1630
1631
1632static inline bool EmitSimpleCharacter(Isolate* isolate,
1633 RegExpCompiler* compiler,
1634 uc16 c,
1635 Label* on_failure,
1636 int cp_offset,
1637 bool check,
1638 bool preloaded) {
1639 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1640 bool bound_checked = false;
1641 if (!preloaded) {
1642 assembler->LoadCurrentCharacter(
1643 cp_offset,
1644 on_failure,
1645 check);
1646 bound_checked = true;
1647 }
1648 assembler->CheckNotCharacter(c, on_failure);
1649 return bound_checked;
1650}
1651
1652
1653// Only emits non-letters (things that don't have case). Only used for case
1654// independent matches.
1655static inline bool EmitAtomNonLetter(Isolate* isolate,
1656 RegExpCompiler* compiler,
1657 uc16 c,
1658 Label* on_failure,
1659 int cp_offset,
1660 bool check,
1661 bool preloaded) {
1662 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1663 bool one_byte = compiler->one_byte();
1664 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1665 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1666 if (length < 1) {
1667 // This can't match. Must be an one-byte subject and a non-one-byte
1668 // character. We do not need to do anything since the one-byte pass
1669 // already handled this.
1670 return false; // Bounds not checked.
1671 }
1672 bool checked = false;
1673 // We handle the length > 1 case in a later pass.
1674 if (length == 1) {
1675 if (one_byte && c > String::kMaxOneByteCharCodeU) {
1676 // Can't match - see above.
1677 return false; // Bounds not checked.
1678 }
1679 if (!preloaded) {
1680 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1681 checked = check;
1682 }
1683 macro_assembler->CheckNotCharacter(c, on_failure);
1684 }
1685 return checked;
1686}
1687
1688
1689static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1690 bool one_byte, uc16 c1, uc16 c2,
1691 Label* on_failure) {
1692 uc16 char_mask;
1693 if (one_byte) {
1694 char_mask = String::kMaxOneByteCharCode;
1695 } else {
1696 char_mask = String::kMaxUtf16CodeUnit;
1697 }
1698 uc16 exor = c1 ^ c2;
1699 // Check whether exor has only one bit set.
1700 if (((exor - 1) & exor) == 0) {
1701 // If c1 and c2 differ only by one bit.
1702 // Ecma262UnCanonicalize always gives the highest number last.
1703 DCHECK(c2 > c1);
1704 uc16 mask = char_mask ^ exor;
1705 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1706 return true;
1707 }
1708 DCHECK(c2 > c1);
1709 uc16 diff = c2 - c1;
1710 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1711 // If the characters differ by 2^n but don't differ by one bit then
1712 // subtract the difference from the found character, then do the or
1713 // trick. We avoid the theoretical case where negative numbers are
1714 // involved in order to simplify code generation.
1715 uc16 mask = char_mask ^ diff;
1716 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1717 diff,
1718 mask,
1719 on_failure);
1720 return true;
1721 }
1722 return false;
1723}
1724
1725
1726typedef bool EmitCharacterFunction(Isolate* isolate,
1727 RegExpCompiler* compiler,
1728 uc16 c,
1729 Label* on_failure,
1730 int cp_offset,
1731 bool check,
1732 bool preloaded);
1733
1734// Only emits letters (things that have case). Only used for case independent
1735// matches.
1736static inline bool EmitAtomLetter(Isolate* isolate,
1737 RegExpCompiler* compiler,
1738 uc16 c,
1739 Label* on_failure,
1740 int cp_offset,
1741 bool check,
1742 bool preloaded) {
1743 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1744 bool one_byte = compiler->one_byte();
1745 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1746 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1747 if (length <= 1) return false;
1748 // We may not need to check against the end of the input string
1749 // if this character lies before a character that matched.
1750 if (!preloaded) {
1751 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1752 }
1753 Label ok;
1754 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1755 switch (length) {
1756 case 2: {
1757 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1758 chars[1], on_failure)) {
1759 } else {
1760 macro_assembler->CheckCharacter(chars[0], &ok);
1761 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1762 macro_assembler->Bind(&ok);
1763 }
1764 break;
1765 }
1766 case 4:
1767 macro_assembler->CheckCharacter(chars[3], &ok);
1768 // Fall through!
1769 case 3:
1770 macro_assembler->CheckCharacter(chars[0], &ok);
1771 macro_assembler->CheckCharacter(chars[1], &ok);
1772 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1773 macro_assembler->Bind(&ok);
1774 break;
1775 default:
1776 UNREACHABLE();
1777 break;
1778 }
1779 return true;
1780}
1781
1782
1783static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1784 int border,
1785 Label* fall_through,
1786 Label* above_or_equal,
1787 Label* below) {
1788 if (below != fall_through) {
1789 masm->CheckCharacterLT(border, below);
1790 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1791 } else {
1792 masm->CheckCharacterGT(border - 1, above_or_equal);
1793 }
1794}
1795
1796
1797static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1798 int first,
1799 int last,
1800 Label* fall_through,
1801 Label* in_range,
1802 Label* out_of_range) {
1803 if (in_range == fall_through) {
1804 if (first == last) {
1805 masm->CheckNotCharacter(first, out_of_range);
1806 } else {
1807 masm->CheckCharacterNotInRange(first, last, out_of_range);
1808 }
1809 } else {
1810 if (first == last) {
1811 masm->CheckCharacter(first, in_range);
1812 } else {
1813 masm->CheckCharacterInRange(first, last, in_range);
1814 }
1815 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1816 }
1817}
1818
1819
1820// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1821// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1822static void EmitUseLookupTable(
1823 RegExpMacroAssembler* masm,
1824 ZoneList<int>* ranges,
1825 int start_index,
1826 int end_index,
1827 int min_char,
1828 Label* fall_through,
1829 Label* even_label,
1830 Label* odd_label) {
1831 static const int kSize = RegExpMacroAssembler::kTableSize;
1832 static const int kMask = RegExpMacroAssembler::kTableMask;
1833
1834 int base = (min_char & ~kMask);
1835 USE(base);
1836
1837 // Assert that everything is on one kTableSize page.
1838 for (int i = start_index; i <= end_index; i++) {
1839 DCHECK_EQ(ranges->at(i) & ~kMask, base);
1840 }
1841 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1842
1843 char templ[kSize];
1844 Label* on_bit_set;
1845 Label* on_bit_clear;
1846 int bit;
1847 if (even_label == fall_through) {
1848 on_bit_set = odd_label;
1849 on_bit_clear = even_label;
1850 bit = 1;
1851 } else {
1852 on_bit_set = even_label;
1853 on_bit_clear = odd_label;
1854 bit = 0;
1855 }
1856 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1857 templ[i] = bit;
1858 }
1859 int j = 0;
1860 bit ^= 1;
1861 for (int i = start_index; i < end_index; i++) {
1862 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1863 templ[j] = bit;
1864 }
1865 bit ^= 1;
1866 }
1867 for (int i = j; i < kSize; i++) {
1868 templ[i] = bit;
1869 }
1870 Factory* factory = masm->isolate()->factory();
1871 // TODO(erikcorry): Cache these.
1872 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1873 for (int i = 0; i < kSize; i++) {
1874 ba->set(i, templ[i]);
1875 }
1876 masm->CheckBitInTable(ba, on_bit_set);
1877 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1878}
1879
1880
1881static void CutOutRange(RegExpMacroAssembler* masm,
1882 ZoneList<int>* ranges,
1883 int start_index,
1884 int end_index,
1885 int cut_index,
1886 Label* even_label,
1887 Label* odd_label) {
1888 bool odd = (((cut_index - start_index) & 1) == 1);
1889 Label* in_range_label = odd ? odd_label : even_label;
1890 Label dummy;
1891 EmitDoubleBoundaryTest(masm,
1892 ranges->at(cut_index),
1893 ranges->at(cut_index + 1) - 1,
1894 &dummy,
1895 in_range_label,
1896 &dummy);
1897 DCHECK(!dummy.is_linked());
1898 // Cut out the single range by rewriting the array. This creates a new
1899 // range that is a merger of the two ranges on either side of the one we
1900 // are cutting out. The oddity of the labels is preserved.
1901 for (int j = cut_index; j > start_index; j--) {
1902 ranges->at(j) = ranges->at(j - 1);
1903 }
1904 for (int j = cut_index + 1; j < end_index; j++) {
1905 ranges->at(j) = ranges->at(j + 1);
1906 }
1907}
1908
1909
1910// Unicode case. Split the search space into kSize spaces that are handled
1911// with recursion.
1912static void SplitSearchSpace(ZoneList<int>* ranges,
1913 int start_index,
1914 int end_index,
1915 int* new_start_index,
1916 int* new_end_index,
1917 int* border) {
1918 static const int kSize = RegExpMacroAssembler::kTableSize;
1919 static const int kMask = RegExpMacroAssembler::kTableMask;
1920
1921 int first = ranges->at(start_index);
1922 int last = ranges->at(end_index) - 1;
1923
1924 *new_start_index = start_index;
1925 *border = (ranges->at(start_index) & ~kMask) + kSize;
1926 while (*new_start_index < end_index) {
1927 if (ranges->at(*new_start_index) > *border) break;
1928 (*new_start_index)++;
1929 }
1930 // new_start_index is the index of the first edge that is beyond the
1931 // current kSize space.
1932
1933 // For very large search spaces we do a binary chop search of the non-Latin1
1934 // space instead of just going to the end of the current kSize space. The
1935 // heuristics are complicated a little by the fact that any 128-character
1936 // encoding space can be quickly tested with a table lookup, so we don't
1937 // wish to do binary chop search at a smaller granularity than that. A
1938 // 128-character space can take up a lot of space in the ranges array if,
1939 // for example, we only want to match every second character (eg. the lower
1940 // case characters on some Unicode pages).
1941 int binary_chop_index = (end_index + start_index) / 2;
1942 // The first test ensures that we get to the code that handles the Latin1
1943 // range with a single not-taken branch, speeding up this important
1944 // character range (even non-Latin1 charset-based text has spaces and
1945 // punctuation).
1946 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1947 end_index - start_index > (*new_start_index - start_index) * 2 &&
1948 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1949 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1950 int scan_forward_for_section_border = binary_chop_index;;
1951 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1952
1953 while (scan_forward_for_section_border < end_index) {
1954 if (ranges->at(scan_forward_for_section_border) > new_border) {
1955 *new_start_index = scan_forward_for_section_border;
1956 *border = new_border;
1957 break;
1958 }
1959 scan_forward_for_section_border++;
1960 }
1961 }
1962
1963 DCHECK(*new_start_index > start_index);
1964 *new_end_index = *new_start_index - 1;
1965 if (ranges->at(*new_end_index) == *border) {
1966 (*new_end_index)--;
1967 }
1968 if (*border >= ranges->at(end_index)) {
1969 *border = ranges->at(end_index);
1970 *new_start_index = end_index; // Won't be used.
1971 *new_end_index = end_index - 1;
1972 }
1973}
1974
1975
1976// Gets a series of segment boundaries representing a character class. If the
1977// character is in the range between an even and an odd boundary (counting from
1978// start_index) then go to even_label, otherwise go to odd_label. We already
1979// know that the character is in the range of min_char to max_char inclusive.
1980// Either label can be NULL indicating backtracking. Either label can also be
1981// equal to the fall_through label.
Ben Murdoch109988c2016-05-18 11:27:45 +01001982static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1983 int start_index, int end_index, uc32 min_char,
1984 uc32 max_char, Label* fall_through,
1985 Label* even_label, Label* odd_label) {
1986 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1987 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1988
Ben Murdoch014dc512016-03-22 12:00:34 +00001989 int first = ranges->at(start_index);
1990 int last = ranges->at(end_index) - 1;
1991
1992 DCHECK_LT(min_char, first);
1993
1994 // Just need to test if the character is before or on-or-after
1995 // a particular character.
1996 if (start_index == end_index) {
1997 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1998 return;
1999 }
2000
2001 // Another almost trivial case: There is one interval in the middle that is
2002 // different from the end intervals.
2003 if (start_index + 1 == end_index) {
2004 EmitDoubleBoundaryTest(
2005 masm, first, last, fall_through, even_label, odd_label);
2006 return;
2007 }
2008
2009 // It's not worth using table lookup if there are very few intervals in the
2010 // character class.
2011 if (end_index - start_index <= 6) {
2012 // It is faster to test for individual characters, so we look for those
2013 // first, then try arbitrary ranges in the second round.
2014 static int kNoCutIndex = -1;
2015 int cut = kNoCutIndex;
2016 for (int i = start_index; i < end_index; i++) {
2017 if (ranges->at(i) == ranges->at(i + 1) - 1) {
2018 cut = i;
2019 break;
2020 }
2021 }
2022 if (cut == kNoCutIndex) cut = start_index;
2023 CutOutRange(
2024 masm, ranges, start_index, end_index, cut, even_label, odd_label);
2025 DCHECK_GE(end_index - start_index, 2);
2026 GenerateBranches(masm,
2027 ranges,
2028 start_index + 1,
2029 end_index - 1,
2030 min_char,
2031 max_char,
2032 fall_through,
2033 even_label,
2034 odd_label);
2035 return;
2036 }
2037
2038 // If there are a lot of intervals in the regexp, then we will use tables to
2039 // determine whether the character is inside or outside the character class.
2040 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2041
2042 if ((max_char >> kBits) == (min_char >> kBits)) {
2043 EmitUseLookupTable(masm,
2044 ranges,
2045 start_index,
2046 end_index,
2047 min_char,
2048 fall_through,
2049 even_label,
2050 odd_label);
2051 return;
2052 }
2053
2054 if ((min_char >> kBits) != (first >> kBits)) {
2055 masm->CheckCharacterLT(first, odd_label);
2056 GenerateBranches(masm,
2057 ranges,
2058 start_index + 1,
2059 end_index,
2060 first,
2061 max_char,
2062 fall_through,
2063 odd_label,
2064 even_label);
2065 return;
2066 }
2067
2068 int new_start_index = 0;
2069 int new_end_index = 0;
2070 int border = 0;
2071
2072 SplitSearchSpace(ranges,
2073 start_index,
2074 end_index,
2075 &new_start_index,
2076 &new_end_index,
2077 &border);
2078
2079 Label handle_rest;
2080 Label* above = &handle_rest;
2081 if (border == last + 1) {
2082 // We didn't find any section that started after the limit, so everything
2083 // above the border is one of the terminal labels.
2084 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2085 DCHECK(new_end_index == end_index - 1);
2086 }
2087
2088 DCHECK_LE(start_index, new_end_index);
2089 DCHECK_LE(new_start_index, end_index);
2090 DCHECK_LT(start_index, new_start_index);
2091 DCHECK_LT(new_end_index, end_index);
2092 DCHECK(new_end_index + 1 == new_start_index ||
2093 (new_end_index + 2 == new_start_index &&
2094 border == ranges->at(new_end_index + 1)));
2095 DCHECK_LT(min_char, border - 1);
2096 DCHECK_LT(border, max_char);
2097 DCHECK_LT(ranges->at(new_end_index), border);
2098 DCHECK(border < ranges->at(new_start_index) ||
2099 (border == ranges->at(new_start_index) &&
2100 new_start_index == end_index &&
2101 new_end_index == end_index - 1 &&
2102 border == last + 1));
2103 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2104
2105 masm->CheckCharacterGT(border - 1, above);
2106 Label dummy;
2107 GenerateBranches(masm,
2108 ranges,
2109 start_index,
2110 new_end_index,
2111 min_char,
2112 border - 1,
2113 &dummy,
2114 even_label,
2115 odd_label);
2116 if (handle_rest.is_linked()) {
2117 masm->Bind(&handle_rest);
2118 bool flip = (new_start_index & 1) != (start_index & 1);
2119 GenerateBranches(masm,
2120 ranges,
2121 new_start_index,
2122 end_index,
2123 border,
2124 max_char,
2125 &dummy,
2126 flip ? odd_label : even_label,
2127 flip ? even_label : odd_label);
2128 }
2129}
2130
2131
2132static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2133 RegExpCharacterClass* cc, bool one_byte,
2134 Label* on_failure, int cp_offset, bool check_offset,
2135 bool preloaded, Zone* zone) {
2136 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
Ben Murdoch109988c2016-05-18 11:27:45 +01002137 CharacterRange::Canonicalize(ranges);
Ben Murdoch014dc512016-03-22 12:00:34 +00002138
2139 int max_char;
2140 if (one_byte) {
2141 max_char = String::kMaxOneByteCharCode;
2142 } else {
2143 max_char = String::kMaxUtf16CodeUnit;
2144 }
2145
2146 int range_count = ranges->length();
2147
2148 int last_valid_range = range_count - 1;
2149 while (last_valid_range >= 0) {
2150 CharacterRange& range = ranges->at(last_valid_range);
2151 if (range.from() <= max_char) {
2152 break;
2153 }
2154 last_valid_range--;
2155 }
2156
2157 if (last_valid_range < 0) {
2158 if (!cc->is_negated()) {
2159 macro_assembler->GoTo(on_failure);
2160 }
2161 if (check_offset) {
2162 macro_assembler->CheckPosition(cp_offset, on_failure);
2163 }
2164 return;
2165 }
2166
2167 if (last_valid_range == 0 &&
2168 ranges->at(0).IsEverything(max_char)) {
2169 if (cc->is_negated()) {
2170 macro_assembler->GoTo(on_failure);
2171 } else {
2172 // This is a common case hit by non-anchored expressions.
2173 if (check_offset) {
2174 macro_assembler->CheckPosition(cp_offset, on_failure);
2175 }
2176 }
2177 return;
2178 }
Ben Murdoch014dc512016-03-22 12:00:34 +00002179
2180 if (!preloaded) {
2181 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2182 }
2183
2184 if (cc->is_standard(zone) &&
Ben Murdoch109988c2016-05-18 11:27:45 +01002185 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2186 on_failure)) {
Ben Murdoch014dc512016-03-22 12:00:34 +00002187 return;
2188 }
2189
2190
2191 // A new list with ascending entries. Each entry is a code unit
2192 // where there is a boundary between code units that are part of
2193 // the class and code units that are not. Normally we insert an
2194 // entry at zero which goes to the failure label, but if there
2195 // was already one there we fall through for success on that entry.
2196 // Subsequent entries have alternating meaning (success/failure).
2197 ZoneList<int>* range_boundaries =
2198 new(zone) ZoneList<int>(last_valid_range, zone);
2199
2200 bool zeroth_entry_is_failure = !cc->is_negated();
2201
2202 for (int i = 0; i <= last_valid_range; i++) {
2203 CharacterRange& range = ranges->at(i);
2204 if (range.from() == 0) {
2205 DCHECK_EQ(i, 0);
2206 zeroth_entry_is_failure = !zeroth_entry_is_failure;
2207 } else {
2208 range_boundaries->Add(range.from(), zone);
2209 }
2210 range_boundaries->Add(range.to() + 1, zone);
2211 }
2212 int end_index = range_boundaries->length() - 1;
2213 if (range_boundaries->at(end_index) > max_char) {
2214 end_index--;
2215 }
2216
2217 Label fall_through;
2218 GenerateBranches(macro_assembler,
2219 range_boundaries,
2220 0, // start_index.
2221 end_index,
2222 0, // min_char.
2223 max_char,
2224 &fall_through,
2225 zeroth_entry_is_failure ? &fall_through : on_failure,
2226 zeroth_entry_is_failure ? on_failure : &fall_through);
2227 macro_assembler->Bind(&fall_through);
2228}
2229
2230
2231RegExpNode::~RegExpNode() {
2232}
2233
2234
2235RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2236 Trace* trace) {
2237 // If we are generating a greedy loop then don't stop and don't reuse code.
2238 if (trace->stop_node() != NULL) {
2239 return CONTINUE;
2240 }
2241
2242 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2243 if (trace->is_trivial()) {
2244 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2245 // If a generic version is already scheduled to be generated or we have
2246 // recursed too deeply then just generate a jump to that code.
2247 macro_assembler->GoTo(&label_);
2248 // This will queue it up for generation of a generic version if it hasn't
2249 // already been queued.
2250 compiler->AddWork(this);
2251 return DONE;
2252 }
2253 // Generate generic version of the node and bind the label for later use.
2254 macro_assembler->Bind(&label_);
2255 return CONTINUE;
2256 }
2257
2258 // We are being asked to make a non-generic version. Keep track of how many
2259 // non-generic versions we generate so as not to overdo it.
2260 trace_count_++;
2261 if (KeepRecursing(compiler) && compiler->optimize() &&
2262 trace_count_ < kMaxCopiesCodeGenerated) {
2263 return CONTINUE;
2264 }
2265
2266 // If we get here code has been generated for this node too many times or
2267 // recursion is too deep. Time to switch to a generic version. The code for
2268 // generic versions above can handle deep recursion properly.
2269 bool was_limiting = compiler->limiting_recursion();
2270 compiler->set_limiting_recursion(true);
2271 trace->Flush(compiler, this);
2272 compiler->set_limiting_recursion(was_limiting);
2273 return DONE;
2274}
2275
2276
2277bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2278 return !compiler->limiting_recursion() &&
2279 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2280}
2281
2282
2283int ActionNode::EatsAtLeast(int still_to_find,
2284 int budget,
2285 bool not_at_start) {
2286 if (budget <= 0) return 0;
2287 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2288 return on_success()->EatsAtLeast(still_to_find,
2289 budget - 1,
2290 not_at_start);
2291}
2292
2293
2294void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2295 BoyerMooreLookahead* bm, bool not_at_start) {
2296 if (action_type_ == BEGIN_SUBMATCH) {
2297 bm->SetRest(offset);
2298 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2299 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2300 }
2301 SaveBMInfo(bm, not_at_start, offset);
2302}
2303
2304
2305int AssertionNode::EatsAtLeast(int still_to_find,
2306 int budget,
2307 bool not_at_start) {
2308 if (budget <= 0) return 0;
2309 // If we know we are not at the start and we are asked "how many characters
2310 // will you match if you succeed?" then we can answer anything since false
2311 // implies false. So lets just return the max answer (still_to_find) since
2312 // that won't prevent us from preloading a lot of characters for the other
2313 // branches in the node graph.
2314 if (assertion_type() == AT_START && not_at_start) return still_to_find;
2315 return on_success()->EatsAtLeast(still_to_find,
2316 budget - 1,
2317 not_at_start);
2318}
2319
2320
2321void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2322 BoyerMooreLookahead* bm, bool not_at_start) {
2323 // Match the behaviour of EatsAtLeast on this node.
2324 if (assertion_type() == AT_START && not_at_start) return;
2325 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2326 SaveBMInfo(bm, not_at_start, offset);
2327}
2328
2329
2330int BackReferenceNode::EatsAtLeast(int still_to_find,
2331 int budget,
2332 bool not_at_start) {
2333 if (read_backward()) return 0;
2334 if (budget <= 0) return 0;
2335 return on_success()->EatsAtLeast(still_to_find,
2336 budget - 1,
2337 not_at_start);
2338}
2339
2340
2341int TextNode::EatsAtLeast(int still_to_find,
2342 int budget,
2343 bool not_at_start) {
2344 if (read_backward()) return 0;
2345 int answer = Length();
2346 if (answer >= still_to_find) return answer;
2347 if (budget <= 0) return answer;
2348 // We are not at start after this node so we set the last argument to 'true'.
2349 return answer + on_success()->EatsAtLeast(still_to_find - answer,
2350 budget - 1,
2351 true);
2352}
2353
2354
2355int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2356 bool not_at_start) {
2357 if (budget <= 0) return 0;
2358 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2359 // afterwards.
2360 RegExpNode* node = alternatives_->at(1).node();
2361 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2362}
2363
2364
2365void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2366 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2367 bool not_at_start) {
2368 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2369 // afterwards.
2370 RegExpNode* node = alternatives_->at(1).node();
2371 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2372}
2373
2374
2375int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2376 int budget,
2377 RegExpNode* ignore_this_node,
2378 bool not_at_start) {
2379 if (budget <= 0) return 0;
2380 int min = 100;
2381 int choice_count = alternatives_->length();
2382 budget = (budget - 1) / choice_count;
2383 for (int i = 0; i < choice_count; i++) {
2384 RegExpNode* node = alternatives_->at(i).node();
2385 if (node == ignore_this_node) continue;
2386 int node_eats_at_least =
2387 node->EatsAtLeast(still_to_find, budget, not_at_start);
2388 if (node_eats_at_least < min) min = node_eats_at_least;
2389 if (min == 0) return 0;
2390 }
2391 return min;
2392}
2393
2394
2395int LoopChoiceNode::EatsAtLeast(int still_to_find,
2396 int budget,
2397 bool not_at_start) {
2398 return EatsAtLeastHelper(still_to_find,
2399 budget - 1,
2400 loop_node_,
2401 not_at_start);
2402}
2403
2404
2405int ChoiceNode::EatsAtLeast(int still_to_find,
2406 int budget,
2407 bool not_at_start) {
2408 return EatsAtLeastHelper(still_to_find,
2409 budget,
2410 NULL,
2411 not_at_start);
2412}
2413
2414
2415// Takes the left-most 1-bit and smears it out, setting all bits to its right.
2416static inline uint32_t SmearBitsRight(uint32_t v) {
2417 v |= v >> 1;
2418 v |= v >> 2;
2419 v |= v >> 4;
2420 v |= v >> 8;
2421 v |= v >> 16;
2422 return v;
2423}
2424
2425
2426bool QuickCheckDetails::Rationalize(bool asc) {
2427 bool found_useful_op = false;
2428 uint32_t char_mask;
2429 if (asc) {
2430 char_mask = String::kMaxOneByteCharCode;
2431 } else {
2432 char_mask = String::kMaxUtf16CodeUnit;
2433 }
2434 mask_ = 0;
2435 value_ = 0;
2436 int char_shift = 0;
2437 for (int i = 0; i < characters_; i++) {
2438 Position* pos = &positions_[i];
2439 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2440 found_useful_op = true;
2441 }
2442 mask_ |= (pos->mask & char_mask) << char_shift;
2443 value_ |= (pos->value & char_mask) << char_shift;
2444 char_shift += asc ? 8 : 16;
2445 }
2446 return found_useful_op;
2447}
2448
2449
2450bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2451 Trace* bounds_check_trace,
2452 Trace* trace,
2453 bool preload_has_checked_bounds,
2454 Label* on_possible_success,
2455 QuickCheckDetails* details,
2456 bool fall_through_on_failure) {
2457 if (details->characters() == 0) return false;
2458 GetQuickCheckDetails(
2459 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2460 if (details->cannot_match()) return false;
2461 if (!details->Rationalize(compiler->one_byte())) return false;
2462 DCHECK(details->characters() == 1 ||
2463 compiler->macro_assembler()->CanReadUnaligned());
2464 uint32_t mask = details->mask();
2465 uint32_t value = details->value();
2466
2467 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2468
2469 if (trace->characters_preloaded() != details->characters()) {
2470 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2471 // We are attempting to preload the minimum number of characters
2472 // any choice would eat, so if the bounds check fails, then none of the
2473 // choices can succeed, so we can just immediately backtrack, rather
2474 // than go to the next choice.
2475 assembler->LoadCurrentCharacter(trace->cp_offset(),
2476 bounds_check_trace->backtrack(),
2477 !preload_has_checked_bounds,
2478 details->characters());
2479 }
2480
2481
2482 bool need_mask = true;
2483
2484 if (details->characters() == 1) {
2485 // If number of characters preloaded is 1 then we used a byte or 16 bit
2486 // load so the value is already masked down.
2487 uint32_t char_mask;
2488 if (compiler->one_byte()) {
2489 char_mask = String::kMaxOneByteCharCode;
2490 } else {
2491 char_mask = String::kMaxUtf16CodeUnit;
2492 }
2493 if ((mask & char_mask) == char_mask) need_mask = false;
2494 mask &= char_mask;
2495 } else {
2496 // For 2-character preloads in one-byte mode or 1-character preloads in
2497 // two-byte mode we also use a 16 bit load with zero extend.
Ben Murdoch109988c2016-05-18 11:27:45 +01002498 static const uint32_t kTwoByteMask = 0xffff;
2499 static const uint32_t kFourByteMask = 0xffffffff;
Ben Murdoch014dc512016-03-22 12:00:34 +00002500 if (details->characters() == 2 && compiler->one_byte()) {
Ben Murdoch109988c2016-05-18 11:27:45 +01002501 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch014dc512016-03-22 12:00:34 +00002502 } else if (details->characters() == 1 && !compiler->one_byte()) {
Ben Murdoch109988c2016-05-18 11:27:45 +01002503 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch014dc512016-03-22 12:00:34 +00002504 } else {
Ben Murdoch109988c2016-05-18 11:27:45 +01002505 if (mask == kFourByteMask) need_mask = false;
Ben Murdoch014dc512016-03-22 12:00:34 +00002506 }
2507 }
2508
2509 if (fall_through_on_failure) {
2510 if (need_mask) {
2511 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2512 } else {
2513 assembler->CheckCharacter(value, on_possible_success);
2514 }
2515 } else {
2516 if (need_mask) {
2517 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2518 } else {
2519 assembler->CheckNotCharacter(value, trace->backtrack());
2520 }
2521 }
2522 return true;
2523}
2524
2525
2526// Here is the meat of GetQuickCheckDetails (see also the comment on the
2527// super-class in the .h file).
2528//
2529// We iterate along the text object, building up for each character a
2530// mask and value that can be used to test for a quick failure to match.
2531// The masks and values for the positions will be combined into a single
2532// machine word for the current character width in order to be used in
2533// generating a quick check.
2534void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2535 RegExpCompiler* compiler,
2536 int characters_filled_in,
2537 bool not_at_start) {
2538 // Do not collect any quick check details if the text node reads backward,
2539 // since it reads in the opposite direction than we use for quick checks.
2540 if (read_backward()) return;
2541 Isolate* isolate = compiler->macro_assembler()->isolate();
2542 DCHECK(characters_filled_in < details->characters());
2543 int characters = details->characters();
2544 int char_mask;
2545 if (compiler->one_byte()) {
2546 char_mask = String::kMaxOneByteCharCode;
2547 } else {
2548 char_mask = String::kMaxUtf16CodeUnit;
2549 }
2550 for (int k = 0; k < elements()->length(); k++) {
2551 TextElement elm = elements()->at(k);
2552 if (elm.text_type() == TextElement::ATOM) {
2553 Vector<const uc16> quarks = elm.atom()->data();
2554 for (int i = 0; i < characters && i < quarks.length(); i++) {
2555 QuickCheckDetails::Position* pos =
2556 details->positions(characters_filled_in);
2557 uc16 c = quarks[i];
2558 if (compiler->ignore_case()) {
2559 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2560 int length = GetCaseIndependentLetters(isolate, c,
2561 compiler->one_byte(), chars);
2562 if (length == 0) {
2563 // This can happen because all case variants are non-Latin1, but we
2564 // know the input is Latin1.
2565 details->set_cannot_match();
2566 pos->determines_perfectly = false;
2567 return;
2568 }
2569 if (length == 1) {
2570 // This letter has no case equivalents, so it's nice and simple
2571 // and the mask-compare will determine definitely whether we have
2572 // a match at this character position.
2573 pos->mask = char_mask;
2574 pos->value = c;
2575 pos->determines_perfectly = true;
2576 } else {
2577 uint32_t common_bits = char_mask;
2578 uint32_t bits = chars[0];
2579 for (int j = 1; j < length; j++) {
2580 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2581 common_bits ^= differing_bits;
2582 bits &= common_bits;
2583 }
2584 // If length is 2 and common bits has only one zero in it then
2585 // our mask and compare instruction will determine definitely
2586 // whether we have a match at this character position. Otherwise
2587 // it can only be an approximate check.
2588 uint32_t one_zero = (common_bits | ~char_mask);
2589 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2590 pos->determines_perfectly = true;
2591 }
2592 pos->mask = common_bits;
2593 pos->value = bits;
2594 }
2595 } else {
2596 // Don't ignore case. Nice simple case where the mask-compare will
2597 // determine definitely whether we have a match at this character
2598 // position.
2599 if (c > char_mask) {
2600 details->set_cannot_match();
2601 pos->determines_perfectly = false;
2602 return;
2603 }
2604 pos->mask = char_mask;
2605 pos->value = c;
2606 pos->determines_perfectly = true;
2607 }
2608 characters_filled_in++;
2609 DCHECK(characters_filled_in <= details->characters());
2610 if (characters_filled_in == details->characters()) {
2611 return;
2612 }
2613 }
2614 } else {
2615 QuickCheckDetails::Position* pos =
2616 details->positions(characters_filled_in);
2617 RegExpCharacterClass* tree = elm.char_class();
2618 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2619 if (tree->is_negated()) {
2620 // A quick check uses multi-character mask and compare. There is no
2621 // useful way to incorporate a negative char class into this scheme
2622 // so we just conservatively create a mask and value that will always
2623 // succeed.
2624 pos->mask = 0;
2625 pos->value = 0;
2626 } else {
2627 int first_range = 0;
2628 while (ranges->at(first_range).from() > char_mask) {
2629 first_range++;
2630 if (first_range == ranges->length()) {
2631 details->set_cannot_match();
2632 pos->determines_perfectly = false;
2633 return;
2634 }
2635 }
2636 CharacterRange range = ranges->at(first_range);
2637 uc16 from = range.from();
2638 uc16 to = range.to();
2639 if (to > char_mask) {
2640 to = char_mask;
2641 }
2642 uint32_t differing_bits = (from ^ to);
2643 // A mask and compare is only perfect if the differing bits form a
2644 // number like 00011111 with one single block of trailing 1s.
2645 if ((differing_bits & (differing_bits + 1)) == 0 &&
2646 from + differing_bits == to) {
2647 pos->determines_perfectly = true;
2648 }
2649 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2650 uint32_t bits = (from & common_bits);
2651 for (int i = first_range + 1; i < ranges->length(); i++) {
2652 CharacterRange range = ranges->at(i);
2653 uc16 from = range.from();
2654 uc16 to = range.to();
2655 if (from > char_mask) continue;
2656 if (to > char_mask) to = char_mask;
2657 // Here we are combining more ranges into the mask and compare
2658 // value. With each new range the mask becomes more sparse and
2659 // so the chances of a false positive rise. A character class
2660 // with multiple ranges is assumed never to be equivalent to a
2661 // mask and compare operation.
2662 pos->determines_perfectly = false;
2663 uint32_t new_common_bits = (from ^ to);
2664 new_common_bits = ~SmearBitsRight(new_common_bits);
2665 common_bits &= new_common_bits;
2666 bits &= new_common_bits;
2667 uint32_t differing_bits = (from & common_bits) ^ bits;
2668 common_bits ^= differing_bits;
2669 bits &= common_bits;
2670 }
2671 pos->mask = common_bits;
2672 pos->value = bits;
2673 }
2674 characters_filled_in++;
2675 DCHECK(characters_filled_in <= details->characters());
2676 if (characters_filled_in == details->characters()) {
2677 return;
2678 }
2679 }
2680 }
2681 DCHECK(characters_filled_in != details->characters());
2682 if (!details->cannot_match()) {
2683 on_success()-> GetQuickCheckDetails(details,
2684 compiler,
2685 characters_filled_in,
2686 true);
2687 }
2688}
2689
2690
2691void QuickCheckDetails::Clear() {
2692 for (int i = 0; i < characters_; i++) {
2693 positions_[i].mask = 0;
2694 positions_[i].value = 0;
2695 positions_[i].determines_perfectly = false;
2696 }
2697 characters_ = 0;
2698}
2699
2700
2701void QuickCheckDetails::Advance(int by, bool one_byte) {
2702 if (by >= characters_ || by < 0) {
2703 DCHECK_IMPLIES(by < 0, characters_ == 0);
2704 Clear();
2705 return;
2706 }
2707 DCHECK_LE(characters_ - by, 4);
2708 DCHECK_LE(characters_, 4);
2709 for (int i = 0; i < characters_ - by; i++) {
2710 positions_[i] = positions_[by + i];
2711 }
2712 for (int i = characters_ - by; i < characters_; i++) {
2713 positions_[i].mask = 0;
2714 positions_[i].value = 0;
2715 positions_[i].determines_perfectly = false;
2716 }
2717 characters_ -= by;
2718 // We could change mask_ and value_ here but we would never advance unless
2719 // they had already been used in a check and they won't be used again because
2720 // it would gain us nothing. So there's no point.
2721}
2722
2723
2724void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2725 DCHECK(characters_ == other->characters_);
2726 if (other->cannot_match_) {
2727 return;
2728 }
2729 if (cannot_match_) {
2730 *this = *other;
2731 return;
2732 }
2733 for (int i = from_index; i < characters_; i++) {
2734 QuickCheckDetails::Position* pos = positions(i);
2735 QuickCheckDetails::Position* other_pos = other->positions(i);
2736 if (pos->mask != other_pos->mask ||
2737 pos->value != other_pos->value ||
2738 !other_pos->determines_perfectly) {
2739 // Our mask-compare operation will be approximate unless we have the
2740 // exact same operation on both sides of the alternation.
2741 pos->determines_perfectly = false;
2742 }
2743 pos->mask &= other_pos->mask;
2744 pos->value &= pos->mask;
2745 other_pos->value &= pos->mask;
2746 uc16 differing_bits = (pos->value ^ other_pos->value);
2747 pos->mask &= ~differing_bits;
2748 pos->value &= pos->mask;
2749 }
2750}
2751
2752
2753class VisitMarker {
2754 public:
2755 explicit VisitMarker(NodeInfo* info) : info_(info) {
2756 DCHECK(!info->visited);
2757 info->visited = true;
2758 }
2759 ~VisitMarker() {
2760 info_->visited = false;
2761 }
2762 private:
2763 NodeInfo* info_;
2764};
2765
2766
2767RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2768 if (info()->replacement_calculated) return replacement();
2769 if (depth < 0) return this;
2770 DCHECK(!info()->visited);
2771 VisitMarker marker(info());
2772 return FilterSuccessor(depth - 1, ignore_case);
2773}
2774
2775
2776RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2777 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2778 if (next == NULL) return set_replacement(NULL);
2779 on_success_ = next;
2780 return set_replacement(this);
2781}
2782
2783
2784// We need to check for the following characters: 0x39c 0x3bc 0x178.
2785static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2786 // TODO(dcarney): this could be a lot more efficient.
2787 return range.Contains(0x39c) ||
2788 range.Contains(0x3bc) || range.Contains(0x178);
2789}
2790
2791
2792static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2793 for (int i = 0; i < ranges->length(); i++) {
2794 // TODO(dcarney): this could be a lot more efficient.
2795 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2796 }
2797 return false;
2798}
2799
2800
2801RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2802 if (info()->replacement_calculated) return replacement();
2803 if (depth < 0) return this;
2804 DCHECK(!info()->visited);
2805 VisitMarker marker(info());
2806 int element_count = elements()->length();
2807 for (int i = 0; i < element_count; i++) {
2808 TextElement elm = elements()->at(i);
2809 if (elm.text_type() == TextElement::ATOM) {
2810 Vector<const uc16> quarks = elm.atom()->data();
2811 for (int j = 0; j < quarks.length(); j++) {
2812 uint16_t c = quarks[j];
2813 if (c <= String::kMaxOneByteCharCode) continue;
2814 if (!ignore_case) return set_replacement(NULL);
2815 // Here, we need to check for characters whose upper and lower cases
2816 // are outside the Latin-1 range.
2817 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2818 // Character is outside Latin-1 completely
2819 if (converted == 0) return set_replacement(NULL);
2820 // Convert quark to Latin-1 in place.
2821 uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2822 copy[j] = converted;
2823 }
2824 } else {
2825 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2826 RegExpCharacterClass* cc = elm.char_class();
2827 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch109988c2016-05-18 11:27:45 +01002828 CharacterRange::Canonicalize(ranges);
Ben Murdoch014dc512016-03-22 12:00:34 +00002829 // Now they are in order so we only need to look at the first.
2830 int range_count = ranges->length();
2831 if (cc->is_negated()) {
2832 if (range_count != 0 &&
2833 ranges->at(0).from() == 0 &&
2834 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2835 // This will be handled in a later filter.
2836 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2837 return set_replacement(NULL);
2838 }
2839 } else {
2840 if (range_count == 0 ||
2841 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2842 // This will be handled in a later filter.
2843 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2844 return set_replacement(NULL);
2845 }
2846 }
2847 }
2848 }
2849 return FilterSuccessor(depth - 1, ignore_case);
2850}
2851
2852
2853RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2854 if (info()->replacement_calculated) return replacement();
2855 if (depth < 0) return this;
2856 if (info()->visited) return this;
2857 {
2858 VisitMarker marker(info());
2859
2860 RegExpNode* continue_replacement =
2861 continue_node_->FilterOneByte(depth - 1, ignore_case);
2862 // If we can't continue after the loop then there is no sense in doing the
2863 // loop.
2864 if (continue_replacement == NULL) return set_replacement(NULL);
2865 }
2866
2867 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2868}
2869
2870
2871RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2872 if (info()->replacement_calculated) return replacement();
2873 if (depth < 0) return this;
2874 if (info()->visited) return this;
2875 VisitMarker marker(info());
2876 int choice_count = alternatives_->length();
2877
2878 for (int i = 0; i < choice_count; i++) {
2879 GuardedAlternative alternative = alternatives_->at(i);
2880 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2881 set_replacement(this);
2882 return this;
2883 }
2884 }
2885
2886 int surviving = 0;
2887 RegExpNode* survivor = NULL;
2888 for (int i = 0; i < choice_count; i++) {
2889 GuardedAlternative alternative = alternatives_->at(i);
2890 RegExpNode* replacement =
2891 alternative.node()->FilterOneByte(depth - 1, ignore_case);
2892 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2893 if (replacement != NULL) {
2894 alternatives_->at(i).set_node(replacement);
2895 surviving++;
2896 survivor = replacement;
2897 }
2898 }
2899 if (surviving < 2) return set_replacement(survivor);
2900
2901 set_replacement(this);
2902 if (surviving == choice_count) {
2903 return this;
2904 }
2905 // Only some of the nodes survived the filtering. We need to rebuild the
2906 // alternatives list.
2907 ZoneList<GuardedAlternative>* new_alternatives =
2908 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2909 for (int i = 0; i < choice_count; i++) {
2910 RegExpNode* replacement =
2911 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2912 if (replacement != NULL) {
2913 alternatives_->at(i).set_node(replacement);
2914 new_alternatives->Add(alternatives_->at(i), zone());
2915 }
2916 }
2917 alternatives_ = new_alternatives;
2918 return this;
2919}
2920
2921
2922RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
2923 bool ignore_case) {
2924 if (info()->replacement_calculated) return replacement();
2925 if (depth < 0) return this;
2926 if (info()->visited) return this;
2927 VisitMarker marker(info());
2928 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2929 // afterwards.
2930 RegExpNode* node = alternatives_->at(1).node();
2931 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2932 if (replacement == NULL) return set_replacement(NULL);
2933 alternatives_->at(1).set_node(replacement);
2934
2935 RegExpNode* neg_node = alternatives_->at(0).node();
2936 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2937 // If the negative lookahead is always going to fail then
2938 // we don't need to check it.
2939 if (neg_replacement == NULL) return set_replacement(replacement);
2940 alternatives_->at(0).set_node(neg_replacement);
2941 return set_replacement(this);
2942}
2943
2944
2945void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2946 RegExpCompiler* compiler,
2947 int characters_filled_in,
2948 bool not_at_start) {
2949 if (body_can_be_zero_length_ || info()->visited) return;
2950 VisitMarker marker(info());
2951 return ChoiceNode::GetQuickCheckDetails(details,
2952 compiler,
2953 characters_filled_in,
2954 not_at_start);
2955}
2956
2957
2958void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2959 BoyerMooreLookahead* bm, bool not_at_start) {
2960 if (body_can_be_zero_length_ || budget <= 0) {
2961 bm->SetRest(offset);
2962 SaveBMInfo(bm, not_at_start, offset);
2963 return;
2964 }
2965 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2966 SaveBMInfo(bm, not_at_start, offset);
2967}
2968
2969
2970void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2971 RegExpCompiler* compiler,
2972 int characters_filled_in,
2973 bool not_at_start) {
2974 not_at_start = (not_at_start || not_at_start_);
2975 int choice_count = alternatives_->length();
2976 DCHECK(choice_count > 0);
2977 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2978 compiler,
2979 characters_filled_in,
2980 not_at_start);
2981 for (int i = 1; i < choice_count; i++) {
2982 QuickCheckDetails new_details(details->characters());
2983 RegExpNode* node = alternatives_->at(i).node();
2984 node->GetQuickCheckDetails(&new_details, compiler,
2985 characters_filled_in,
2986 not_at_start);
2987 // Here we merge the quick match details of the two branches.
2988 details->Merge(&new_details, characters_filled_in);
2989 }
2990}
2991
2992
2993// Check for [0-9A-Z_a-z].
2994static void EmitWordCheck(RegExpMacroAssembler* assembler,
2995 Label* word,
2996 Label* non_word,
2997 bool fall_through_on_word) {
2998 if (assembler->CheckSpecialCharacterClass(
2999 fall_through_on_word ? 'w' : 'W',
3000 fall_through_on_word ? non_word : word)) {
3001 // Optimized implementation available.
3002 return;
3003 }
3004 assembler->CheckCharacterGT('z', non_word);
3005 assembler->CheckCharacterLT('0', non_word);
3006 assembler->CheckCharacterGT('a' - 1, word);
3007 assembler->CheckCharacterLT('9' + 1, word);
3008 assembler->CheckCharacterLT('A', non_word);
3009 assembler->CheckCharacterLT('Z' + 1, word);
3010 if (fall_through_on_word) {
3011 assembler->CheckNotCharacter('_', non_word);
3012 } else {
3013 assembler->CheckCharacter('_', word);
3014 }
3015}
3016
3017
3018// Emit the code to check for a ^ in multiline mode (1-character lookbehind
3019// that matches newline or the start of input).
3020static void EmitHat(RegExpCompiler* compiler,
3021 RegExpNode* on_success,
3022 Trace* trace) {
3023 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3024 // We will be loading the previous character into the current character
3025 // register.
3026 Trace new_trace(*trace);
3027 new_trace.InvalidateCurrentCharacter();
3028
3029 Label ok;
3030 if (new_trace.cp_offset() == 0) {
3031 // The start of input counts as a newline in this context, so skip to
3032 // ok if we are at the start.
3033 assembler->CheckAtStart(&ok);
3034 }
3035 // We already checked that we are not at the start of input so it must be
3036 // OK to load the previous character.
3037 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3038 new_trace.backtrack(),
3039 false);
3040 if (!assembler->CheckSpecialCharacterClass('n',
3041 new_trace.backtrack())) {
3042 // Newline means \n, \r, 0x2028 or 0x2029.
3043 if (!compiler->one_byte()) {
3044 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3045 }
3046 assembler->CheckCharacter('\n', &ok);
3047 assembler->CheckNotCharacter('\r', new_trace.backtrack());
3048 }
3049 assembler->Bind(&ok);
3050 on_success->Emit(compiler, &new_trace);
3051}
3052
3053
3054// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3055void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3056 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3057 Isolate* isolate = assembler->isolate();
3058 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3059 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3060 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3061 if (lookahead == NULL) {
3062 int eats_at_least =
3063 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3064 kRecursionBudget,
3065 not_at_start));
3066 if (eats_at_least >= 1) {
3067 BoyerMooreLookahead* bm =
3068 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3069 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
3070 if (bm->at(0)->is_non_word())
3071 next_is_word_character = Trace::FALSE_VALUE;
3072 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3073 }
3074 } else {
3075 if (lookahead->at(0)->is_non_word())
3076 next_is_word_character = Trace::FALSE_VALUE;
3077 if (lookahead->at(0)->is_word())
3078 next_is_word_character = Trace::TRUE_VALUE;
3079 }
3080 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3081 if (next_is_word_character == Trace::UNKNOWN) {
3082 Label before_non_word;
3083 Label before_word;
3084 if (trace->characters_preloaded() != 1) {
3085 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3086 }
3087 // Fall through on non-word.
3088 EmitWordCheck(assembler, &before_word, &before_non_word, false);
3089 // Next character is not a word character.
3090 assembler->Bind(&before_non_word);
3091 Label ok;
3092 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3093 assembler->GoTo(&ok);
3094
3095 assembler->Bind(&before_word);
3096 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3097 assembler->Bind(&ok);
3098 } else if (next_is_word_character == Trace::TRUE_VALUE) {
3099 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3100 } else {
3101 DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3102 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3103 }
3104}
3105
3106
3107void AssertionNode::BacktrackIfPrevious(
3108 RegExpCompiler* compiler,
3109 Trace* trace,
3110 AssertionNode::IfPrevious backtrack_if_previous) {
3111 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3112 Trace new_trace(*trace);
3113 new_trace.InvalidateCurrentCharacter();
3114
3115 Label fall_through, dummy;
3116
3117 Label* non_word = backtrack_if_previous == kIsNonWord ?
3118 new_trace.backtrack() :
3119 &fall_through;
3120 Label* word = backtrack_if_previous == kIsNonWord ?
3121 &fall_through :
3122 new_trace.backtrack();
3123
3124 if (new_trace.cp_offset() == 0) {
3125 // The start of input counts as a non-word character, so the question is
3126 // decided if we are at the start.
3127 assembler->CheckAtStart(non_word);
3128 }
3129 // We already checked that we are not at the start of input so it must be
3130 // OK to load the previous character.
3131 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3132 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3133
3134 assembler->Bind(&fall_through);
3135 on_success()->Emit(compiler, &new_trace);
3136}
3137
3138
3139void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3140 RegExpCompiler* compiler,
3141 int filled_in,
3142 bool not_at_start) {
3143 if (assertion_type_ == AT_START && not_at_start) {
3144 details->set_cannot_match();
3145 return;
3146 }
3147 return on_success()->GetQuickCheckDetails(details,
3148 compiler,
3149 filled_in,
3150 not_at_start);
3151}
3152
3153
3154void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3155 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3156 switch (assertion_type_) {
3157 case AT_END: {
3158 Label ok;
3159 assembler->CheckPosition(trace->cp_offset(), &ok);
3160 assembler->GoTo(trace->backtrack());
3161 assembler->Bind(&ok);
3162 break;
3163 }
3164 case AT_START: {
3165 if (trace->at_start() == Trace::FALSE_VALUE) {
3166 assembler->GoTo(trace->backtrack());
3167 return;
3168 }
3169 if (trace->at_start() == Trace::UNKNOWN) {
3170 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3171 Trace at_start_trace = *trace;
3172 at_start_trace.set_at_start(Trace::TRUE_VALUE);
3173 on_success()->Emit(compiler, &at_start_trace);
3174 return;
3175 }
3176 }
3177 break;
3178 case AFTER_NEWLINE:
3179 EmitHat(compiler, on_success(), trace);
3180 return;
3181 case AT_BOUNDARY:
3182 case AT_NON_BOUNDARY: {
3183 EmitBoundaryCheck(compiler, trace);
3184 return;
3185 }
3186 }
3187 on_success()->Emit(compiler, trace);
3188}
3189
3190
3191static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3192 if (quick_check == NULL) return false;
3193 if (offset >= quick_check->characters()) return false;
3194 return quick_check->positions(offset)->determines_perfectly;
3195}
3196
3197
3198static void UpdateBoundsCheck(int index, int* checked_up_to) {
3199 if (index > *checked_up_to) {
3200 *checked_up_to = index;
3201 }
3202}
3203
3204
3205// We call this repeatedly to generate code for each pass over the text node.
3206// The passes are in increasing order of difficulty because we hope one
3207// of the first passes will fail in which case we are saved the work of the
3208// later passes. for example for the case independent regexp /%[asdfghjkl]a/
3209// we will check the '%' in the first pass, the case independent 'a' in the
3210// second pass and the character class in the last pass.
3211//
3212// The passes are done from right to left, so for example to test for /bar/
3213// we will first test for an 'r' with offset 2, then an 'a' with offset 1
3214// and then a 'b' with offset 0. This means we can avoid the end-of-input
3215// bounds check most of the time. In the example we only need to check for
3216// end-of-input when loading the putative 'r'.
3217//
3218// A slight complication involves the fact that the first character may already
3219// be fetched into a register by the previous node. In this case we want to
3220// do the test for that character first. We do this in separate passes. The
3221// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3222// pass has been performed then subsequent passes will have true in
3223// first_element_checked to indicate that that character does not need to be
3224// checked again.
3225//
3226// In addition to all this we are passed a Trace, which can
3227// contain an AlternativeGeneration object. In this AlternativeGeneration
3228// object we can see details of any quick check that was already passed in
3229// order to get to the code we are now generating. The quick check can involve
3230// loading characters, which means we do not need to recheck the bounds
3231// up to the limit the quick check already checked. In addition the quick
3232// check can have involved a mask and compare operation which may simplify
3233// or obviate the need for further checks at some character positions.
3234void TextNode::TextEmitPass(RegExpCompiler* compiler,
3235 TextEmitPassType pass,
3236 bool preloaded,
3237 Trace* trace,
3238 bool first_element_checked,
3239 int* checked_up_to) {
3240 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3241 Isolate* isolate = assembler->isolate();
3242 bool one_byte = compiler->one_byte();
3243 Label* backtrack = trace->backtrack();
3244 QuickCheckDetails* quick_check = trace->quick_check_performed();
3245 int element_count = elements()->length();
3246 int backward_offset = read_backward() ? -Length() : 0;
3247 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3248 TextElement elm = elements()->at(i);
3249 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3250 if (elm.text_type() == TextElement::ATOM) {
3251 Vector<const uc16> quarks = elm.atom()->data();
3252 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3253 if (first_element_checked && i == 0 && j == 0) continue;
3254 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3255 EmitCharacterFunction* emit_function = NULL;
3256 switch (pass) {
3257 case NON_LATIN1_MATCH:
3258 DCHECK(one_byte);
3259 if (quarks[j] > String::kMaxOneByteCharCode) {
3260 assembler->GoTo(backtrack);
3261 return;
3262 }
3263 break;
3264 case NON_LETTER_CHARACTER_MATCH:
3265 emit_function = &EmitAtomNonLetter;
3266 break;
3267 case SIMPLE_CHARACTER_MATCH:
3268 emit_function = &EmitSimpleCharacter;
3269 break;
3270 case CASE_CHARACTER_MATCH:
3271 emit_function = &EmitAtomLetter;
3272 break;
3273 default:
3274 break;
3275 }
3276 if (emit_function != NULL) {
3277 bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3278 bool bound_checked =
3279 emit_function(isolate, compiler, quarks[j], backtrack,
3280 cp_offset + j, bounds_check, preloaded);
3281 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3282 }
3283 }
3284 } else {
3285 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3286 if (pass == CHARACTER_CLASS_MATCH) {
3287 if (first_element_checked && i == 0) continue;
3288 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3289 RegExpCharacterClass* cc = elm.char_class();
3290 bool bounds_check = *checked_up_to < cp_offset || read_backward();
3291 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3292 bounds_check, preloaded, zone());
3293 UpdateBoundsCheck(cp_offset, checked_up_to);
3294 }
3295 }
3296 }
3297}
3298
3299
3300int TextNode::Length() {
3301 TextElement elm = elements()->last();
3302 DCHECK(elm.cp_offset() >= 0);
3303 return elm.cp_offset() + elm.length();
3304}
3305
3306
3307bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3308 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3309 if (ignore_case) {
3310 return pass == SIMPLE_CHARACTER_MATCH;
3311 } else {
3312 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3313 }
3314}
3315
3316
Ben Murdoch109988c2016-05-18 11:27:45 +01003317TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3318 ZoneList<CharacterRange>* ranges,
3319 bool read_backward,
3320 RegExpNode* on_success) {
3321 DCHECK_NOT_NULL(ranges);
3322 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3323 elms->Add(
3324 TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
3325 zone);
3326 return new (zone) TextNode(elms, read_backward, on_success);
3327}
3328
3329
3330TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3331 CharacterRange trail,
3332 bool read_backward,
3333 RegExpNode* on_success) {
3334 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3335 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3336 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3337 elms->Add(TextElement::CharClass(
3338 new (zone) RegExpCharacterClass(lead_ranges, false)),
3339 zone);
3340 elms->Add(TextElement::CharClass(
3341 new (zone) RegExpCharacterClass(trail_ranges, false)),
3342 zone);
3343 return new (zone) TextNode(elms, read_backward, on_success);
3344}
3345
3346
Ben Murdoch014dc512016-03-22 12:00:34 +00003347// This generates the code to match a text node. A text node can contain
3348// straight character sequences (possibly to be matched in a case-independent
3349// way) and character classes. For efficiency we do not do this in a single
3350// pass from left to right. Instead we pass over the text node several times,
3351// emitting code for some character positions every time. See the comment on
3352// TextEmitPass for details.
3353void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3354 LimitResult limit_result = LimitVersions(compiler, trace);
3355 if (limit_result == DONE) return;
3356 DCHECK(limit_result == CONTINUE);
3357
3358 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3359 compiler->SetRegExpTooBig();
3360 return;
3361 }
3362
3363 if (compiler->one_byte()) {
3364 int dummy = 0;
3365 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3366 }
3367
3368 bool first_elt_done = false;
3369 int bound_checked_to = trace->cp_offset() - 1;
3370 bound_checked_to += trace->bound_checked_up_to();
3371
3372 // If a character is preloaded into the current character register then
3373 // check that now.
3374 if (trace->characters_preloaded() == 1) {
3375 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3376 if (!SkipPass(pass, compiler->ignore_case())) {
3377 TextEmitPass(compiler,
3378 static_cast<TextEmitPassType>(pass),
3379 true,
3380 trace,
3381 false,
3382 &bound_checked_to);
3383 }
3384 }
3385 first_elt_done = true;
3386 }
3387
3388 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3389 if (!SkipPass(pass, compiler->ignore_case())) {
3390 TextEmitPass(compiler,
3391 static_cast<TextEmitPassType>(pass),
3392 false,
3393 trace,
3394 first_elt_done,
3395 &bound_checked_to);
3396 }
3397 }
3398
3399 Trace successor_trace(*trace);
3400 // If we advance backward, we may end up at the start.
3401 successor_trace.AdvanceCurrentPositionInTrace(
3402 read_backward() ? -Length() : Length(), compiler);
3403 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3404 : Trace::FALSE_VALUE);
3405 RecursionCheck rc(compiler);
3406 on_success()->Emit(compiler, &successor_trace);
3407}
3408
3409
3410void Trace::InvalidateCurrentCharacter() {
3411 characters_preloaded_ = 0;
3412}
3413
3414
3415void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3416 // We don't have an instruction for shifting the current character register
3417 // down or for using a shifted value for anything so lets just forget that
3418 // we preloaded any characters into it.
3419 characters_preloaded_ = 0;
3420 // Adjust the offsets of the quick check performed information. This
3421 // information is used to find out what we already determined about the
3422 // characters by means of mask and compare.
3423 quick_check_performed_.Advance(by, compiler->one_byte());
3424 cp_offset_ += by;
3425 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3426 compiler->SetRegExpTooBig();
3427 cp_offset_ = 0;
3428 }
3429 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3430}
3431
3432
3433void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3434 int element_count = elements()->length();
3435 for (int i = 0; i < element_count; i++) {
3436 TextElement elm = elements()->at(i);
3437 if (elm.text_type() == TextElement::CHAR_CLASS) {
3438 RegExpCharacterClass* cc = elm.char_class();
3439 // None of the standard character classes is different in the case
3440 // independent case and it slows us down if we don't know that.
3441 if (cc->is_standard(zone())) continue;
3442 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch109988c2016-05-18 11:27:45 +01003443 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
Ben Murdoch014dc512016-03-22 12:00:34 +00003444 }
3445 }
3446}
3447
3448
3449int TextNode::GreedyLoopTextLength() { return Length(); }
3450
3451
3452RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3453 RegExpCompiler* compiler) {
3454 if (read_backward()) return NULL;
3455 if (elements()->length() != 1) return NULL;
3456 TextElement elm = elements()->at(0);
3457 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3458 RegExpCharacterClass* node = elm.char_class();
3459 ZoneList<CharacterRange>* ranges = node->ranges(zone());
Ben Murdoch109988c2016-05-18 11:27:45 +01003460 CharacterRange::Canonicalize(ranges);
Ben Murdoch014dc512016-03-22 12:00:34 +00003461 if (node->is_negated()) {
3462 return ranges->length() == 0 ? on_success() : NULL;
3463 }
3464 if (ranges->length() != 1) return NULL;
3465 uint32_t max_char;
3466 if (compiler->one_byte()) {
3467 max_char = String::kMaxOneByteCharCode;
3468 } else {
3469 max_char = String::kMaxUtf16CodeUnit;
3470 }
3471 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3472}
3473
3474
3475// Finds the fixed match length of a sequence of nodes that goes from
3476// this alternative and back to this choice node. If there are variable
3477// length nodes or other complications in the way then return a sentinel
3478// value indicating that a greedy loop cannot be constructed.
3479int ChoiceNode::GreedyLoopTextLengthForAlternative(
3480 GuardedAlternative* alternative) {
3481 int length = 0;
3482 RegExpNode* node = alternative->node();
3483 // Later we will generate code for all these text nodes using recursion
3484 // so we have to limit the max number.
3485 int recursion_depth = 0;
3486 while (node != this) {
3487 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3488 return kNodeIsTooComplexForGreedyLoops;
3489 }
3490 int node_length = node->GreedyLoopTextLength();
3491 if (node_length == kNodeIsTooComplexForGreedyLoops) {
3492 return kNodeIsTooComplexForGreedyLoops;
3493 }
3494 length += node_length;
3495 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3496 node = seq_node->on_success();
3497 }
3498 return read_backward() ? -length : length;
3499}
3500
3501
3502void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3503 DCHECK_NULL(loop_node_);
3504 AddAlternative(alt);
3505 loop_node_ = alt.node();
3506}
3507
3508
3509void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3510 DCHECK_NULL(continue_node_);
3511 AddAlternative(alt);
3512 continue_node_ = alt.node();
3513}
3514
3515
3516void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3517 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3518 if (trace->stop_node() == this) {
3519 // Back edge of greedy optimized loop node graph.
3520 int text_length =
3521 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3522 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
3523 // Update the counter-based backtracking info on the stack. This is an
3524 // optimization for greedy loops (see below).
3525 DCHECK(trace->cp_offset() == text_length);
3526 macro_assembler->AdvanceCurrentPosition(text_length);
3527 macro_assembler->GoTo(trace->loop_label());
3528 return;
3529 }
3530 DCHECK_NULL(trace->stop_node());
3531 if (!trace->is_trivial()) {
3532 trace->Flush(compiler, this);
3533 return;
3534 }
3535 ChoiceNode::Emit(compiler, trace);
3536}
3537
3538
3539int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3540 int eats_at_least) {
3541 int preload_characters = Min(4, eats_at_least);
3542 if (compiler->macro_assembler()->CanReadUnaligned()) {
3543 bool one_byte = compiler->one_byte();
3544 if (one_byte) {
3545 if (preload_characters > 4) preload_characters = 4;
3546 // We can't preload 3 characters because there is no machine instruction
3547 // to do that. We can't just load 4 because we could be reading
3548 // beyond the end of the string, which could cause a memory fault.
3549 if (preload_characters == 3) preload_characters = 2;
3550 } else {
3551 if (preload_characters > 2) preload_characters = 2;
3552 }
3553 } else {
3554 if (preload_characters > 1) preload_characters = 1;
3555 }
3556 return preload_characters;
3557}
3558
3559
3560// This class is used when generating the alternatives in a choice node. It
3561// records the way the alternative is being code generated.
3562class AlternativeGeneration: public Malloced {
3563 public:
3564 AlternativeGeneration()
3565 : possible_success(),
3566 expects_preload(false),
3567 after(),
3568 quick_check_details() { }
3569 Label possible_success;
3570 bool expects_preload;
3571 Label after;
3572 QuickCheckDetails quick_check_details;
3573};
3574
3575
3576// Creates a list of AlternativeGenerations. If the list has a reasonable
3577// size then it is on the stack, otherwise the excess is on the heap.
3578class AlternativeGenerationList {
3579 public:
3580 AlternativeGenerationList(int count, Zone* zone)
3581 : alt_gens_(count, zone) {
3582 for (int i = 0; i < count && i < kAFew; i++) {
3583 alt_gens_.Add(a_few_alt_gens_ + i, zone);
3584 }
3585 for (int i = kAFew; i < count; i++) {
3586 alt_gens_.Add(new AlternativeGeneration(), zone);
3587 }
3588 }
3589 ~AlternativeGenerationList() {
3590 for (int i = kAFew; i < alt_gens_.length(); i++) {
3591 delete alt_gens_[i];
3592 alt_gens_[i] = NULL;
3593 }
3594 }
3595
3596 AlternativeGeneration* at(int i) {
3597 return alt_gens_[i];
3598 }
3599
3600 private:
3601 static const int kAFew = 10;
3602 ZoneList<AlternativeGeneration*> alt_gens_;
3603 AlternativeGeneration a_few_alt_gens_[kAFew];
3604};
3605
3606
Ben Murdoch109988c2016-05-18 11:27:45 +01003607static const uc32 kRangeEndMarker = 0x110000;
3608
Ben Murdoch014dc512016-03-22 12:00:34 +00003609// The '2' variant is has inclusive from and exclusive to.
3610// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3611// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
Ben Murdoch109988c2016-05-18 11:27:45 +01003612static const int kSpaceRanges[] = {
3613 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681,
3614 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3615 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
Ben Murdoch014dc512016-03-22 12:00:34 +00003616static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3617
3618static const int kWordRanges[] = {
Ben Murdoch109988c2016-05-18 11:27:45 +01003619 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
Ben Murdoch014dc512016-03-22 12:00:34 +00003620static const int kWordRangeCount = arraysize(kWordRanges);
Ben Murdoch109988c2016-05-18 11:27:45 +01003621static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
Ben Murdoch014dc512016-03-22 12:00:34 +00003622static const int kDigitRangeCount = arraysize(kDigitRanges);
Ben Murdoch109988c2016-05-18 11:27:45 +01003623static const int kSurrogateRanges[] = {
3624 kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
Ben Murdoch014dc512016-03-22 12:00:34 +00003625static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
Ben Murdoch109988c2016-05-18 11:27:45 +01003626static const int kLineTerminatorRanges[] = {
3627 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
Ben Murdoch014dc512016-03-22 12:00:34 +00003628static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3629
Ben Murdoch014dc512016-03-22 12:00:34 +00003630void BoyerMoorePositionInfo::Set(int character) {
3631 SetInterval(Interval(character, character));
3632}
3633
3634
3635void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3636 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3637 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3638 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3639 surrogate_ =
3640 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3641 if (interval.to() - interval.from() >= kMapSize - 1) {
3642 if (map_count_ != kMapSize) {
3643 map_count_ = kMapSize;
3644 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3645 }
3646 return;
3647 }
3648 for (int i = interval.from(); i <= interval.to(); i++) {
3649 int mod_character = (i & kMask);
3650 if (!map_->at(mod_character)) {
3651 map_count_++;
3652 map_->at(mod_character) = true;
3653 }
3654 if (map_count_ == kMapSize) return;
3655 }
3656}
3657
3658
3659void BoyerMoorePositionInfo::SetAll() {
3660 s_ = w_ = d_ = kLatticeUnknown;
3661 if (map_count_ != kMapSize) {
3662 map_count_ = kMapSize;
3663 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3664 }
3665}
3666
3667
3668BoyerMooreLookahead::BoyerMooreLookahead(
3669 int length, RegExpCompiler* compiler, Zone* zone)
3670 : length_(length),
3671 compiler_(compiler) {
3672 if (compiler->one_byte()) {
3673 max_char_ = String::kMaxOneByteCharCode;
3674 } else {
3675 max_char_ = String::kMaxUtf16CodeUnit;
3676 }
3677 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3678 for (int i = 0; i < length; i++) {
3679 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3680 }
3681}
3682
3683
3684// Find the longest range of lookahead that has the fewest number of different
3685// characters that can occur at a given position. Since we are optimizing two
3686// different parameters at once this is a tradeoff.
3687bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3688 int biggest_points = 0;
3689 // If more than 32 characters out of 128 can occur it is unlikely that we can
3690 // be lucky enough to step forwards much of the time.
3691 const int kMaxMax = 32;
3692 for (int max_number_of_chars = 4;
3693 max_number_of_chars < kMaxMax;
3694 max_number_of_chars *= 2) {
3695 biggest_points =
3696 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3697 }
3698 if (biggest_points == 0) return false;
3699 return true;
3700}
3701
3702
3703// Find the highest-points range between 0 and length_ where the character
3704// information is not too vague. 'Too vague' means that there are more than
3705// max_number_of_chars that can occur at this position. Calculates the number
3706// of points as the product of width-of-the-range and
3707// probability-of-finding-one-of-the-characters, where the probability is
3708// calculated using the frequency distribution of the sample subject string.
3709int BoyerMooreLookahead::FindBestInterval(
3710 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3711 int biggest_points = old_biggest_points;
3712 static const int kSize = RegExpMacroAssembler::kTableSize;
3713 for (int i = 0; i < length_; ) {
3714 while (i < length_ && Count(i) > max_number_of_chars) i++;
3715 if (i == length_) break;
3716 int remembered_from = i;
3717 bool union_map[kSize];
3718 for (int j = 0; j < kSize; j++) union_map[j] = false;
3719 while (i < length_ && Count(i) <= max_number_of_chars) {
3720 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3721 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3722 i++;
3723 }
3724 int frequency = 0;
3725 for (int j = 0; j < kSize; j++) {
3726 if (union_map[j]) {
3727 // Add 1 to the frequency to give a small per-character boost for
3728 // the cases where our sampling is not good enough and many
3729 // characters have a frequency of zero. This means the frequency
3730 // can theoretically be up to 2*kSize though we treat it mostly as
3731 // a fraction of kSize.
3732 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3733 }
3734 }
3735 // We use the probability of skipping times the distance we are skipping to
3736 // judge the effectiveness of this. Actually we have a cut-off: By
3737 // dividing by 2 we switch off the skipping if the probability of skipping
3738 // is less than 50%. This is because the multibyte mask-and-compare
3739 // skipping in quickcheck is more likely to do well on this case.
3740 bool in_quickcheck_range =
3741 ((i - remembered_from < 4) ||
3742 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3743 // Called 'probability' but it is only a rough estimate and can actually
3744 // be outside the 0-kSize range.
3745 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3746 int points = (i - remembered_from) * probability;
3747 if (points > biggest_points) {
3748 *from = remembered_from;
3749 *to = i - 1;
3750 biggest_points = points;
3751 }
3752 }
3753 return biggest_points;
3754}
3755
3756
3757// Take all the characters that will not prevent a successful match if they
3758// occur in the subject string in the range between min_lookahead and
3759// max_lookahead (inclusive) measured from the current position. If the
3760// character at max_lookahead offset is not one of these characters, then we
3761// can safely skip forwards by the number of characters in the range.
3762int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3763 int max_lookahead,
3764 Handle<ByteArray> boolean_skip_table) {
3765 const int kSize = RegExpMacroAssembler::kTableSize;
3766
3767 const int kSkipArrayEntry = 0;
3768 const int kDontSkipArrayEntry = 1;
3769
3770 for (int i = 0; i < kSize; i++) {
3771 boolean_skip_table->set(i, kSkipArrayEntry);
3772 }
3773 int skip = max_lookahead + 1 - min_lookahead;
3774
3775 for (int i = max_lookahead; i >= min_lookahead; i--) {
3776 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3777 for (int j = 0; j < kSize; j++) {
3778 if (map->at(j)) {
3779 boolean_skip_table->set(j, kDontSkipArrayEntry);
3780 }
3781 }
3782 }
3783
3784 return skip;
3785}
3786
3787
3788// See comment above on the implementation of GetSkipTable.
3789void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3790 const int kSize = RegExpMacroAssembler::kTableSize;
3791
3792 int min_lookahead = 0;
3793 int max_lookahead = 0;
3794
3795 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3796
3797 bool found_single_character = false;
3798 int single_character = 0;
3799 for (int i = max_lookahead; i >= min_lookahead; i--) {
3800 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3801 if (map->map_count() > 1 ||
3802 (found_single_character && map->map_count() != 0)) {
3803 found_single_character = false;
3804 break;
3805 }
3806 for (int j = 0; j < kSize; j++) {
3807 if (map->at(j)) {
3808 found_single_character = true;
3809 single_character = j;
3810 break;
3811 }
3812 }
3813 }
3814
3815 int lookahead_width = max_lookahead + 1 - min_lookahead;
3816
3817 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3818 // The mask-compare can probably handle this better.
3819 return;
3820 }
3821
3822 if (found_single_character) {
3823 Label cont, again;
3824 masm->Bind(&again);
3825 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3826 if (max_char_ > kSize) {
3827 masm->CheckCharacterAfterAnd(single_character,
3828 RegExpMacroAssembler::kTableMask,
3829 &cont);
3830 } else {
3831 masm->CheckCharacter(single_character, &cont);
3832 }
3833 masm->AdvanceCurrentPosition(lookahead_width);
3834 masm->GoTo(&again);
3835 masm->Bind(&cont);
3836 return;
3837 }
3838
3839 Factory* factory = masm->isolate()->factory();
3840 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3841 int skip_distance = GetSkipTable(
3842 min_lookahead, max_lookahead, boolean_skip_table);
3843 DCHECK(skip_distance != 0);
3844
3845 Label cont, again;
3846 masm->Bind(&again);
3847 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3848 masm->CheckBitInTable(boolean_skip_table, &cont);
3849 masm->AdvanceCurrentPosition(skip_distance);
3850 masm->GoTo(&again);
3851 masm->Bind(&cont);
3852}
3853
3854
3855/* Code generation for choice nodes.
3856 *
3857 * We generate quick checks that do a mask and compare to eliminate a
3858 * choice. If the quick check succeeds then it jumps to the continuation to
3859 * do slow checks and check subsequent nodes. If it fails (the common case)
3860 * it falls through to the next choice.
3861 *
3862 * Here is the desired flow graph. Nodes directly below each other imply
3863 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3864 * 3 doesn't have a quick check so we have to call the slow check.
3865 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3866 * regexp continuation is generated directly after the Sn node, up to the
3867 * next GoTo if we decide to reuse some already generated code. Some
3868 * nodes expect preload_characters to be preloaded into the current
3869 * character register. R nodes do this preloading. Vertices are marked
3870 * F for failures and S for success (possible success in the case of quick
3871 * nodes). L, V, < and > are used as arrow heads.
3872 *
3873 * ----------> R
3874 * |
3875 * V
3876 * Q1 -----> S1
3877 * | S /
3878 * F| /
3879 * | F/
3880 * | /
3881 * | R
3882 * | /
3883 * V L
3884 * Q2 -----> S2
3885 * | S /
3886 * F| /
3887 * | F/
3888 * | /
3889 * | R
3890 * | /
3891 * V L
3892 * S3
3893 * |
3894 * F|
3895 * |
3896 * R
3897 * |
3898 * backtrack V
3899 * <----------Q4
3900 * \ F |
3901 * \ |S
3902 * \ F V
3903 * \-----S4
3904 *
3905 * For greedy loops we push the current position, then generate the code that
3906 * eats the input specially in EmitGreedyLoop. The other choice (the
3907 * continuation) is generated by the normal code in EmitChoices, and steps back
3908 * in the input to the starting position when it fails to match. The loop code
3909 * looks like this (U is the unwind code that steps back in the greedy loop).
3910 *
3911 * _____
3912 * / \
3913 * V |
3914 * ----------> S1 |
3915 * /| |
3916 * / |S |
3917 * F/ \_____/
3918 * /
3919 * |<-----
3920 * | \
3921 * V |S
3922 * Q2 ---> U----->backtrack
3923 * | F /
3924 * S| /
3925 * V F /
3926 * S2--/
3927 */
3928
3929GreedyLoopState::GreedyLoopState(bool not_at_start) {
3930 counter_backtrack_trace_.set_backtrack(&label_);
3931 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3932}
3933
3934
3935void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3936#ifdef DEBUG
3937 int choice_count = alternatives_->length();
3938 for (int i = 0; i < choice_count - 1; i++) {
3939 GuardedAlternative alternative = alternatives_->at(i);
3940 ZoneList<Guard*>* guards = alternative.guards();
3941 int guard_count = (guards == NULL) ? 0 : guards->length();
3942 for (int j = 0; j < guard_count; j++) {
3943 DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3944 }
3945 }
3946#endif
3947}
3948
3949
3950void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3951 Trace* current_trace,
3952 PreloadState* state) {
3953 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3954 // Save some time by looking at most one machine word ahead.
3955 state->eats_at_least_ =
3956 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3957 current_trace->at_start() == Trace::FALSE_VALUE);
3958 }
3959 state->preload_characters_ =
3960 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3961
3962 state->preload_is_current_ =
3963 (current_trace->characters_preloaded() == state->preload_characters_);
3964 state->preload_has_checked_bounds_ = state->preload_is_current_;
3965}
3966
3967
3968void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3969 int choice_count = alternatives_->length();
3970
Ben Murdoch109988c2016-05-18 11:27:45 +01003971 if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
3972 alternatives_->at(0).node()->Emit(compiler, trace);
3973 return;
3974 }
3975
Ben Murdoch014dc512016-03-22 12:00:34 +00003976 AssertGuardsMentionRegisters(trace);
3977
3978 LimitResult limit_result = LimitVersions(compiler, trace);
3979 if (limit_result == DONE) return;
3980 DCHECK(limit_result == CONTINUE);
3981
3982 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3983 // other choice nodes we only flush if we are out of code size budget.
3984 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3985 trace->Flush(compiler, this);
3986 return;
3987 }
3988
3989 RecursionCheck rc(compiler);
3990
3991 PreloadState preload;
3992 preload.init();
3993 GreedyLoopState greedy_loop_state(not_at_start());
3994
3995 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3996 AlternativeGenerationList alt_gens(choice_count, zone());
3997
3998 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3999 trace = EmitGreedyLoop(compiler,
4000 trace,
4001 &alt_gens,
4002 &preload,
4003 &greedy_loop_state,
4004 text_length);
4005 } else {
4006 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
4007 // match the traces produced pre-cleanup.
4008 Label second_choice;
4009 compiler->macro_assembler()->Bind(&second_choice);
4010
4011 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
4012
4013 EmitChoices(compiler,
4014 &alt_gens,
4015 0,
4016 trace,
4017 &preload);
4018 }
4019
4020 // At this point we need to generate slow checks for the alternatives where
4021 // the quick check was inlined. We can recognize these because the associated
4022 // label was bound.
4023 int new_flush_budget = trace->flush_budget() / choice_count;
4024 for (int i = 0; i < choice_count; i++) {
4025 AlternativeGeneration* alt_gen = alt_gens.at(i);
4026 Trace new_trace(*trace);
4027 // If there are actions to be flushed we have to limit how many times
4028 // they are flushed. Take the budget of the parent trace and distribute
4029 // it fairly amongst the children.
4030 if (new_trace.actions() != NULL) {
4031 new_trace.set_flush_budget(new_flush_budget);
4032 }
4033 bool next_expects_preload =
4034 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
4035 EmitOutOfLineContinuation(compiler,
4036 &new_trace,
4037 alternatives_->at(i),
4038 alt_gen,
4039 preload.preload_characters_,
4040 next_expects_preload);
4041 }
4042}
4043
4044
4045Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
4046 Trace* trace,
4047 AlternativeGenerationList* alt_gens,
4048 PreloadState* preload,
4049 GreedyLoopState* greedy_loop_state,
4050 int text_length) {
4051 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4052 // Here we have special handling for greedy loops containing only text nodes
4053 // and other simple nodes. These are handled by pushing the current
4054 // position on the stack and then incrementing the current position each
4055 // time around the switch. On backtrack we decrement the current position
4056 // and check it against the pushed value. This avoids pushing backtrack
4057 // information for each iteration of the loop, which could take up a lot of
4058 // space.
4059 DCHECK(trace->stop_node() == NULL);
4060 macro_assembler->PushCurrentPosition();
4061 Label greedy_match_failed;
4062 Trace greedy_match_trace;
4063 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
4064 greedy_match_trace.set_backtrack(&greedy_match_failed);
4065 Label loop_label;
4066 macro_assembler->Bind(&loop_label);
4067 greedy_match_trace.set_stop_node(this);
4068 greedy_match_trace.set_loop_label(&loop_label);
4069 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4070 macro_assembler->Bind(&greedy_match_failed);
4071
4072 Label second_choice; // For use in greedy matches.
4073 macro_assembler->Bind(&second_choice);
4074
4075 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4076
4077 EmitChoices(compiler,
4078 alt_gens,
4079 1,
4080 new_trace,
4081 preload);
4082
4083 macro_assembler->Bind(greedy_loop_state->label());
4084 // If we have unwound to the bottom then backtrack.
4085 macro_assembler->CheckGreedyLoop(trace->backtrack());
4086 // Otherwise try the second priority at an earlier position.
4087 macro_assembler->AdvanceCurrentPosition(-text_length);
4088 macro_assembler->GoTo(&second_choice);
4089 return new_trace;
4090}
4091
4092int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4093 Trace* trace) {
4094 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4095 if (alternatives_->length() != 2) return eats_at_least;
4096
4097 GuardedAlternative alt1 = alternatives_->at(1);
4098 if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4099 return eats_at_least;
4100 }
4101 RegExpNode* eats_anything_node = alt1.node();
4102 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4103 return eats_at_least;
4104 }
4105
4106 // Really we should be creating a new trace when we execute this function,
4107 // but there is no need, because the code it generates cannot backtrack, and
4108 // we always arrive here with a trivial trace (since it's the entry to a
4109 // loop. That also implies that there are no preloaded characters, which is
4110 // good, because it means we won't be violating any assumptions by
4111 // overwriting those characters with new load instructions.
4112 DCHECK(trace->is_trivial());
4113
4114 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4115 Isolate* isolate = macro_assembler->isolate();
4116 // At this point we know that we are at a non-greedy loop that will eat
4117 // any character one at a time. Any non-anchored regexp has such a
4118 // loop prepended to it in order to find where it starts. We look for
4119 // a pattern of the form ...abc... where we can look 6 characters ahead
4120 // and step forwards 3 if the character is not one of abc. Abc need
4121 // not be atoms, they can be any reasonably limited character class or
4122 // small alternation.
4123 BoyerMooreLookahead* bm = bm_info(false);
4124 if (bm == NULL) {
4125 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4126 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4127 kRecursionBudget,
4128 false));
4129 if (eats_at_least >= 1) {
4130 bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4131 compiler,
4132 zone());
4133 GuardedAlternative alt0 = alternatives_->at(0);
4134 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4135 }
4136 }
4137 if (bm != NULL) {
4138 bm->EmitSkipInstructions(macro_assembler);
4139 }
4140 return eats_at_least;
4141}
4142
4143
4144void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4145 AlternativeGenerationList* alt_gens,
4146 int first_choice,
4147 Trace* trace,
4148 PreloadState* preload) {
4149 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4150 SetUpPreLoad(compiler, trace, preload);
4151
4152 // For now we just call all choices one after the other. The idea ultimately
4153 // is to use the Dispatch table to try only the relevant ones.
4154 int choice_count = alternatives_->length();
4155
4156 int new_flush_budget = trace->flush_budget() / choice_count;
4157
4158 for (int i = first_choice; i < choice_count; i++) {
4159 bool is_last = i == choice_count - 1;
4160 bool fall_through_on_failure = !is_last;
4161 GuardedAlternative alternative = alternatives_->at(i);
4162 AlternativeGeneration* alt_gen = alt_gens->at(i);
4163 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4164 ZoneList<Guard*>* guards = alternative.guards();
4165 int guard_count = (guards == NULL) ? 0 : guards->length();
4166 Trace new_trace(*trace);
4167 new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4168 preload->preload_characters_ :
4169 0);
4170 if (preload->preload_has_checked_bounds_) {
4171 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4172 }
4173 new_trace.quick_check_performed()->Clear();
4174 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4175 if (!is_last) {
4176 new_trace.set_backtrack(&alt_gen->after);
4177 }
4178 alt_gen->expects_preload = preload->preload_is_current_;
4179 bool generate_full_check_inline = false;
4180 if (compiler->optimize() &&
4181 try_to_emit_quick_check_for_alternative(i == 0) &&
4182 alternative.node()->EmitQuickCheck(
4183 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4184 &alt_gen->possible_success, &alt_gen->quick_check_details,
4185 fall_through_on_failure)) {
4186 // Quick check was generated for this choice.
4187 preload->preload_is_current_ = true;
4188 preload->preload_has_checked_bounds_ = true;
4189 // If we generated the quick check to fall through on possible success,
4190 // we now need to generate the full check inline.
4191 if (!fall_through_on_failure) {
4192 macro_assembler->Bind(&alt_gen->possible_success);
4193 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4194 new_trace.set_characters_preloaded(preload->preload_characters_);
4195 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4196 generate_full_check_inline = true;
4197 }
4198 } else if (alt_gen->quick_check_details.cannot_match()) {
4199 if (!fall_through_on_failure) {
4200 macro_assembler->GoTo(trace->backtrack());
4201 }
4202 continue;
4203 } else {
4204 // No quick check was generated. Put the full code here.
4205 // If this is not the first choice then there could be slow checks from
4206 // previous cases that go here when they fail. There's no reason to
4207 // insist that they preload characters since the slow check we are about
4208 // to generate probably can't use it.
4209 if (i != first_choice) {
4210 alt_gen->expects_preload = false;
4211 new_trace.InvalidateCurrentCharacter();
4212 }
4213 generate_full_check_inline = true;
4214 }
4215 if (generate_full_check_inline) {
4216 if (new_trace.actions() != NULL) {
4217 new_trace.set_flush_budget(new_flush_budget);
4218 }
4219 for (int j = 0; j < guard_count; j++) {
4220 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4221 }
4222 alternative.node()->Emit(compiler, &new_trace);
4223 preload->preload_is_current_ = false;
4224 }
4225 macro_assembler->Bind(&alt_gen->after);
4226 }
4227}
4228
4229
4230void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4231 Trace* trace,
4232 GuardedAlternative alternative,
4233 AlternativeGeneration* alt_gen,
4234 int preload_characters,
4235 bool next_expects_preload) {
4236 if (!alt_gen->possible_success.is_linked()) return;
4237
4238 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4239 macro_assembler->Bind(&alt_gen->possible_success);
4240 Trace out_of_line_trace(*trace);
4241 out_of_line_trace.set_characters_preloaded(preload_characters);
4242 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4243 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4244 ZoneList<Guard*>* guards = alternative.guards();
4245 int guard_count = (guards == NULL) ? 0 : guards->length();
4246 if (next_expects_preload) {
4247 Label reload_current_char;
4248 out_of_line_trace.set_backtrack(&reload_current_char);
4249 for (int j = 0; j < guard_count; j++) {
4250 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4251 }
4252 alternative.node()->Emit(compiler, &out_of_line_trace);
4253 macro_assembler->Bind(&reload_current_char);
4254 // Reload the current character, since the next quick check expects that.
4255 // We don't need to check bounds here because we only get into this
4256 // code through a quick check which already did the checked load.
4257 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4258 NULL,
4259 false,
4260 preload_characters);
4261 macro_assembler->GoTo(&(alt_gen->after));
4262 } else {
4263 out_of_line_trace.set_backtrack(&(alt_gen->after));
4264 for (int j = 0; j < guard_count; j++) {
4265 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4266 }
4267 alternative.node()->Emit(compiler, &out_of_line_trace);
4268 }
4269}
4270
4271
4272void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4273 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4274 LimitResult limit_result = LimitVersions(compiler, trace);
4275 if (limit_result == DONE) return;
4276 DCHECK(limit_result == CONTINUE);
4277
4278 RecursionCheck rc(compiler);
4279
4280 switch (action_type_) {
4281 case STORE_POSITION: {
4282 Trace::DeferredCapture
4283 new_capture(data_.u_position_register.reg,
4284 data_.u_position_register.is_capture,
4285 trace);
4286 Trace new_trace = *trace;
4287 new_trace.add_action(&new_capture);
4288 on_success()->Emit(compiler, &new_trace);
4289 break;
4290 }
4291 case INCREMENT_REGISTER: {
4292 Trace::DeferredIncrementRegister
4293 new_increment(data_.u_increment_register.reg);
4294 Trace new_trace = *trace;
4295 new_trace.add_action(&new_increment);
4296 on_success()->Emit(compiler, &new_trace);
4297 break;
4298 }
4299 case SET_REGISTER: {
4300 Trace::DeferredSetRegister
4301 new_set(data_.u_store_register.reg, data_.u_store_register.value);
4302 Trace new_trace = *trace;
4303 new_trace.add_action(&new_set);
4304 on_success()->Emit(compiler, &new_trace);
4305 break;
4306 }
4307 case CLEAR_CAPTURES: {
4308 Trace::DeferredClearCaptures
4309 new_capture(Interval(data_.u_clear_captures.range_from,
4310 data_.u_clear_captures.range_to));
4311 Trace new_trace = *trace;
4312 new_trace.add_action(&new_capture);
4313 on_success()->Emit(compiler, &new_trace);
4314 break;
4315 }
4316 case BEGIN_SUBMATCH:
4317 if (!trace->is_trivial()) {
4318 trace->Flush(compiler, this);
4319 } else {
4320 assembler->WriteCurrentPositionToRegister(
4321 data_.u_submatch.current_position_register, 0);
4322 assembler->WriteStackPointerToRegister(
4323 data_.u_submatch.stack_pointer_register);
4324 on_success()->Emit(compiler, trace);
4325 }
4326 break;
4327 case EMPTY_MATCH_CHECK: {
4328 int start_pos_reg = data_.u_empty_match_check.start_register;
4329 int stored_pos = 0;
4330 int rep_reg = data_.u_empty_match_check.repetition_register;
4331 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4332 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4333 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4334 // If we know we haven't advanced and there is no minimum we
4335 // can just backtrack immediately.
4336 assembler->GoTo(trace->backtrack());
4337 } else if (know_dist && stored_pos < trace->cp_offset()) {
4338 // If we know we've advanced we can generate the continuation
4339 // immediately.
4340 on_success()->Emit(compiler, trace);
4341 } else if (!trace->is_trivial()) {
4342 trace->Flush(compiler, this);
4343 } else {
4344 Label skip_empty_check;
4345 // If we have a minimum number of repetitions we check the current
4346 // number first and skip the empty check if it's not enough.
4347 if (has_minimum) {
4348 int limit = data_.u_empty_match_check.repetition_limit;
4349 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4350 }
4351 // If the match is empty we bail out, otherwise we fall through
4352 // to the on-success continuation.
4353 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4354 trace->backtrack());
4355 assembler->Bind(&skip_empty_check);
4356 on_success()->Emit(compiler, trace);
4357 }
4358 break;
4359 }
4360 case POSITIVE_SUBMATCH_SUCCESS: {
4361 if (!trace->is_trivial()) {
4362 trace->Flush(compiler, this);
4363 return;
4364 }
4365 assembler->ReadCurrentPositionFromRegister(
4366 data_.u_submatch.current_position_register);
4367 assembler->ReadStackPointerFromRegister(
4368 data_.u_submatch.stack_pointer_register);
4369 int clear_register_count = data_.u_submatch.clear_register_count;
4370 if (clear_register_count == 0) {
4371 on_success()->Emit(compiler, trace);
4372 return;
4373 }
4374 int clear_registers_from = data_.u_submatch.clear_register_from;
4375 Label clear_registers_backtrack;
4376 Trace new_trace = *trace;
4377 new_trace.set_backtrack(&clear_registers_backtrack);
4378 on_success()->Emit(compiler, &new_trace);
4379
4380 assembler->Bind(&clear_registers_backtrack);
4381 int clear_registers_to = clear_registers_from + clear_register_count - 1;
4382 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4383
4384 DCHECK(trace->backtrack() == NULL);
4385 assembler->Backtrack();
4386 return;
4387 }
4388 default:
4389 UNREACHABLE();
4390 }
4391}
4392
4393
4394void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4395 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4396 if (!trace->is_trivial()) {
4397 trace->Flush(compiler, this);
4398 return;
4399 }
4400
4401 LimitResult limit_result = LimitVersions(compiler, trace);
4402 if (limit_result == DONE) return;
4403 DCHECK(limit_result == CONTINUE);
4404
4405 RecursionCheck rc(compiler);
4406
4407 DCHECK_EQ(start_reg_ + 1, end_reg_);
4408 if (compiler->ignore_case()) {
Ben Murdoch109988c2016-05-18 11:27:45 +01004409 assembler->CheckNotBackReferenceIgnoreCase(
4410 start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
Ben Murdoch014dc512016-03-22 12:00:34 +00004411 } else {
4412 assembler->CheckNotBackReference(start_reg_, read_backward(),
4413 trace->backtrack());
4414 }
4415 // We are going to advance backward, so we may end up at the start.
4416 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
Ben Murdoch109988c2016-05-18 11:27:45 +01004417
4418 // Check that the back reference does not end inside a surrogate pair.
4419 if (compiler->unicode() && !compiler->one_byte()) {
4420 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4421 }
Ben Murdoch014dc512016-03-22 12:00:34 +00004422 on_success()->Emit(compiler, trace);
4423}
4424
4425
4426// -------------------------------------------------------------------
4427// Dot/dotty output
4428
4429
4430#ifdef DEBUG
4431
4432
4433class DotPrinter: public NodeVisitor {
4434 public:
4435 DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4436 : os_(os),
4437 ignore_case_(ignore_case) {}
4438 void PrintNode(const char* label, RegExpNode* node);
4439 void Visit(RegExpNode* node);
4440 void PrintAttributes(RegExpNode* from);
4441 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4442#define DECLARE_VISIT(Type) \
4443 virtual void Visit##Type(Type##Node* that);
4444FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4445#undef DECLARE_VISIT
4446 private:
4447 std::ostream& os_;
4448 bool ignore_case_;
4449};
4450
4451
4452void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4453 os_ << "digraph G {\n graph [label=\"";
4454 for (int i = 0; label[i]; i++) {
4455 switch (label[i]) {
4456 case '\\':
4457 os_ << "\\\\";
4458 break;
4459 case '"':
4460 os_ << "\"";
4461 break;
4462 default:
4463 os_ << label[i];
4464 break;
4465 }
4466 }
4467 os_ << "\"];\n";
4468 Visit(node);
4469 os_ << "}" << std::endl;
4470}
4471
4472
4473void DotPrinter::Visit(RegExpNode* node) {
4474 if (node->info()->visited) return;
4475 node->info()->visited = true;
4476 node->Accept(this);
4477}
4478
4479
4480void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4481 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4482 Visit(on_failure);
4483}
4484
4485
4486class TableEntryBodyPrinter {
4487 public:
4488 TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4489 : os_(os),
4490 choice_(choice) {}
4491 void Call(uc16 from, DispatchTable::Entry entry) {
4492 OutSet* out_set = entry.out_set();
4493 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4494 if (out_set->Get(i)) {
4495 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4496 << choice()->alternatives()->at(i).node() << ";\n";
4497 }
4498 }
4499 }
4500 private:
4501 ChoiceNode* choice() { return choice_; }
4502 std::ostream& os_;
4503 ChoiceNode* choice_;
4504};
4505
4506
4507class TableEntryHeaderPrinter {
4508 public:
4509 explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4510 : first_(true),
4511 os_(os) {}
4512 void Call(uc16 from, DispatchTable::Entry entry) {
4513 if (first_) {
4514 first_ = false;
4515 } else {
4516 os_ << "|";
4517 }
4518 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4519 OutSet* out_set = entry.out_set();
4520 int priority = 0;
4521 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4522 if (out_set->Get(i)) {
4523 if (priority > 0) os_ << "|";
4524 os_ << "<s" << from << "o" << i << "> " << priority;
4525 priority++;
4526 }
4527 }
4528 os_ << "}}";
4529 }
4530
4531 private:
4532 bool first_;
4533 std::ostream& os_;
4534};
4535
4536
4537class AttributePrinter {
4538 public:
4539 explicit AttributePrinter(std::ostream& os) // NOLINT
4540 : os_(os),
4541 first_(true) {}
4542 void PrintSeparator() {
4543 if (first_) {
4544 first_ = false;
4545 } else {
4546 os_ << "|";
4547 }
4548 }
4549 void PrintBit(const char* name, bool value) {
4550 if (!value) return;
4551 PrintSeparator();
4552 os_ << "{" << name << "}";
4553 }
4554 void PrintPositive(const char* name, int value) {
4555 if (value < 0) return;
4556 PrintSeparator();
4557 os_ << "{" << name << "|" << value << "}";
4558 }
4559
4560 private:
4561 std::ostream& os_;
4562 bool first_;
4563};
4564
4565
4566void DotPrinter::PrintAttributes(RegExpNode* that) {
4567 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4568 << "margin=0.1, fontsize=10, label=\"{";
4569 AttributePrinter printer(os_);
4570 NodeInfo* info = that->info();
4571 printer.PrintBit("NI", info->follows_newline_interest);
4572 printer.PrintBit("WI", info->follows_word_interest);
4573 printer.PrintBit("SI", info->follows_start_interest);
4574 Label* label = that->label();
4575 if (label->is_bound())
4576 printer.PrintPositive("@", label->pos());
4577 os_ << "}\"];\n"
4578 << " a" << that << " -> n" << that
4579 << " [style=dashed, color=grey, arrowhead=none];\n";
4580}
4581
4582
4583static const bool kPrintDispatchTable = false;
4584void DotPrinter::VisitChoice(ChoiceNode* that) {
4585 if (kPrintDispatchTable) {
4586 os_ << " n" << that << " [shape=Mrecord, label=\"";
4587 TableEntryHeaderPrinter header_printer(os_);
4588 that->GetTable(ignore_case_)->ForEach(&header_printer);
4589 os_ << "\"]\n";
4590 PrintAttributes(that);
4591 TableEntryBodyPrinter body_printer(os_, that);
4592 that->GetTable(ignore_case_)->ForEach(&body_printer);
4593 } else {
4594 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4595 for (int i = 0; i < that->alternatives()->length(); i++) {
4596 GuardedAlternative alt = that->alternatives()->at(i);
4597 os_ << " n" << that << " -> n" << alt.node();
4598 }
4599 }
4600 for (int i = 0; i < that->alternatives()->length(); i++) {
4601 GuardedAlternative alt = that->alternatives()->at(i);
4602 alt.node()->Accept(this);
4603 }
4604}
4605
4606
4607void DotPrinter::VisitText(TextNode* that) {
4608 Zone* zone = that->zone();
4609 os_ << " n" << that << " [label=\"";
4610 for (int i = 0; i < that->elements()->length(); i++) {
4611 if (i > 0) os_ << " ";
4612 TextElement elm = that->elements()->at(i);
4613 switch (elm.text_type()) {
4614 case TextElement::ATOM: {
4615 Vector<const uc16> data = elm.atom()->data();
4616 for (int i = 0; i < data.length(); i++) {
4617 os_ << static_cast<char>(data[i]);
4618 }
4619 break;
4620 }
4621 case TextElement::CHAR_CLASS: {
4622 RegExpCharacterClass* node = elm.char_class();
4623 os_ << "[";
4624 if (node->is_negated()) os_ << "^";
4625 for (int j = 0; j < node->ranges(zone)->length(); j++) {
4626 CharacterRange range = node->ranges(zone)->at(j);
4627 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4628 }
4629 os_ << "]";
4630 break;
4631 }
4632 default:
4633 UNREACHABLE();
4634 }
4635 }
4636 os_ << "\", shape=box, peripheries=2];\n";
4637 PrintAttributes(that);
4638 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4639 Visit(that->on_success());
4640}
4641
4642
4643void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4644 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4645 << that->end_register() << "\", shape=doubleoctagon];\n";
4646 PrintAttributes(that);
4647 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4648 Visit(that->on_success());
4649}
4650
4651
4652void DotPrinter::VisitEnd(EndNode* that) {
4653 os_ << " n" << that << " [style=bold, shape=point];\n";
4654 PrintAttributes(that);
4655}
4656
4657
4658void DotPrinter::VisitAssertion(AssertionNode* that) {
4659 os_ << " n" << that << " [";
4660 switch (that->assertion_type()) {
4661 case AssertionNode::AT_END:
4662 os_ << "label=\"$\", shape=septagon";
4663 break;
4664 case AssertionNode::AT_START:
4665 os_ << "label=\"^\", shape=septagon";
4666 break;
4667 case AssertionNode::AT_BOUNDARY:
4668 os_ << "label=\"\\b\", shape=septagon";
4669 break;
4670 case AssertionNode::AT_NON_BOUNDARY:
4671 os_ << "label=\"\\B\", shape=septagon";
4672 break;
4673 case AssertionNode::AFTER_NEWLINE:
4674 os_ << "label=\"(?<=\\n)\", shape=septagon";
4675 break;
4676 }
4677 os_ << "];\n";
4678 PrintAttributes(that);
4679 RegExpNode* successor = that->on_success();
4680 os_ << " n" << that << " -> n" << successor << ";\n";
4681 Visit(successor);
4682}
4683
4684
4685void DotPrinter::VisitAction(ActionNode* that) {
4686 os_ << " n" << that << " [";
4687 switch (that->action_type_) {
4688 case ActionNode::SET_REGISTER:
4689 os_ << "label=\"$" << that->data_.u_store_register.reg
4690 << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4691 break;
4692 case ActionNode::INCREMENT_REGISTER:
4693 os_ << "label=\"$" << that->data_.u_increment_register.reg
4694 << "++\", shape=octagon";
4695 break;
4696 case ActionNode::STORE_POSITION:
4697 os_ << "label=\"$" << that->data_.u_position_register.reg
4698 << ":=$pos\", shape=octagon";
4699 break;
4700 case ActionNode::BEGIN_SUBMATCH:
4701 os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4702 << ":=$pos,begin\", shape=septagon";
4703 break;
4704 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4705 os_ << "label=\"escape\", shape=septagon";
4706 break;
4707 case ActionNode::EMPTY_MATCH_CHECK:
4708 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4709 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4710 << "<" << that->data_.u_empty_match_check.repetition_limit
4711 << "?\", shape=septagon";
4712 break;
4713 case ActionNode::CLEAR_CAPTURES: {
4714 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4715 << " to $" << that->data_.u_clear_captures.range_to
4716 << "\", shape=septagon";
4717 break;
4718 }
4719 }
4720 os_ << "];\n";
4721 PrintAttributes(that);
4722 RegExpNode* successor = that->on_success();
4723 os_ << " n" << that << " -> n" << successor << ";\n";
4724 Visit(successor);
4725}
4726
4727
4728class DispatchTableDumper {
4729 public:
4730 explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4731 void Call(uc16 key, DispatchTable::Entry entry);
4732 private:
4733 std::ostream& os_;
4734};
4735
4736
4737void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4738 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4739 OutSet* set = entry.out_set();
4740 bool first = true;
4741 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4742 if (set->Get(i)) {
4743 if (first) {
4744 first = false;
4745 } else {
4746 os_ << ", ";
4747 }
4748 os_ << i;
4749 }
4750 }
4751 os_ << "}\n";
4752}
4753
4754
4755void DispatchTable::Dump() {
4756 OFStream os(stderr);
4757 DispatchTableDumper dumper(os);
4758 tree()->ForEach(&dumper);
4759}
4760
4761
4762void RegExpEngine::DotPrint(const char* label,
4763 RegExpNode* node,
4764 bool ignore_case) {
4765 OFStream os(stdout);
4766 DotPrinter printer(os, ignore_case);
4767 printer.PrintNode(label, node);
4768}
4769
4770
4771#endif // DEBUG
4772
4773
4774// -------------------------------------------------------------------
4775// Tree to graph conversion
4776
4777RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4778 RegExpNode* on_success) {
4779 ZoneList<TextElement>* elms =
4780 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4781 elms->Add(TextElement::Atom(this), compiler->zone());
4782 return new (compiler->zone())
4783 TextNode(elms, compiler->read_backward(), on_success);
4784}
4785
4786
4787RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4788 RegExpNode* on_success) {
4789 return new (compiler->zone())
4790 TextNode(elements(), compiler->read_backward(), on_success);
4791}
4792
4793
4794static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4795 const int* special_class,
4796 int length) {
Ben Murdoch109988c2016-05-18 11:27:45 +01004797 length--; // Remove final marker.
4798 DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch014dc512016-03-22 12:00:34 +00004799 DCHECK(ranges->length() != 0);
4800 DCHECK(length != 0);
4801 DCHECK(special_class[0] != 0);
4802 if (ranges->length() != (length >> 1) + 1) {
4803 return false;
4804 }
4805 CharacterRange range = ranges->at(0);
4806 if (range.from() != 0) {
4807 return false;
4808 }
4809 for (int i = 0; i < length; i += 2) {
4810 if (special_class[i] != (range.to() + 1)) {
4811 return false;
4812 }
4813 range = ranges->at((i >> 1) + 1);
4814 if (special_class[i+1] != range.from()) {
4815 return false;
4816 }
4817 }
Ben Murdoch109988c2016-05-18 11:27:45 +01004818 if (range.to() != String::kMaxCodePoint) {
Ben Murdoch014dc512016-03-22 12:00:34 +00004819 return false;
4820 }
4821 return true;
4822}
4823
4824
4825static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4826 const int* special_class,
4827 int length) {
Ben Murdoch109988c2016-05-18 11:27:45 +01004828 length--; // Remove final marker.
4829 DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch014dc512016-03-22 12:00:34 +00004830 if (ranges->length() * 2 != length) {
4831 return false;
4832 }
4833 for (int i = 0; i < length; i += 2) {
4834 CharacterRange range = ranges->at(i >> 1);
4835 if (range.from() != special_class[i] ||
4836 range.to() != special_class[i + 1] - 1) {
4837 return false;
4838 }
4839 }
4840 return true;
4841}
4842
4843
4844bool RegExpCharacterClass::is_standard(Zone* zone) {
4845 // TODO(lrn): Remove need for this function, by not throwing away information
4846 // along the way.
4847 if (is_negated_) {
4848 return false;
4849 }
4850 if (set_.is_standard()) {
4851 return true;
4852 }
4853 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4854 set_.set_standard_set_type('s');
4855 return true;
4856 }
4857 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4858 set_.set_standard_set_type('S');
4859 return true;
4860 }
4861 if (CompareInverseRanges(set_.ranges(zone),
4862 kLineTerminatorRanges,
4863 kLineTerminatorRangeCount)) {
4864 set_.set_standard_set_type('.');
4865 return true;
4866 }
4867 if (CompareRanges(set_.ranges(zone),
4868 kLineTerminatorRanges,
4869 kLineTerminatorRangeCount)) {
4870 set_.set_standard_set_type('n');
4871 return true;
4872 }
4873 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4874 set_.set_standard_set_type('w');
4875 return true;
4876 }
4877 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4878 set_.set_standard_set_type('W');
4879 return true;
4880 }
4881 return false;
4882}
4883
4884
Ben Murdoch109988c2016-05-18 11:27:45 +01004885UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4886 ZoneList<CharacterRange>* base)
4887 : zone_(zone),
4888 table_(zone),
4889 bmp_(nullptr),
4890 lead_surrogates_(nullptr),
4891 trail_surrogates_(nullptr),
4892 non_bmp_(nullptr) {
4893 // The unicode range splitter categorizes given character ranges into:
4894 // - Code points from the BMP representable by one code unit.
4895 // - Code points outside the BMP that need to be split into surrogate pairs.
4896 // - Lone lead surrogates.
4897 // - Lone trail surrogates.
4898 // Lone surrogates are valid code points, even though no actual characters.
4899 // They require special matching to make sure we do not split surrogate pairs.
4900 // We use the dispatch table to accomplish this. The base range is split up
4901 // by the table by the overlay ranges, and the Call callback is used to
4902 // filter and collect ranges for each category.
4903 for (int i = 0; i < base->length(); i++) {
4904 table_.AddRange(base->at(i), kBase, zone_);
4905 }
4906 // Add overlay ranges.
4907 table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4908 kBmpCodePoints, zone_);
4909 table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4910 kLeadSurrogates, zone_);
4911 table_.AddRange(
4912 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4913 kTrailSurrogates, zone_);
4914 table_.AddRange(
4915 CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4916 kBmpCodePoints, zone_);
4917 table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4918 kNonBmpCodePoints, zone_);
4919 table_.ForEach(this);
4920}
4921
4922
4923void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4924 OutSet* outset = entry.out_set();
4925 if (!outset->Get(kBase)) return;
4926 ZoneList<CharacterRange>** target = NULL;
4927 if (outset->Get(kBmpCodePoints)) {
4928 target = &bmp_;
4929 } else if (outset->Get(kLeadSurrogates)) {
4930 target = &lead_surrogates_;
4931 } else if (outset->Get(kTrailSurrogates)) {
4932 target = &trail_surrogates_;
4933 } else {
4934 DCHECK(outset->Get(kNonBmpCodePoints));
4935 target = &non_bmp_;
4936 }
4937 if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4938 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4939}
4940
4941
4942void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4943 RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4944 ZoneList<CharacterRange>* bmp = splitter->bmp();
4945 if (bmp == nullptr) return;
4946 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4947 compiler->zone(), bmp, compiler->read_backward(), on_success)));
4948}
4949
4950
4951void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4952 RegExpNode* on_success,
4953 UnicodeRangeSplitter* splitter) {
4954 ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4955 if (non_bmp == nullptr) return;
4956 DCHECK(compiler->unicode());
4957 DCHECK(!compiler->one_byte());
4958 Zone* zone = compiler->zone();
4959 CharacterRange::Canonicalize(non_bmp);
4960 for (int i = 0; i < non_bmp->length(); i++) {
4961 // Match surrogate pair.
4962 // E.g. [\u10005-\u11005] becomes
4963 // \ud800[\udc05-\udfff]|
4964 // [\ud801-\ud803][\udc00-\udfff]|
4965 // \ud804[\udc00-\udc05]
4966 uc32 from = non_bmp->at(i).from();
4967 uc32 to = non_bmp->at(i).to();
4968 uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4969 uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4970 uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4971 uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4972 if (from_l == to_l) {
4973 // The lead surrogate is the same.
4974 result->AddAlternative(
4975 GuardedAlternative(TextNode::CreateForSurrogatePair(
4976 zone, CharacterRange::Singleton(from_l),
4977 CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4978 on_success)));
4979 } else {
4980 if (from_t != kTrailSurrogateStart) {
4981 // Add [from_l][from_t-\udfff]
4982 result->AddAlternative(
4983 GuardedAlternative(TextNode::CreateForSurrogatePair(
4984 zone, CharacterRange::Singleton(from_l),
4985 CharacterRange::Range(from_t, kTrailSurrogateEnd),
4986 compiler->read_backward(), on_success)));
4987 from_l++;
4988 }
4989 if (to_t != kTrailSurrogateEnd) {
4990 // Add [to_l][\udc00-to_t]
4991 result->AddAlternative(
4992 GuardedAlternative(TextNode::CreateForSurrogatePair(
4993 zone, CharacterRange::Singleton(to_l),
4994 CharacterRange::Range(kTrailSurrogateStart, to_t),
4995 compiler->read_backward(), on_success)));
4996 to_l--;
4997 }
4998 if (from_l <= to_l) {
4999 // Add [from_l-to_l][\udc00-\udfff]
5000 result->AddAlternative(
5001 GuardedAlternative(TextNode::CreateForSurrogatePair(
5002 zone, CharacterRange::Range(from_l, to_l),
5003 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
5004 compiler->read_backward(), on_success)));
5005 }
5006 }
5007 }
5008}
5009
5010
5011RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
5012 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
5013 ZoneList<CharacterRange>* match, RegExpNode* on_success,
5014 bool read_backward) {
5015 Zone* zone = compiler->zone();
5016 RegExpNode* match_node = TextNode::CreateForCharacterRanges(
5017 zone, match, read_backward, on_success);
5018 int stack_register = compiler->UnicodeLookaroundStackRegister();
5019 int position_register = compiler->UnicodeLookaroundPositionRegister();
5020 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
5021 position_register);
5022 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5023 zone, lookbehind, !read_backward, lookaround.on_match_success());
5024 return lookaround.ForMatch(negative_match);
5025}
5026
5027
5028RegExpNode* MatchAndNegativeLookaroundInReadDirection(
5029 RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
5030 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
5031 bool read_backward) {
5032 Zone* zone = compiler->zone();
5033 int stack_register = compiler->UnicodeLookaroundStackRegister();
5034 int position_register = compiler->UnicodeLookaroundPositionRegister();
5035 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
5036 position_register);
5037 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5038 zone, lookahead, read_backward, lookaround.on_match_success());
5039 return TextNode::CreateForCharacterRanges(
5040 zone, match, read_backward, lookaround.ForMatch(negative_match));
5041}
5042
5043
5044void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5045 RegExpNode* on_success,
5046 UnicodeRangeSplitter* splitter) {
5047 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
5048 if (lead_surrogates == nullptr) return;
5049 Zone* zone = compiler->zone();
5050 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
5051 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
5052 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
5053
5054 RegExpNode* match;
5055 if (compiler->read_backward()) {
5056 // Reading backward. Assert that reading forward, there is no trail
5057 // surrogate, and then backward match the lead surrogate.
5058 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5059 compiler, trail_surrogates, lead_surrogates, on_success, true);
5060 } else {
5061 // Reading forward. Forward match the lead surrogate and assert that
5062 // no trail surrogate follows.
5063 match = MatchAndNegativeLookaroundInReadDirection(
5064 compiler, lead_surrogates, trail_surrogates, on_success, false);
5065 }
5066 result->AddAlternative(GuardedAlternative(match));
5067}
5068
5069
5070void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5071 RegExpNode* on_success,
5072 UnicodeRangeSplitter* splitter) {
5073 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5074 if (trail_surrogates == nullptr) return;
5075 Zone* zone = compiler->zone();
5076 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5077 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5078 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5079
5080 RegExpNode* match;
5081 if (compiler->read_backward()) {
5082 // Reading backward. Backward match the trail surrogate and assert that no
5083 // lead surrogate precedes it.
5084 match = MatchAndNegativeLookaroundInReadDirection(
5085 compiler, trail_surrogates, lead_surrogates, on_success, true);
5086 } else {
5087 // Reading forward. Assert that reading backward, there is no lead
5088 // surrogate, and then forward match the trail surrogate.
5089 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5090 compiler, lead_surrogates, trail_surrogates, on_success, false);
5091 }
5092 result->AddAlternative(GuardedAlternative(match));
5093}
5094
5095RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5096 RegExpNode* on_success) {
5097 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5098 DCHECK(!compiler->read_backward());
5099 Zone* zone = compiler->zone();
5100 // Advance any character. If the character happens to be a lead surrogate and
5101 // we advanced into the middle of a surrogate pair, it will work out, as
5102 // nothing will match from there. We will have to advance again, consuming
5103 // the associated trail surrogate.
5104 ZoneList<CharacterRange>* range = CharacterRange::List(
5105 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5106 return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
5107}
5108
5109
5110void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
5111 ZoneList<CharacterRange>* ranges) {
5112#ifdef V8_I18N_SUPPORT
5113 // Use ICU to compute the case fold closure over the ranges.
5114 DCHECK(compiler->unicode());
5115 DCHECK(compiler->ignore_case());
5116 USet* set = uset_openEmpty();
5117 for (int i = 0; i < ranges->length(); i++) {
5118 uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
5119 }
5120 ranges->Clear();
5121 uset_closeOver(set, USET_CASE_INSENSITIVE);
5122 // Full case mapping map single characters to multiple characters.
5123 // Those are represented as strings in the set. Remove them so that
5124 // we end up with only simple and common case mappings.
5125 uset_removeAllStrings(set);
5126 int item_count = uset_getItemCount(set);
5127 int item_result = 0;
5128 UErrorCode ec = U_ZERO_ERROR;
5129 Zone* zone = compiler->zone();
5130 for (int i = 0; i < item_count; i++) {
5131 uc32 start = 0;
5132 uc32 end = 0;
5133 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
5134 ranges->Add(CharacterRange::Range(start, end), zone);
5135 }
5136 // No errors and everything we collected have been ranges.
5137 DCHECK_EQ(U_ZERO_ERROR, ec);
5138 DCHECK_EQ(0, item_result);
5139 uset_close(set);
5140#else
5141 // Fallback if ICU is not included.
5142 CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
5143 ranges, compiler->one_byte());
5144#endif // V8_I18N_SUPPORT
5145 CharacterRange::Canonicalize(ranges);
5146}
5147
5148
Ben Murdoch014dc512016-03-22 12:00:34 +00005149RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5150 RegExpNode* on_success) {
Ben Murdoch109988c2016-05-18 11:27:45 +01005151 set_.Canonicalize();
5152 Zone* zone = compiler->zone();
5153 ZoneList<CharacterRange>* ranges = this->ranges(zone);
5154 if (compiler->unicode() && compiler->ignore_case()) {
5155 AddUnicodeCaseEquivalents(compiler, ranges);
5156 }
5157 if (compiler->unicode() && !compiler->one_byte()) {
5158 if (is_negated()) {
5159 ZoneList<CharacterRange>* negated =
5160 new (zone) ZoneList<CharacterRange>(2, zone);
5161 CharacterRange::Negate(ranges, negated, zone);
5162 ranges = negated;
5163 }
5164 if (ranges->length() == 0) {
Ben Murdochbcf72ee2016-08-08 18:44:38 +01005165 ranges->Add(CharacterRange::Everything(), zone);
5166 RegExpCharacterClass* fail =
5167 new (zone) RegExpCharacterClass(ranges, true);
5168 return new (zone) TextNode(fail, compiler->read_backward(), on_success);
Ben Murdoch109988c2016-05-18 11:27:45 +01005169 }
5170 if (standard_type() == '*') {
5171 return UnanchoredAdvance(compiler, on_success);
5172 } else {
5173 ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5174 UnicodeRangeSplitter splitter(zone, ranges);
5175 AddBmpCharacters(compiler, result, on_success, &splitter);
5176 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5177 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5178 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5179 return result;
5180 }
5181 } else {
5182 return new (zone) TextNode(this, compiler->read_backward(), on_success);
5183 }
Ben Murdoch014dc512016-03-22 12:00:34 +00005184}
5185
5186
5187int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5188 RegExpAtom* atom1 = (*a)->AsAtom();
5189 RegExpAtom* atom2 = (*b)->AsAtom();
5190 uc16 character1 = atom1->data().at(0);
5191 uc16 character2 = atom2->data().at(0);
5192 if (character1 < character2) return -1;
5193 if (character1 > character2) return 1;
5194 return 0;
5195}
5196
5197
5198static unibrow::uchar Canonical(
5199 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5200 unibrow::uchar c) {
5201 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5202 int length = canonicalize->get(c, '\0', chars);
5203 DCHECK_LE(length, 1);
5204 unibrow::uchar canonical = c;
5205 if (length == 1) canonical = chars[0];
5206 return canonical;
5207}
5208
5209
5210int CompareFirstCharCaseIndependent(
5211 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5212 RegExpTree* const* a, RegExpTree* const* b) {
5213 RegExpAtom* atom1 = (*a)->AsAtom();
5214 RegExpAtom* atom2 = (*b)->AsAtom();
5215 unibrow::uchar character1 = atom1->data().at(0);
5216 unibrow::uchar character2 = atom2->data().at(0);
5217 if (character1 == character2) return 0;
5218 if (character1 >= 'a' || character2 >= 'a') {
5219 character1 = Canonical(canonicalize, character1);
5220 character2 = Canonical(canonicalize, character2);
5221 }
5222 return static_cast<int>(character1) - static_cast<int>(character2);
5223}
5224
5225
5226// We can stable sort runs of atoms, since the order does not matter if they
5227// start with different characters.
5228// Returns true if any consecutive atoms were found.
5229bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5230 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5231 int length = alternatives->length();
5232 bool found_consecutive_atoms = false;
5233 for (int i = 0; i < length; i++) {
5234 while (i < length) {
5235 RegExpTree* alternative = alternatives->at(i);
5236 if (alternative->IsAtom()) break;
5237 i++;
5238 }
5239 // i is length or it is the index of an atom.
5240 if (i == length) break;
5241 int first_atom = i;
5242 i++;
5243 while (i < length) {
5244 RegExpTree* alternative = alternatives->at(i);
5245 if (!alternative->IsAtom()) break;
5246 i++;
5247 }
5248 // Sort atoms to get ones with common prefixes together.
5249 // This step is more tricky if we are in a case-independent regexp,
5250 // because it would change /is|I/ to /I|is/, and order matters when
5251 // the regexp parts don't match only disjoint starting points. To fix
5252 // this we have a version of CompareFirstChar that uses case-
5253 // independent character classes for comparison.
5254 DCHECK_LT(first_atom, alternatives->length());
5255 DCHECK_LE(i, alternatives->length());
5256 DCHECK_LE(first_atom, i);
5257 if (compiler->ignore_case()) {
5258 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5259 compiler->isolate()->regexp_macro_assembler_canonicalize();
5260 auto compare_closure =
5261 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5262 return CompareFirstCharCaseIndependent(canonicalize, a, b);
5263 };
5264 alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5265 } else {
5266 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5267 }
5268 if (i - first_atom > 1) found_consecutive_atoms = true;
5269 }
5270 return found_consecutive_atoms;
5271}
5272
5273
5274// Optimizes ab|ac|az to a(?:b|c|d).
5275void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5276 Zone* zone = compiler->zone();
5277 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5278 int length = alternatives->length();
5279
5280 int write_posn = 0;
5281 int i = 0;
5282 while (i < length) {
5283 RegExpTree* alternative = alternatives->at(i);
5284 if (!alternative->IsAtom()) {
5285 alternatives->at(write_posn++) = alternatives->at(i);
5286 i++;
5287 continue;
5288 }
5289 RegExpAtom* atom = alternative->AsAtom();
5290 unibrow::uchar common_prefix = atom->data().at(0);
5291 int first_with_prefix = i;
5292 int prefix_length = atom->length();
5293 i++;
5294 while (i < length) {
5295 alternative = alternatives->at(i);
5296 if (!alternative->IsAtom()) break;
5297 atom = alternative->AsAtom();
5298 unibrow::uchar new_prefix = atom->data().at(0);
5299 if (new_prefix != common_prefix) {
5300 if (!compiler->ignore_case()) break;
5301 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5302 compiler->isolate()->regexp_macro_assembler_canonicalize();
5303 new_prefix = Canonical(canonicalize, new_prefix);
5304 common_prefix = Canonical(canonicalize, common_prefix);
5305 if (new_prefix != common_prefix) break;
5306 }
5307 prefix_length = Min(prefix_length, atom->length());
5308 i++;
5309 }
5310 if (i > first_with_prefix + 2) {
5311 // Found worthwhile run of alternatives with common prefix of at least one
5312 // character. The sorting function above did not sort on more than one
5313 // character for reasons of correctness, but there may still be a longer
5314 // common prefix if the terms were similar or presorted in the input.
5315 // Find out how long the common prefix is.
5316 int run_length = i - first_with_prefix;
5317 atom = alternatives->at(first_with_prefix)->AsAtom();
5318 for (int j = 1; j < run_length && prefix_length > 1; j++) {
5319 RegExpAtom* old_atom =
5320 alternatives->at(j + first_with_prefix)->AsAtom();
5321 for (int k = 1; k < prefix_length; k++) {
5322 if (atom->data().at(k) != old_atom->data().at(k)) {
5323 prefix_length = k;
5324 break;
5325 }
5326 }
5327 }
5328 RegExpAtom* prefix =
5329 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
5330 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5331 pair->Add(prefix, zone);
5332 ZoneList<RegExpTree*>* suffixes =
5333 new (zone) ZoneList<RegExpTree*>(run_length, zone);
5334 for (int j = 0; j < run_length; j++) {
5335 RegExpAtom* old_atom =
5336 alternatives->at(j + first_with_prefix)->AsAtom();
5337 int len = old_atom->length();
5338 if (len == prefix_length) {
5339 suffixes->Add(new (zone) RegExpEmpty(), zone);
5340 } else {
5341 RegExpTree* suffix = new (zone) RegExpAtom(
5342 old_atom->data().SubVector(prefix_length, old_atom->length()));
5343 suffixes->Add(suffix, zone);
5344 }
5345 }
5346 pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5347 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5348 } else {
5349 // Just copy any non-worthwhile alternatives.
5350 for (int j = first_with_prefix; j < i; j++) {
5351 alternatives->at(write_posn++) = alternatives->at(j);
5352 }
5353 }
5354 }
5355 alternatives->Rewind(write_posn); // Trim end of array.
5356}
5357
5358
5359// Optimizes b|c|z to [bcz].
5360void RegExpDisjunction::FixSingleCharacterDisjunctions(
5361 RegExpCompiler* compiler) {
5362 Zone* zone = compiler->zone();
5363 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5364 int length = alternatives->length();
5365
5366 int write_posn = 0;
5367 int i = 0;
5368 while (i < length) {
5369 RegExpTree* alternative = alternatives->at(i);
5370 if (!alternative->IsAtom()) {
5371 alternatives->at(write_posn++) = alternatives->at(i);
5372 i++;
5373 continue;
5374 }
5375 RegExpAtom* atom = alternative->AsAtom();
5376 if (atom->length() != 1) {
5377 alternatives->at(write_posn++) = alternatives->at(i);
5378 i++;
5379 continue;
5380 }
5381 int first_in_run = i;
5382 i++;
5383 while (i < length) {
5384 alternative = alternatives->at(i);
5385 if (!alternative->IsAtom()) break;
5386 atom = alternative->AsAtom();
5387 if (atom->length() != 1) break;
5388 i++;
5389 }
5390 if (i > first_in_run + 1) {
5391 // Found non-trivial run of single-character alternatives.
5392 int run_length = i - first_in_run;
5393 ZoneList<CharacterRange>* ranges =
5394 new (zone) ZoneList<CharacterRange>(2, zone);
5395 for (int j = 0; j < run_length; j++) {
5396 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5397 DCHECK_EQ(old_atom->length(), 1);
5398 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5399 }
5400 alternatives->at(write_posn++) =
5401 new (zone) RegExpCharacterClass(ranges, false);
5402 } else {
5403 // Just copy any trivial alternatives.
5404 for (int j = first_in_run; j < i; j++) {
5405 alternatives->at(write_posn++) = alternatives->at(j);
5406 }
5407 }
5408 }
5409 alternatives->Rewind(write_posn); // Trim end of array.
5410}
5411
5412
5413RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5414 RegExpNode* on_success) {
5415 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5416
5417 if (alternatives->length() > 2) {
5418 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5419 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5420 FixSingleCharacterDisjunctions(compiler);
5421 if (alternatives->length() == 1) {
5422 return alternatives->at(0)->ToNode(compiler, on_success);
5423 }
5424 }
5425
5426 int length = alternatives->length();
5427
5428 ChoiceNode* result =
5429 new(compiler->zone()) ChoiceNode(length, compiler->zone());
5430 for (int i = 0; i < length; i++) {
5431 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5432 on_success));
5433 result->AddAlternative(alternative);
5434 }
5435 return result;
5436}
5437
5438
5439RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5440 RegExpNode* on_success) {
5441 return ToNode(min(),
5442 max(),
5443 is_greedy(),
5444 body(),
5445 compiler,
5446 on_success);
5447}
5448
5449
5450// Scoped object to keep track of how much we unroll quantifier loops in the
5451// regexp graph generator.
5452class RegExpExpansionLimiter {
5453 public:
5454 static const int kMaxExpansionFactor = 6;
5455 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5456 : compiler_(compiler),
5457 saved_expansion_factor_(compiler->current_expansion_factor()),
5458 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5459 DCHECK(factor > 0);
5460 if (ok_to_expand_) {
5461 if (factor > kMaxExpansionFactor) {
5462 // Avoid integer overflow of the current expansion factor.
5463 ok_to_expand_ = false;
5464 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5465 } else {
5466 int new_factor = saved_expansion_factor_ * factor;
5467 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5468 compiler->set_current_expansion_factor(new_factor);
5469 }
5470 }
5471 }
5472
5473 ~RegExpExpansionLimiter() {
5474 compiler_->set_current_expansion_factor(saved_expansion_factor_);
5475 }
5476
5477 bool ok_to_expand() { return ok_to_expand_; }
5478
5479 private:
5480 RegExpCompiler* compiler_;
5481 int saved_expansion_factor_;
5482 bool ok_to_expand_;
5483
5484 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5485};
5486
5487
5488RegExpNode* RegExpQuantifier::ToNode(int min,
5489 int max,
5490 bool is_greedy,
5491 RegExpTree* body,
5492 RegExpCompiler* compiler,
5493 RegExpNode* on_success,
5494 bool not_at_start) {
5495 // x{f, t} becomes this:
5496 //
5497 // (r++)<-.
5498 // | `
5499 // | (x)
5500 // v ^
5501 // (r=0)-->(?)---/ [if r < t]
5502 // |
5503 // [if r >= f] \----> ...
5504 //
5505
5506 // 15.10.2.5 RepeatMatcher algorithm.
5507 // The parser has already eliminated the case where max is 0. In the case
5508 // where max_match is zero the parser has removed the quantifier if min was
5509 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5510
5511 // If we know that we cannot match zero length then things are a little
5512 // simpler since we don't need to make the special zero length match check
5513 // from step 2.1. If the min and max are small we can unroll a little in
5514 // this case.
5515 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5516 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5517 if (max == 0) return on_success; // This can happen due to recursion.
5518 bool body_can_be_empty = (body->min_match() == 0);
5519 int body_start_reg = RegExpCompiler::kNoRegister;
5520 Interval capture_registers = body->CaptureRegisters();
5521 bool needs_capture_clearing = !capture_registers.is_empty();
5522 Zone* zone = compiler->zone();
5523
5524 if (body_can_be_empty) {
5525 body_start_reg = compiler->AllocateRegister();
5526 } else if (compiler->optimize() && !needs_capture_clearing) {
5527 // Only unroll if there are no captures and the body can't be
5528 // empty.
5529 {
5530 RegExpExpansionLimiter limiter(
5531 compiler, min + ((max != min) ? 1 : 0));
5532 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5533 int new_max = (max == kInfinity) ? max : max - min;
5534 // Recurse once to get the loop or optional matches after the fixed
5535 // ones.
5536 RegExpNode* answer = ToNode(
5537 0, new_max, is_greedy, body, compiler, on_success, true);
5538 // Unroll the forced matches from 0 to min. This can cause chains of
5539 // TextNodes (which the parser does not generate). These should be
5540 // combined if it turns out they hinder good code generation.
5541 for (int i = 0; i < min; i++) {
5542 answer = body->ToNode(compiler, answer);
5543 }
5544 return answer;
5545 }
5546 }
5547 if (max <= kMaxUnrolledMaxMatches && min == 0) {
5548 DCHECK(max > 0); // Due to the 'if' above.
5549 RegExpExpansionLimiter limiter(compiler, max);
5550 if (limiter.ok_to_expand()) {
5551 // Unroll the optional matches up to max.
5552 RegExpNode* answer = on_success;
5553 for (int i = 0; i < max; i++) {
5554 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5555 if (is_greedy) {
5556 alternation->AddAlternative(
5557 GuardedAlternative(body->ToNode(compiler, answer)));
5558 alternation->AddAlternative(GuardedAlternative(on_success));
5559 } else {
5560 alternation->AddAlternative(GuardedAlternative(on_success));
5561 alternation->AddAlternative(
5562 GuardedAlternative(body->ToNode(compiler, answer)));
5563 }
5564 answer = alternation;
5565 if (not_at_start && !compiler->read_backward()) {
5566 alternation->set_not_at_start();
5567 }
5568 }
5569 return answer;
5570 }
5571 }
5572 }
5573 bool has_min = min > 0;
5574 bool has_max = max < RegExpTree::kInfinity;
5575 bool needs_counter = has_min || has_max;
5576 int reg_ctr = needs_counter
5577 ? compiler->AllocateRegister()
5578 : RegExpCompiler::kNoRegister;
5579 LoopChoiceNode* center = new (zone)
5580 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5581 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5582 RegExpNode* loop_return = needs_counter
5583 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5584 : static_cast<RegExpNode*>(center);
5585 if (body_can_be_empty) {
5586 // If the body can be empty we need to check if it was and then
5587 // backtrack.
5588 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5589 reg_ctr,
5590 min,
5591 loop_return);
5592 }
5593 RegExpNode* body_node = body->ToNode(compiler, loop_return);
5594 if (body_can_be_empty) {
5595 // If the body can be empty we need to store the start position
5596 // so we can bail out if it was empty.
5597 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5598 }
5599 if (needs_capture_clearing) {
5600 // Before entering the body of this loop we need to clear captures.
5601 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5602 }
5603 GuardedAlternative body_alt(body_node);
5604 if (has_max) {
5605 Guard* body_guard =
5606 new(zone) Guard(reg_ctr, Guard::LT, max);
5607 body_alt.AddGuard(body_guard, zone);
5608 }
5609 GuardedAlternative rest_alt(on_success);
5610 if (has_min) {
5611 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5612 rest_alt.AddGuard(rest_guard, zone);
5613 }
5614 if (is_greedy) {
5615 center->AddLoopAlternative(body_alt);
5616 center->AddContinueAlternative(rest_alt);
5617 } else {
5618 center->AddContinueAlternative(rest_alt);
5619 center->AddLoopAlternative(body_alt);
5620 }
5621 if (needs_counter) {
5622 return ActionNode::SetRegister(reg_ctr, 0, center);
5623 } else {
5624 return center;
5625 }
5626}
5627
5628
5629RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5630 RegExpNode* on_success) {
5631 NodeInfo info;
5632 Zone* zone = compiler->zone();
5633
5634 switch (assertion_type()) {
5635 case START_OF_LINE:
5636 return AssertionNode::AfterNewline(on_success);
5637 case START_OF_INPUT:
5638 return AssertionNode::AtStart(on_success);
5639 case BOUNDARY:
5640 return AssertionNode::AtBoundary(on_success);
5641 case NON_BOUNDARY:
5642 return AssertionNode::AtNonBoundary(on_success);
5643 case END_OF_INPUT:
5644 return AssertionNode::AtEnd(on_success);
5645 case END_OF_LINE: {
5646 // Compile $ in multiline regexps as an alternation with a positive
5647 // lookahead in one side and an end-of-input on the other side.
5648 // We need two registers for the lookahead.
5649 int stack_pointer_register = compiler->AllocateRegister();
5650 int position_register = compiler->AllocateRegister();
5651 // The ChoiceNode to distinguish between a newline and end-of-input.
5652 ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5653 // Create a newline atom.
5654 ZoneList<CharacterRange>* newline_ranges =
5655 new(zone) ZoneList<CharacterRange>(3, zone);
5656 CharacterRange::AddClassEscape('n', newline_ranges, zone);
5657 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
5658 TextNode* newline_matcher = new (zone) TextNode(
5659 newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5660 stack_pointer_register, position_register,
5661 0, // No captures inside.
5662 -1, // Ignored if no captures.
5663 on_success));
5664 // Create an end-of-input matcher.
5665 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5666 stack_pointer_register,
5667 position_register,
5668 newline_matcher);
5669 // Add the two alternatives to the ChoiceNode.
5670 GuardedAlternative eol_alternative(end_of_line);
5671 result->AddAlternative(eol_alternative);
5672 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5673 result->AddAlternative(end_alternative);
5674 return result;
5675 }
5676 default:
5677 UNREACHABLE();
5678 }
5679 return on_success;
5680}
5681
5682
5683RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5684 RegExpNode* on_success) {
5685 return new (compiler->zone())
5686 BackReferenceNode(RegExpCapture::StartRegister(index()),
5687 RegExpCapture::EndRegister(index()),
5688 compiler->read_backward(), on_success);
5689}
5690
5691
5692RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5693 RegExpNode* on_success) {
5694 return on_success;
5695}
5696
5697
Ben Murdoch109988c2016-05-18 11:27:45 +01005698RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5699 int stack_pointer_register,
5700 int position_register,
5701 int capture_register_count,
5702 int capture_register_start)
5703 : is_positive_(is_positive),
5704 on_success_(on_success),
5705 stack_pointer_register_(stack_pointer_register),
5706 position_register_(position_register) {
5707 if (is_positive_) {
5708 on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5709 stack_pointer_register, position_register, capture_register_count,
5710 capture_register_start, on_success_);
5711 } else {
5712 Zone* zone = on_success_->zone();
5713 on_match_success_ = new (zone) NegativeSubmatchSuccess(
5714 stack_pointer_register, position_register, capture_register_count,
5715 capture_register_start, zone);
5716 }
5717}
5718
5719
5720RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5721 if (is_positive_) {
5722 return ActionNode::BeginSubmatch(stack_pointer_register_,
5723 position_register_, match);
5724 } else {
5725 Zone* zone = on_success_->zone();
5726 // We use a ChoiceNode to represent the negative lookaround. The first
5727 // alternative is the negative match. On success, the end node backtracks.
5728 // On failure, the second alternative is tried and leads to success.
5729 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5730 // first exit when calculating quick checks.
5731 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5732 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5733 return ActionNode::BeginSubmatch(stack_pointer_register_,
5734 position_register_, choice_node);
5735 }
5736}
5737
5738
Ben Murdoch014dc512016-03-22 12:00:34 +00005739RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5740 RegExpNode* on_success) {
5741 int stack_pointer_register = compiler->AllocateRegister();
5742 int position_register = compiler->AllocateRegister();
5743
5744 const int registers_per_capture = 2;
5745 const int register_of_first_capture = 2;
5746 int register_count = capture_count_ * registers_per_capture;
5747 int register_start =
5748 register_of_first_capture + capture_from_ * registers_per_capture;
5749
5750 RegExpNode* result;
5751 bool was_reading_backward = compiler->read_backward();
5752 compiler->set_read_backward(type() == LOOKBEHIND);
Ben Murdoch109988c2016-05-18 11:27:45 +01005753 Builder builder(is_positive(), on_success, stack_pointer_register,
5754 position_register, register_count, register_start);
5755 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5756 result = builder.ForMatch(match);
Ben Murdoch014dc512016-03-22 12:00:34 +00005757 compiler->set_read_backward(was_reading_backward);
5758 return result;
5759}
5760
5761
5762RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5763 RegExpNode* on_success) {
5764 return ToNode(body(), index(), compiler, on_success);
5765}
5766
5767
5768RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5769 int index,
5770 RegExpCompiler* compiler,
5771 RegExpNode* on_success) {
5772 DCHECK_NOT_NULL(body);
5773 int start_reg = RegExpCapture::StartRegister(index);
5774 int end_reg = RegExpCapture::EndRegister(index);
5775 if (compiler->read_backward()) std::swap(start_reg, end_reg);
5776 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5777 RegExpNode* body_node = body->ToNode(compiler, store_end);
5778 return ActionNode::StorePosition(start_reg, true, body_node);
5779}
5780
5781
5782RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5783 RegExpNode* on_success) {
5784 ZoneList<RegExpTree*>* children = nodes();
5785 RegExpNode* current = on_success;
5786 if (compiler->read_backward()) {
5787 for (int i = 0; i < children->length(); i++) {
5788 current = children->at(i)->ToNode(compiler, current);
5789 }
5790 } else {
5791 for (int i = children->length() - 1; i >= 0; i--) {
5792 current = children->at(i)->ToNode(compiler, current);
5793 }
5794 }
5795 return current;
5796}
5797
5798
5799static void AddClass(const int* elmv,
5800 int elmc,
5801 ZoneList<CharacterRange>* ranges,
5802 Zone* zone) {
5803 elmc--;
Ben Murdoch109988c2016-05-18 11:27:45 +01005804 DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch014dc512016-03-22 12:00:34 +00005805 for (int i = 0; i < elmc; i += 2) {
5806 DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch109988c2016-05-18 11:27:45 +01005807 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
Ben Murdoch014dc512016-03-22 12:00:34 +00005808 }
5809}
5810
5811
5812static void AddClassNegated(const int *elmv,
5813 int elmc,
5814 ZoneList<CharacterRange>* ranges,
5815 Zone* zone) {
5816 elmc--;
Ben Murdoch109988c2016-05-18 11:27:45 +01005817 DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch014dc512016-03-22 12:00:34 +00005818 DCHECK(elmv[0] != 0x0000);
Ben Murdoch109988c2016-05-18 11:27:45 +01005819 DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
Ben Murdoch014dc512016-03-22 12:00:34 +00005820 uc16 last = 0x0000;
5821 for (int i = 0; i < elmc; i += 2) {
5822 DCHECK(last <= elmv[i] - 1);
5823 DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch109988c2016-05-18 11:27:45 +01005824 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
Ben Murdoch014dc512016-03-22 12:00:34 +00005825 last = elmv[i + 1];
5826 }
Ben Murdoch109988c2016-05-18 11:27:45 +01005827 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
Ben Murdoch014dc512016-03-22 12:00:34 +00005828}
5829
5830
5831void CharacterRange::AddClassEscape(uc16 type,
5832 ZoneList<CharacterRange>* ranges,
5833 Zone* zone) {
5834 switch (type) {
5835 case 's':
5836 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5837 break;
5838 case 'S':
5839 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5840 break;
5841 case 'w':
5842 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5843 break;
5844 case 'W':
5845 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5846 break;
5847 case 'd':
5848 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5849 break;
5850 case 'D':
5851 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5852 break;
5853 case '.':
5854 AddClassNegated(kLineTerminatorRanges,
5855 kLineTerminatorRangeCount,
5856 ranges,
5857 zone);
5858 break;
5859 // This is not a character range as defined by the spec but a
5860 // convenient shorthand for a character class that matches any
5861 // character.
5862 case '*':
5863 ranges->Add(CharacterRange::Everything(), zone);
5864 break;
5865 // This is the set of characters matched by the $ and ^ symbols
5866 // in multiline mode.
5867 case 'n':
5868 AddClass(kLineTerminatorRanges,
5869 kLineTerminatorRangeCount,
5870 ranges,
5871 zone);
5872 break;
5873 default:
5874 UNREACHABLE();
5875 }
5876}
5877
5878
5879Vector<const int> CharacterRange::GetWordBounds() {
5880 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5881}
5882
5883
Ben Murdoch014dc512016-03-22 12:00:34 +00005884void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5885 ZoneList<CharacterRange>* ranges,
5886 bool is_one_byte) {
Ben Murdochbcf72ee2016-08-08 18:44:38 +01005887 CharacterRange::Canonicalize(ranges);
Ben Murdoch109988c2016-05-18 11:27:45 +01005888 int range_count = ranges->length();
5889 for (int i = 0; i < range_count; i++) {
5890 CharacterRange range = ranges->at(i);
5891 uc32 bottom = range.from();
5892 if (bottom > String::kMaxUtf16CodeUnit) return;
5893 uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5894 // Nothing to be done for surrogates.
5895 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
5896 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5897 if (bottom > String::kMaxOneByteCharCode) return;
5898 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
Ben Murdoch014dc512016-03-22 12:00:34 +00005899 }
Ben Murdoch109988c2016-05-18 11:27:45 +01005900 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5901 if (top == bottom) {
5902 // If this is a singleton we just expand the one character.
5903 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
Ben Murdoch014dc512016-03-22 12:00:34 +00005904 for (int i = 0; i < length; i++) {
Ben Murdoch109988c2016-05-18 11:27:45 +01005905 uc32 chr = chars[i];
5906 if (chr != bottom) {
5907 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
Ben Murdoch014dc512016-03-22 12:00:34 +00005908 }
5909 }
Ben Murdoch109988c2016-05-18 11:27:45 +01005910 } else {
5911 // If this is a range we expand the characters block by block, expanding
5912 // contiguous subranges (blocks) one at a time. The approach is as
5913 // follows. For a given start character we look up the remainder of the
5914 // block that contains it (represented by the end point), for instance we
5915 // find 'z' if the character is 'c'. A block is characterized by the
5916 // property that all characters uncanonicalize in the same way, except
5917 // that each entry in the result is incremented by the distance from the
5918 // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
5919 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
5920 // we've found the end point we look up its uncanonicalization and
5921 // produce a range for each element. For instance for [c-f] we look up
5922 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
5923 // it is not already contained in the input, so [c-f] will be skipped but
5924 // [C-F] will be added. If this range is not completely contained in a
5925 // block we do this for all the blocks covered by the range (handling
5926 // characters that is not in a block as a "singleton block").
5927 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5928 int pos = bottom;
5929 while (pos <= top) {
5930 int length =
5931 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5932 uc32 block_end;
5933 if (length == 0) {
5934 block_end = pos;
5935 } else {
5936 DCHECK_EQ(1, length);
5937 block_end = equivalents[0];
5938 }
5939 int end = (block_end > top) ? top : block_end;
5940 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5941 equivalents);
5942 for (int i = 0; i < length; i++) {
5943 uc32 c = equivalents[i];
5944 uc32 range_from = c - (block_end - pos);
5945 uc32 range_to = c - (block_end - end);
5946 if (!(bottom <= range_from && range_to <= top)) {
5947 ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5948 }
5949 }
5950 pos = end + 1;
5951 }
Ben Murdoch014dc512016-03-22 12:00:34 +00005952 }
5953 }
5954}
5955
5956
5957bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5958 DCHECK_NOT_NULL(ranges);
5959 int n = ranges->length();
5960 if (n <= 1) return true;
5961 int max = ranges->at(0).to();
5962 for (int i = 1; i < n; i++) {
5963 CharacterRange next_range = ranges->at(i);
5964 if (next_range.from() <= max + 1) return false;
5965 max = next_range.to();
5966 }
5967 return true;
5968}
5969
5970
5971ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5972 if (ranges_ == NULL) {
5973 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5974 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5975 }
5976 return ranges_;
5977}
5978
5979
5980// Move a number of elements in a zonelist to another position
5981// in the same list. Handles overlapping source and target areas.
5982static void MoveRanges(ZoneList<CharacterRange>* list,
5983 int from,
5984 int to,
5985 int count) {
5986 // Ranges are potentially overlapping.
5987 if (from < to) {
5988 for (int i = count - 1; i >= 0; i--) {
5989 list->at(to + i) = list->at(from + i);
5990 }
5991 } else {
5992 for (int i = 0; i < count; i++) {
5993 list->at(to + i) = list->at(from + i);
5994 }
5995 }
5996}
5997
5998
5999static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
6000 int count,
6001 CharacterRange insert) {
6002 // Inserts a range into list[0..count[, which must be sorted
6003 // by from value and non-overlapping and non-adjacent, using at most
6004 // list[0..count] for the result. Returns the number of resulting
6005 // canonicalized ranges. Inserting a range may collapse existing ranges into
6006 // fewer ranges, so the return value can be anything in the range 1..count+1.
Ben Murdoch109988c2016-05-18 11:27:45 +01006007 uc32 from = insert.from();
6008 uc32 to = insert.to();
Ben Murdoch014dc512016-03-22 12:00:34 +00006009 int start_pos = 0;
6010 int end_pos = count;
6011 for (int i = count - 1; i >= 0; i--) {
6012 CharacterRange current = list->at(i);
6013 if (current.from() > to + 1) {
6014 end_pos = i;
6015 } else if (current.to() + 1 < from) {
6016 start_pos = i + 1;
6017 break;
6018 }
6019 }
6020
6021 // Inserted range overlaps, or is adjacent to, ranges at positions
6022 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6023 // not affected by the insertion.
6024 // If start_pos == end_pos, the range must be inserted before start_pos.
6025 // if start_pos < end_pos, the entire range from start_pos to end_pos
6026 // must be merged with the insert range.
6027
6028 if (start_pos == end_pos) {
6029 // Insert between existing ranges at position start_pos.
6030 if (start_pos < count) {
6031 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6032 }
6033 list->at(start_pos) = insert;
6034 return count + 1;
6035 }
6036 if (start_pos + 1 == end_pos) {
6037 // Replace single existing range at position start_pos.
6038 CharacterRange to_replace = list->at(start_pos);
6039 int new_from = Min(to_replace.from(), from);
6040 int new_to = Max(to_replace.to(), to);
Ben Murdoch109988c2016-05-18 11:27:45 +01006041 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch014dc512016-03-22 12:00:34 +00006042 return count;
6043 }
6044 // Replace a number of existing ranges from start_pos to end_pos - 1.
6045 // Move the remaining ranges down.
6046
6047 int new_from = Min(list->at(start_pos).from(), from);
6048 int new_to = Max(list->at(end_pos - 1).to(), to);
6049 if (end_pos < count) {
6050 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6051 }
Ben Murdoch109988c2016-05-18 11:27:45 +01006052 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch014dc512016-03-22 12:00:34 +00006053 return count - (end_pos - start_pos) + 1;
6054}
6055
6056
6057void CharacterSet::Canonicalize() {
6058 // Special/default classes are always considered canonical. The result
6059 // of calling ranges() will be sorted.
6060 if (ranges_ == NULL) return;
6061 CharacterRange::Canonicalize(ranges_);
6062}
6063
6064
6065void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6066 if (character_ranges->length() <= 1) return;
6067 // Check whether ranges are already canonical (increasing, non-overlapping,
6068 // non-adjacent).
6069 int n = character_ranges->length();
6070 int max = character_ranges->at(0).to();
6071 int i = 1;
6072 while (i < n) {
6073 CharacterRange current = character_ranges->at(i);
6074 if (current.from() <= max + 1) {
6075 break;
6076 }
6077 max = current.to();
6078 i++;
6079 }
6080 // Canonical until the i'th range. If that's all of them, we are done.
6081 if (i == n) return;
6082
6083 // The ranges at index i and forward are not canonicalized. Make them so by
6084 // doing the equivalent of insertion sort (inserting each into the previous
6085 // list, in order).
6086 // Notice that inserting a range can reduce the number of ranges in the
6087 // result due to combining of adjacent and overlapping ranges.
6088 int read = i; // Range to insert.
6089 int num_canonical = i; // Length of canonicalized part of list.
6090 do {
6091 num_canonical = InsertRangeInCanonicalList(character_ranges,
6092 num_canonical,
6093 character_ranges->at(read));
6094 read++;
6095 } while (read < n);
6096 character_ranges->Rewind(num_canonical);
6097
6098 DCHECK(CharacterRange::IsCanonical(character_ranges));
6099}
6100
6101
6102void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6103 ZoneList<CharacterRange>* negated_ranges,
6104 Zone* zone) {
6105 DCHECK(CharacterRange::IsCanonical(ranges));
6106 DCHECK_EQ(0, negated_ranges->length());
6107 int range_count = ranges->length();
Ben Murdoch109988c2016-05-18 11:27:45 +01006108 uc32 from = 0;
Ben Murdoch014dc512016-03-22 12:00:34 +00006109 int i = 0;
6110 if (range_count > 0 && ranges->at(0).from() == 0) {
Ben Murdoch109988c2016-05-18 11:27:45 +01006111 from = ranges->at(0).to() + 1;
Ben Murdoch014dc512016-03-22 12:00:34 +00006112 i = 1;
6113 }
6114 while (i < range_count) {
6115 CharacterRange range = ranges->at(i);
Ben Murdoch109988c2016-05-18 11:27:45 +01006116 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6117 from = range.to() + 1;
Ben Murdoch014dc512016-03-22 12:00:34 +00006118 i++;
6119 }
Ben Murdoch109988c2016-05-18 11:27:45 +01006120 if (from < String::kMaxCodePoint) {
6121 negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
Ben Murdoch014dc512016-03-22 12:00:34 +00006122 zone);
6123 }
6124}
6125
6126
6127// -------------------------------------------------------------------
6128// Splay tree
6129
6130
6131OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6132 if (Get(value))
6133 return this;
6134 if (successors(zone) != NULL) {
6135 for (int i = 0; i < successors(zone)->length(); i++) {
6136 OutSet* successor = successors(zone)->at(i);
6137 if (successor->Get(value))
6138 return successor;
6139 }
6140 } else {
6141 successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6142 }
6143 OutSet* result = new(zone) OutSet(first_, remaining_);
6144 result->Set(value, zone);
6145 successors(zone)->Add(result, zone);
6146 return result;
6147}
6148
6149
6150void OutSet::Set(unsigned value, Zone *zone) {
6151 if (value < kFirstLimit) {
6152 first_ |= (1 << value);
6153 } else {
6154 if (remaining_ == NULL)
6155 remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6156 if (remaining_->is_empty() || !remaining_->Contains(value))
6157 remaining_->Add(value, zone);
6158 }
6159}
6160
6161
6162bool OutSet::Get(unsigned value) const {
6163 if (value < kFirstLimit) {
6164 return (first_ & (1 << value)) != 0;
6165 } else if (remaining_ == NULL) {
6166 return false;
6167 } else {
6168 return remaining_->Contains(value);
6169 }
6170}
6171
6172
Ben Murdoch109988c2016-05-18 11:27:45 +01006173const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
Ben Murdoch014dc512016-03-22 12:00:34 +00006174
6175
6176void DispatchTable::AddRange(CharacterRange full_range, int value,
6177 Zone* zone) {
6178 CharacterRange current = full_range;
6179 if (tree()->is_empty()) {
6180 // If this is the first range we just insert into the table.
6181 ZoneSplayTree<Config>::Locator loc;
6182 bool inserted = tree()->Insert(current.from(), &loc);
6183 DCHECK(inserted);
6184 USE(inserted);
6185 loc.set_value(Entry(current.from(), current.to(),
6186 empty()->Extend(value, zone)));
6187 return;
6188 }
6189 // First see if there is a range to the left of this one that
6190 // overlaps.
6191 ZoneSplayTree<Config>::Locator loc;
6192 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6193 Entry* entry = &loc.value();
6194 // If we've found a range that overlaps with this one, and it
6195 // starts strictly to the left of this one, we have to fix it
6196 // because the following code only handles ranges that start on
6197 // or after the start point of the range we're adding.
6198 if (entry->from() < current.from() && entry->to() >= current.from()) {
6199 // Snap the overlapping range in half around the start point of
6200 // the range we're adding.
Ben Murdoch109988c2016-05-18 11:27:45 +01006201 CharacterRange left =
6202 CharacterRange::Range(entry->from(), current.from() - 1);
6203 CharacterRange right = CharacterRange::Range(current.from(), entry->to());
Ben Murdoch014dc512016-03-22 12:00:34 +00006204 // The left part of the overlapping range doesn't overlap.
6205 // Truncate the whole entry to be just the left part.
6206 entry->set_to(left.to());
6207 // The right part is the one that overlaps. We add this part
6208 // to the map and let the next step deal with merging it with
6209 // the range we're adding.
6210 ZoneSplayTree<Config>::Locator loc;
6211 bool inserted = tree()->Insert(right.from(), &loc);
6212 DCHECK(inserted);
6213 USE(inserted);
6214 loc.set_value(Entry(right.from(),
6215 right.to(),
6216 entry->out_set()));
6217 }
6218 }
6219 while (current.is_valid()) {
6220 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6221 (loc.value().from() <= current.to()) &&
6222 (loc.value().to() >= current.from())) {
6223 Entry* entry = &loc.value();
6224 // We have overlap. If there is space between the start point of
6225 // the range we're adding and where the overlapping range starts
6226 // then we have to add a range covering just that space.
6227 if (current.from() < entry->from()) {
6228 ZoneSplayTree<Config>::Locator ins;
6229 bool inserted = tree()->Insert(current.from(), &ins);
6230 DCHECK(inserted);
6231 USE(inserted);
6232 ins.set_value(Entry(current.from(),
6233 entry->from() - 1,
6234 empty()->Extend(value, zone)));
6235 current.set_from(entry->from());
6236 }
6237 DCHECK_EQ(current.from(), entry->from());
6238 // If the overlapping range extends beyond the one we want to add
6239 // we have to snap the right part off and add it separately.
6240 if (entry->to() > current.to()) {
6241 ZoneSplayTree<Config>::Locator ins;
6242 bool inserted = tree()->Insert(current.to() + 1, &ins);
6243 DCHECK(inserted);
6244 USE(inserted);
6245 ins.set_value(Entry(current.to() + 1,
6246 entry->to(),
6247 entry->out_set()));
6248 entry->set_to(current.to());
6249 }
6250 DCHECK(entry->to() <= current.to());
6251 // The overlapping range is now completely contained by the range
6252 // we're adding so we can just update it and move the start point
6253 // of the range we're adding just past it.
6254 entry->AddValue(value, zone);
Ben Murdoch014dc512016-03-22 12:00:34 +00006255 DCHECK(entry->to() + 1 > current.from());
6256 current.set_from(entry->to() + 1);
6257 } else {
6258 // There is no overlap so we can just add the range
6259 ZoneSplayTree<Config>::Locator ins;
6260 bool inserted = tree()->Insert(current.from(), &ins);
6261 DCHECK(inserted);
6262 USE(inserted);
6263 ins.set_value(Entry(current.from(),
6264 current.to(),
6265 empty()->Extend(value, zone)));
6266 break;
6267 }
6268 }
6269}
6270
6271
Ben Murdoch109988c2016-05-18 11:27:45 +01006272OutSet* DispatchTable::Get(uc32 value) {
Ben Murdoch014dc512016-03-22 12:00:34 +00006273 ZoneSplayTree<Config>::Locator loc;
6274 if (!tree()->FindGreatestLessThan(value, &loc))
6275 return empty();
6276 Entry* entry = &loc.value();
6277 if (value <= entry->to())
6278 return entry->out_set();
6279 else
6280 return empty();
6281}
6282
6283
6284// -------------------------------------------------------------------
6285// Analysis
6286
6287
6288void Analysis::EnsureAnalyzed(RegExpNode* that) {
6289 StackLimitCheck check(isolate());
6290 if (check.HasOverflowed()) {
6291 fail("Stack overflow");
6292 return;
6293 }
6294 if (that->info()->been_analyzed || that->info()->being_analyzed)
6295 return;
6296 that->info()->being_analyzed = true;
6297 that->Accept(this);
6298 that->info()->being_analyzed = false;
6299 that->info()->been_analyzed = true;
6300}
6301
6302
6303void Analysis::VisitEnd(EndNode* that) {
6304 // nothing to do
6305}
6306
6307
6308void TextNode::CalculateOffsets() {
6309 int element_count = elements()->length();
6310 // Set up the offsets of the elements relative to the start. This is a fixed
6311 // quantity since a TextNode can only contain fixed-width things.
6312 int cp_offset = 0;
6313 for (int i = 0; i < element_count; i++) {
6314 TextElement& elm = elements()->at(i);
6315 elm.set_cp_offset(cp_offset);
6316 cp_offset += elm.length();
6317 }
6318}
6319
6320
6321void Analysis::VisitText(TextNode* that) {
Ben Murdoch109988c2016-05-18 11:27:45 +01006322 if (ignore_case()) {
Ben Murdoch014dc512016-03-22 12:00:34 +00006323 that->MakeCaseIndependent(isolate(), is_one_byte_);
6324 }
6325 EnsureAnalyzed(that->on_success());
6326 if (!has_failed()) {
6327 that->CalculateOffsets();
6328 }
6329}
6330
6331
6332void Analysis::VisitAction(ActionNode* that) {
6333 RegExpNode* target = that->on_success();
6334 EnsureAnalyzed(target);
6335 if (!has_failed()) {
6336 // If the next node is interested in what it follows then this node
6337 // has to be interested too so it can pass the information on.
6338 that->info()->AddFromFollowing(target->info());
6339 }
6340}
6341
6342
6343void Analysis::VisitChoice(ChoiceNode* that) {
6344 NodeInfo* info = that->info();
6345 for (int i = 0; i < that->alternatives()->length(); i++) {
6346 RegExpNode* node = that->alternatives()->at(i).node();
6347 EnsureAnalyzed(node);
6348 if (has_failed()) return;
6349 // Anything the following nodes need to know has to be known by
6350 // this node also, so it can pass it on.
6351 info->AddFromFollowing(node->info());
6352 }
6353}
6354
6355
6356void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6357 NodeInfo* info = that->info();
6358 for (int i = 0; i < that->alternatives()->length(); i++) {
6359 RegExpNode* node = that->alternatives()->at(i).node();
6360 if (node != that->loop_node()) {
6361 EnsureAnalyzed(node);
6362 if (has_failed()) return;
6363 info->AddFromFollowing(node->info());
6364 }
6365 }
6366 // Check the loop last since it may need the value of this node
6367 // to get a correct result.
6368 EnsureAnalyzed(that->loop_node());
6369 if (!has_failed()) {
6370 info->AddFromFollowing(that->loop_node()->info());
6371 }
6372}
6373
6374
6375void Analysis::VisitBackReference(BackReferenceNode* that) {
6376 EnsureAnalyzed(that->on_success());
6377}
6378
6379
6380void Analysis::VisitAssertion(AssertionNode* that) {
6381 EnsureAnalyzed(that->on_success());
6382}
6383
6384
6385void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6386 BoyerMooreLookahead* bm,
6387 bool not_at_start) {
6388 // Working out the set of characters that a backreference can match is too
6389 // hard, so we just say that any character can match.
6390 bm->SetRest(offset);
6391 SaveBMInfo(bm, not_at_start, offset);
6392}
6393
6394
6395STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6396 RegExpMacroAssembler::kTableSize);
6397
6398
6399void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6400 BoyerMooreLookahead* bm, bool not_at_start) {
6401 ZoneList<GuardedAlternative>* alts = alternatives();
6402 budget = (budget - 1) / alts->length();
6403 for (int i = 0; i < alts->length(); i++) {
6404 GuardedAlternative& alt = alts->at(i);
6405 if (alt.guards() != NULL && alt.guards()->length() != 0) {
6406 bm->SetRest(offset); // Give up trying to fill in info.
6407 SaveBMInfo(bm, not_at_start, offset);
6408 return;
6409 }
6410 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6411 }
6412 SaveBMInfo(bm, not_at_start, offset);
6413}
6414
6415
6416void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6417 BoyerMooreLookahead* bm, bool not_at_start) {
6418 if (initial_offset >= bm->length()) return;
6419 int offset = initial_offset;
6420 int max_char = bm->max_char();
6421 for (int i = 0; i < elements()->length(); i++) {
6422 if (offset >= bm->length()) {
6423 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6424 return;
6425 }
6426 TextElement text = elements()->at(i);
6427 if (text.text_type() == TextElement::ATOM) {
6428 RegExpAtom* atom = text.atom();
6429 for (int j = 0; j < atom->length(); j++, offset++) {
6430 if (offset >= bm->length()) {
6431 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6432 return;
6433 }
6434 uc16 character = atom->data()[j];
6435 if (bm->compiler()->ignore_case()) {
6436 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6437 int length = GetCaseIndependentLetters(
6438 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6439 chars);
6440 for (int j = 0; j < length; j++) {
6441 bm->Set(offset, chars[j]);
6442 }
6443 } else {
6444 if (character <= max_char) bm->Set(offset, character);
6445 }
6446 }
6447 } else {
6448 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6449 RegExpCharacterClass* char_class = text.char_class();
6450 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6451 if (char_class->is_negated()) {
6452 bm->SetAll(offset);
6453 } else {
6454 for (int k = 0; k < ranges->length(); k++) {
6455 CharacterRange& range = ranges->at(k);
6456 if (range.from() > max_char) continue;
6457 int to = Min(max_char, static_cast<int>(range.to()));
6458 bm->SetInterval(offset, Interval(range.from(), to));
6459 }
6460 }
6461 offset++;
6462 }
6463 }
6464 if (offset >= bm->length()) {
6465 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6466 return;
6467 }
6468 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6469 true); // Not at start after a text node.
6470 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6471}
6472
6473
6474// -------------------------------------------------------------------
6475// Dispatch table construction
6476
6477
6478void DispatchTableConstructor::VisitEnd(EndNode* that) {
6479 AddRange(CharacterRange::Everything());
6480}
6481
6482
6483void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6484 node->set_being_calculated(true);
6485 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6486 for (int i = 0; i < alternatives->length(); i++) {
6487 set_choice_index(i);
6488 alternatives->at(i).node()->Accept(this);
6489 }
6490 node->set_being_calculated(false);
6491}
6492
6493
6494class AddDispatchRange {
6495 public:
6496 explicit AddDispatchRange(DispatchTableConstructor* constructor)
6497 : constructor_(constructor) { }
6498 void Call(uc32 from, DispatchTable::Entry entry);
6499 private:
6500 DispatchTableConstructor* constructor_;
6501};
6502
6503
6504void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
Ben Murdoch109988c2016-05-18 11:27:45 +01006505 constructor_->AddRange(CharacterRange::Range(from, entry.to()));
Ben Murdoch014dc512016-03-22 12:00:34 +00006506}
6507
6508
6509void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6510 if (node->being_calculated())
6511 return;
6512 DispatchTable* table = node->GetTable(ignore_case_);
6513 AddDispatchRange adder(this);
6514 table->ForEach(&adder);
6515}
6516
6517
6518void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6519 // TODO(160): Find the node that we refer back to and propagate its start
6520 // set back to here. For now we just accept anything.
6521 AddRange(CharacterRange::Everything());
6522}
6523
6524
6525void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6526 RegExpNode* target = that->on_success();
6527 target->Accept(this);
6528}
6529
6530
6531static int CompareRangeByFrom(const CharacterRange* a,
6532 const CharacterRange* b) {
6533 return Compare<uc16>(a->from(), b->from());
6534}
6535
6536
6537void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6538 ranges->Sort(CompareRangeByFrom);
6539 uc16 last = 0;
6540 for (int i = 0; i < ranges->length(); i++) {
6541 CharacterRange range = ranges->at(i);
6542 if (last < range.from())
Ben Murdoch109988c2016-05-18 11:27:45 +01006543 AddRange(CharacterRange::Range(last, range.from() - 1));
Ben Murdoch014dc512016-03-22 12:00:34 +00006544 if (range.to() >= last) {
Ben Murdoch109988c2016-05-18 11:27:45 +01006545 if (range.to() == String::kMaxCodePoint) {
Ben Murdoch014dc512016-03-22 12:00:34 +00006546 return;
6547 } else {
6548 last = range.to() + 1;
6549 }
6550 }
6551 }
Ben Murdoch109988c2016-05-18 11:27:45 +01006552 AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
Ben Murdoch014dc512016-03-22 12:00:34 +00006553}
6554
6555
6556void DispatchTableConstructor::VisitText(TextNode* that) {
6557 TextElement elm = that->elements()->at(0);
6558 switch (elm.text_type()) {
6559 case TextElement::ATOM: {
6560 uc16 c = elm.atom()->data()[0];
Ben Murdoch109988c2016-05-18 11:27:45 +01006561 AddRange(CharacterRange::Range(c, c));
Ben Murdoch014dc512016-03-22 12:00:34 +00006562 break;
6563 }
6564 case TextElement::CHAR_CLASS: {
6565 RegExpCharacterClass* tree = elm.char_class();
6566 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6567 if (tree->is_negated()) {
6568 AddInverse(ranges);
6569 } else {
6570 for (int i = 0; i < ranges->length(); i++)
6571 AddRange(ranges->at(i));
6572 }
6573 break;
6574 }
6575 default: {
6576 UNIMPLEMENTED();
6577 }
6578 }
6579}
6580
6581
6582void DispatchTableConstructor::VisitAction(ActionNode* that) {
6583 RegExpNode* target = that->on_success();
6584 target->Accept(this);
6585}
6586
6587
Ben Murdoch109988c2016-05-18 11:27:45 +01006588RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6589 RegExpNode* on_success) {
6590 // If the regexp matching starts within a surrogate pair, step back
6591 // to the lead surrogate and start matching from there.
6592 DCHECK(!compiler->read_backward());
6593 Zone* zone = compiler->zone();
6594 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6595 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6596 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6597 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6598
6599 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6600
6601 int stack_register = compiler->UnicodeLookaroundStackRegister();
6602 int position_register = compiler->UnicodeLookaroundPositionRegister();
6603 RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6604 zone, lead_surrogates, true, on_success);
6605 RegExpLookaround::Builder builder(true, step_back, stack_register,
6606 position_register);
6607 RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6608 zone, trail_surrogates, false, builder.on_match_success());
6609
6610 optional_step_back->AddAlternative(
6611 GuardedAlternative(builder.ForMatch(match_trail)));
6612 optional_step_back->AddAlternative(GuardedAlternative(on_success));
6613
6614 return optional_step_back;
6615}
6616
6617
Ben Murdoch014dc512016-03-22 12:00:34 +00006618RegExpEngine::CompilationResult RegExpEngine::Compile(
Ben Murdoch109988c2016-05-18 11:27:45 +01006619 Isolate* isolate, Zone* zone, RegExpCompileData* data,
6620 JSRegExp::Flags flags, Handle<String> pattern,
Ben Murdoch014dc512016-03-22 12:00:34 +00006621 Handle<String> sample_subject, bool is_one_byte) {
6622 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6623 return IrregexpRegExpTooBig(isolate);
6624 }
Ben Murdoch109988c2016-05-18 11:27:45 +01006625 bool ignore_case = flags & JSRegExp::kIgnoreCase;
6626 bool is_sticky = flags & JSRegExp::kSticky;
6627 bool is_global = flags & JSRegExp::kGlobal;
6628 bool is_unicode = flags & JSRegExp::kUnicode;
6629 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
Ben Murdoch014dc512016-03-22 12:00:34 +00006630 is_one_byte);
6631
6632 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6633
6634 // Sample some characters from the middle of the string.
6635 static const int kSampleSize = 128;
6636
6637 sample_subject = String::Flatten(sample_subject);
6638 int chars_sampled = 0;
6639 int half_way = (sample_subject->length() - kSampleSize) / 2;
6640 for (int i = Max(0, half_way);
6641 i < sample_subject->length() && chars_sampled < kSampleSize;
6642 i++, chars_sampled++) {
6643 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6644 }
6645
6646 // Wrap the body of the regexp in capture #0.
6647 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6648 0,
6649 &compiler,
6650 compiler.accept());
6651 RegExpNode* node = captured_body;
6652 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6653 bool is_start_anchored = data->tree->IsAnchoredAtStart();
6654 int max_length = data->tree->max_match();
6655 if (!is_start_anchored && !is_sticky) {
6656 // Add a .*? at the beginning, outside the body capture, unless
6657 // this expression is anchored at the beginning or sticky.
6658 RegExpNode* loop_node = RegExpQuantifier::ToNode(
6659 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
6660 &compiler, captured_body, data->contains_anchor);
6661
6662 if (data->contains_anchor) {
6663 // Unroll loop once, to take care of the case that might start
6664 // at the start of input.
6665 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6666 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6667 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6668 new (zone) RegExpCharacterClass('*'), false, loop_node)));
6669 node = first_step_node;
6670 } else {
6671 node = loop_node;
6672 }
6673 }
6674 if (is_one_byte) {
6675 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6676 // Do it again to propagate the new nodes to places where they were not
6677 // put because they had not been calculated yet.
6678 if (node != NULL) {
6679 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6680 }
Ben Murdoch109988c2016-05-18 11:27:45 +01006681 } else if (compiler.unicode() && (is_global || is_sticky)) {
6682 node = OptionallyStepBackToLeadSurrogate(&compiler, node);
Ben Murdoch014dc512016-03-22 12:00:34 +00006683 }
6684
6685 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6686 data->node = node;
Ben Murdoch109988c2016-05-18 11:27:45 +01006687 Analysis analysis(isolate, flags, is_one_byte);
Ben Murdoch014dc512016-03-22 12:00:34 +00006688 analysis.EnsureAnalyzed(node);
6689 if (analysis.has_failed()) {
6690 const char* error_message = analysis.error_message();
6691 return CompilationResult(isolate, error_message);
6692 }
6693
6694 // Create the correct assembler for the architecture.
6695#ifndef V8_INTERPRETED_REGEXP
6696 // Native regexp implementation.
6697
6698 NativeRegExpMacroAssembler::Mode mode =
6699 is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6700 : NativeRegExpMacroAssembler::UC16;
6701
6702#if V8_TARGET_ARCH_IA32
6703 RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6704 (data->capture_count + 1) * 2);
6705#elif V8_TARGET_ARCH_X64
6706 RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6707 (data->capture_count + 1) * 2);
6708#elif V8_TARGET_ARCH_ARM
6709 RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6710 (data->capture_count + 1) * 2);
6711#elif V8_TARGET_ARCH_ARM64
6712 RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6713 (data->capture_count + 1) * 2);
Ben Murdoch3b9bc312016-06-02 14:46:10 +01006714#elif V8_TARGET_ARCH_S390
6715 RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
6716 (data->capture_count + 1) * 2);
Ben Murdoch014dc512016-03-22 12:00:34 +00006717#elif V8_TARGET_ARCH_PPC
6718 RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6719 (data->capture_count + 1) * 2);
6720#elif V8_TARGET_ARCH_MIPS
6721 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6722 (data->capture_count + 1) * 2);
6723#elif V8_TARGET_ARCH_MIPS64
6724 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6725 (data->capture_count + 1) * 2);
6726#elif V8_TARGET_ARCH_X87
6727 RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
6728 (data->capture_count + 1) * 2);
6729#else
6730#error "Unsupported architecture"
6731#endif
6732
6733#else // V8_INTERPRETED_REGEXP
6734 // Interpreted regexp implementation.
6735 EmbeddedVector<byte, 1024> codes;
6736 RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6737#endif // V8_INTERPRETED_REGEXP
6738
6739 macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
6740
6741 // Inserted here, instead of in Assembler, because it depends on information
6742 // in the AST that isn't replicated in the Node structure.
6743 static const int kMaxBacksearchLimit = 1024;
6744 if (is_end_anchored &&
6745 !is_start_anchored &&
6746 max_length < kMaxBacksearchLimit) {
6747 macro_assembler.SetCurrentPositionFromEnd(max_length);
6748 }
6749
6750 if (is_global) {
Ben Murdoch109988c2016-05-18 11:27:45 +01006751 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6752 if (data->tree->min_match() > 0) {
6753 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6754 } else if (is_unicode) {
6755 mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6756 }
6757 macro_assembler.set_global_mode(mode);
Ben Murdoch014dc512016-03-22 12:00:34 +00006758 }
6759
6760 return compiler.Assemble(&macro_assembler,
6761 node,
6762 data->capture_count,
6763 pattern);
6764}
6765
6766
6767bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
6768 Heap* heap = pattern->GetHeap();
6769 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6770 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
Ben Murdochbcf72ee2016-08-08 18:44:38 +01006771 heap->memory_allocator()->SizeExecutable() >
Ben Murdoch014dc512016-03-22 12:00:34 +00006772 RegExpImpl::kRegExpExecutableMemoryLimit) {
6773 too_much = true;
6774 }
6775 return too_much;
6776}
6777
6778
6779Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
6780 Object* key_pattern,
6781 FixedArray** last_match_cache,
6782 ResultsCacheType type) {
6783 FixedArray* cache;
6784 if (!key_string->IsInternalizedString()) return Smi::FromInt(0);
6785 if (type == STRING_SPLIT_SUBSTRINGS) {
6786 DCHECK(key_pattern->IsString());
6787 if (!key_pattern->IsInternalizedString()) return Smi::FromInt(0);
6788 cache = heap->string_split_cache();
6789 } else {
6790 DCHECK(type == REGEXP_MULTIPLE_INDICES);
6791 DCHECK(key_pattern->IsFixedArray());
6792 cache = heap->regexp_multiple_cache();
6793 }
6794
6795 uint32_t hash = key_string->Hash();
6796 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6797 ~(kArrayEntriesPerCacheEntry - 1));
6798 if (cache->get(index + kStringOffset) != key_string ||
6799 cache->get(index + kPatternOffset) != key_pattern) {
6800 index =
6801 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6802 if (cache->get(index + kStringOffset) != key_string ||
6803 cache->get(index + kPatternOffset) != key_pattern) {
6804 return Smi::FromInt(0);
6805 }
6806 }
6807
6808 *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6809 return cache->get(index + kArrayOffset);
6810}
6811
6812
6813void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6814 Handle<Object> key_pattern,
6815 Handle<FixedArray> value_array,
6816 Handle<FixedArray> last_match_cache,
6817 ResultsCacheType type) {
6818 Factory* factory = isolate->factory();
6819 Handle<FixedArray> cache;
6820 if (!key_string->IsInternalizedString()) return;
6821 if (type == STRING_SPLIT_SUBSTRINGS) {
6822 DCHECK(key_pattern->IsString());
6823 if (!key_pattern->IsInternalizedString()) return;
6824 cache = factory->string_split_cache();
6825 } else {
6826 DCHECK(type == REGEXP_MULTIPLE_INDICES);
6827 DCHECK(key_pattern->IsFixedArray());
6828 cache = factory->regexp_multiple_cache();
6829 }
6830
6831 uint32_t hash = key_string->Hash();
6832 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6833 ~(kArrayEntriesPerCacheEntry - 1));
6834 if (cache->get(index + kStringOffset) == Smi::FromInt(0)) {
6835 cache->set(index + kStringOffset, *key_string);
6836 cache->set(index + kPatternOffset, *key_pattern);
6837 cache->set(index + kArrayOffset, *value_array);
6838 cache->set(index + kLastMatchOffset, *last_match_cache);
6839 } else {
6840 uint32_t index2 =
6841 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6842 if (cache->get(index2 + kStringOffset) == Smi::FromInt(0)) {
6843 cache->set(index2 + kStringOffset, *key_string);
6844 cache->set(index2 + kPatternOffset, *key_pattern);
6845 cache->set(index2 + kArrayOffset, *value_array);
6846 cache->set(index2 + kLastMatchOffset, *last_match_cache);
6847 } else {
6848 cache->set(index2 + kStringOffset, Smi::FromInt(0));
6849 cache->set(index2 + kPatternOffset, Smi::FromInt(0));
6850 cache->set(index2 + kArrayOffset, Smi::FromInt(0));
6851 cache->set(index2 + kLastMatchOffset, Smi::FromInt(0));
6852 cache->set(index + kStringOffset, *key_string);
6853 cache->set(index + kPatternOffset, *key_pattern);
6854 cache->set(index + kArrayOffset, *value_array);
6855 cache->set(index + kLastMatchOffset, *last_match_cache);
6856 }
6857 }
6858 // If the array is a reasonably short list of substrings, convert it into a
6859 // list of internalized strings.
6860 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6861 for (int i = 0; i < value_array->length(); i++) {
6862 Handle<String> str(String::cast(value_array->get(i)), isolate);
6863 Handle<String> internalized_str = factory->InternalizeString(str);
6864 value_array->set(i, *internalized_str);
6865 }
6866 }
6867 // Convert backing store to a copy-on-write array.
6868 value_array->set_map_no_write_barrier(*factory->fixed_cow_array_map());
6869}
6870
6871
6872void RegExpResultsCache::Clear(FixedArray* cache) {
6873 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6874 cache->set(i, Smi::FromInt(0));
6875 }
6876}
6877
6878} // namespace internal
6879} // namespace v8