blob: 80f48ca1a9fd34318de2c638889b58fed4f3dc7d [file] [log] [blame]
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001// Copyright 2012 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/jsregexp.h"
6
7#include "src/ast/ast.h"
8#include "src/base/platform/platform.h"
9#include "src/compilation-cache.h"
10#include "src/compiler.h"
11#include "src/execution.h"
12#include "src/factory.h"
13#include "src/isolate-inl.h"
14#include "src/messages.h"
15#include "src/ostreams.h"
16#include "src/regexp/interpreter-irregexp.h"
17#include "src/regexp/jsregexp-inl.h"
18#include "src/regexp/regexp-macro-assembler.h"
19#include "src/regexp/regexp-macro-assembler-irregexp.h"
20#include "src/regexp/regexp-macro-assembler-tracer.h"
21#include "src/regexp/regexp-parser.h"
22#include "src/regexp/regexp-stack.h"
23#include "src/runtime/runtime.h"
24#include "src/splay-tree-inl.h"
25#include "src/string-search.h"
26#include "src/unicode-decoder.h"
27
Ben Murdoch097c5b22016-05-18 11:27:45 +010028#ifdef V8_I18N_SUPPORT
29#include "unicode/uset.h"
30#include "unicode/utypes.h"
31#endif // V8_I18N_SUPPORT
32
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000033#ifndef V8_INTERPRETED_REGEXP
34#if V8_TARGET_ARCH_IA32
35#include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
36#elif V8_TARGET_ARCH_X64
37#include "src/regexp/x64/regexp-macro-assembler-x64.h"
38#elif V8_TARGET_ARCH_ARM64
39#include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
40#elif V8_TARGET_ARCH_ARM
41#include "src/regexp/arm/regexp-macro-assembler-arm.h"
42#elif V8_TARGET_ARCH_PPC
43#include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
44#elif V8_TARGET_ARCH_MIPS
45#include "src/regexp/mips/regexp-macro-assembler-mips.h"
46#elif V8_TARGET_ARCH_MIPS64
47#include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
48#elif V8_TARGET_ARCH_X87
49#include "src/regexp/x87/regexp-macro-assembler-x87.h"
50#else
51#error Unsupported target architecture.
52#endif
53#endif
54
55
56namespace v8 {
57namespace internal {
58
59MUST_USE_RESULT
60static inline MaybeHandle<Object> ThrowRegExpException(
61 Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
62 Isolate* isolate = re->GetIsolate();
63 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
64 pattern, error_text),
65 Object);
66}
67
68
69inline void ThrowRegExpException(Handle<JSRegExp> re,
70 Handle<String> error_text) {
71 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
72}
73
74
75ContainedInLattice AddRange(ContainedInLattice containment,
76 const int* ranges,
77 int ranges_length,
78 Interval new_range) {
79 DCHECK((ranges_length & 1) == 1);
Ben Murdoch097c5b22016-05-18 11:27:45 +010080 DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000081 if (containment == kLatticeUnknown) return containment;
82 bool inside = false;
83 int last = 0;
84 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
85 // Consider the range from last to ranges[i].
86 // We haven't got to the new range yet.
87 if (ranges[i] <= new_range.from()) continue;
88 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
89 // inclusive, but the values in ranges are not.
90 if (last <= new_range.from() && new_range.to() < ranges[i]) {
91 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
92 }
93 return kLatticeUnknown;
94 }
95 return containment;
96}
97
98
99// More makes code generation slower, less makes V8 benchmark score lower.
100const int kMaxLookaheadForBoyerMoore = 8;
101// In a 3-character pattern you can maximally step forwards 3 characters
102// at a time, which is not always enough to pay for the extra logic.
103const int kPatternTooShortForBoyerMoore = 2;
104
105
106// Identifies the sort of regexps where the regexp engine is faster
107// than the code used for atom matches.
108static bool HasFewDifferentCharacters(Handle<String> pattern) {
109 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
110 if (length <= kPatternTooShortForBoyerMoore) return false;
111 const int kMod = 128;
112 bool character_found[kMod];
113 int different = 0;
114 memset(&character_found[0], 0, sizeof(character_found));
115 for (int i = 0; i < length; i++) {
116 int ch = (pattern->Get(i) & (kMod - 1));
117 if (!character_found[ch]) {
118 character_found[ch] = true;
119 different++;
120 // We declare a regexp low-alphabet if it has at least 3 times as many
121 // characters as it has different characters.
122 if (different * 3 > length) return false;
123 }
124 }
125 return true;
126}
127
128
129// Generic RegExp methods. Dispatches to implementation specific methods.
130
131
132MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
133 Handle<String> pattern,
134 JSRegExp::Flags flags) {
135 Isolate* isolate = re->GetIsolate();
136 Zone zone;
137 CompilationCache* compilation_cache = isolate->compilation_cache();
138 MaybeHandle<FixedArray> maybe_cached =
139 compilation_cache->LookupRegExp(pattern, flags);
140 Handle<FixedArray> cached;
141 bool in_cache = maybe_cached.ToHandle(&cached);
142 LOG(isolate, RegExpCompileEvent(re, in_cache));
143
144 Handle<Object> result;
145 if (in_cache) {
146 re->set_data(*cached);
147 return re;
148 }
149 pattern = String::Flatten(pattern);
150 PostponeInterruptsScope postpone(isolate);
151 RegExpCompileData parse_result;
152 FlatStringReader reader(isolate, pattern);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100153 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
154 &parse_result)) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000155 // Throw an exception if we fail to parse the pattern.
156 return ThrowRegExpException(re, pattern, parse_result.error);
157 }
158
159 bool has_been_compiled = false;
160
161 if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
162 !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
163 // Parse-tree is a single atom that is equal to the pattern.
164 AtomCompile(re, pattern, flags, pattern);
165 has_been_compiled = true;
166 } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
167 !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
168 RegExpAtom* atom = parse_result.tree->AsAtom();
169 Vector<const uc16> atom_pattern = atom->data();
170 Handle<String> atom_string;
171 ASSIGN_RETURN_ON_EXCEPTION(
172 isolate, atom_string,
173 isolate->factory()->NewStringFromTwoByte(atom_pattern),
174 Object);
175 if (!HasFewDifferentCharacters(atom_string)) {
176 AtomCompile(re, pattern, flags, atom_string);
177 has_been_compiled = true;
178 }
179 }
180 if (!has_been_compiled) {
181 IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
182 }
183 DCHECK(re->data()->IsFixedArray());
184 // Compilation succeeded so the data is set on the regexp
185 // and we can store it in the cache.
186 Handle<FixedArray> data(FixedArray::cast(re->data()));
187 compilation_cache->PutRegExp(pattern, flags, data);
188
189 return re;
190}
191
192
193MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
194 Handle<String> subject,
195 int index,
196 Handle<JSArray> last_match_info) {
197 switch (regexp->TypeTag()) {
198 case JSRegExp::ATOM:
199 return AtomExec(regexp, subject, index, last_match_info);
200 case JSRegExp::IRREGEXP: {
201 return IrregexpExec(regexp, subject, index, last_match_info);
202 }
203 default:
204 UNREACHABLE();
205 return MaybeHandle<Object>();
206 }
207}
208
209
210// RegExp Atom implementation: Simple string search using indexOf.
211
212
213void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
214 Handle<String> pattern,
215 JSRegExp::Flags flags,
216 Handle<String> match_pattern) {
217 re->GetIsolate()->factory()->SetRegExpAtomData(re,
218 JSRegExp::ATOM,
219 pattern,
220 flags,
221 match_pattern);
222}
223
224
225static void SetAtomLastCapture(FixedArray* array,
226 String* subject,
227 int from,
228 int to) {
229 SealHandleScope shs(array->GetIsolate());
230 RegExpImpl::SetLastCaptureCount(array, 2);
231 RegExpImpl::SetLastSubject(array, subject);
232 RegExpImpl::SetLastInput(array, subject);
233 RegExpImpl::SetCapture(array, 0, from);
234 RegExpImpl::SetCapture(array, 1, to);
235}
236
237
238int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
239 Handle<String> subject,
240 int index,
241 int32_t* output,
242 int output_size) {
243 Isolate* isolate = regexp->GetIsolate();
244
245 DCHECK(0 <= index);
246 DCHECK(index <= subject->length());
247
248 subject = String::Flatten(subject);
249 DisallowHeapAllocation no_gc; // ensure vectors stay valid
250
251 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
252 int needle_len = needle->length();
253 DCHECK(needle->IsFlat());
254 DCHECK_LT(0, needle_len);
255
256 if (index + needle_len > subject->length()) {
257 return RegExpImpl::RE_FAILURE;
258 }
259
260 for (int i = 0; i < output_size; i += 2) {
261 String::FlatContent needle_content = needle->GetFlatContent();
262 String::FlatContent subject_content = subject->GetFlatContent();
263 DCHECK(needle_content.IsFlat());
264 DCHECK(subject_content.IsFlat());
265 // dispatch on type of strings
266 index =
267 (needle_content.IsOneByte()
268 ? (subject_content.IsOneByte()
269 ? SearchString(isolate, subject_content.ToOneByteVector(),
270 needle_content.ToOneByteVector(), index)
271 : SearchString(isolate, subject_content.ToUC16Vector(),
272 needle_content.ToOneByteVector(), index))
273 : (subject_content.IsOneByte()
274 ? SearchString(isolate, subject_content.ToOneByteVector(),
275 needle_content.ToUC16Vector(), index)
276 : SearchString(isolate, subject_content.ToUC16Vector(),
277 needle_content.ToUC16Vector(), index)));
278 if (index == -1) {
279 return i / 2; // Return number of matches.
280 } else {
281 output[i] = index;
282 output[i+1] = index + needle_len;
283 index += needle_len;
284 }
285 }
286 return output_size / 2;
287}
288
289
290Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
291 Handle<String> subject,
292 int index,
293 Handle<JSArray> last_match_info) {
294 Isolate* isolate = re->GetIsolate();
295
296 static const int kNumRegisters = 2;
297 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
298 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
299
300 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
301
302 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
303
304 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
305 SealHandleScope shs(isolate);
306 FixedArray* array = FixedArray::cast(last_match_info->elements());
307 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
308 return last_match_info;
309}
310
311
312// Irregexp implementation.
313
314// Ensures that the regexp object contains a compiled version of the
315// source for either one-byte or two-byte subject strings.
316// If the compiled version doesn't already exist, it is compiled
317// from the source pattern.
318// If compilation fails, an exception is thrown and this function
319// returns false.
320bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
321 Handle<String> sample_subject,
322 bool is_one_byte) {
323 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
324#ifdef V8_INTERPRETED_REGEXP
325 if (compiled_code->IsByteArray()) return true;
326#else // V8_INTERPRETED_REGEXP (RegExp native code)
327 if (compiled_code->IsCode()) return true;
328#endif
329 // We could potentially have marked this as flushable, but have kept
330 // a saved version if we did not flush it yet.
331 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
332 if (saved_code->IsCode()) {
333 // Reinstate the code in the original place.
334 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
335 DCHECK(compiled_code->IsSmi());
336 return true;
337 }
338 return CompileIrregexp(re, sample_subject, is_one_byte);
339}
340
341
342bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
343 Handle<String> sample_subject,
344 bool is_one_byte) {
345 // Compile the RegExp.
346 Isolate* isolate = re->GetIsolate();
347 Zone zone;
348 PostponeInterruptsScope postpone(isolate);
349 // If we had a compilation error the last time this is saved at the
350 // saved code index.
351 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
352 // When arriving here entry can only be a smi, either representing an
353 // uncompiled regexp, a previous compilation error, or code that has
354 // been flushed.
355 DCHECK(entry->IsSmi());
356 int entry_value = Smi::cast(entry)->value();
357 DCHECK(entry_value == JSRegExp::kUninitializedValue ||
358 entry_value == JSRegExp::kCompilationErrorValue ||
359 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
360
361 if (entry_value == JSRegExp::kCompilationErrorValue) {
362 // A previous compilation failed and threw an error which we store in
363 // the saved code index (we store the error message, not the actual
364 // error). Recreate the error object and throw it.
365 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
366 DCHECK(error_string->IsString());
367 Handle<String> error_message(String::cast(error_string));
368 ThrowRegExpException(re, error_message);
369 return false;
370 }
371
372 JSRegExp::Flags flags = re->GetFlags();
373
374 Handle<String> pattern(re->Pattern());
375 pattern = String::Flatten(pattern);
376 RegExpCompileData compile_data;
377 FlatStringReader reader(isolate, pattern);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100378 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
379 &compile_data)) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000380 // Throw an exception if we fail to parse the pattern.
381 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
382 USE(ThrowRegExpException(re, pattern, compile_data.error));
383 return false;
384 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100385 RegExpEngine::CompilationResult result =
386 RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
387 sample_subject, is_one_byte);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000388 if (result.error_message != NULL) {
389 // Unable to compile regexp.
390 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
391 CStrVector(result.error_message)).ToHandleChecked();
392 ThrowRegExpException(re, error_message);
393 return false;
394 }
395
396 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
397 data->set(JSRegExp::code_index(is_one_byte), result.code);
398 int register_max = IrregexpMaxRegisterCount(*data);
399 if (result.num_registers > register_max) {
400 SetIrregexpMaxRegisterCount(*data, result.num_registers);
401 }
402
403 return true;
404}
405
406
407int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
408 return Smi::cast(
409 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
410}
411
412
413void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
414 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
415}
416
417
418int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
419 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
420}
421
422
423int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
424 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
425}
426
427
428ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
429 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
430}
431
432
433Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
434 return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
435}
436
437
438void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
439 Handle<String> pattern,
440 JSRegExp::Flags flags,
441 int capture_count) {
442 // Initialize compiled code entries to null.
443 re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
444 JSRegExp::IRREGEXP,
445 pattern,
446 flags,
447 capture_count);
448}
449
450
451int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
452 Handle<String> subject) {
453 subject = String::Flatten(subject);
454
455 // Check representation of the underlying storage.
456 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
457 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
458
459#ifdef V8_INTERPRETED_REGEXP
460 // Byte-code regexp needs space allocated for all its registers.
461 // The result captures are copied to the start of the registers array
462 // if the match succeeds. This way those registers are not clobbered
463 // when we set the last match info from last successful match.
464 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
465 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
466#else // V8_INTERPRETED_REGEXP
467 // Native regexp only needs room to output captures. Registers are handled
468 // internally.
469 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
470#endif // V8_INTERPRETED_REGEXP
471}
472
473
474int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
475 Handle<String> subject,
476 int index,
477 int32_t* output,
478 int output_size) {
479 Isolate* isolate = regexp->GetIsolate();
480
481 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
482
483 DCHECK(index >= 0);
484 DCHECK(index <= subject->length());
485 DCHECK(subject->IsFlat());
486
487 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
488
489#ifndef V8_INTERPRETED_REGEXP
490 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
491 do {
492 EnsureCompiledIrregexp(regexp, subject, is_one_byte);
493 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
494 // The stack is used to allocate registers for the compiled regexp code.
495 // This means that in case of failure, the output registers array is left
496 // untouched and contains the capture results from the previous successful
497 // match. We can use that to set the last match info lazily.
498 NativeRegExpMacroAssembler::Result res =
499 NativeRegExpMacroAssembler::Match(code,
500 subject,
501 output,
502 output_size,
503 index,
504 isolate);
505 if (res != NativeRegExpMacroAssembler::RETRY) {
506 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
507 isolate->has_pending_exception());
508 STATIC_ASSERT(
509 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
510 STATIC_ASSERT(
511 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
512 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
513 == RE_EXCEPTION);
514 return static_cast<IrregexpResult>(res);
515 }
516 // If result is RETRY, the string has changed representation, and we
517 // must restart from scratch.
518 // In this case, it means we must make sure we are prepared to handle
519 // the, potentially, different subject (the string can switch between
520 // being internal and external, and even between being Latin1 and UC16,
521 // but the characters are always the same).
522 IrregexpPrepare(regexp, subject);
523 is_one_byte = subject->IsOneByteRepresentationUnderneath();
524 } while (true);
525 UNREACHABLE();
526 return RE_EXCEPTION;
527#else // V8_INTERPRETED_REGEXP
528
529 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
530 // We must have done EnsureCompiledIrregexp, so we can get the number of
531 // registers.
532 int number_of_capture_registers =
533 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
534 int32_t* raw_output = &output[number_of_capture_registers];
535 // We do not touch the actual capture result registers until we know there
536 // has been a match so that we can use those capture results to set the
537 // last match info.
538 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
539 raw_output[i] = -1;
540 }
541 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
542 isolate);
543
544 IrregexpResult result = IrregexpInterpreter::Match(isolate,
545 byte_codes,
546 subject,
547 raw_output,
548 index);
549 if (result == RE_SUCCESS) {
550 // Copy capture results to the start of the registers array.
551 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
552 }
553 if (result == RE_EXCEPTION) {
554 DCHECK(!isolate->has_pending_exception());
555 isolate->StackOverflow();
556 }
557 return result;
558#endif // V8_INTERPRETED_REGEXP
559}
560
561
562MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
563 Handle<String> subject,
564 int previous_index,
565 Handle<JSArray> last_match_info) {
566 Isolate* isolate = regexp->GetIsolate();
567 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
568
569 // Prepare space for the return values.
570#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
571 if (FLAG_trace_regexp_bytecodes) {
572 String* pattern = regexp->Pattern();
573 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
574 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
575 }
576#endif
577 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
578 if (required_registers < 0) {
579 // Compiling failed with an exception.
580 DCHECK(isolate->has_pending_exception());
581 return MaybeHandle<Object>();
582 }
583
584 int32_t* output_registers = NULL;
585 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
586 output_registers = NewArray<int32_t>(required_registers);
587 }
588 base::SmartArrayPointer<int32_t> auto_release(output_registers);
589 if (output_registers == NULL) {
590 output_registers = isolate->jsregexp_static_offsets_vector();
591 }
592
593 int res = RegExpImpl::IrregexpExecRaw(
594 regexp, subject, previous_index, output_registers, required_registers);
595 if (res == RE_SUCCESS) {
596 int capture_count =
597 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
598 return SetLastMatchInfo(
599 last_match_info, subject, capture_count, output_registers);
600 }
601 if (res == RE_EXCEPTION) {
602 DCHECK(isolate->has_pending_exception());
603 return MaybeHandle<Object>();
604 }
605 DCHECK(res == RE_FAILURE);
606 return isolate->factory()->null_value();
607}
608
609
610static void EnsureSize(Handle<JSArray> array, uint32_t minimum_size) {
611 if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) {
612 JSArray::SetLength(array, minimum_size);
613 }
614}
615
616
617Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
618 Handle<String> subject,
619 int capture_count,
620 int32_t* match) {
621 DCHECK(last_match_info->HasFastObjectElements());
622 int capture_register_count = (capture_count + 1) * 2;
623 EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead);
624 DisallowHeapAllocation no_allocation;
625 FixedArray* array = FixedArray::cast(last_match_info->elements());
626 if (match != NULL) {
627 for (int i = 0; i < capture_register_count; i += 2) {
628 SetCapture(array, i, match[i]);
629 SetCapture(array, i + 1, match[i + 1]);
630 }
631 }
632 SetLastCaptureCount(array, capture_register_count);
633 SetLastSubject(array, *subject);
634 SetLastInput(array, *subject);
635 return last_match_info;
636}
637
638
639RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
640 Handle<String> subject,
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000641 Isolate* isolate)
642 : register_array_(NULL),
643 register_array_size_(0),
644 regexp_(regexp),
645 subject_(subject) {
646#ifdef V8_INTERPRETED_REGEXP
647 bool interpreted = true;
648#else
649 bool interpreted = false;
650#endif // V8_INTERPRETED_REGEXP
651
652 if (regexp_->TypeTag() == JSRegExp::ATOM) {
653 static const int kAtomRegistersPerMatch = 2;
654 registers_per_match_ = kAtomRegistersPerMatch;
655 // There is no distinction between interpreted and native for atom regexps.
656 interpreted = false;
657 } else {
658 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
659 if (registers_per_match_ < 0) {
660 num_matches_ = -1; // Signal exception.
661 return;
662 }
663 }
664
Ben Murdoch097c5b22016-05-18 11:27:45 +0100665 DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
666 if (!interpreted) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000667 register_array_size_ =
668 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
669 max_matches_ = register_array_size_ / registers_per_match_;
670 } else {
671 // Global loop in interpreted regexp is not implemented. We choose
672 // the size of the offsets vector so that it can only store one match.
673 register_array_size_ = registers_per_match_;
674 max_matches_ = 1;
675 }
676
677 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
678 register_array_ = NewArray<int32_t>(register_array_size_);
679 } else {
680 register_array_ = isolate->jsregexp_static_offsets_vector();
681 }
682
683 // Set state so that fetching the results the first time triggers a call
684 // to the compiled regexp.
685 current_match_index_ = max_matches_ - 1;
686 num_matches_ = max_matches_;
687 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
688 DCHECK_GE(register_array_size_, registers_per_match_);
689 int32_t* last_match =
690 &register_array_[current_match_index_ * registers_per_match_];
691 last_match[0] = -1;
692 last_match[1] = 0;
693}
694
Ben Murdoch097c5b22016-05-18 11:27:45 +0100695int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
696 if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
697 last_index + 1 < subject_->length() &&
698 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
699 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
700 // Advance over the surrogate pair.
701 return last_index + 2;
702 }
703 return last_index + 1;
704}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000705
706// -------------------------------------------------------------------
707// Implementation of the Irregexp regular expression engine.
708//
709// The Irregexp regular expression engine is intended to be a complete
710// implementation of ECMAScript regular expressions. It generates either
711// bytecodes or native code.
712
713// The Irregexp regexp engine is structured in three steps.
714// 1) The parser generates an abstract syntax tree. See ast.cc.
715// 2) From the AST a node network is created. The nodes are all
716// subclasses of RegExpNode. The nodes represent states when
717// executing a regular expression. Several optimizations are
718// performed on the node network.
719// 3) From the nodes we generate either byte codes or native code
720// that can actually execute the regular expression (perform
721// the search). The code generation step is described in more
722// detail below.
723
724// Code generation.
725//
726// The nodes are divided into four main categories.
727// * Choice nodes
728// These represent places where the regular expression can
729// match in more than one way. For example on entry to an
730// alternation (foo|bar) or a repetition (*, +, ? or {}).
731// * Action nodes
732// These represent places where some action should be
733// performed. Examples include recording the current position
734// in the input string to a register (in order to implement
735// captures) or other actions on register for example in order
736// to implement the counters needed for {} repetitions.
737// * Matching nodes
738// These attempt to match some element part of the input string.
739// Examples of elements include character classes, plain strings
740// or back references.
741// * End nodes
742// These are used to implement the actions required on finding
743// a successful match or failing to find a match.
744//
745// The code generated (whether as byte codes or native code) maintains
746// some state as it runs. This consists of the following elements:
747//
748// * The capture registers. Used for string captures.
749// * Other registers. Used for counters etc.
750// * The current position.
751// * The stack of backtracking information. Used when a matching node
752// fails to find a match and needs to try an alternative.
753//
754// Conceptual regular expression execution model:
755//
756// There is a simple conceptual model of regular expression execution
757// which will be presented first. The actual code generated is a more
758// efficient simulation of the simple conceptual model:
759//
760// * Choice nodes are implemented as follows:
761// For each choice except the last {
762// push current position
763// push backtrack code location
764// <generate code to test for choice>
765// backtrack code location:
766// pop current position
767// }
768// <generate code to test for last choice>
769//
770// * Actions nodes are generated as follows
771// <push affected registers on backtrack stack>
772// <generate code to perform action>
773// push backtrack code location
774// <generate code to test for following nodes>
775// backtrack code location:
776// <pop affected registers to restore their state>
777// <pop backtrack location from stack and go to it>
778//
779// * Matching nodes are generated as follows:
780// if input string matches at current position
781// update current position
782// <generate code to test for following nodes>
783// else
784// <pop backtrack location from stack and go to it>
785//
786// Thus it can be seen that the current position is saved and restored
787// by the choice nodes, whereas the registers are saved and restored by
788// by the action nodes that manipulate them.
789//
790// The other interesting aspect of this model is that nodes are generated
791// at the point where they are needed by a recursive call to Emit(). If
792// the node has already been code generated then the Emit() call will
793// generate a jump to the previously generated code instead. In order to
794// limit recursion it is possible for the Emit() function to put the node
795// on a work list for later generation and instead generate a jump. The
796// destination of the jump is resolved later when the code is generated.
797//
798// Actual regular expression code generation.
799//
800// Code generation is actually more complicated than the above. In order
801// to improve the efficiency of the generated code some optimizations are
802// performed
803//
804// * Choice nodes have 1-character lookahead.
805// A choice node looks at the following character and eliminates some of
806// the choices immediately based on that character. This is not yet
807// implemented.
808// * Simple greedy loops store reduced backtracking information.
809// A quantifier like /.*foo/m will greedily match the whole input. It will
810// then need to backtrack to a point where it can match "foo". The naive
811// implementation of this would push each character position onto the
812// backtracking stack, then pop them off one by one. This would use space
813// proportional to the length of the input string. However since the "."
814// can only match in one way and always has a constant length (in this case
815// of 1) it suffices to store the current position on the top of the stack
816// once. Matching now becomes merely incrementing the current position and
817// backtracking becomes decrementing the current position and checking the
818// result against the stored current position. This is faster and saves
819// space.
820// * The current state is virtualized.
821// This is used to defer expensive operations until it is clear that they
822// are needed and to generate code for a node more than once, allowing
823// specialized an efficient versions of the code to be created. This is
824// explained in the section below.
825//
826// Execution state virtualization.
827//
828// Instead of emitting code, nodes that manipulate the state can record their
829// manipulation in an object called the Trace. The Trace object can record a
830// current position offset, an optional backtrack code location on the top of
831// the virtualized backtrack stack and some register changes. When a node is
832// to be emitted it can flush the Trace or update it. Flushing the Trace
833// will emit code to bring the actual state into line with the virtual state.
834// Avoiding flushing the state can postpone some work (e.g. updates of capture
835// registers). Postponing work can save time when executing the regular
836// expression since it may be found that the work never has to be done as a
837// failure to match can occur. In addition it is much faster to jump to a
838// known backtrack code location than it is to pop an unknown backtrack
839// location from the stack and jump there.
840//
841// The virtual state found in the Trace affects code generation. For example
842// the virtual state contains the difference between the actual current
843// position and the virtual current position, and matching code needs to use
844// this offset to attempt a match in the correct location of the input
845// string. Therefore code generated for a non-trivial trace is specialized
846// to that trace. The code generator therefore has the ability to generate
847// code for each node several times. In order to limit the size of the
848// generated code there is an arbitrary limit on how many specialized sets of
849// code may be generated for a given node. If the limit is reached, the
850// trace is flushed and a generic version of the code for a node is emitted.
851// This is subsequently used for that node. The code emitted for non-generic
852// trace is not recorded in the node and so it cannot currently be reused in
853// the event that code generation is requested for an identical trace.
854
855
856void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
857 UNREACHABLE();
858}
859
860
861void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
862 text->AddElement(TextElement::Atom(this), zone);
863}
864
865
866void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
867 text->AddElement(TextElement::CharClass(this), zone);
868}
869
870
871void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
872 for (int i = 0; i < elements()->length(); i++)
873 text->AddElement(elements()->at(i), zone);
874}
875
876
877TextElement TextElement::Atom(RegExpAtom* atom) {
878 return TextElement(ATOM, atom);
879}
880
881
882TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
883 return TextElement(CHAR_CLASS, char_class);
884}
885
886
887int TextElement::length() const {
888 switch (text_type()) {
889 case ATOM:
890 return atom()->length();
891
892 case CHAR_CLASS:
893 return 1;
894 }
895 UNREACHABLE();
896 return 0;
897}
898
899
900DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
901 if (table_ == NULL) {
902 table_ = new(zone()) DispatchTable(zone());
903 DispatchTableConstructor cons(table_, ignore_case, zone());
904 cons.BuildTable(this);
905 }
906 return table_;
907}
908
909
910class FrequencyCollator {
911 public:
912 FrequencyCollator() : total_samples_(0) {
913 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
914 frequencies_[i] = CharacterFrequency(i);
915 }
916 }
917
918 void CountCharacter(int character) {
919 int index = (character & RegExpMacroAssembler::kTableMask);
920 frequencies_[index].Increment();
921 total_samples_++;
922 }
923
924 // Does not measure in percent, but rather per-128 (the table size from the
925 // regexp macro assembler).
926 int Frequency(int in_character) {
927 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
928 if (total_samples_ < 1) return 1; // Division by zero.
929 int freq_in_per128 =
930 (frequencies_[in_character].counter() * 128) / total_samples_;
931 return freq_in_per128;
932 }
933
934 private:
935 class CharacterFrequency {
936 public:
937 CharacterFrequency() : counter_(0), character_(-1) { }
938 explicit CharacterFrequency(int character)
939 : counter_(0), character_(character) { }
940
941 void Increment() { counter_++; }
942 int counter() { return counter_; }
943 int character() { return character_; }
944
945 private:
946 int counter_;
947 int character_;
948 };
949
950
951 private:
952 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
953 int total_samples_;
954};
955
956
957class RegExpCompiler {
958 public:
959 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch097c5b22016-05-18 11:27:45 +0100960 JSRegExp::Flags flags, bool is_one_byte);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000961
962 int AllocateRegister() {
963 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
964 reg_exp_too_big_ = true;
965 return next_register_;
966 }
967 return next_register_++;
968 }
969
Ben Murdoch097c5b22016-05-18 11:27:45 +0100970 // Lookarounds to match lone surrogates for unicode character class matches
971 // are never nested. We can therefore reuse registers.
972 int UnicodeLookaroundStackRegister() {
973 if (unicode_lookaround_stack_register_ == kNoRegister) {
974 unicode_lookaround_stack_register_ = AllocateRegister();
975 }
976 return unicode_lookaround_stack_register_;
977 }
978
979 int UnicodeLookaroundPositionRegister() {
980 if (unicode_lookaround_position_register_ == kNoRegister) {
981 unicode_lookaround_position_register_ = AllocateRegister();
982 }
983 return unicode_lookaround_position_register_;
984 }
985
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000986 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
987 RegExpNode* start,
988 int capture_count,
989 Handle<String> pattern);
990
991 inline void AddWork(RegExpNode* node) {
992 if (!node->on_work_list() && !node->label()->is_bound()) {
993 node->set_on_work_list(true);
994 work_list_->Add(node);
995 }
996 }
997
998 static const int kImplementationOffset = 0;
999 static const int kNumberOfRegistersOffset = 0;
1000 static const int kCodeOffset = 1;
1001
1002 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1003 EndNode* accept() { return accept_; }
1004
1005 static const int kMaxRecursion = 100;
1006 inline int recursion_depth() { return recursion_depth_; }
1007 inline void IncrementRecursionDepth() { recursion_depth_++; }
1008 inline void DecrementRecursionDepth() { recursion_depth_--; }
1009
1010 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1011
Ben Murdoch097c5b22016-05-18 11:27:45 +01001012 inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
1013 inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001014 inline bool one_byte() { return one_byte_; }
1015 inline bool optimize() { return optimize_; }
1016 inline void set_optimize(bool value) { optimize_ = value; }
1017 inline bool limiting_recursion() { return limiting_recursion_; }
1018 inline void set_limiting_recursion(bool value) {
1019 limiting_recursion_ = value;
1020 }
1021 bool read_backward() { return read_backward_; }
1022 void set_read_backward(bool value) { read_backward_ = value; }
1023 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1024
1025 int current_expansion_factor() { return current_expansion_factor_; }
1026 void set_current_expansion_factor(int value) {
1027 current_expansion_factor_ = value;
1028 }
1029
1030 Isolate* isolate() const { return isolate_; }
1031 Zone* zone() const { return zone_; }
1032
1033 static const int kNoRegister = -1;
1034
1035 private:
1036 EndNode* accept_;
1037 int next_register_;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001038 int unicode_lookaround_stack_register_;
1039 int unicode_lookaround_position_register_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001040 List<RegExpNode*>* work_list_;
1041 int recursion_depth_;
1042 RegExpMacroAssembler* macro_assembler_;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001043 JSRegExp::Flags flags_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001044 bool one_byte_;
1045 bool reg_exp_too_big_;
1046 bool limiting_recursion_;
1047 bool optimize_;
1048 bool read_backward_;
1049 int current_expansion_factor_;
1050 FrequencyCollator frequency_collator_;
1051 Isolate* isolate_;
1052 Zone* zone_;
1053};
1054
1055
1056class RecursionCheck {
1057 public:
1058 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1059 compiler->IncrementRecursionDepth();
1060 }
1061 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1062 private:
1063 RegExpCompiler* compiler_;
1064};
1065
1066
1067static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1068 return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1069}
1070
1071
1072// Attempts to compile the regexp using an Irregexp code generator. Returns
1073// a fixed array or a null handle depending on whether it succeeded.
1074RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch097c5b22016-05-18 11:27:45 +01001075 JSRegExp::Flags flags, bool one_byte)
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001076 : next_register_(2 * (capture_count + 1)),
Ben Murdoch097c5b22016-05-18 11:27:45 +01001077 unicode_lookaround_stack_register_(kNoRegister),
1078 unicode_lookaround_position_register_(kNoRegister),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001079 work_list_(NULL),
1080 recursion_depth_(0),
Ben Murdoch097c5b22016-05-18 11:27:45 +01001081 flags_(flags),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001082 one_byte_(one_byte),
1083 reg_exp_too_big_(false),
1084 limiting_recursion_(false),
1085 optimize_(FLAG_regexp_optimization),
1086 read_backward_(false),
1087 current_expansion_factor_(1),
1088 frequency_collator_(),
1089 isolate_(isolate),
1090 zone_(zone) {
1091 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1092 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1093}
1094
1095
1096RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1097 RegExpMacroAssembler* macro_assembler,
1098 RegExpNode* start,
1099 int capture_count,
1100 Handle<String> pattern) {
1101 Heap* heap = pattern->GetHeap();
1102
1103#ifdef DEBUG
1104 if (FLAG_trace_regexp_assembler)
1105 macro_assembler_ =
1106 new RegExpMacroAssemblerTracer(isolate(), macro_assembler);
1107 else
1108#endif
1109 macro_assembler_ = macro_assembler;
1110
1111 List <RegExpNode*> work_list(0);
1112 work_list_ = &work_list;
1113 Label fail;
1114 macro_assembler_->PushBacktrack(&fail);
1115 Trace new_trace;
1116 start->Emit(this, &new_trace);
1117 macro_assembler_->Bind(&fail);
1118 macro_assembler_->Fail();
1119 while (!work_list.is_empty()) {
1120 RegExpNode* node = work_list.RemoveLast();
1121 node->set_on_work_list(false);
1122 if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1123 }
1124 if (reg_exp_too_big_) {
1125 macro_assembler_->AbortedCodeGeneration();
1126 return IrregexpRegExpTooBig(isolate_);
1127 }
1128
1129 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1130 heap->IncreaseTotalRegexpCodeGenerated(code->Size());
1131 work_list_ = NULL;
1132#ifdef ENABLE_DISASSEMBLER
1133 if (FLAG_print_code) {
1134 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
1135 OFStream os(trace_scope.file());
1136 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1137 }
1138#endif
1139#ifdef DEBUG
1140 if (FLAG_trace_regexp_assembler) {
1141 delete macro_assembler_;
1142 }
1143#endif
1144 return RegExpEngine::CompilationResult(*code, next_register_);
1145}
1146
1147
1148bool Trace::DeferredAction::Mentions(int that) {
1149 if (action_type() == ActionNode::CLEAR_CAPTURES) {
1150 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1151 return range.Contains(that);
1152 } else {
1153 return reg() == that;
1154 }
1155}
1156
1157
1158bool Trace::mentions_reg(int reg) {
1159 for (DeferredAction* action = actions_;
1160 action != NULL;
1161 action = action->next()) {
1162 if (action->Mentions(reg))
1163 return true;
1164 }
1165 return false;
1166}
1167
1168
1169bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1170 DCHECK_EQ(0, *cp_offset);
1171 for (DeferredAction* action = actions_;
1172 action != NULL;
1173 action = action->next()) {
1174 if (action->Mentions(reg)) {
1175 if (action->action_type() == ActionNode::STORE_POSITION) {
1176 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1177 return true;
1178 } else {
1179 return false;
1180 }
1181 }
1182 }
1183 return false;
1184}
1185
1186
1187int Trace::FindAffectedRegisters(OutSet* affected_registers,
1188 Zone* zone) {
1189 int max_register = RegExpCompiler::kNoRegister;
1190 for (DeferredAction* action = actions_;
1191 action != NULL;
1192 action = action->next()) {
1193 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1194 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1195 for (int i = range.from(); i <= range.to(); i++)
1196 affected_registers->Set(i, zone);
1197 if (range.to() > max_register) max_register = range.to();
1198 } else {
1199 affected_registers->Set(action->reg(), zone);
1200 if (action->reg() > max_register) max_register = action->reg();
1201 }
1202 }
1203 return max_register;
1204}
1205
1206
1207void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1208 int max_register,
1209 const OutSet& registers_to_pop,
1210 const OutSet& registers_to_clear) {
1211 for (int reg = max_register; reg >= 0; reg--) {
1212 if (registers_to_pop.Get(reg)) {
1213 assembler->PopRegister(reg);
1214 } else if (registers_to_clear.Get(reg)) {
1215 int clear_to = reg;
1216 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1217 reg--;
1218 }
1219 assembler->ClearRegisters(reg, clear_to);
1220 }
1221 }
1222}
1223
1224
1225void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1226 int max_register,
1227 const OutSet& affected_registers,
1228 OutSet* registers_to_pop,
1229 OutSet* registers_to_clear,
1230 Zone* zone) {
1231 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1232 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1233
1234 // Count pushes performed to force a stack limit check occasionally.
1235 int pushes = 0;
1236
1237 for (int reg = 0; reg <= max_register; reg++) {
1238 if (!affected_registers.Get(reg)) {
1239 continue;
1240 }
1241
1242 // The chronologically first deferred action in the trace
1243 // is used to infer the action needed to restore a register
1244 // to its previous state (or not, if it's safe to ignore it).
1245 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1246 DeferredActionUndoType undo_action = IGNORE;
1247
1248 int value = 0;
1249 bool absolute = false;
1250 bool clear = false;
1251 static const int kNoStore = kMinInt;
1252 int store_position = kNoStore;
1253 // This is a little tricky because we are scanning the actions in reverse
1254 // historical order (newest first).
1255 for (DeferredAction* action = actions_;
1256 action != NULL;
1257 action = action->next()) {
1258 if (action->Mentions(reg)) {
1259 switch (action->action_type()) {
1260 case ActionNode::SET_REGISTER: {
1261 Trace::DeferredSetRegister* psr =
1262 static_cast<Trace::DeferredSetRegister*>(action);
1263 if (!absolute) {
1264 value += psr->value();
1265 absolute = true;
1266 }
1267 // SET_REGISTER is currently only used for newly introduced loop
1268 // counters. They can have a significant previous value if they
1269 // occour in a loop. TODO(lrn): Propagate this information, so
1270 // we can set undo_action to IGNORE if we know there is no value to
1271 // restore.
1272 undo_action = RESTORE;
1273 DCHECK_EQ(store_position, kNoStore);
1274 DCHECK(!clear);
1275 break;
1276 }
1277 case ActionNode::INCREMENT_REGISTER:
1278 if (!absolute) {
1279 value++;
1280 }
1281 DCHECK_EQ(store_position, kNoStore);
1282 DCHECK(!clear);
1283 undo_action = RESTORE;
1284 break;
1285 case ActionNode::STORE_POSITION: {
1286 Trace::DeferredCapture* pc =
1287 static_cast<Trace::DeferredCapture*>(action);
1288 if (!clear && store_position == kNoStore) {
1289 store_position = pc->cp_offset();
1290 }
1291
1292 // For captures we know that stores and clears alternate.
1293 // Other register, are never cleared, and if the occur
1294 // inside a loop, they might be assigned more than once.
1295 if (reg <= 1) {
1296 // Registers zero and one, aka "capture zero", is
1297 // always set correctly if we succeed. There is no
1298 // need to undo a setting on backtrack, because we
1299 // will set it again or fail.
1300 undo_action = IGNORE;
1301 } else {
1302 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1303 }
1304 DCHECK(!absolute);
1305 DCHECK_EQ(value, 0);
1306 break;
1307 }
1308 case ActionNode::CLEAR_CAPTURES: {
1309 // Since we're scanning in reverse order, if we've already
1310 // set the position we have to ignore historically earlier
1311 // clearing operations.
1312 if (store_position == kNoStore) {
1313 clear = true;
1314 }
1315 undo_action = RESTORE;
1316 DCHECK(!absolute);
1317 DCHECK_EQ(value, 0);
1318 break;
1319 }
1320 default:
1321 UNREACHABLE();
1322 break;
1323 }
1324 }
1325 }
1326 // Prepare for the undo-action (e.g., push if it's going to be popped).
1327 if (undo_action == RESTORE) {
1328 pushes++;
1329 RegExpMacroAssembler::StackCheckFlag stack_check =
1330 RegExpMacroAssembler::kNoStackLimitCheck;
1331 if (pushes == push_limit) {
1332 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1333 pushes = 0;
1334 }
1335
1336 assembler->PushRegister(reg, stack_check);
1337 registers_to_pop->Set(reg, zone);
1338 } else if (undo_action == CLEAR) {
1339 registers_to_clear->Set(reg, zone);
1340 }
1341 // Perform the chronologically last action (or accumulated increment)
1342 // for the register.
1343 if (store_position != kNoStore) {
1344 assembler->WriteCurrentPositionToRegister(reg, store_position);
1345 } else if (clear) {
1346 assembler->ClearRegisters(reg, reg);
1347 } else if (absolute) {
1348 assembler->SetRegister(reg, value);
1349 } else if (value != 0) {
1350 assembler->AdvanceRegister(reg, value);
1351 }
1352 }
1353}
1354
1355
1356// This is called as we come into a loop choice node and some other tricky
1357// nodes. It normalizes the state of the code generator to ensure we can
1358// generate generic code.
1359void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1360 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1361
1362 DCHECK(!is_trivial());
1363
1364 if (actions_ == NULL && backtrack() == NULL) {
1365 // Here we just have some deferred cp advances to fix and we are back to
1366 // a normal situation. We may also have to forget some information gained
1367 // through a quick check that was already performed.
1368 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1369 // Create a new trivial state and generate the node with that.
1370 Trace new_state;
1371 successor->Emit(compiler, &new_state);
1372 return;
1373 }
1374
1375 // Generate deferred actions here along with code to undo them again.
1376 OutSet affected_registers;
1377
1378 if (backtrack() != NULL) {
1379 // Here we have a concrete backtrack location. These are set up by choice
1380 // nodes and so they indicate that we have a deferred save of the current
1381 // position which we may need to emit here.
1382 assembler->PushCurrentPosition();
1383 }
1384
1385 int max_register = FindAffectedRegisters(&affected_registers,
1386 compiler->zone());
1387 OutSet registers_to_pop;
1388 OutSet registers_to_clear;
1389 PerformDeferredActions(assembler,
1390 max_register,
1391 affected_registers,
1392 &registers_to_pop,
1393 &registers_to_clear,
1394 compiler->zone());
1395 if (cp_offset_ != 0) {
1396 assembler->AdvanceCurrentPosition(cp_offset_);
1397 }
1398
1399 // Create a new trivial state and generate the node with that.
1400 Label undo;
1401 assembler->PushBacktrack(&undo);
1402 if (successor->KeepRecursing(compiler)) {
1403 Trace new_state;
1404 successor->Emit(compiler, &new_state);
1405 } else {
1406 compiler->AddWork(successor);
1407 assembler->GoTo(successor->label());
1408 }
1409
1410 // On backtrack we need to restore state.
1411 assembler->Bind(&undo);
1412 RestoreAffectedRegisters(assembler,
1413 max_register,
1414 registers_to_pop,
1415 registers_to_clear);
1416 if (backtrack() == NULL) {
1417 assembler->Backtrack();
1418 } else {
1419 assembler->PopCurrentPosition();
1420 assembler->GoTo(backtrack());
1421 }
1422}
1423
1424
1425void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1426 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1427
1428 // Omit flushing the trace. We discard the entire stack frame anyway.
1429
1430 if (!label()->is_bound()) {
1431 // We are completely independent of the trace, since we ignore it,
1432 // so this code can be used as the generic version.
1433 assembler->Bind(label());
1434 }
1435
1436 // Throw away everything on the backtrack stack since the start
1437 // of the negative submatch and restore the character position.
1438 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1439 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1440 if (clear_capture_count_ > 0) {
1441 // Clear any captures that might have been performed during the success
1442 // of the body of the negative look-ahead.
1443 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1444 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1445 }
1446 // Now that we have unwound the stack we find at the top of the stack the
1447 // backtrack that the BeginSubmatch node got.
1448 assembler->Backtrack();
1449}
1450
1451
1452void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1453 if (!trace->is_trivial()) {
1454 trace->Flush(compiler, this);
1455 return;
1456 }
1457 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1458 if (!label()->is_bound()) {
1459 assembler->Bind(label());
1460 }
1461 switch (action_) {
1462 case ACCEPT:
1463 assembler->Succeed();
1464 return;
1465 case BACKTRACK:
1466 assembler->GoTo(trace->backtrack());
1467 return;
1468 case NEGATIVE_SUBMATCH_SUCCESS:
1469 // This case is handled in a different virtual method.
1470 UNREACHABLE();
1471 }
1472 UNIMPLEMENTED();
1473}
1474
1475
1476void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1477 if (guards_ == NULL)
1478 guards_ = new(zone) ZoneList<Guard*>(1, zone);
1479 guards_->Add(guard, zone);
1480}
1481
1482
1483ActionNode* ActionNode::SetRegister(int reg,
1484 int val,
1485 RegExpNode* on_success) {
1486 ActionNode* result =
1487 new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1488 result->data_.u_store_register.reg = reg;
1489 result->data_.u_store_register.value = val;
1490 return result;
1491}
1492
1493
1494ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1495 ActionNode* result =
1496 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1497 result->data_.u_increment_register.reg = reg;
1498 return result;
1499}
1500
1501
1502ActionNode* ActionNode::StorePosition(int reg,
1503 bool is_capture,
1504 RegExpNode* on_success) {
1505 ActionNode* result =
1506 new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1507 result->data_.u_position_register.reg = reg;
1508 result->data_.u_position_register.is_capture = is_capture;
1509 return result;
1510}
1511
1512
1513ActionNode* ActionNode::ClearCaptures(Interval range,
1514 RegExpNode* on_success) {
1515 ActionNode* result =
1516 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1517 result->data_.u_clear_captures.range_from = range.from();
1518 result->data_.u_clear_captures.range_to = range.to();
1519 return result;
1520}
1521
1522
1523ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1524 int position_reg,
1525 RegExpNode* on_success) {
1526 ActionNode* result =
1527 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1528 result->data_.u_submatch.stack_pointer_register = stack_reg;
1529 result->data_.u_submatch.current_position_register = position_reg;
1530 return result;
1531}
1532
1533
1534ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1535 int position_reg,
1536 int clear_register_count,
1537 int clear_register_from,
1538 RegExpNode* on_success) {
1539 ActionNode* result =
1540 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1541 result->data_.u_submatch.stack_pointer_register = stack_reg;
1542 result->data_.u_submatch.current_position_register = position_reg;
1543 result->data_.u_submatch.clear_register_count = clear_register_count;
1544 result->data_.u_submatch.clear_register_from = clear_register_from;
1545 return result;
1546}
1547
1548
1549ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1550 int repetition_register,
1551 int repetition_limit,
1552 RegExpNode* on_success) {
1553 ActionNode* result =
1554 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1555 result->data_.u_empty_match_check.start_register = start_register;
1556 result->data_.u_empty_match_check.repetition_register = repetition_register;
1557 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1558 return result;
1559}
1560
1561
1562#define DEFINE_ACCEPT(Type) \
1563 void Type##Node::Accept(NodeVisitor* visitor) { \
1564 visitor->Visit##Type(this); \
1565 }
1566FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1567#undef DEFINE_ACCEPT
1568
1569
1570void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1571 visitor->VisitLoopChoice(this);
1572}
1573
1574
1575// -------------------------------------------------------------------
1576// Emit code.
1577
1578
1579void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1580 Guard* guard,
1581 Trace* trace) {
1582 switch (guard->op()) {
1583 case Guard::LT:
1584 DCHECK(!trace->mentions_reg(guard->reg()));
1585 macro_assembler->IfRegisterGE(guard->reg(),
1586 guard->value(),
1587 trace->backtrack());
1588 break;
1589 case Guard::GEQ:
1590 DCHECK(!trace->mentions_reg(guard->reg()));
1591 macro_assembler->IfRegisterLT(guard->reg(),
1592 guard->value(),
1593 trace->backtrack());
1594 break;
1595 }
1596}
1597
1598
1599// Returns the number of characters in the equivalence class, omitting those
1600// that cannot occur in the source string because it is Latin1.
1601static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1602 bool one_byte_subject,
1603 unibrow::uchar* letters) {
1604 int length =
1605 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1606 // Unibrow returns 0 or 1 for characters where case independence is
1607 // trivial.
1608 if (length == 0) {
1609 letters[0] = character;
1610 length = 1;
1611 }
1612
1613 if (one_byte_subject) {
1614 int new_length = 0;
1615 for (int i = 0; i < length; i++) {
1616 if (letters[i] <= String::kMaxOneByteCharCode) {
1617 letters[new_length++] = letters[i];
1618 }
1619 }
1620 length = new_length;
1621 }
1622
1623 return length;
1624}
1625
1626
1627static inline bool EmitSimpleCharacter(Isolate* isolate,
1628 RegExpCompiler* compiler,
1629 uc16 c,
1630 Label* on_failure,
1631 int cp_offset,
1632 bool check,
1633 bool preloaded) {
1634 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1635 bool bound_checked = false;
1636 if (!preloaded) {
1637 assembler->LoadCurrentCharacter(
1638 cp_offset,
1639 on_failure,
1640 check);
1641 bound_checked = true;
1642 }
1643 assembler->CheckNotCharacter(c, on_failure);
1644 return bound_checked;
1645}
1646
1647
1648// Only emits non-letters (things that don't have case). Only used for case
1649// independent matches.
1650static inline bool EmitAtomNonLetter(Isolate* isolate,
1651 RegExpCompiler* compiler,
1652 uc16 c,
1653 Label* on_failure,
1654 int cp_offset,
1655 bool check,
1656 bool preloaded) {
1657 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1658 bool one_byte = compiler->one_byte();
1659 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1660 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1661 if (length < 1) {
1662 // This can't match. Must be an one-byte subject and a non-one-byte
1663 // character. We do not need to do anything since the one-byte pass
1664 // already handled this.
1665 return false; // Bounds not checked.
1666 }
1667 bool checked = false;
1668 // We handle the length > 1 case in a later pass.
1669 if (length == 1) {
1670 if (one_byte && c > String::kMaxOneByteCharCodeU) {
1671 // Can't match - see above.
1672 return false; // Bounds not checked.
1673 }
1674 if (!preloaded) {
1675 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1676 checked = check;
1677 }
1678 macro_assembler->CheckNotCharacter(c, on_failure);
1679 }
1680 return checked;
1681}
1682
1683
1684static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1685 bool one_byte, uc16 c1, uc16 c2,
1686 Label* on_failure) {
1687 uc16 char_mask;
1688 if (one_byte) {
1689 char_mask = String::kMaxOneByteCharCode;
1690 } else {
1691 char_mask = String::kMaxUtf16CodeUnit;
1692 }
1693 uc16 exor = c1 ^ c2;
1694 // Check whether exor has only one bit set.
1695 if (((exor - 1) & exor) == 0) {
1696 // If c1 and c2 differ only by one bit.
1697 // Ecma262UnCanonicalize always gives the highest number last.
1698 DCHECK(c2 > c1);
1699 uc16 mask = char_mask ^ exor;
1700 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1701 return true;
1702 }
1703 DCHECK(c2 > c1);
1704 uc16 diff = c2 - c1;
1705 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1706 // If the characters differ by 2^n but don't differ by one bit then
1707 // subtract the difference from the found character, then do the or
1708 // trick. We avoid the theoretical case where negative numbers are
1709 // involved in order to simplify code generation.
1710 uc16 mask = char_mask ^ diff;
1711 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1712 diff,
1713 mask,
1714 on_failure);
1715 return true;
1716 }
1717 return false;
1718}
1719
1720
1721typedef bool EmitCharacterFunction(Isolate* isolate,
1722 RegExpCompiler* compiler,
1723 uc16 c,
1724 Label* on_failure,
1725 int cp_offset,
1726 bool check,
1727 bool preloaded);
1728
1729// Only emits letters (things that have case). Only used for case independent
1730// matches.
1731static inline bool EmitAtomLetter(Isolate* isolate,
1732 RegExpCompiler* compiler,
1733 uc16 c,
1734 Label* on_failure,
1735 int cp_offset,
1736 bool check,
1737 bool preloaded) {
1738 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1739 bool one_byte = compiler->one_byte();
1740 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1741 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1742 if (length <= 1) return false;
1743 // We may not need to check against the end of the input string
1744 // if this character lies before a character that matched.
1745 if (!preloaded) {
1746 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1747 }
1748 Label ok;
1749 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1750 switch (length) {
1751 case 2: {
1752 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1753 chars[1], on_failure)) {
1754 } else {
1755 macro_assembler->CheckCharacter(chars[0], &ok);
1756 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1757 macro_assembler->Bind(&ok);
1758 }
1759 break;
1760 }
1761 case 4:
1762 macro_assembler->CheckCharacter(chars[3], &ok);
1763 // Fall through!
1764 case 3:
1765 macro_assembler->CheckCharacter(chars[0], &ok);
1766 macro_assembler->CheckCharacter(chars[1], &ok);
1767 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1768 macro_assembler->Bind(&ok);
1769 break;
1770 default:
1771 UNREACHABLE();
1772 break;
1773 }
1774 return true;
1775}
1776
1777
1778static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1779 int border,
1780 Label* fall_through,
1781 Label* above_or_equal,
1782 Label* below) {
1783 if (below != fall_through) {
1784 masm->CheckCharacterLT(border, below);
1785 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1786 } else {
1787 masm->CheckCharacterGT(border - 1, above_or_equal);
1788 }
1789}
1790
1791
1792static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1793 int first,
1794 int last,
1795 Label* fall_through,
1796 Label* in_range,
1797 Label* out_of_range) {
1798 if (in_range == fall_through) {
1799 if (first == last) {
1800 masm->CheckNotCharacter(first, out_of_range);
1801 } else {
1802 masm->CheckCharacterNotInRange(first, last, out_of_range);
1803 }
1804 } else {
1805 if (first == last) {
1806 masm->CheckCharacter(first, in_range);
1807 } else {
1808 masm->CheckCharacterInRange(first, last, in_range);
1809 }
1810 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1811 }
1812}
1813
1814
1815// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1816// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1817static void EmitUseLookupTable(
1818 RegExpMacroAssembler* masm,
1819 ZoneList<int>* ranges,
1820 int start_index,
1821 int end_index,
1822 int min_char,
1823 Label* fall_through,
1824 Label* even_label,
1825 Label* odd_label) {
1826 static const int kSize = RegExpMacroAssembler::kTableSize;
1827 static const int kMask = RegExpMacroAssembler::kTableMask;
1828
1829 int base = (min_char & ~kMask);
1830 USE(base);
1831
1832 // Assert that everything is on one kTableSize page.
1833 for (int i = start_index; i <= end_index; i++) {
1834 DCHECK_EQ(ranges->at(i) & ~kMask, base);
1835 }
1836 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1837
1838 char templ[kSize];
1839 Label* on_bit_set;
1840 Label* on_bit_clear;
1841 int bit;
1842 if (even_label == fall_through) {
1843 on_bit_set = odd_label;
1844 on_bit_clear = even_label;
1845 bit = 1;
1846 } else {
1847 on_bit_set = even_label;
1848 on_bit_clear = odd_label;
1849 bit = 0;
1850 }
1851 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1852 templ[i] = bit;
1853 }
1854 int j = 0;
1855 bit ^= 1;
1856 for (int i = start_index; i < end_index; i++) {
1857 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1858 templ[j] = bit;
1859 }
1860 bit ^= 1;
1861 }
1862 for (int i = j; i < kSize; i++) {
1863 templ[i] = bit;
1864 }
1865 Factory* factory = masm->isolate()->factory();
1866 // TODO(erikcorry): Cache these.
1867 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1868 for (int i = 0; i < kSize; i++) {
1869 ba->set(i, templ[i]);
1870 }
1871 masm->CheckBitInTable(ba, on_bit_set);
1872 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1873}
1874
1875
1876static void CutOutRange(RegExpMacroAssembler* masm,
1877 ZoneList<int>* ranges,
1878 int start_index,
1879 int end_index,
1880 int cut_index,
1881 Label* even_label,
1882 Label* odd_label) {
1883 bool odd = (((cut_index - start_index) & 1) == 1);
1884 Label* in_range_label = odd ? odd_label : even_label;
1885 Label dummy;
1886 EmitDoubleBoundaryTest(masm,
1887 ranges->at(cut_index),
1888 ranges->at(cut_index + 1) - 1,
1889 &dummy,
1890 in_range_label,
1891 &dummy);
1892 DCHECK(!dummy.is_linked());
1893 // Cut out the single range by rewriting the array. This creates a new
1894 // range that is a merger of the two ranges on either side of the one we
1895 // are cutting out. The oddity of the labels is preserved.
1896 for (int j = cut_index; j > start_index; j--) {
1897 ranges->at(j) = ranges->at(j - 1);
1898 }
1899 for (int j = cut_index + 1; j < end_index; j++) {
1900 ranges->at(j) = ranges->at(j + 1);
1901 }
1902}
1903
1904
1905// Unicode case. Split the search space into kSize spaces that are handled
1906// with recursion.
1907static void SplitSearchSpace(ZoneList<int>* ranges,
1908 int start_index,
1909 int end_index,
1910 int* new_start_index,
1911 int* new_end_index,
1912 int* border) {
1913 static const int kSize = RegExpMacroAssembler::kTableSize;
1914 static const int kMask = RegExpMacroAssembler::kTableMask;
1915
1916 int first = ranges->at(start_index);
1917 int last = ranges->at(end_index) - 1;
1918
1919 *new_start_index = start_index;
1920 *border = (ranges->at(start_index) & ~kMask) + kSize;
1921 while (*new_start_index < end_index) {
1922 if (ranges->at(*new_start_index) > *border) break;
1923 (*new_start_index)++;
1924 }
1925 // new_start_index is the index of the first edge that is beyond the
1926 // current kSize space.
1927
1928 // For very large search spaces we do a binary chop search of the non-Latin1
1929 // space instead of just going to the end of the current kSize space. The
1930 // heuristics are complicated a little by the fact that any 128-character
1931 // encoding space can be quickly tested with a table lookup, so we don't
1932 // wish to do binary chop search at a smaller granularity than that. A
1933 // 128-character space can take up a lot of space in the ranges array if,
1934 // for example, we only want to match every second character (eg. the lower
1935 // case characters on some Unicode pages).
1936 int binary_chop_index = (end_index + start_index) / 2;
1937 // The first test ensures that we get to the code that handles the Latin1
1938 // range with a single not-taken branch, speeding up this important
1939 // character range (even non-Latin1 charset-based text has spaces and
1940 // punctuation).
1941 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1942 end_index - start_index > (*new_start_index - start_index) * 2 &&
1943 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1944 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1945 int scan_forward_for_section_border = binary_chop_index;;
1946 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1947
1948 while (scan_forward_for_section_border < end_index) {
1949 if (ranges->at(scan_forward_for_section_border) > new_border) {
1950 *new_start_index = scan_forward_for_section_border;
1951 *border = new_border;
1952 break;
1953 }
1954 scan_forward_for_section_border++;
1955 }
1956 }
1957
1958 DCHECK(*new_start_index > start_index);
1959 *new_end_index = *new_start_index - 1;
1960 if (ranges->at(*new_end_index) == *border) {
1961 (*new_end_index)--;
1962 }
1963 if (*border >= ranges->at(end_index)) {
1964 *border = ranges->at(end_index);
1965 *new_start_index = end_index; // Won't be used.
1966 *new_end_index = end_index - 1;
1967 }
1968}
1969
1970
1971// Gets a series of segment boundaries representing a character class. If the
1972// character is in the range between an even and an odd boundary (counting from
1973// start_index) then go to even_label, otherwise go to odd_label. We already
1974// know that the character is in the range of min_char to max_char inclusive.
1975// Either label can be NULL indicating backtracking. Either label can also be
1976// equal to the fall_through label.
Ben Murdoch097c5b22016-05-18 11:27:45 +01001977static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1978 int start_index, int end_index, uc32 min_char,
1979 uc32 max_char, Label* fall_through,
1980 Label* even_label, Label* odd_label) {
1981 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1982 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1983
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001984 int first = ranges->at(start_index);
1985 int last = ranges->at(end_index) - 1;
1986
1987 DCHECK_LT(min_char, first);
1988
1989 // Just need to test if the character is before or on-or-after
1990 // a particular character.
1991 if (start_index == end_index) {
1992 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1993 return;
1994 }
1995
1996 // Another almost trivial case: There is one interval in the middle that is
1997 // different from the end intervals.
1998 if (start_index + 1 == end_index) {
1999 EmitDoubleBoundaryTest(
2000 masm, first, last, fall_through, even_label, odd_label);
2001 return;
2002 }
2003
2004 // It's not worth using table lookup if there are very few intervals in the
2005 // character class.
2006 if (end_index - start_index <= 6) {
2007 // It is faster to test for individual characters, so we look for those
2008 // first, then try arbitrary ranges in the second round.
2009 static int kNoCutIndex = -1;
2010 int cut = kNoCutIndex;
2011 for (int i = start_index; i < end_index; i++) {
2012 if (ranges->at(i) == ranges->at(i + 1) - 1) {
2013 cut = i;
2014 break;
2015 }
2016 }
2017 if (cut == kNoCutIndex) cut = start_index;
2018 CutOutRange(
2019 masm, ranges, start_index, end_index, cut, even_label, odd_label);
2020 DCHECK_GE(end_index - start_index, 2);
2021 GenerateBranches(masm,
2022 ranges,
2023 start_index + 1,
2024 end_index - 1,
2025 min_char,
2026 max_char,
2027 fall_through,
2028 even_label,
2029 odd_label);
2030 return;
2031 }
2032
2033 // If there are a lot of intervals in the regexp, then we will use tables to
2034 // determine whether the character is inside or outside the character class.
2035 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2036
2037 if ((max_char >> kBits) == (min_char >> kBits)) {
2038 EmitUseLookupTable(masm,
2039 ranges,
2040 start_index,
2041 end_index,
2042 min_char,
2043 fall_through,
2044 even_label,
2045 odd_label);
2046 return;
2047 }
2048
2049 if ((min_char >> kBits) != (first >> kBits)) {
2050 masm->CheckCharacterLT(first, odd_label);
2051 GenerateBranches(masm,
2052 ranges,
2053 start_index + 1,
2054 end_index,
2055 first,
2056 max_char,
2057 fall_through,
2058 odd_label,
2059 even_label);
2060 return;
2061 }
2062
2063 int new_start_index = 0;
2064 int new_end_index = 0;
2065 int border = 0;
2066
2067 SplitSearchSpace(ranges,
2068 start_index,
2069 end_index,
2070 &new_start_index,
2071 &new_end_index,
2072 &border);
2073
2074 Label handle_rest;
2075 Label* above = &handle_rest;
2076 if (border == last + 1) {
2077 // We didn't find any section that started after the limit, so everything
2078 // above the border is one of the terminal labels.
2079 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2080 DCHECK(new_end_index == end_index - 1);
2081 }
2082
2083 DCHECK_LE(start_index, new_end_index);
2084 DCHECK_LE(new_start_index, end_index);
2085 DCHECK_LT(start_index, new_start_index);
2086 DCHECK_LT(new_end_index, end_index);
2087 DCHECK(new_end_index + 1 == new_start_index ||
2088 (new_end_index + 2 == new_start_index &&
2089 border == ranges->at(new_end_index + 1)));
2090 DCHECK_LT(min_char, border - 1);
2091 DCHECK_LT(border, max_char);
2092 DCHECK_LT(ranges->at(new_end_index), border);
2093 DCHECK(border < ranges->at(new_start_index) ||
2094 (border == ranges->at(new_start_index) &&
2095 new_start_index == end_index &&
2096 new_end_index == end_index - 1 &&
2097 border == last + 1));
2098 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2099
2100 masm->CheckCharacterGT(border - 1, above);
2101 Label dummy;
2102 GenerateBranches(masm,
2103 ranges,
2104 start_index,
2105 new_end_index,
2106 min_char,
2107 border - 1,
2108 &dummy,
2109 even_label,
2110 odd_label);
2111 if (handle_rest.is_linked()) {
2112 masm->Bind(&handle_rest);
2113 bool flip = (new_start_index & 1) != (start_index & 1);
2114 GenerateBranches(masm,
2115 ranges,
2116 new_start_index,
2117 end_index,
2118 border,
2119 max_char,
2120 &dummy,
2121 flip ? odd_label : even_label,
2122 flip ? even_label : odd_label);
2123 }
2124}
2125
2126
2127static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2128 RegExpCharacterClass* cc, bool one_byte,
2129 Label* on_failure, int cp_offset, bool check_offset,
2130 bool preloaded, Zone* zone) {
2131 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
Ben Murdoch097c5b22016-05-18 11:27:45 +01002132 CharacterRange::Canonicalize(ranges);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002133
2134 int max_char;
2135 if (one_byte) {
2136 max_char = String::kMaxOneByteCharCode;
2137 } else {
2138 max_char = String::kMaxUtf16CodeUnit;
2139 }
2140
2141 int range_count = ranges->length();
2142
2143 int last_valid_range = range_count - 1;
2144 while (last_valid_range >= 0) {
2145 CharacterRange& range = ranges->at(last_valid_range);
2146 if (range.from() <= max_char) {
2147 break;
2148 }
2149 last_valid_range--;
2150 }
2151
2152 if (last_valid_range < 0) {
2153 if (!cc->is_negated()) {
2154 macro_assembler->GoTo(on_failure);
2155 }
2156 if (check_offset) {
2157 macro_assembler->CheckPosition(cp_offset, on_failure);
2158 }
2159 return;
2160 }
2161
2162 if (last_valid_range == 0 &&
2163 ranges->at(0).IsEverything(max_char)) {
2164 if (cc->is_negated()) {
2165 macro_assembler->GoTo(on_failure);
2166 } else {
2167 // This is a common case hit by non-anchored expressions.
2168 if (check_offset) {
2169 macro_assembler->CheckPosition(cp_offset, on_failure);
2170 }
2171 }
2172 return;
2173 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002174
2175 if (!preloaded) {
2176 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2177 }
2178
2179 if (cc->is_standard(zone) &&
Ben Murdoch097c5b22016-05-18 11:27:45 +01002180 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2181 on_failure)) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002182 return;
2183 }
2184
2185
2186 // A new list with ascending entries. Each entry is a code unit
2187 // where there is a boundary between code units that are part of
2188 // the class and code units that are not. Normally we insert an
2189 // entry at zero which goes to the failure label, but if there
2190 // was already one there we fall through for success on that entry.
2191 // Subsequent entries have alternating meaning (success/failure).
2192 ZoneList<int>* range_boundaries =
2193 new(zone) ZoneList<int>(last_valid_range, zone);
2194
2195 bool zeroth_entry_is_failure = !cc->is_negated();
2196
2197 for (int i = 0; i <= last_valid_range; i++) {
2198 CharacterRange& range = ranges->at(i);
2199 if (range.from() == 0) {
2200 DCHECK_EQ(i, 0);
2201 zeroth_entry_is_failure = !zeroth_entry_is_failure;
2202 } else {
2203 range_boundaries->Add(range.from(), zone);
2204 }
2205 range_boundaries->Add(range.to() + 1, zone);
2206 }
2207 int end_index = range_boundaries->length() - 1;
2208 if (range_boundaries->at(end_index) > max_char) {
2209 end_index--;
2210 }
2211
2212 Label fall_through;
2213 GenerateBranches(macro_assembler,
2214 range_boundaries,
2215 0, // start_index.
2216 end_index,
2217 0, // min_char.
2218 max_char,
2219 &fall_through,
2220 zeroth_entry_is_failure ? &fall_through : on_failure,
2221 zeroth_entry_is_failure ? on_failure : &fall_through);
2222 macro_assembler->Bind(&fall_through);
2223}
2224
2225
2226RegExpNode::~RegExpNode() {
2227}
2228
2229
2230RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2231 Trace* trace) {
2232 // If we are generating a greedy loop then don't stop and don't reuse code.
2233 if (trace->stop_node() != NULL) {
2234 return CONTINUE;
2235 }
2236
2237 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2238 if (trace->is_trivial()) {
2239 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2240 // If a generic version is already scheduled to be generated or we have
2241 // recursed too deeply then just generate a jump to that code.
2242 macro_assembler->GoTo(&label_);
2243 // This will queue it up for generation of a generic version if it hasn't
2244 // already been queued.
2245 compiler->AddWork(this);
2246 return DONE;
2247 }
2248 // Generate generic version of the node and bind the label for later use.
2249 macro_assembler->Bind(&label_);
2250 return CONTINUE;
2251 }
2252
2253 // We are being asked to make a non-generic version. Keep track of how many
2254 // non-generic versions we generate so as not to overdo it.
2255 trace_count_++;
2256 if (KeepRecursing(compiler) && compiler->optimize() &&
2257 trace_count_ < kMaxCopiesCodeGenerated) {
2258 return CONTINUE;
2259 }
2260
2261 // If we get here code has been generated for this node too many times or
2262 // recursion is too deep. Time to switch to a generic version. The code for
2263 // generic versions above can handle deep recursion properly.
2264 bool was_limiting = compiler->limiting_recursion();
2265 compiler->set_limiting_recursion(true);
2266 trace->Flush(compiler, this);
2267 compiler->set_limiting_recursion(was_limiting);
2268 return DONE;
2269}
2270
2271
2272bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2273 return !compiler->limiting_recursion() &&
2274 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2275}
2276
2277
2278int ActionNode::EatsAtLeast(int still_to_find,
2279 int budget,
2280 bool not_at_start) {
2281 if (budget <= 0) return 0;
2282 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2283 return on_success()->EatsAtLeast(still_to_find,
2284 budget - 1,
2285 not_at_start);
2286}
2287
2288
2289void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2290 BoyerMooreLookahead* bm, bool not_at_start) {
2291 if (action_type_ == BEGIN_SUBMATCH) {
2292 bm->SetRest(offset);
2293 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2294 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2295 }
2296 SaveBMInfo(bm, not_at_start, offset);
2297}
2298
2299
2300int AssertionNode::EatsAtLeast(int still_to_find,
2301 int budget,
2302 bool not_at_start) {
2303 if (budget <= 0) return 0;
2304 // If we know we are not at the start and we are asked "how many characters
2305 // will you match if you succeed?" then we can answer anything since false
2306 // implies false. So lets just return the max answer (still_to_find) since
2307 // that won't prevent us from preloading a lot of characters for the other
2308 // branches in the node graph.
2309 if (assertion_type() == AT_START && not_at_start) return still_to_find;
2310 return on_success()->EatsAtLeast(still_to_find,
2311 budget - 1,
2312 not_at_start);
2313}
2314
2315
2316void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2317 BoyerMooreLookahead* bm, bool not_at_start) {
2318 // Match the behaviour of EatsAtLeast on this node.
2319 if (assertion_type() == AT_START && not_at_start) return;
2320 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2321 SaveBMInfo(bm, not_at_start, offset);
2322}
2323
2324
2325int BackReferenceNode::EatsAtLeast(int still_to_find,
2326 int budget,
2327 bool not_at_start) {
2328 if (read_backward()) return 0;
2329 if (budget <= 0) return 0;
2330 return on_success()->EatsAtLeast(still_to_find,
2331 budget - 1,
2332 not_at_start);
2333}
2334
2335
2336int TextNode::EatsAtLeast(int still_to_find,
2337 int budget,
2338 bool not_at_start) {
2339 if (read_backward()) return 0;
2340 int answer = Length();
2341 if (answer >= still_to_find) return answer;
2342 if (budget <= 0) return answer;
2343 // We are not at start after this node so we set the last argument to 'true'.
2344 return answer + on_success()->EatsAtLeast(still_to_find - answer,
2345 budget - 1,
2346 true);
2347}
2348
2349
2350int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2351 bool not_at_start) {
2352 if (budget <= 0) return 0;
2353 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2354 // afterwards.
2355 RegExpNode* node = alternatives_->at(1).node();
2356 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2357}
2358
2359
2360void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2361 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2362 bool not_at_start) {
2363 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2364 // afterwards.
2365 RegExpNode* node = alternatives_->at(1).node();
2366 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2367}
2368
2369
2370int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2371 int budget,
2372 RegExpNode* ignore_this_node,
2373 bool not_at_start) {
2374 if (budget <= 0) return 0;
2375 int min = 100;
2376 int choice_count = alternatives_->length();
2377 budget = (budget - 1) / choice_count;
2378 for (int i = 0; i < choice_count; i++) {
2379 RegExpNode* node = alternatives_->at(i).node();
2380 if (node == ignore_this_node) continue;
2381 int node_eats_at_least =
2382 node->EatsAtLeast(still_to_find, budget, not_at_start);
2383 if (node_eats_at_least < min) min = node_eats_at_least;
2384 if (min == 0) return 0;
2385 }
2386 return min;
2387}
2388
2389
2390int LoopChoiceNode::EatsAtLeast(int still_to_find,
2391 int budget,
2392 bool not_at_start) {
2393 return EatsAtLeastHelper(still_to_find,
2394 budget - 1,
2395 loop_node_,
2396 not_at_start);
2397}
2398
2399
2400int ChoiceNode::EatsAtLeast(int still_to_find,
2401 int budget,
2402 bool not_at_start) {
2403 return EatsAtLeastHelper(still_to_find,
2404 budget,
2405 NULL,
2406 not_at_start);
2407}
2408
2409
2410// Takes the left-most 1-bit and smears it out, setting all bits to its right.
2411static inline uint32_t SmearBitsRight(uint32_t v) {
2412 v |= v >> 1;
2413 v |= v >> 2;
2414 v |= v >> 4;
2415 v |= v >> 8;
2416 v |= v >> 16;
2417 return v;
2418}
2419
2420
2421bool QuickCheckDetails::Rationalize(bool asc) {
2422 bool found_useful_op = false;
2423 uint32_t char_mask;
2424 if (asc) {
2425 char_mask = String::kMaxOneByteCharCode;
2426 } else {
2427 char_mask = String::kMaxUtf16CodeUnit;
2428 }
2429 mask_ = 0;
2430 value_ = 0;
2431 int char_shift = 0;
2432 for (int i = 0; i < characters_; i++) {
2433 Position* pos = &positions_[i];
2434 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2435 found_useful_op = true;
2436 }
2437 mask_ |= (pos->mask & char_mask) << char_shift;
2438 value_ |= (pos->value & char_mask) << char_shift;
2439 char_shift += asc ? 8 : 16;
2440 }
2441 return found_useful_op;
2442}
2443
2444
2445bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2446 Trace* bounds_check_trace,
2447 Trace* trace,
2448 bool preload_has_checked_bounds,
2449 Label* on_possible_success,
2450 QuickCheckDetails* details,
2451 bool fall_through_on_failure) {
2452 if (details->characters() == 0) return false;
2453 GetQuickCheckDetails(
2454 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2455 if (details->cannot_match()) return false;
2456 if (!details->Rationalize(compiler->one_byte())) return false;
2457 DCHECK(details->characters() == 1 ||
2458 compiler->macro_assembler()->CanReadUnaligned());
2459 uint32_t mask = details->mask();
2460 uint32_t value = details->value();
2461
2462 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2463
2464 if (trace->characters_preloaded() != details->characters()) {
2465 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2466 // We are attempting to preload the minimum number of characters
2467 // any choice would eat, so if the bounds check fails, then none of the
2468 // choices can succeed, so we can just immediately backtrack, rather
2469 // than go to the next choice.
2470 assembler->LoadCurrentCharacter(trace->cp_offset(),
2471 bounds_check_trace->backtrack(),
2472 !preload_has_checked_bounds,
2473 details->characters());
2474 }
2475
2476
2477 bool need_mask = true;
2478
2479 if (details->characters() == 1) {
2480 // If number of characters preloaded is 1 then we used a byte or 16 bit
2481 // load so the value is already masked down.
2482 uint32_t char_mask;
2483 if (compiler->one_byte()) {
2484 char_mask = String::kMaxOneByteCharCode;
2485 } else {
2486 char_mask = String::kMaxUtf16CodeUnit;
2487 }
2488 if ((mask & char_mask) == char_mask) need_mask = false;
2489 mask &= char_mask;
2490 } else {
2491 // For 2-character preloads in one-byte mode or 1-character preloads in
2492 // two-byte mode we also use a 16 bit load with zero extend.
Ben Murdoch097c5b22016-05-18 11:27:45 +01002493 static const uint32_t kTwoByteMask = 0xffff;
2494 static const uint32_t kFourByteMask = 0xffffffff;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002495 if (details->characters() == 2 && compiler->one_byte()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01002496 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002497 } else if (details->characters() == 1 && !compiler->one_byte()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01002498 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002499 } else {
Ben Murdoch097c5b22016-05-18 11:27:45 +01002500 if (mask == kFourByteMask) need_mask = false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002501 }
2502 }
2503
2504 if (fall_through_on_failure) {
2505 if (need_mask) {
2506 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2507 } else {
2508 assembler->CheckCharacter(value, on_possible_success);
2509 }
2510 } else {
2511 if (need_mask) {
2512 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2513 } else {
2514 assembler->CheckNotCharacter(value, trace->backtrack());
2515 }
2516 }
2517 return true;
2518}
2519
2520
2521// Here is the meat of GetQuickCheckDetails (see also the comment on the
2522// super-class in the .h file).
2523//
2524// We iterate along the text object, building up for each character a
2525// mask and value that can be used to test for a quick failure to match.
2526// The masks and values for the positions will be combined into a single
2527// machine word for the current character width in order to be used in
2528// generating a quick check.
2529void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2530 RegExpCompiler* compiler,
2531 int characters_filled_in,
2532 bool not_at_start) {
2533 // Do not collect any quick check details if the text node reads backward,
2534 // since it reads in the opposite direction than we use for quick checks.
2535 if (read_backward()) return;
2536 Isolate* isolate = compiler->macro_assembler()->isolate();
2537 DCHECK(characters_filled_in < details->characters());
2538 int characters = details->characters();
2539 int char_mask;
2540 if (compiler->one_byte()) {
2541 char_mask = String::kMaxOneByteCharCode;
2542 } else {
2543 char_mask = String::kMaxUtf16CodeUnit;
2544 }
2545 for (int k = 0; k < elements()->length(); k++) {
2546 TextElement elm = elements()->at(k);
2547 if (elm.text_type() == TextElement::ATOM) {
2548 Vector<const uc16> quarks = elm.atom()->data();
2549 for (int i = 0; i < characters && i < quarks.length(); i++) {
2550 QuickCheckDetails::Position* pos =
2551 details->positions(characters_filled_in);
2552 uc16 c = quarks[i];
2553 if (compiler->ignore_case()) {
2554 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2555 int length = GetCaseIndependentLetters(isolate, c,
2556 compiler->one_byte(), chars);
2557 if (length == 0) {
2558 // This can happen because all case variants are non-Latin1, but we
2559 // know the input is Latin1.
2560 details->set_cannot_match();
2561 pos->determines_perfectly = false;
2562 return;
2563 }
2564 if (length == 1) {
2565 // This letter has no case equivalents, so it's nice and simple
2566 // and the mask-compare will determine definitely whether we have
2567 // a match at this character position.
2568 pos->mask = char_mask;
2569 pos->value = c;
2570 pos->determines_perfectly = true;
2571 } else {
2572 uint32_t common_bits = char_mask;
2573 uint32_t bits = chars[0];
2574 for (int j = 1; j < length; j++) {
2575 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2576 common_bits ^= differing_bits;
2577 bits &= common_bits;
2578 }
2579 // If length is 2 and common bits has only one zero in it then
2580 // our mask and compare instruction will determine definitely
2581 // whether we have a match at this character position. Otherwise
2582 // it can only be an approximate check.
2583 uint32_t one_zero = (common_bits | ~char_mask);
2584 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2585 pos->determines_perfectly = true;
2586 }
2587 pos->mask = common_bits;
2588 pos->value = bits;
2589 }
2590 } else {
2591 // Don't ignore case. Nice simple case where the mask-compare will
2592 // determine definitely whether we have a match at this character
2593 // position.
2594 if (c > char_mask) {
2595 details->set_cannot_match();
2596 pos->determines_perfectly = false;
2597 return;
2598 }
2599 pos->mask = char_mask;
2600 pos->value = c;
2601 pos->determines_perfectly = true;
2602 }
2603 characters_filled_in++;
2604 DCHECK(characters_filled_in <= details->characters());
2605 if (characters_filled_in == details->characters()) {
2606 return;
2607 }
2608 }
2609 } else {
2610 QuickCheckDetails::Position* pos =
2611 details->positions(characters_filled_in);
2612 RegExpCharacterClass* tree = elm.char_class();
2613 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2614 if (tree->is_negated()) {
2615 // A quick check uses multi-character mask and compare. There is no
2616 // useful way to incorporate a negative char class into this scheme
2617 // so we just conservatively create a mask and value that will always
2618 // succeed.
2619 pos->mask = 0;
2620 pos->value = 0;
2621 } else {
2622 int first_range = 0;
2623 while (ranges->at(first_range).from() > char_mask) {
2624 first_range++;
2625 if (first_range == ranges->length()) {
2626 details->set_cannot_match();
2627 pos->determines_perfectly = false;
2628 return;
2629 }
2630 }
2631 CharacterRange range = ranges->at(first_range);
2632 uc16 from = range.from();
2633 uc16 to = range.to();
2634 if (to > char_mask) {
2635 to = char_mask;
2636 }
2637 uint32_t differing_bits = (from ^ to);
2638 // A mask and compare is only perfect if the differing bits form a
2639 // number like 00011111 with one single block of trailing 1s.
2640 if ((differing_bits & (differing_bits + 1)) == 0 &&
2641 from + differing_bits == to) {
2642 pos->determines_perfectly = true;
2643 }
2644 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2645 uint32_t bits = (from & common_bits);
2646 for (int i = first_range + 1; i < ranges->length(); i++) {
2647 CharacterRange range = ranges->at(i);
2648 uc16 from = range.from();
2649 uc16 to = range.to();
2650 if (from > char_mask) continue;
2651 if (to > char_mask) to = char_mask;
2652 // Here we are combining more ranges into the mask and compare
2653 // value. With each new range the mask becomes more sparse and
2654 // so the chances of a false positive rise. A character class
2655 // with multiple ranges is assumed never to be equivalent to a
2656 // mask and compare operation.
2657 pos->determines_perfectly = false;
2658 uint32_t new_common_bits = (from ^ to);
2659 new_common_bits = ~SmearBitsRight(new_common_bits);
2660 common_bits &= new_common_bits;
2661 bits &= new_common_bits;
2662 uint32_t differing_bits = (from & common_bits) ^ bits;
2663 common_bits ^= differing_bits;
2664 bits &= common_bits;
2665 }
2666 pos->mask = common_bits;
2667 pos->value = bits;
2668 }
2669 characters_filled_in++;
2670 DCHECK(characters_filled_in <= details->characters());
2671 if (characters_filled_in == details->characters()) {
2672 return;
2673 }
2674 }
2675 }
2676 DCHECK(characters_filled_in != details->characters());
2677 if (!details->cannot_match()) {
2678 on_success()-> GetQuickCheckDetails(details,
2679 compiler,
2680 characters_filled_in,
2681 true);
2682 }
2683}
2684
2685
2686void QuickCheckDetails::Clear() {
2687 for (int i = 0; i < characters_; i++) {
2688 positions_[i].mask = 0;
2689 positions_[i].value = 0;
2690 positions_[i].determines_perfectly = false;
2691 }
2692 characters_ = 0;
2693}
2694
2695
2696void QuickCheckDetails::Advance(int by, bool one_byte) {
2697 if (by >= characters_ || by < 0) {
2698 DCHECK_IMPLIES(by < 0, characters_ == 0);
2699 Clear();
2700 return;
2701 }
2702 DCHECK_LE(characters_ - by, 4);
2703 DCHECK_LE(characters_, 4);
2704 for (int i = 0; i < characters_ - by; i++) {
2705 positions_[i] = positions_[by + i];
2706 }
2707 for (int i = characters_ - by; i < characters_; i++) {
2708 positions_[i].mask = 0;
2709 positions_[i].value = 0;
2710 positions_[i].determines_perfectly = false;
2711 }
2712 characters_ -= by;
2713 // We could change mask_ and value_ here but we would never advance unless
2714 // they had already been used in a check and they won't be used again because
2715 // it would gain us nothing. So there's no point.
2716}
2717
2718
2719void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2720 DCHECK(characters_ == other->characters_);
2721 if (other->cannot_match_) {
2722 return;
2723 }
2724 if (cannot_match_) {
2725 *this = *other;
2726 return;
2727 }
2728 for (int i = from_index; i < characters_; i++) {
2729 QuickCheckDetails::Position* pos = positions(i);
2730 QuickCheckDetails::Position* other_pos = other->positions(i);
2731 if (pos->mask != other_pos->mask ||
2732 pos->value != other_pos->value ||
2733 !other_pos->determines_perfectly) {
2734 // Our mask-compare operation will be approximate unless we have the
2735 // exact same operation on both sides of the alternation.
2736 pos->determines_perfectly = false;
2737 }
2738 pos->mask &= other_pos->mask;
2739 pos->value &= pos->mask;
2740 other_pos->value &= pos->mask;
2741 uc16 differing_bits = (pos->value ^ other_pos->value);
2742 pos->mask &= ~differing_bits;
2743 pos->value &= pos->mask;
2744 }
2745}
2746
2747
2748class VisitMarker {
2749 public:
2750 explicit VisitMarker(NodeInfo* info) : info_(info) {
2751 DCHECK(!info->visited);
2752 info->visited = true;
2753 }
2754 ~VisitMarker() {
2755 info_->visited = false;
2756 }
2757 private:
2758 NodeInfo* info_;
2759};
2760
2761
2762RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2763 if (info()->replacement_calculated) return replacement();
2764 if (depth < 0) return this;
2765 DCHECK(!info()->visited);
2766 VisitMarker marker(info());
2767 return FilterSuccessor(depth - 1, ignore_case);
2768}
2769
2770
2771RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2772 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2773 if (next == NULL) return set_replacement(NULL);
2774 on_success_ = next;
2775 return set_replacement(this);
2776}
2777
2778
2779// We need to check for the following characters: 0x39c 0x3bc 0x178.
2780static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2781 // TODO(dcarney): this could be a lot more efficient.
2782 return range.Contains(0x39c) ||
2783 range.Contains(0x3bc) || range.Contains(0x178);
2784}
2785
2786
2787static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2788 for (int i = 0; i < ranges->length(); i++) {
2789 // TODO(dcarney): this could be a lot more efficient.
2790 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2791 }
2792 return false;
2793}
2794
2795
2796RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2797 if (info()->replacement_calculated) return replacement();
2798 if (depth < 0) return this;
2799 DCHECK(!info()->visited);
2800 VisitMarker marker(info());
2801 int element_count = elements()->length();
2802 for (int i = 0; i < element_count; i++) {
2803 TextElement elm = elements()->at(i);
2804 if (elm.text_type() == TextElement::ATOM) {
2805 Vector<const uc16> quarks = elm.atom()->data();
2806 for (int j = 0; j < quarks.length(); j++) {
2807 uint16_t c = quarks[j];
2808 if (c <= String::kMaxOneByteCharCode) continue;
2809 if (!ignore_case) return set_replacement(NULL);
2810 // Here, we need to check for characters whose upper and lower cases
2811 // are outside the Latin-1 range.
2812 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2813 // Character is outside Latin-1 completely
2814 if (converted == 0) return set_replacement(NULL);
2815 // Convert quark to Latin-1 in place.
2816 uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2817 copy[j] = converted;
2818 }
2819 } else {
2820 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2821 RegExpCharacterClass* cc = elm.char_class();
2822 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +01002823 CharacterRange::Canonicalize(ranges);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002824 // Now they are in order so we only need to look at the first.
2825 int range_count = ranges->length();
2826 if (cc->is_negated()) {
2827 if (range_count != 0 &&
2828 ranges->at(0).from() == 0 &&
2829 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2830 // This will be handled in a later filter.
2831 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2832 return set_replacement(NULL);
2833 }
2834 } else {
2835 if (range_count == 0 ||
2836 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2837 // This will be handled in a later filter.
2838 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2839 return set_replacement(NULL);
2840 }
2841 }
2842 }
2843 }
2844 return FilterSuccessor(depth - 1, ignore_case);
2845}
2846
2847
2848RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2849 if (info()->replacement_calculated) return replacement();
2850 if (depth < 0) return this;
2851 if (info()->visited) return this;
2852 {
2853 VisitMarker marker(info());
2854
2855 RegExpNode* continue_replacement =
2856 continue_node_->FilterOneByte(depth - 1, ignore_case);
2857 // If we can't continue after the loop then there is no sense in doing the
2858 // loop.
2859 if (continue_replacement == NULL) return set_replacement(NULL);
2860 }
2861
2862 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2863}
2864
2865
2866RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2867 if (info()->replacement_calculated) return replacement();
2868 if (depth < 0) return this;
2869 if (info()->visited) return this;
2870 VisitMarker marker(info());
2871 int choice_count = alternatives_->length();
2872
2873 for (int i = 0; i < choice_count; i++) {
2874 GuardedAlternative alternative = alternatives_->at(i);
2875 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2876 set_replacement(this);
2877 return this;
2878 }
2879 }
2880
2881 int surviving = 0;
2882 RegExpNode* survivor = NULL;
2883 for (int i = 0; i < choice_count; i++) {
2884 GuardedAlternative alternative = alternatives_->at(i);
2885 RegExpNode* replacement =
2886 alternative.node()->FilterOneByte(depth - 1, ignore_case);
2887 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2888 if (replacement != NULL) {
2889 alternatives_->at(i).set_node(replacement);
2890 surviving++;
2891 survivor = replacement;
2892 }
2893 }
2894 if (surviving < 2) return set_replacement(survivor);
2895
2896 set_replacement(this);
2897 if (surviving == choice_count) {
2898 return this;
2899 }
2900 // Only some of the nodes survived the filtering. We need to rebuild the
2901 // alternatives list.
2902 ZoneList<GuardedAlternative>* new_alternatives =
2903 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2904 for (int i = 0; i < choice_count; i++) {
2905 RegExpNode* replacement =
2906 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2907 if (replacement != NULL) {
2908 alternatives_->at(i).set_node(replacement);
2909 new_alternatives->Add(alternatives_->at(i), zone());
2910 }
2911 }
2912 alternatives_ = new_alternatives;
2913 return this;
2914}
2915
2916
2917RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
2918 bool ignore_case) {
2919 if (info()->replacement_calculated) return replacement();
2920 if (depth < 0) return this;
2921 if (info()->visited) return this;
2922 VisitMarker marker(info());
2923 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2924 // afterwards.
2925 RegExpNode* node = alternatives_->at(1).node();
2926 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2927 if (replacement == NULL) return set_replacement(NULL);
2928 alternatives_->at(1).set_node(replacement);
2929
2930 RegExpNode* neg_node = alternatives_->at(0).node();
2931 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2932 // If the negative lookahead is always going to fail then
2933 // we don't need to check it.
2934 if (neg_replacement == NULL) return set_replacement(replacement);
2935 alternatives_->at(0).set_node(neg_replacement);
2936 return set_replacement(this);
2937}
2938
2939
2940void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2941 RegExpCompiler* compiler,
2942 int characters_filled_in,
2943 bool not_at_start) {
2944 if (body_can_be_zero_length_ || info()->visited) return;
2945 VisitMarker marker(info());
2946 return ChoiceNode::GetQuickCheckDetails(details,
2947 compiler,
2948 characters_filled_in,
2949 not_at_start);
2950}
2951
2952
2953void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2954 BoyerMooreLookahead* bm, bool not_at_start) {
2955 if (body_can_be_zero_length_ || budget <= 0) {
2956 bm->SetRest(offset);
2957 SaveBMInfo(bm, not_at_start, offset);
2958 return;
2959 }
2960 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2961 SaveBMInfo(bm, not_at_start, offset);
2962}
2963
2964
2965void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2966 RegExpCompiler* compiler,
2967 int characters_filled_in,
2968 bool not_at_start) {
2969 not_at_start = (not_at_start || not_at_start_);
2970 int choice_count = alternatives_->length();
2971 DCHECK(choice_count > 0);
2972 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2973 compiler,
2974 characters_filled_in,
2975 not_at_start);
2976 for (int i = 1; i < choice_count; i++) {
2977 QuickCheckDetails new_details(details->characters());
2978 RegExpNode* node = alternatives_->at(i).node();
2979 node->GetQuickCheckDetails(&new_details, compiler,
2980 characters_filled_in,
2981 not_at_start);
2982 // Here we merge the quick match details of the two branches.
2983 details->Merge(&new_details, characters_filled_in);
2984 }
2985}
2986
2987
2988// Check for [0-9A-Z_a-z].
2989static void EmitWordCheck(RegExpMacroAssembler* assembler,
2990 Label* word,
2991 Label* non_word,
2992 bool fall_through_on_word) {
2993 if (assembler->CheckSpecialCharacterClass(
2994 fall_through_on_word ? 'w' : 'W',
2995 fall_through_on_word ? non_word : word)) {
2996 // Optimized implementation available.
2997 return;
2998 }
2999 assembler->CheckCharacterGT('z', non_word);
3000 assembler->CheckCharacterLT('0', non_word);
3001 assembler->CheckCharacterGT('a' - 1, word);
3002 assembler->CheckCharacterLT('9' + 1, word);
3003 assembler->CheckCharacterLT('A', non_word);
3004 assembler->CheckCharacterLT('Z' + 1, word);
3005 if (fall_through_on_word) {
3006 assembler->CheckNotCharacter('_', non_word);
3007 } else {
3008 assembler->CheckCharacter('_', word);
3009 }
3010}
3011
3012
3013// Emit the code to check for a ^ in multiline mode (1-character lookbehind
3014// that matches newline or the start of input).
3015static void EmitHat(RegExpCompiler* compiler,
3016 RegExpNode* on_success,
3017 Trace* trace) {
3018 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3019 // We will be loading the previous character into the current character
3020 // register.
3021 Trace new_trace(*trace);
3022 new_trace.InvalidateCurrentCharacter();
3023
3024 Label ok;
3025 if (new_trace.cp_offset() == 0) {
3026 // The start of input counts as a newline in this context, so skip to
3027 // ok if we are at the start.
3028 assembler->CheckAtStart(&ok);
3029 }
3030 // We already checked that we are not at the start of input so it must be
3031 // OK to load the previous character.
3032 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3033 new_trace.backtrack(),
3034 false);
3035 if (!assembler->CheckSpecialCharacterClass('n',
3036 new_trace.backtrack())) {
3037 // Newline means \n, \r, 0x2028 or 0x2029.
3038 if (!compiler->one_byte()) {
3039 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3040 }
3041 assembler->CheckCharacter('\n', &ok);
3042 assembler->CheckNotCharacter('\r', new_trace.backtrack());
3043 }
3044 assembler->Bind(&ok);
3045 on_success->Emit(compiler, &new_trace);
3046}
3047
3048
3049// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3050void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3051 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3052 Isolate* isolate = assembler->isolate();
3053 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3054 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3055 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3056 if (lookahead == NULL) {
3057 int eats_at_least =
3058 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3059 kRecursionBudget,
3060 not_at_start));
3061 if (eats_at_least >= 1) {
3062 BoyerMooreLookahead* bm =
3063 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3064 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
3065 if (bm->at(0)->is_non_word())
3066 next_is_word_character = Trace::FALSE_VALUE;
3067 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3068 }
3069 } else {
3070 if (lookahead->at(0)->is_non_word())
3071 next_is_word_character = Trace::FALSE_VALUE;
3072 if (lookahead->at(0)->is_word())
3073 next_is_word_character = Trace::TRUE_VALUE;
3074 }
3075 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3076 if (next_is_word_character == Trace::UNKNOWN) {
3077 Label before_non_word;
3078 Label before_word;
3079 if (trace->characters_preloaded() != 1) {
3080 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3081 }
3082 // Fall through on non-word.
3083 EmitWordCheck(assembler, &before_word, &before_non_word, false);
3084 // Next character is not a word character.
3085 assembler->Bind(&before_non_word);
3086 Label ok;
3087 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3088 assembler->GoTo(&ok);
3089
3090 assembler->Bind(&before_word);
3091 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3092 assembler->Bind(&ok);
3093 } else if (next_is_word_character == Trace::TRUE_VALUE) {
3094 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3095 } else {
3096 DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3097 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3098 }
3099}
3100
3101
3102void AssertionNode::BacktrackIfPrevious(
3103 RegExpCompiler* compiler,
3104 Trace* trace,
3105 AssertionNode::IfPrevious backtrack_if_previous) {
3106 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3107 Trace new_trace(*trace);
3108 new_trace.InvalidateCurrentCharacter();
3109
3110 Label fall_through, dummy;
3111
3112 Label* non_word = backtrack_if_previous == kIsNonWord ?
3113 new_trace.backtrack() :
3114 &fall_through;
3115 Label* word = backtrack_if_previous == kIsNonWord ?
3116 &fall_through :
3117 new_trace.backtrack();
3118
3119 if (new_trace.cp_offset() == 0) {
3120 // The start of input counts as a non-word character, so the question is
3121 // decided if we are at the start.
3122 assembler->CheckAtStart(non_word);
3123 }
3124 // We already checked that we are not at the start of input so it must be
3125 // OK to load the previous character.
3126 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3127 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3128
3129 assembler->Bind(&fall_through);
3130 on_success()->Emit(compiler, &new_trace);
3131}
3132
3133
3134void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3135 RegExpCompiler* compiler,
3136 int filled_in,
3137 bool not_at_start) {
3138 if (assertion_type_ == AT_START && not_at_start) {
3139 details->set_cannot_match();
3140 return;
3141 }
3142 return on_success()->GetQuickCheckDetails(details,
3143 compiler,
3144 filled_in,
3145 not_at_start);
3146}
3147
3148
3149void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3150 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3151 switch (assertion_type_) {
3152 case AT_END: {
3153 Label ok;
3154 assembler->CheckPosition(trace->cp_offset(), &ok);
3155 assembler->GoTo(trace->backtrack());
3156 assembler->Bind(&ok);
3157 break;
3158 }
3159 case AT_START: {
3160 if (trace->at_start() == Trace::FALSE_VALUE) {
3161 assembler->GoTo(trace->backtrack());
3162 return;
3163 }
3164 if (trace->at_start() == Trace::UNKNOWN) {
3165 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3166 Trace at_start_trace = *trace;
3167 at_start_trace.set_at_start(Trace::TRUE_VALUE);
3168 on_success()->Emit(compiler, &at_start_trace);
3169 return;
3170 }
3171 }
3172 break;
3173 case AFTER_NEWLINE:
3174 EmitHat(compiler, on_success(), trace);
3175 return;
3176 case AT_BOUNDARY:
3177 case AT_NON_BOUNDARY: {
3178 EmitBoundaryCheck(compiler, trace);
3179 return;
3180 }
3181 }
3182 on_success()->Emit(compiler, trace);
3183}
3184
3185
3186static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3187 if (quick_check == NULL) return false;
3188 if (offset >= quick_check->characters()) return false;
3189 return quick_check->positions(offset)->determines_perfectly;
3190}
3191
3192
3193static void UpdateBoundsCheck(int index, int* checked_up_to) {
3194 if (index > *checked_up_to) {
3195 *checked_up_to = index;
3196 }
3197}
3198
3199
3200// We call this repeatedly to generate code for each pass over the text node.
3201// The passes are in increasing order of difficulty because we hope one
3202// of the first passes will fail in which case we are saved the work of the
3203// later passes. for example for the case independent regexp /%[asdfghjkl]a/
3204// we will check the '%' in the first pass, the case independent 'a' in the
3205// second pass and the character class in the last pass.
3206//
3207// The passes are done from right to left, so for example to test for /bar/
3208// we will first test for an 'r' with offset 2, then an 'a' with offset 1
3209// and then a 'b' with offset 0. This means we can avoid the end-of-input
3210// bounds check most of the time. In the example we only need to check for
3211// end-of-input when loading the putative 'r'.
3212//
3213// A slight complication involves the fact that the first character may already
3214// be fetched into a register by the previous node. In this case we want to
3215// do the test for that character first. We do this in separate passes. The
3216// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3217// pass has been performed then subsequent passes will have true in
3218// first_element_checked to indicate that that character does not need to be
3219// checked again.
3220//
3221// In addition to all this we are passed a Trace, which can
3222// contain an AlternativeGeneration object. In this AlternativeGeneration
3223// object we can see details of any quick check that was already passed in
3224// order to get to the code we are now generating. The quick check can involve
3225// loading characters, which means we do not need to recheck the bounds
3226// up to the limit the quick check already checked. In addition the quick
3227// check can have involved a mask and compare operation which may simplify
3228// or obviate the need for further checks at some character positions.
3229void TextNode::TextEmitPass(RegExpCompiler* compiler,
3230 TextEmitPassType pass,
3231 bool preloaded,
3232 Trace* trace,
3233 bool first_element_checked,
3234 int* checked_up_to) {
3235 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3236 Isolate* isolate = assembler->isolate();
3237 bool one_byte = compiler->one_byte();
3238 Label* backtrack = trace->backtrack();
3239 QuickCheckDetails* quick_check = trace->quick_check_performed();
3240 int element_count = elements()->length();
3241 int backward_offset = read_backward() ? -Length() : 0;
3242 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3243 TextElement elm = elements()->at(i);
3244 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3245 if (elm.text_type() == TextElement::ATOM) {
3246 Vector<const uc16> quarks = elm.atom()->data();
3247 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3248 if (first_element_checked && i == 0 && j == 0) continue;
3249 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3250 EmitCharacterFunction* emit_function = NULL;
3251 switch (pass) {
3252 case NON_LATIN1_MATCH:
3253 DCHECK(one_byte);
3254 if (quarks[j] > String::kMaxOneByteCharCode) {
3255 assembler->GoTo(backtrack);
3256 return;
3257 }
3258 break;
3259 case NON_LETTER_CHARACTER_MATCH:
3260 emit_function = &EmitAtomNonLetter;
3261 break;
3262 case SIMPLE_CHARACTER_MATCH:
3263 emit_function = &EmitSimpleCharacter;
3264 break;
3265 case CASE_CHARACTER_MATCH:
3266 emit_function = &EmitAtomLetter;
3267 break;
3268 default:
3269 break;
3270 }
3271 if (emit_function != NULL) {
3272 bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3273 bool bound_checked =
3274 emit_function(isolate, compiler, quarks[j], backtrack,
3275 cp_offset + j, bounds_check, preloaded);
3276 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3277 }
3278 }
3279 } else {
3280 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3281 if (pass == CHARACTER_CLASS_MATCH) {
3282 if (first_element_checked && i == 0) continue;
3283 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3284 RegExpCharacterClass* cc = elm.char_class();
3285 bool bounds_check = *checked_up_to < cp_offset || read_backward();
3286 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3287 bounds_check, preloaded, zone());
3288 UpdateBoundsCheck(cp_offset, checked_up_to);
3289 }
3290 }
3291 }
3292}
3293
3294
3295int TextNode::Length() {
3296 TextElement elm = elements()->last();
3297 DCHECK(elm.cp_offset() >= 0);
3298 return elm.cp_offset() + elm.length();
3299}
3300
3301
3302bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3303 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3304 if (ignore_case) {
3305 return pass == SIMPLE_CHARACTER_MATCH;
3306 } else {
3307 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3308 }
3309}
3310
3311
Ben Murdoch097c5b22016-05-18 11:27:45 +01003312TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3313 ZoneList<CharacterRange>* ranges,
3314 bool read_backward,
3315 RegExpNode* on_success) {
3316 DCHECK_NOT_NULL(ranges);
3317 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3318 elms->Add(
3319 TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
3320 zone);
3321 return new (zone) TextNode(elms, read_backward, on_success);
3322}
3323
3324
3325TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3326 CharacterRange trail,
3327 bool read_backward,
3328 RegExpNode* on_success) {
3329 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3330 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3331 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3332 elms->Add(TextElement::CharClass(
3333 new (zone) RegExpCharacterClass(lead_ranges, false)),
3334 zone);
3335 elms->Add(TextElement::CharClass(
3336 new (zone) RegExpCharacterClass(trail_ranges, false)),
3337 zone);
3338 return new (zone) TextNode(elms, read_backward, on_success);
3339}
3340
3341
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003342// This generates the code to match a text node. A text node can contain
3343// straight character sequences (possibly to be matched in a case-independent
3344// way) and character classes. For efficiency we do not do this in a single
3345// pass from left to right. Instead we pass over the text node several times,
3346// emitting code for some character positions every time. See the comment on
3347// TextEmitPass for details.
3348void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3349 LimitResult limit_result = LimitVersions(compiler, trace);
3350 if (limit_result == DONE) return;
3351 DCHECK(limit_result == CONTINUE);
3352
3353 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3354 compiler->SetRegExpTooBig();
3355 return;
3356 }
3357
3358 if (compiler->one_byte()) {
3359 int dummy = 0;
3360 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3361 }
3362
3363 bool first_elt_done = false;
3364 int bound_checked_to = trace->cp_offset() - 1;
3365 bound_checked_to += trace->bound_checked_up_to();
3366
3367 // If a character is preloaded into the current character register then
3368 // check that now.
3369 if (trace->characters_preloaded() == 1) {
3370 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3371 if (!SkipPass(pass, compiler->ignore_case())) {
3372 TextEmitPass(compiler,
3373 static_cast<TextEmitPassType>(pass),
3374 true,
3375 trace,
3376 false,
3377 &bound_checked_to);
3378 }
3379 }
3380 first_elt_done = true;
3381 }
3382
3383 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3384 if (!SkipPass(pass, compiler->ignore_case())) {
3385 TextEmitPass(compiler,
3386 static_cast<TextEmitPassType>(pass),
3387 false,
3388 trace,
3389 first_elt_done,
3390 &bound_checked_to);
3391 }
3392 }
3393
3394 Trace successor_trace(*trace);
3395 // If we advance backward, we may end up at the start.
3396 successor_trace.AdvanceCurrentPositionInTrace(
3397 read_backward() ? -Length() : Length(), compiler);
3398 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3399 : Trace::FALSE_VALUE);
3400 RecursionCheck rc(compiler);
3401 on_success()->Emit(compiler, &successor_trace);
3402}
3403
3404
3405void Trace::InvalidateCurrentCharacter() {
3406 characters_preloaded_ = 0;
3407}
3408
3409
3410void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3411 // We don't have an instruction for shifting the current character register
3412 // down or for using a shifted value for anything so lets just forget that
3413 // we preloaded any characters into it.
3414 characters_preloaded_ = 0;
3415 // Adjust the offsets of the quick check performed information. This
3416 // information is used to find out what we already determined about the
3417 // characters by means of mask and compare.
3418 quick_check_performed_.Advance(by, compiler->one_byte());
3419 cp_offset_ += by;
3420 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3421 compiler->SetRegExpTooBig();
3422 cp_offset_ = 0;
3423 }
3424 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3425}
3426
3427
3428void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3429 int element_count = elements()->length();
3430 for (int i = 0; i < element_count; i++) {
3431 TextElement elm = elements()->at(i);
3432 if (elm.text_type() == TextElement::CHAR_CLASS) {
3433 RegExpCharacterClass* cc = elm.char_class();
3434 // None of the standard character classes is different in the case
3435 // independent case and it slows us down if we don't know that.
3436 if (cc->is_standard(zone())) continue;
3437 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +01003438 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003439 }
3440 }
3441}
3442
3443
3444int TextNode::GreedyLoopTextLength() { return Length(); }
3445
3446
3447RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3448 RegExpCompiler* compiler) {
3449 if (read_backward()) return NULL;
3450 if (elements()->length() != 1) return NULL;
3451 TextElement elm = elements()->at(0);
3452 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3453 RegExpCharacterClass* node = elm.char_class();
3454 ZoneList<CharacterRange>* ranges = node->ranges(zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +01003455 CharacterRange::Canonicalize(ranges);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003456 if (node->is_negated()) {
3457 return ranges->length() == 0 ? on_success() : NULL;
3458 }
3459 if (ranges->length() != 1) return NULL;
3460 uint32_t max_char;
3461 if (compiler->one_byte()) {
3462 max_char = String::kMaxOneByteCharCode;
3463 } else {
3464 max_char = String::kMaxUtf16CodeUnit;
3465 }
3466 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3467}
3468
3469
3470// Finds the fixed match length of a sequence of nodes that goes from
3471// this alternative and back to this choice node. If there are variable
3472// length nodes or other complications in the way then return a sentinel
3473// value indicating that a greedy loop cannot be constructed.
3474int ChoiceNode::GreedyLoopTextLengthForAlternative(
3475 GuardedAlternative* alternative) {
3476 int length = 0;
3477 RegExpNode* node = alternative->node();
3478 // Later we will generate code for all these text nodes using recursion
3479 // so we have to limit the max number.
3480 int recursion_depth = 0;
3481 while (node != this) {
3482 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3483 return kNodeIsTooComplexForGreedyLoops;
3484 }
3485 int node_length = node->GreedyLoopTextLength();
3486 if (node_length == kNodeIsTooComplexForGreedyLoops) {
3487 return kNodeIsTooComplexForGreedyLoops;
3488 }
3489 length += node_length;
3490 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3491 node = seq_node->on_success();
3492 }
3493 return read_backward() ? -length : length;
3494}
3495
3496
3497void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3498 DCHECK_NULL(loop_node_);
3499 AddAlternative(alt);
3500 loop_node_ = alt.node();
3501}
3502
3503
3504void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3505 DCHECK_NULL(continue_node_);
3506 AddAlternative(alt);
3507 continue_node_ = alt.node();
3508}
3509
3510
3511void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3512 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3513 if (trace->stop_node() == this) {
3514 // Back edge of greedy optimized loop node graph.
3515 int text_length =
3516 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3517 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
3518 // Update the counter-based backtracking info on the stack. This is an
3519 // optimization for greedy loops (see below).
3520 DCHECK(trace->cp_offset() == text_length);
3521 macro_assembler->AdvanceCurrentPosition(text_length);
3522 macro_assembler->GoTo(trace->loop_label());
3523 return;
3524 }
3525 DCHECK_NULL(trace->stop_node());
3526 if (!trace->is_trivial()) {
3527 trace->Flush(compiler, this);
3528 return;
3529 }
3530 ChoiceNode::Emit(compiler, trace);
3531}
3532
3533
3534int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3535 int eats_at_least) {
3536 int preload_characters = Min(4, eats_at_least);
3537 if (compiler->macro_assembler()->CanReadUnaligned()) {
3538 bool one_byte = compiler->one_byte();
3539 if (one_byte) {
3540 if (preload_characters > 4) preload_characters = 4;
3541 // We can't preload 3 characters because there is no machine instruction
3542 // to do that. We can't just load 4 because we could be reading
3543 // beyond the end of the string, which could cause a memory fault.
3544 if (preload_characters == 3) preload_characters = 2;
3545 } else {
3546 if (preload_characters > 2) preload_characters = 2;
3547 }
3548 } else {
3549 if (preload_characters > 1) preload_characters = 1;
3550 }
3551 return preload_characters;
3552}
3553
3554
3555// This class is used when generating the alternatives in a choice node. It
3556// records the way the alternative is being code generated.
3557class AlternativeGeneration: public Malloced {
3558 public:
3559 AlternativeGeneration()
3560 : possible_success(),
3561 expects_preload(false),
3562 after(),
3563 quick_check_details() { }
3564 Label possible_success;
3565 bool expects_preload;
3566 Label after;
3567 QuickCheckDetails quick_check_details;
3568};
3569
3570
3571// Creates a list of AlternativeGenerations. If the list has a reasonable
3572// size then it is on the stack, otherwise the excess is on the heap.
3573class AlternativeGenerationList {
3574 public:
3575 AlternativeGenerationList(int count, Zone* zone)
3576 : alt_gens_(count, zone) {
3577 for (int i = 0; i < count && i < kAFew; i++) {
3578 alt_gens_.Add(a_few_alt_gens_ + i, zone);
3579 }
3580 for (int i = kAFew; i < count; i++) {
3581 alt_gens_.Add(new AlternativeGeneration(), zone);
3582 }
3583 }
3584 ~AlternativeGenerationList() {
3585 for (int i = kAFew; i < alt_gens_.length(); i++) {
3586 delete alt_gens_[i];
3587 alt_gens_[i] = NULL;
3588 }
3589 }
3590
3591 AlternativeGeneration* at(int i) {
3592 return alt_gens_[i];
3593 }
3594
3595 private:
3596 static const int kAFew = 10;
3597 ZoneList<AlternativeGeneration*> alt_gens_;
3598 AlternativeGeneration a_few_alt_gens_[kAFew];
3599};
3600
3601
Ben Murdoch097c5b22016-05-18 11:27:45 +01003602static const uc32 kRangeEndMarker = 0x110000;
3603
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003604// The '2' variant is has inclusive from and exclusive to.
3605// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3606// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
Ben Murdoch097c5b22016-05-18 11:27:45 +01003607static const int kSpaceRanges[] = {
3608 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681,
3609 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3610 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003611static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3612
3613static const int kWordRanges[] = {
Ben Murdoch097c5b22016-05-18 11:27:45 +01003614 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003615static const int kWordRangeCount = arraysize(kWordRanges);
Ben Murdoch097c5b22016-05-18 11:27:45 +01003616static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003617static const int kDigitRangeCount = arraysize(kDigitRanges);
Ben Murdoch097c5b22016-05-18 11:27:45 +01003618static const int kSurrogateRanges[] = {
3619 kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003620static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
Ben Murdoch097c5b22016-05-18 11:27:45 +01003621static const int kLineTerminatorRanges[] = {
3622 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003623static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3624
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003625void BoyerMoorePositionInfo::Set(int character) {
3626 SetInterval(Interval(character, character));
3627}
3628
3629
3630void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3631 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3632 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3633 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3634 surrogate_ =
3635 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3636 if (interval.to() - interval.from() >= kMapSize - 1) {
3637 if (map_count_ != kMapSize) {
3638 map_count_ = kMapSize;
3639 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3640 }
3641 return;
3642 }
3643 for (int i = interval.from(); i <= interval.to(); i++) {
3644 int mod_character = (i & kMask);
3645 if (!map_->at(mod_character)) {
3646 map_count_++;
3647 map_->at(mod_character) = true;
3648 }
3649 if (map_count_ == kMapSize) return;
3650 }
3651}
3652
3653
3654void BoyerMoorePositionInfo::SetAll() {
3655 s_ = w_ = d_ = kLatticeUnknown;
3656 if (map_count_ != kMapSize) {
3657 map_count_ = kMapSize;
3658 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3659 }
3660}
3661
3662
3663BoyerMooreLookahead::BoyerMooreLookahead(
3664 int length, RegExpCompiler* compiler, Zone* zone)
3665 : length_(length),
3666 compiler_(compiler) {
3667 if (compiler->one_byte()) {
3668 max_char_ = String::kMaxOneByteCharCode;
3669 } else {
3670 max_char_ = String::kMaxUtf16CodeUnit;
3671 }
3672 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3673 for (int i = 0; i < length; i++) {
3674 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3675 }
3676}
3677
3678
3679// Find the longest range of lookahead that has the fewest number of different
3680// characters that can occur at a given position. Since we are optimizing two
3681// different parameters at once this is a tradeoff.
3682bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3683 int biggest_points = 0;
3684 // If more than 32 characters out of 128 can occur it is unlikely that we can
3685 // be lucky enough to step forwards much of the time.
3686 const int kMaxMax = 32;
3687 for (int max_number_of_chars = 4;
3688 max_number_of_chars < kMaxMax;
3689 max_number_of_chars *= 2) {
3690 biggest_points =
3691 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3692 }
3693 if (biggest_points == 0) return false;
3694 return true;
3695}
3696
3697
3698// Find the highest-points range between 0 and length_ where the character
3699// information is not too vague. 'Too vague' means that there are more than
3700// max_number_of_chars that can occur at this position. Calculates the number
3701// of points as the product of width-of-the-range and
3702// probability-of-finding-one-of-the-characters, where the probability is
3703// calculated using the frequency distribution of the sample subject string.
3704int BoyerMooreLookahead::FindBestInterval(
3705 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3706 int biggest_points = old_biggest_points;
3707 static const int kSize = RegExpMacroAssembler::kTableSize;
3708 for (int i = 0; i < length_; ) {
3709 while (i < length_ && Count(i) > max_number_of_chars) i++;
3710 if (i == length_) break;
3711 int remembered_from = i;
3712 bool union_map[kSize];
3713 for (int j = 0; j < kSize; j++) union_map[j] = false;
3714 while (i < length_ && Count(i) <= max_number_of_chars) {
3715 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3716 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3717 i++;
3718 }
3719 int frequency = 0;
3720 for (int j = 0; j < kSize; j++) {
3721 if (union_map[j]) {
3722 // Add 1 to the frequency to give a small per-character boost for
3723 // the cases where our sampling is not good enough and many
3724 // characters have a frequency of zero. This means the frequency
3725 // can theoretically be up to 2*kSize though we treat it mostly as
3726 // a fraction of kSize.
3727 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3728 }
3729 }
3730 // We use the probability of skipping times the distance we are skipping to
3731 // judge the effectiveness of this. Actually we have a cut-off: By
3732 // dividing by 2 we switch off the skipping if the probability of skipping
3733 // is less than 50%. This is because the multibyte mask-and-compare
3734 // skipping in quickcheck is more likely to do well on this case.
3735 bool in_quickcheck_range =
3736 ((i - remembered_from < 4) ||
3737 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3738 // Called 'probability' but it is only a rough estimate and can actually
3739 // be outside the 0-kSize range.
3740 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3741 int points = (i - remembered_from) * probability;
3742 if (points > biggest_points) {
3743 *from = remembered_from;
3744 *to = i - 1;
3745 biggest_points = points;
3746 }
3747 }
3748 return biggest_points;
3749}
3750
3751
3752// Take all the characters that will not prevent a successful match if they
3753// occur in the subject string in the range between min_lookahead and
3754// max_lookahead (inclusive) measured from the current position. If the
3755// character at max_lookahead offset is not one of these characters, then we
3756// can safely skip forwards by the number of characters in the range.
3757int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3758 int max_lookahead,
3759 Handle<ByteArray> boolean_skip_table) {
3760 const int kSize = RegExpMacroAssembler::kTableSize;
3761
3762 const int kSkipArrayEntry = 0;
3763 const int kDontSkipArrayEntry = 1;
3764
3765 for (int i = 0; i < kSize; i++) {
3766 boolean_skip_table->set(i, kSkipArrayEntry);
3767 }
3768 int skip = max_lookahead + 1 - min_lookahead;
3769
3770 for (int i = max_lookahead; i >= min_lookahead; i--) {
3771 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3772 for (int j = 0; j < kSize; j++) {
3773 if (map->at(j)) {
3774 boolean_skip_table->set(j, kDontSkipArrayEntry);
3775 }
3776 }
3777 }
3778
3779 return skip;
3780}
3781
3782
3783// See comment above on the implementation of GetSkipTable.
3784void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3785 const int kSize = RegExpMacroAssembler::kTableSize;
3786
3787 int min_lookahead = 0;
3788 int max_lookahead = 0;
3789
3790 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3791
3792 bool found_single_character = false;
3793 int single_character = 0;
3794 for (int i = max_lookahead; i >= min_lookahead; i--) {
3795 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3796 if (map->map_count() > 1 ||
3797 (found_single_character && map->map_count() != 0)) {
3798 found_single_character = false;
3799 break;
3800 }
3801 for (int j = 0; j < kSize; j++) {
3802 if (map->at(j)) {
3803 found_single_character = true;
3804 single_character = j;
3805 break;
3806 }
3807 }
3808 }
3809
3810 int lookahead_width = max_lookahead + 1 - min_lookahead;
3811
3812 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3813 // The mask-compare can probably handle this better.
3814 return;
3815 }
3816
3817 if (found_single_character) {
3818 Label cont, again;
3819 masm->Bind(&again);
3820 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3821 if (max_char_ > kSize) {
3822 masm->CheckCharacterAfterAnd(single_character,
3823 RegExpMacroAssembler::kTableMask,
3824 &cont);
3825 } else {
3826 masm->CheckCharacter(single_character, &cont);
3827 }
3828 masm->AdvanceCurrentPosition(lookahead_width);
3829 masm->GoTo(&again);
3830 masm->Bind(&cont);
3831 return;
3832 }
3833
3834 Factory* factory = masm->isolate()->factory();
3835 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3836 int skip_distance = GetSkipTable(
3837 min_lookahead, max_lookahead, boolean_skip_table);
3838 DCHECK(skip_distance != 0);
3839
3840 Label cont, again;
3841 masm->Bind(&again);
3842 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3843 masm->CheckBitInTable(boolean_skip_table, &cont);
3844 masm->AdvanceCurrentPosition(skip_distance);
3845 masm->GoTo(&again);
3846 masm->Bind(&cont);
3847}
3848
3849
3850/* Code generation for choice nodes.
3851 *
3852 * We generate quick checks that do a mask and compare to eliminate a
3853 * choice. If the quick check succeeds then it jumps to the continuation to
3854 * do slow checks and check subsequent nodes. If it fails (the common case)
3855 * it falls through to the next choice.
3856 *
3857 * Here is the desired flow graph. Nodes directly below each other imply
3858 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3859 * 3 doesn't have a quick check so we have to call the slow check.
3860 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3861 * regexp continuation is generated directly after the Sn node, up to the
3862 * next GoTo if we decide to reuse some already generated code. Some
3863 * nodes expect preload_characters to be preloaded into the current
3864 * character register. R nodes do this preloading. Vertices are marked
3865 * F for failures and S for success (possible success in the case of quick
3866 * nodes). L, V, < and > are used as arrow heads.
3867 *
3868 * ----------> R
3869 * |
3870 * V
3871 * Q1 -----> S1
3872 * | S /
3873 * F| /
3874 * | F/
3875 * | /
3876 * | R
3877 * | /
3878 * V L
3879 * Q2 -----> S2
3880 * | S /
3881 * F| /
3882 * | F/
3883 * | /
3884 * | R
3885 * | /
3886 * V L
3887 * S3
3888 * |
3889 * F|
3890 * |
3891 * R
3892 * |
3893 * backtrack V
3894 * <----------Q4
3895 * \ F |
3896 * \ |S
3897 * \ F V
3898 * \-----S4
3899 *
3900 * For greedy loops we push the current position, then generate the code that
3901 * eats the input specially in EmitGreedyLoop. The other choice (the
3902 * continuation) is generated by the normal code in EmitChoices, and steps back
3903 * in the input to the starting position when it fails to match. The loop code
3904 * looks like this (U is the unwind code that steps back in the greedy loop).
3905 *
3906 * _____
3907 * / \
3908 * V |
3909 * ----------> S1 |
3910 * /| |
3911 * / |S |
3912 * F/ \_____/
3913 * /
3914 * |<-----
3915 * | \
3916 * V |S
3917 * Q2 ---> U----->backtrack
3918 * | F /
3919 * S| /
3920 * V F /
3921 * S2--/
3922 */
3923
3924GreedyLoopState::GreedyLoopState(bool not_at_start) {
3925 counter_backtrack_trace_.set_backtrack(&label_);
3926 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3927}
3928
3929
3930void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3931#ifdef DEBUG
3932 int choice_count = alternatives_->length();
3933 for (int i = 0; i < choice_count - 1; i++) {
3934 GuardedAlternative alternative = alternatives_->at(i);
3935 ZoneList<Guard*>* guards = alternative.guards();
3936 int guard_count = (guards == NULL) ? 0 : guards->length();
3937 for (int j = 0; j < guard_count; j++) {
3938 DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3939 }
3940 }
3941#endif
3942}
3943
3944
3945void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3946 Trace* current_trace,
3947 PreloadState* state) {
3948 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3949 // Save some time by looking at most one machine word ahead.
3950 state->eats_at_least_ =
3951 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3952 current_trace->at_start() == Trace::FALSE_VALUE);
3953 }
3954 state->preload_characters_ =
3955 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3956
3957 state->preload_is_current_ =
3958 (current_trace->characters_preloaded() == state->preload_characters_);
3959 state->preload_has_checked_bounds_ = state->preload_is_current_;
3960}
3961
3962
3963void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3964 int choice_count = alternatives_->length();
3965
Ben Murdoch097c5b22016-05-18 11:27:45 +01003966 if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
3967 alternatives_->at(0).node()->Emit(compiler, trace);
3968 return;
3969 }
3970
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003971 AssertGuardsMentionRegisters(trace);
3972
3973 LimitResult limit_result = LimitVersions(compiler, trace);
3974 if (limit_result == DONE) return;
3975 DCHECK(limit_result == CONTINUE);
3976
3977 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3978 // other choice nodes we only flush if we are out of code size budget.
3979 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3980 trace->Flush(compiler, this);
3981 return;
3982 }
3983
3984 RecursionCheck rc(compiler);
3985
3986 PreloadState preload;
3987 preload.init();
3988 GreedyLoopState greedy_loop_state(not_at_start());
3989
3990 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3991 AlternativeGenerationList alt_gens(choice_count, zone());
3992
3993 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3994 trace = EmitGreedyLoop(compiler,
3995 trace,
3996 &alt_gens,
3997 &preload,
3998 &greedy_loop_state,
3999 text_length);
4000 } else {
4001 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
4002 // match the traces produced pre-cleanup.
4003 Label second_choice;
4004 compiler->macro_assembler()->Bind(&second_choice);
4005
4006 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
4007
4008 EmitChoices(compiler,
4009 &alt_gens,
4010 0,
4011 trace,
4012 &preload);
4013 }
4014
4015 // At this point we need to generate slow checks for the alternatives where
4016 // the quick check was inlined. We can recognize these because the associated
4017 // label was bound.
4018 int new_flush_budget = trace->flush_budget() / choice_count;
4019 for (int i = 0; i < choice_count; i++) {
4020 AlternativeGeneration* alt_gen = alt_gens.at(i);
4021 Trace new_trace(*trace);
4022 // If there are actions to be flushed we have to limit how many times
4023 // they are flushed. Take the budget of the parent trace and distribute
4024 // it fairly amongst the children.
4025 if (new_trace.actions() != NULL) {
4026 new_trace.set_flush_budget(new_flush_budget);
4027 }
4028 bool next_expects_preload =
4029 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
4030 EmitOutOfLineContinuation(compiler,
4031 &new_trace,
4032 alternatives_->at(i),
4033 alt_gen,
4034 preload.preload_characters_,
4035 next_expects_preload);
4036 }
4037}
4038
4039
4040Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
4041 Trace* trace,
4042 AlternativeGenerationList* alt_gens,
4043 PreloadState* preload,
4044 GreedyLoopState* greedy_loop_state,
4045 int text_length) {
4046 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4047 // Here we have special handling for greedy loops containing only text nodes
4048 // and other simple nodes. These are handled by pushing the current
4049 // position on the stack and then incrementing the current position each
4050 // time around the switch. On backtrack we decrement the current position
4051 // and check it against the pushed value. This avoids pushing backtrack
4052 // information for each iteration of the loop, which could take up a lot of
4053 // space.
4054 DCHECK(trace->stop_node() == NULL);
4055 macro_assembler->PushCurrentPosition();
4056 Label greedy_match_failed;
4057 Trace greedy_match_trace;
4058 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
4059 greedy_match_trace.set_backtrack(&greedy_match_failed);
4060 Label loop_label;
4061 macro_assembler->Bind(&loop_label);
4062 greedy_match_trace.set_stop_node(this);
4063 greedy_match_trace.set_loop_label(&loop_label);
4064 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4065 macro_assembler->Bind(&greedy_match_failed);
4066
4067 Label second_choice; // For use in greedy matches.
4068 macro_assembler->Bind(&second_choice);
4069
4070 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4071
4072 EmitChoices(compiler,
4073 alt_gens,
4074 1,
4075 new_trace,
4076 preload);
4077
4078 macro_assembler->Bind(greedy_loop_state->label());
4079 // If we have unwound to the bottom then backtrack.
4080 macro_assembler->CheckGreedyLoop(trace->backtrack());
4081 // Otherwise try the second priority at an earlier position.
4082 macro_assembler->AdvanceCurrentPosition(-text_length);
4083 macro_assembler->GoTo(&second_choice);
4084 return new_trace;
4085}
4086
4087int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4088 Trace* trace) {
4089 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4090 if (alternatives_->length() != 2) return eats_at_least;
4091
4092 GuardedAlternative alt1 = alternatives_->at(1);
4093 if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4094 return eats_at_least;
4095 }
4096 RegExpNode* eats_anything_node = alt1.node();
4097 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4098 return eats_at_least;
4099 }
4100
4101 // Really we should be creating a new trace when we execute this function,
4102 // but there is no need, because the code it generates cannot backtrack, and
4103 // we always arrive here with a trivial trace (since it's the entry to a
4104 // loop. That also implies that there are no preloaded characters, which is
4105 // good, because it means we won't be violating any assumptions by
4106 // overwriting those characters with new load instructions.
4107 DCHECK(trace->is_trivial());
4108
4109 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4110 Isolate* isolate = macro_assembler->isolate();
4111 // At this point we know that we are at a non-greedy loop that will eat
4112 // any character one at a time. Any non-anchored regexp has such a
4113 // loop prepended to it in order to find where it starts. We look for
4114 // a pattern of the form ...abc... where we can look 6 characters ahead
4115 // and step forwards 3 if the character is not one of abc. Abc need
4116 // not be atoms, they can be any reasonably limited character class or
4117 // small alternation.
4118 BoyerMooreLookahead* bm = bm_info(false);
4119 if (bm == NULL) {
4120 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4121 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4122 kRecursionBudget,
4123 false));
4124 if (eats_at_least >= 1) {
4125 bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4126 compiler,
4127 zone());
4128 GuardedAlternative alt0 = alternatives_->at(0);
4129 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4130 }
4131 }
4132 if (bm != NULL) {
4133 bm->EmitSkipInstructions(macro_assembler);
4134 }
4135 return eats_at_least;
4136}
4137
4138
4139void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4140 AlternativeGenerationList* alt_gens,
4141 int first_choice,
4142 Trace* trace,
4143 PreloadState* preload) {
4144 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4145 SetUpPreLoad(compiler, trace, preload);
4146
4147 // For now we just call all choices one after the other. The idea ultimately
4148 // is to use the Dispatch table to try only the relevant ones.
4149 int choice_count = alternatives_->length();
4150
4151 int new_flush_budget = trace->flush_budget() / choice_count;
4152
4153 for (int i = first_choice; i < choice_count; i++) {
4154 bool is_last = i == choice_count - 1;
4155 bool fall_through_on_failure = !is_last;
4156 GuardedAlternative alternative = alternatives_->at(i);
4157 AlternativeGeneration* alt_gen = alt_gens->at(i);
4158 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4159 ZoneList<Guard*>* guards = alternative.guards();
4160 int guard_count = (guards == NULL) ? 0 : guards->length();
4161 Trace new_trace(*trace);
4162 new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4163 preload->preload_characters_ :
4164 0);
4165 if (preload->preload_has_checked_bounds_) {
4166 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4167 }
4168 new_trace.quick_check_performed()->Clear();
4169 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4170 if (!is_last) {
4171 new_trace.set_backtrack(&alt_gen->after);
4172 }
4173 alt_gen->expects_preload = preload->preload_is_current_;
4174 bool generate_full_check_inline = false;
4175 if (compiler->optimize() &&
4176 try_to_emit_quick_check_for_alternative(i == 0) &&
4177 alternative.node()->EmitQuickCheck(
4178 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4179 &alt_gen->possible_success, &alt_gen->quick_check_details,
4180 fall_through_on_failure)) {
4181 // Quick check was generated for this choice.
4182 preload->preload_is_current_ = true;
4183 preload->preload_has_checked_bounds_ = true;
4184 // If we generated the quick check to fall through on possible success,
4185 // we now need to generate the full check inline.
4186 if (!fall_through_on_failure) {
4187 macro_assembler->Bind(&alt_gen->possible_success);
4188 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4189 new_trace.set_characters_preloaded(preload->preload_characters_);
4190 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4191 generate_full_check_inline = true;
4192 }
4193 } else if (alt_gen->quick_check_details.cannot_match()) {
4194 if (!fall_through_on_failure) {
4195 macro_assembler->GoTo(trace->backtrack());
4196 }
4197 continue;
4198 } else {
4199 // No quick check was generated. Put the full code here.
4200 // If this is not the first choice then there could be slow checks from
4201 // previous cases that go here when they fail. There's no reason to
4202 // insist that they preload characters since the slow check we are about
4203 // to generate probably can't use it.
4204 if (i != first_choice) {
4205 alt_gen->expects_preload = false;
4206 new_trace.InvalidateCurrentCharacter();
4207 }
4208 generate_full_check_inline = true;
4209 }
4210 if (generate_full_check_inline) {
4211 if (new_trace.actions() != NULL) {
4212 new_trace.set_flush_budget(new_flush_budget);
4213 }
4214 for (int j = 0; j < guard_count; j++) {
4215 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4216 }
4217 alternative.node()->Emit(compiler, &new_trace);
4218 preload->preload_is_current_ = false;
4219 }
4220 macro_assembler->Bind(&alt_gen->after);
4221 }
4222}
4223
4224
4225void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4226 Trace* trace,
4227 GuardedAlternative alternative,
4228 AlternativeGeneration* alt_gen,
4229 int preload_characters,
4230 bool next_expects_preload) {
4231 if (!alt_gen->possible_success.is_linked()) return;
4232
4233 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4234 macro_assembler->Bind(&alt_gen->possible_success);
4235 Trace out_of_line_trace(*trace);
4236 out_of_line_trace.set_characters_preloaded(preload_characters);
4237 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4238 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4239 ZoneList<Guard*>* guards = alternative.guards();
4240 int guard_count = (guards == NULL) ? 0 : guards->length();
4241 if (next_expects_preload) {
4242 Label reload_current_char;
4243 out_of_line_trace.set_backtrack(&reload_current_char);
4244 for (int j = 0; j < guard_count; j++) {
4245 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4246 }
4247 alternative.node()->Emit(compiler, &out_of_line_trace);
4248 macro_assembler->Bind(&reload_current_char);
4249 // Reload the current character, since the next quick check expects that.
4250 // We don't need to check bounds here because we only get into this
4251 // code through a quick check which already did the checked load.
4252 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4253 NULL,
4254 false,
4255 preload_characters);
4256 macro_assembler->GoTo(&(alt_gen->after));
4257 } else {
4258 out_of_line_trace.set_backtrack(&(alt_gen->after));
4259 for (int j = 0; j < guard_count; j++) {
4260 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4261 }
4262 alternative.node()->Emit(compiler, &out_of_line_trace);
4263 }
4264}
4265
4266
4267void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4268 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4269 LimitResult limit_result = LimitVersions(compiler, trace);
4270 if (limit_result == DONE) return;
4271 DCHECK(limit_result == CONTINUE);
4272
4273 RecursionCheck rc(compiler);
4274
4275 switch (action_type_) {
4276 case STORE_POSITION: {
4277 Trace::DeferredCapture
4278 new_capture(data_.u_position_register.reg,
4279 data_.u_position_register.is_capture,
4280 trace);
4281 Trace new_trace = *trace;
4282 new_trace.add_action(&new_capture);
4283 on_success()->Emit(compiler, &new_trace);
4284 break;
4285 }
4286 case INCREMENT_REGISTER: {
4287 Trace::DeferredIncrementRegister
4288 new_increment(data_.u_increment_register.reg);
4289 Trace new_trace = *trace;
4290 new_trace.add_action(&new_increment);
4291 on_success()->Emit(compiler, &new_trace);
4292 break;
4293 }
4294 case SET_REGISTER: {
4295 Trace::DeferredSetRegister
4296 new_set(data_.u_store_register.reg, data_.u_store_register.value);
4297 Trace new_trace = *trace;
4298 new_trace.add_action(&new_set);
4299 on_success()->Emit(compiler, &new_trace);
4300 break;
4301 }
4302 case CLEAR_CAPTURES: {
4303 Trace::DeferredClearCaptures
4304 new_capture(Interval(data_.u_clear_captures.range_from,
4305 data_.u_clear_captures.range_to));
4306 Trace new_trace = *trace;
4307 new_trace.add_action(&new_capture);
4308 on_success()->Emit(compiler, &new_trace);
4309 break;
4310 }
4311 case BEGIN_SUBMATCH:
4312 if (!trace->is_trivial()) {
4313 trace->Flush(compiler, this);
4314 } else {
4315 assembler->WriteCurrentPositionToRegister(
4316 data_.u_submatch.current_position_register, 0);
4317 assembler->WriteStackPointerToRegister(
4318 data_.u_submatch.stack_pointer_register);
4319 on_success()->Emit(compiler, trace);
4320 }
4321 break;
4322 case EMPTY_MATCH_CHECK: {
4323 int start_pos_reg = data_.u_empty_match_check.start_register;
4324 int stored_pos = 0;
4325 int rep_reg = data_.u_empty_match_check.repetition_register;
4326 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4327 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4328 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4329 // If we know we haven't advanced and there is no minimum we
4330 // can just backtrack immediately.
4331 assembler->GoTo(trace->backtrack());
4332 } else if (know_dist && stored_pos < trace->cp_offset()) {
4333 // If we know we've advanced we can generate the continuation
4334 // immediately.
4335 on_success()->Emit(compiler, trace);
4336 } else if (!trace->is_trivial()) {
4337 trace->Flush(compiler, this);
4338 } else {
4339 Label skip_empty_check;
4340 // If we have a minimum number of repetitions we check the current
4341 // number first and skip the empty check if it's not enough.
4342 if (has_minimum) {
4343 int limit = data_.u_empty_match_check.repetition_limit;
4344 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4345 }
4346 // If the match is empty we bail out, otherwise we fall through
4347 // to the on-success continuation.
4348 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4349 trace->backtrack());
4350 assembler->Bind(&skip_empty_check);
4351 on_success()->Emit(compiler, trace);
4352 }
4353 break;
4354 }
4355 case POSITIVE_SUBMATCH_SUCCESS: {
4356 if (!trace->is_trivial()) {
4357 trace->Flush(compiler, this);
4358 return;
4359 }
4360 assembler->ReadCurrentPositionFromRegister(
4361 data_.u_submatch.current_position_register);
4362 assembler->ReadStackPointerFromRegister(
4363 data_.u_submatch.stack_pointer_register);
4364 int clear_register_count = data_.u_submatch.clear_register_count;
4365 if (clear_register_count == 0) {
4366 on_success()->Emit(compiler, trace);
4367 return;
4368 }
4369 int clear_registers_from = data_.u_submatch.clear_register_from;
4370 Label clear_registers_backtrack;
4371 Trace new_trace = *trace;
4372 new_trace.set_backtrack(&clear_registers_backtrack);
4373 on_success()->Emit(compiler, &new_trace);
4374
4375 assembler->Bind(&clear_registers_backtrack);
4376 int clear_registers_to = clear_registers_from + clear_register_count - 1;
4377 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4378
4379 DCHECK(trace->backtrack() == NULL);
4380 assembler->Backtrack();
4381 return;
4382 }
4383 default:
4384 UNREACHABLE();
4385 }
4386}
4387
4388
4389void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4390 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4391 if (!trace->is_trivial()) {
4392 trace->Flush(compiler, this);
4393 return;
4394 }
4395
4396 LimitResult limit_result = LimitVersions(compiler, trace);
4397 if (limit_result == DONE) return;
4398 DCHECK(limit_result == CONTINUE);
4399
4400 RecursionCheck rc(compiler);
4401
4402 DCHECK_EQ(start_reg_ + 1, end_reg_);
4403 if (compiler->ignore_case()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01004404 assembler->CheckNotBackReferenceIgnoreCase(
4405 start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004406 } else {
4407 assembler->CheckNotBackReference(start_reg_, read_backward(),
4408 trace->backtrack());
4409 }
4410 // We are going to advance backward, so we may end up at the start.
4411 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
Ben Murdoch097c5b22016-05-18 11:27:45 +01004412
4413 // Check that the back reference does not end inside a surrogate pair.
4414 if (compiler->unicode() && !compiler->one_byte()) {
4415 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4416 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004417 on_success()->Emit(compiler, trace);
4418}
4419
4420
4421// -------------------------------------------------------------------
4422// Dot/dotty output
4423
4424
4425#ifdef DEBUG
4426
4427
4428class DotPrinter: public NodeVisitor {
4429 public:
4430 DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4431 : os_(os),
4432 ignore_case_(ignore_case) {}
4433 void PrintNode(const char* label, RegExpNode* node);
4434 void Visit(RegExpNode* node);
4435 void PrintAttributes(RegExpNode* from);
4436 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4437#define DECLARE_VISIT(Type) \
4438 virtual void Visit##Type(Type##Node* that);
4439FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4440#undef DECLARE_VISIT
4441 private:
4442 std::ostream& os_;
4443 bool ignore_case_;
4444};
4445
4446
4447void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4448 os_ << "digraph G {\n graph [label=\"";
4449 for (int i = 0; label[i]; i++) {
4450 switch (label[i]) {
4451 case '\\':
4452 os_ << "\\\\";
4453 break;
4454 case '"':
4455 os_ << "\"";
4456 break;
4457 default:
4458 os_ << label[i];
4459 break;
4460 }
4461 }
4462 os_ << "\"];\n";
4463 Visit(node);
4464 os_ << "}" << std::endl;
4465}
4466
4467
4468void DotPrinter::Visit(RegExpNode* node) {
4469 if (node->info()->visited) return;
4470 node->info()->visited = true;
4471 node->Accept(this);
4472}
4473
4474
4475void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4476 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4477 Visit(on_failure);
4478}
4479
4480
4481class TableEntryBodyPrinter {
4482 public:
4483 TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4484 : os_(os),
4485 choice_(choice) {}
4486 void Call(uc16 from, DispatchTable::Entry entry) {
4487 OutSet* out_set = entry.out_set();
4488 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4489 if (out_set->Get(i)) {
4490 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4491 << choice()->alternatives()->at(i).node() << ";\n";
4492 }
4493 }
4494 }
4495 private:
4496 ChoiceNode* choice() { return choice_; }
4497 std::ostream& os_;
4498 ChoiceNode* choice_;
4499};
4500
4501
4502class TableEntryHeaderPrinter {
4503 public:
4504 explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4505 : first_(true),
4506 os_(os) {}
4507 void Call(uc16 from, DispatchTable::Entry entry) {
4508 if (first_) {
4509 first_ = false;
4510 } else {
4511 os_ << "|";
4512 }
4513 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4514 OutSet* out_set = entry.out_set();
4515 int priority = 0;
4516 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4517 if (out_set->Get(i)) {
4518 if (priority > 0) os_ << "|";
4519 os_ << "<s" << from << "o" << i << "> " << priority;
4520 priority++;
4521 }
4522 }
4523 os_ << "}}";
4524 }
4525
4526 private:
4527 bool first_;
4528 std::ostream& os_;
4529};
4530
4531
4532class AttributePrinter {
4533 public:
4534 explicit AttributePrinter(std::ostream& os) // NOLINT
4535 : os_(os),
4536 first_(true) {}
4537 void PrintSeparator() {
4538 if (first_) {
4539 first_ = false;
4540 } else {
4541 os_ << "|";
4542 }
4543 }
4544 void PrintBit(const char* name, bool value) {
4545 if (!value) return;
4546 PrintSeparator();
4547 os_ << "{" << name << "}";
4548 }
4549 void PrintPositive(const char* name, int value) {
4550 if (value < 0) return;
4551 PrintSeparator();
4552 os_ << "{" << name << "|" << value << "}";
4553 }
4554
4555 private:
4556 std::ostream& os_;
4557 bool first_;
4558};
4559
4560
4561void DotPrinter::PrintAttributes(RegExpNode* that) {
4562 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4563 << "margin=0.1, fontsize=10, label=\"{";
4564 AttributePrinter printer(os_);
4565 NodeInfo* info = that->info();
4566 printer.PrintBit("NI", info->follows_newline_interest);
4567 printer.PrintBit("WI", info->follows_word_interest);
4568 printer.PrintBit("SI", info->follows_start_interest);
4569 Label* label = that->label();
4570 if (label->is_bound())
4571 printer.PrintPositive("@", label->pos());
4572 os_ << "}\"];\n"
4573 << " a" << that << " -> n" << that
4574 << " [style=dashed, color=grey, arrowhead=none];\n";
4575}
4576
4577
4578static const bool kPrintDispatchTable = false;
4579void DotPrinter::VisitChoice(ChoiceNode* that) {
4580 if (kPrintDispatchTable) {
4581 os_ << " n" << that << " [shape=Mrecord, label=\"";
4582 TableEntryHeaderPrinter header_printer(os_);
4583 that->GetTable(ignore_case_)->ForEach(&header_printer);
4584 os_ << "\"]\n";
4585 PrintAttributes(that);
4586 TableEntryBodyPrinter body_printer(os_, that);
4587 that->GetTable(ignore_case_)->ForEach(&body_printer);
4588 } else {
4589 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4590 for (int i = 0; i < that->alternatives()->length(); i++) {
4591 GuardedAlternative alt = that->alternatives()->at(i);
4592 os_ << " n" << that << " -> n" << alt.node();
4593 }
4594 }
4595 for (int i = 0; i < that->alternatives()->length(); i++) {
4596 GuardedAlternative alt = that->alternatives()->at(i);
4597 alt.node()->Accept(this);
4598 }
4599}
4600
4601
4602void DotPrinter::VisitText(TextNode* that) {
4603 Zone* zone = that->zone();
4604 os_ << " n" << that << " [label=\"";
4605 for (int i = 0; i < that->elements()->length(); i++) {
4606 if (i > 0) os_ << " ";
4607 TextElement elm = that->elements()->at(i);
4608 switch (elm.text_type()) {
4609 case TextElement::ATOM: {
4610 Vector<const uc16> data = elm.atom()->data();
4611 for (int i = 0; i < data.length(); i++) {
4612 os_ << static_cast<char>(data[i]);
4613 }
4614 break;
4615 }
4616 case TextElement::CHAR_CLASS: {
4617 RegExpCharacterClass* node = elm.char_class();
4618 os_ << "[";
4619 if (node->is_negated()) os_ << "^";
4620 for (int j = 0; j < node->ranges(zone)->length(); j++) {
4621 CharacterRange range = node->ranges(zone)->at(j);
4622 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4623 }
4624 os_ << "]";
4625 break;
4626 }
4627 default:
4628 UNREACHABLE();
4629 }
4630 }
4631 os_ << "\", shape=box, peripheries=2];\n";
4632 PrintAttributes(that);
4633 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4634 Visit(that->on_success());
4635}
4636
4637
4638void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4639 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4640 << that->end_register() << "\", shape=doubleoctagon];\n";
4641 PrintAttributes(that);
4642 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4643 Visit(that->on_success());
4644}
4645
4646
4647void DotPrinter::VisitEnd(EndNode* that) {
4648 os_ << " n" << that << " [style=bold, shape=point];\n";
4649 PrintAttributes(that);
4650}
4651
4652
4653void DotPrinter::VisitAssertion(AssertionNode* that) {
4654 os_ << " n" << that << " [";
4655 switch (that->assertion_type()) {
4656 case AssertionNode::AT_END:
4657 os_ << "label=\"$\", shape=septagon";
4658 break;
4659 case AssertionNode::AT_START:
4660 os_ << "label=\"^\", shape=septagon";
4661 break;
4662 case AssertionNode::AT_BOUNDARY:
4663 os_ << "label=\"\\b\", shape=septagon";
4664 break;
4665 case AssertionNode::AT_NON_BOUNDARY:
4666 os_ << "label=\"\\B\", shape=septagon";
4667 break;
4668 case AssertionNode::AFTER_NEWLINE:
4669 os_ << "label=\"(?<=\\n)\", shape=septagon";
4670 break;
4671 }
4672 os_ << "];\n";
4673 PrintAttributes(that);
4674 RegExpNode* successor = that->on_success();
4675 os_ << " n" << that << " -> n" << successor << ";\n";
4676 Visit(successor);
4677}
4678
4679
4680void DotPrinter::VisitAction(ActionNode* that) {
4681 os_ << " n" << that << " [";
4682 switch (that->action_type_) {
4683 case ActionNode::SET_REGISTER:
4684 os_ << "label=\"$" << that->data_.u_store_register.reg
4685 << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4686 break;
4687 case ActionNode::INCREMENT_REGISTER:
4688 os_ << "label=\"$" << that->data_.u_increment_register.reg
4689 << "++\", shape=octagon";
4690 break;
4691 case ActionNode::STORE_POSITION:
4692 os_ << "label=\"$" << that->data_.u_position_register.reg
4693 << ":=$pos\", shape=octagon";
4694 break;
4695 case ActionNode::BEGIN_SUBMATCH:
4696 os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4697 << ":=$pos,begin\", shape=septagon";
4698 break;
4699 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4700 os_ << "label=\"escape\", shape=septagon";
4701 break;
4702 case ActionNode::EMPTY_MATCH_CHECK:
4703 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4704 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4705 << "<" << that->data_.u_empty_match_check.repetition_limit
4706 << "?\", shape=septagon";
4707 break;
4708 case ActionNode::CLEAR_CAPTURES: {
4709 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4710 << " to $" << that->data_.u_clear_captures.range_to
4711 << "\", shape=septagon";
4712 break;
4713 }
4714 }
4715 os_ << "];\n";
4716 PrintAttributes(that);
4717 RegExpNode* successor = that->on_success();
4718 os_ << " n" << that << " -> n" << successor << ";\n";
4719 Visit(successor);
4720}
4721
4722
4723class DispatchTableDumper {
4724 public:
4725 explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4726 void Call(uc16 key, DispatchTable::Entry entry);
4727 private:
4728 std::ostream& os_;
4729};
4730
4731
4732void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4733 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4734 OutSet* set = entry.out_set();
4735 bool first = true;
4736 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4737 if (set->Get(i)) {
4738 if (first) {
4739 first = false;
4740 } else {
4741 os_ << ", ";
4742 }
4743 os_ << i;
4744 }
4745 }
4746 os_ << "}\n";
4747}
4748
4749
4750void DispatchTable::Dump() {
4751 OFStream os(stderr);
4752 DispatchTableDumper dumper(os);
4753 tree()->ForEach(&dumper);
4754}
4755
4756
4757void RegExpEngine::DotPrint(const char* label,
4758 RegExpNode* node,
4759 bool ignore_case) {
4760 OFStream os(stdout);
4761 DotPrinter printer(os, ignore_case);
4762 printer.PrintNode(label, node);
4763}
4764
4765
4766#endif // DEBUG
4767
4768
4769// -------------------------------------------------------------------
4770// Tree to graph conversion
4771
4772RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4773 RegExpNode* on_success) {
4774 ZoneList<TextElement>* elms =
4775 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4776 elms->Add(TextElement::Atom(this), compiler->zone());
4777 return new (compiler->zone())
4778 TextNode(elms, compiler->read_backward(), on_success);
4779}
4780
4781
4782RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4783 RegExpNode* on_success) {
4784 return new (compiler->zone())
4785 TextNode(elements(), compiler->read_backward(), on_success);
4786}
4787
4788
4789static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4790 const int* special_class,
4791 int length) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01004792 length--; // Remove final marker.
4793 DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004794 DCHECK(ranges->length() != 0);
4795 DCHECK(length != 0);
4796 DCHECK(special_class[0] != 0);
4797 if (ranges->length() != (length >> 1) + 1) {
4798 return false;
4799 }
4800 CharacterRange range = ranges->at(0);
4801 if (range.from() != 0) {
4802 return false;
4803 }
4804 for (int i = 0; i < length; i += 2) {
4805 if (special_class[i] != (range.to() + 1)) {
4806 return false;
4807 }
4808 range = ranges->at((i >> 1) + 1);
4809 if (special_class[i+1] != range.from()) {
4810 return false;
4811 }
4812 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01004813 if (range.to() != String::kMaxCodePoint) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004814 return false;
4815 }
4816 return true;
4817}
4818
4819
4820static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4821 const int* special_class,
4822 int length) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01004823 length--; // Remove final marker.
4824 DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004825 if (ranges->length() * 2 != length) {
4826 return false;
4827 }
4828 for (int i = 0; i < length; i += 2) {
4829 CharacterRange range = ranges->at(i >> 1);
4830 if (range.from() != special_class[i] ||
4831 range.to() != special_class[i + 1] - 1) {
4832 return false;
4833 }
4834 }
4835 return true;
4836}
4837
4838
4839bool RegExpCharacterClass::is_standard(Zone* zone) {
4840 // TODO(lrn): Remove need for this function, by not throwing away information
4841 // along the way.
4842 if (is_negated_) {
4843 return false;
4844 }
4845 if (set_.is_standard()) {
4846 return true;
4847 }
4848 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4849 set_.set_standard_set_type('s');
4850 return true;
4851 }
4852 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4853 set_.set_standard_set_type('S');
4854 return true;
4855 }
4856 if (CompareInverseRanges(set_.ranges(zone),
4857 kLineTerminatorRanges,
4858 kLineTerminatorRangeCount)) {
4859 set_.set_standard_set_type('.');
4860 return true;
4861 }
4862 if (CompareRanges(set_.ranges(zone),
4863 kLineTerminatorRanges,
4864 kLineTerminatorRangeCount)) {
4865 set_.set_standard_set_type('n');
4866 return true;
4867 }
4868 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4869 set_.set_standard_set_type('w');
4870 return true;
4871 }
4872 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4873 set_.set_standard_set_type('W');
4874 return true;
4875 }
4876 return false;
4877}
4878
4879
Ben Murdoch097c5b22016-05-18 11:27:45 +01004880UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4881 ZoneList<CharacterRange>* base)
4882 : zone_(zone),
4883 table_(zone),
4884 bmp_(nullptr),
4885 lead_surrogates_(nullptr),
4886 trail_surrogates_(nullptr),
4887 non_bmp_(nullptr) {
4888 // The unicode range splitter categorizes given character ranges into:
4889 // - Code points from the BMP representable by one code unit.
4890 // - Code points outside the BMP that need to be split into surrogate pairs.
4891 // - Lone lead surrogates.
4892 // - Lone trail surrogates.
4893 // Lone surrogates are valid code points, even though no actual characters.
4894 // They require special matching to make sure we do not split surrogate pairs.
4895 // We use the dispatch table to accomplish this. The base range is split up
4896 // by the table by the overlay ranges, and the Call callback is used to
4897 // filter and collect ranges for each category.
4898 for (int i = 0; i < base->length(); i++) {
4899 table_.AddRange(base->at(i), kBase, zone_);
4900 }
4901 // Add overlay ranges.
4902 table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4903 kBmpCodePoints, zone_);
4904 table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4905 kLeadSurrogates, zone_);
4906 table_.AddRange(
4907 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4908 kTrailSurrogates, zone_);
4909 table_.AddRange(
4910 CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4911 kBmpCodePoints, zone_);
4912 table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4913 kNonBmpCodePoints, zone_);
4914 table_.ForEach(this);
4915}
4916
4917
4918void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4919 OutSet* outset = entry.out_set();
4920 if (!outset->Get(kBase)) return;
4921 ZoneList<CharacterRange>** target = NULL;
4922 if (outset->Get(kBmpCodePoints)) {
4923 target = &bmp_;
4924 } else if (outset->Get(kLeadSurrogates)) {
4925 target = &lead_surrogates_;
4926 } else if (outset->Get(kTrailSurrogates)) {
4927 target = &trail_surrogates_;
4928 } else {
4929 DCHECK(outset->Get(kNonBmpCodePoints));
4930 target = &non_bmp_;
4931 }
4932 if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4933 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4934}
4935
4936
4937void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4938 RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4939 ZoneList<CharacterRange>* bmp = splitter->bmp();
4940 if (bmp == nullptr) return;
4941 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4942 compiler->zone(), bmp, compiler->read_backward(), on_success)));
4943}
4944
4945
4946void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4947 RegExpNode* on_success,
4948 UnicodeRangeSplitter* splitter) {
4949 ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4950 if (non_bmp == nullptr) return;
4951 DCHECK(compiler->unicode());
4952 DCHECK(!compiler->one_byte());
4953 Zone* zone = compiler->zone();
4954 CharacterRange::Canonicalize(non_bmp);
4955 for (int i = 0; i < non_bmp->length(); i++) {
4956 // Match surrogate pair.
4957 // E.g. [\u10005-\u11005] becomes
4958 // \ud800[\udc05-\udfff]|
4959 // [\ud801-\ud803][\udc00-\udfff]|
4960 // \ud804[\udc00-\udc05]
4961 uc32 from = non_bmp->at(i).from();
4962 uc32 to = non_bmp->at(i).to();
4963 uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4964 uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4965 uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4966 uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4967 if (from_l == to_l) {
4968 // The lead surrogate is the same.
4969 result->AddAlternative(
4970 GuardedAlternative(TextNode::CreateForSurrogatePair(
4971 zone, CharacterRange::Singleton(from_l),
4972 CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4973 on_success)));
4974 } else {
4975 if (from_t != kTrailSurrogateStart) {
4976 // Add [from_l][from_t-\udfff]
4977 result->AddAlternative(
4978 GuardedAlternative(TextNode::CreateForSurrogatePair(
4979 zone, CharacterRange::Singleton(from_l),
4980 CharacterRange::Range(from_t, kTrailSurrogateEnd),
4981 compiler->read_backward(), on_success)));
4982 from_l++;
4983 }
4984 if (to_t != kTrailSurrogateEnd) {
4985 // Add [to_l][\udc00-to_t]
4986 result->AddAlternative(
4987 GuardedAlternative(TextNode::CreateForSurrogatePair(
4988 zone, CharacterRange::Singleton(to_l),
4989 CharacterRange::Range(kTrailSurrogateStart, to_t),
4990 compiler->read_backward(), on_success)));
4991 to_l--;
4992 }
4993 if (from_l <= to_l) {
4994 // Add [from_l-to_l][\udc00-\udfff]
4995 result->AddAlternative(
4996 GuardedAlternative(TextNode::CreateForSurrogatePair(
4997 zone, CharacterRange::Range(from_l, to_l),
4998 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4999 compiler->read_backward(), on_success)));
5000 }
5001 }
5002 }
5003}
5004
5005
5006RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
5007 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
5008 ZoneList<CharacterRange>* match, RegExpNode* on_success,
5009 bool read_backward) {
5010 Zone* zone = compiler->zone();
5011 RegExpNode* match_node = TextNode::CreateForCharacterRanges(
5012 zone, match, read_backward, on_success);
5013 int stack_register = compiler->UnicodeLookaroundStackRegister();
5014 int position_register = compiler->UnicodeLookaroundPositionRegister();
5015 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
5016 position_register);
5017 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5018 zone, lookbehind, !read_backward, lookaround.on_match_success());
5019 return lookaround.ForMatch(negative_match);
5020}
5021
5022
5023RegExpNode* MatchAndNegativeLookaroundInReadDirection(
5024 RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
5025 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
5026 bool read_backward) {
5027 Zone* zone = compiler->zone();
5028 int stack_register = compiler->UnicodeLookaroundStackRegister();
5029 int position_register = compiler->UnicodeLookaroundPositionRegister();
5030 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
5031 position_register);
5032 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5033 zone, lookahead, read_backward, lookaround.on_match_success());
5034 return TextNode::CreateForCharacterRanges(
5035 zone, match, read_backward, lookaround.ForMatch(negative_match));
5036}
5037
5038
5039void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5040 RegExpNode* on_success,
5041 UnicodeRangeSplitter* splitter) {
5042 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
5043 if (lead_surrogates == nullptr) return;
5044 Zone* zone = compiler->zone();
5045 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
5046 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
5047 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
5048
5049 RegExpNode* match;
5050 if (compiler->read_backward()) {
5051 // Reading backward. Assert that reading forward, there is no trail
5052 // surrogate, and then backward match the lead surrogate.
5053 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5054 compiler, trail_surrogates, lead_surrogates, on_success, true);
5055 } else {
5056 // Reading forward. Forward match the lead surrogate and assert that
5057 // no trail surrogate follows.
5058 match = MatchAndNegativeLookaroundInReadDirection(
5059 compiler, lead_surrogates, trail_surrogates, on_success, false);
5060 }
5061 result->AddAlternative(GuardedAlternative(match));
5062}
5063
5064
5065void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5066 RegExpNode* on_success,
5067 UnicodeRangeSplitter* splitter) {
5068 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5069 if (trail_surrogates == nullptr) return;
5070 Zone* zone = compiler->zone();
5071 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5072 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5073 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5074
5075 RegExpNode* match;
5076 if (compiler->read_backward()) {
5077 // Reading backward. Backward match the trail surrogate and assert that no
5078 // lead surrogate precedes it.
5079 match = MatchAndNegativeLookaroundInReadDirection(
5080 compiler, trail_surrogates, lead_surrogates, on_success, true);
5081 } else {
5082 // Reading forward. Assert that reading backward, there is no lead
5083 // surrogate, and then forward match the trail surrogate.
5084 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5085 compiler, lead_surrogates, trail_surrogates, on_success, false);
5086 }
5087 result->AddAlternative(GuardedAlternative(match));
5088}
5089
5090RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5091 RegExpNode* on_success) {
5092 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5093 DCHECK(!compiler->read_backward());
5094 Zone* zone = compiler->zone();
5095 // Advance any character. If the character happens to be a lead surrogate and
5096 // we advanced into the middle of a surrogate pair, it will work out, as
5097 // nothing will match from there. We will have to advance again, consuming
5098 // the associated trail surrogate.
5099 ZoneList<CharacterRange>* range = CharacterRange::List(
5100 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5101 return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
5102}
5103
5104
5105void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
5106 ZoneList<CharacterRange>* ranges) {
5107#ifdef V8_I18N_SUPPORT
5108 // Use ICU to compute the case fold closure over the ranges.
5109 DCHECK(compiler->unicode());
5110 DCHECK(compiler->ignore_case());
5111 USet* set = uset_openEmpty();
5112 for (int i = 0; i < ranges->length(); i++) {
5113 uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
5114 }
5115 ranges->Clear();
5116 uset_closeOver(set, USET_CASE_INSENSITIVE);
5117 // Full case mapping map single characters to multiple characters.
5118 // Those are represented as strings in the set. Remove them so that
5119 // we end up with only simple and common case mappings.
5120 uset_removeAllStrings(set);
5121 int item_count = uset_getItemCount(set);
5122 int item_result = 0;
5123 UErrorCode ec = U_ZERO_ERROR;
5124 Zone* zone = compiler->zone();
5125 for (int i = 0; i < item_count; i++) {
5126 uc32 start = 0;
5127 uc32 end = 0;
5128 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
5129 ranges->Add(CharacterRange::Range(start, end), zone);
5130 }
5131 // No errors and everything we collected have been ranges.
5132 DCHECK_EQ(U_ZERO_ERROR, ec);
5133 DCHECK_EQ(0, item_result);
5134 uset_close(set);
5135#else
5136 // Fallback if ICU is not included.
5137 CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
5138 ranges, compiler->one_byte());
5139#endif // V8_I18N_SUPPORT
5140 CharacterRange::Canonicalize(ranges);
5141}
5142
5143
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005144RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5145 RegExpNode* on_success) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01005146 set_.Canonicalize();
5147 Zone* zone = compiler->zone();
5148 ZoneList<CharacterRange>* ranges = this->ranges(zone);
5149 if (compiler->unicode() && compiler->ignore_case()) {
5150 AddUnicodeCaseEquivalents(compiler, ranges);
5151 }
5152 if (compiler->unicode() && !compiler->one_byte()) {
5153 if (is_negated()) {
5154 ZoneList<CharacterRange>* negated =
5155 new (zone) ZoneList<CharacterRange>(2, zone);
5156 CharacterRange::Negate(ranges, negated, zone);
5157 ranges = negated;
5158 }
5159 if (ranges->length() == 0) {
5160 // No matches possible.
5161 return new (zone) EndNode(EndNode::BACKTRACK, zone);
5162 }
5163 if (standard_type() == '*') {
5164 return UnanchoredAdvance(compiler, on_success);
5165 } else {
5166 ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5167 UnicodeRangeSplitter splitter(zone, ranges);
5168 AddBmpCharacters(compiler, result, on_success, &splitter);
5169 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5170 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5171 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5172 return result;
5173 }
5174 } else {
5175 return new (zone) TextNode(this, compiler->read_backward(), on_success);
5176 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005177}
5178
5179
5180int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5181 RegExpAtom* atom1 = (*a)->AsAtom();
5182 RegExpAtom* atom2 = (*b)->AsAtom();
5183 uc16 character1 = atom1->data().at(0);
5184 uc16 character2 = atom2->data().at(0);
5185 if (character1 < character2) return -1;
5186 if (character1 > character2) return 1;
5187 return 0;
5188}
5189
5190
5191static unibrow::uchar Canonical(
5192 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5193 unibrow::uchar c) {
5194 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5195 int length = canonicalize->get(c, '\0', chars);
5196 DCHECK_LE(length, 1);
5197 unibrow::uchar canonical = c;
5198 if (length == 1) canonical = chars[0];
5199 return canonical;
5200}
5201
5202
5203int CompareFirstCharCaseIndependent(
5204 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5205 RegExpTree* const* a, RegExpTree* const* b) {
5206 RegExpAtom* atom1 = (*a)->AsAtom();
5207 RegExpAtom* atom2 = (*b)->AsAtom();
5208 unibrow::uchar character1 = atom1->data().at(0);
5209 unibrow::uchar character2 = atom2->data().at(0);
5210 if (character1 == character2) return 0;
5211 if (character1 >= 'a' || character2 >= 'a') {
5212 character1 = Canonical(canonicalize, character1);
5213 character2 = Canonical(canonicalize, character2);
5214 }
5215 return static_cast<int>(character1) - static_cast<int>(character2);
5216}
5217
5218
5219// We can stable sort runs of atoms, since the order does not matter if they
5220// start with different characters.
5221// Returns true if any consecutive atoms were found.
5222bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5223 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5224 int length = alternatives->length();
5225 bool found_consecutive_atoms = false;
5226 for (int i = 0; i < length; i++) {
5227 while (i < length) {
5228 RegExpTree* alternative = alternatives->at(i);
5229 if (alternative->IsAtom()) break;
5230 i++;
5231 }
5232 // i is length or it is the index of an atom.
5233 if (i == length) break;
5234 int first_atom = i;
5235 i++;
5236 while (i < length) {
5237 RegExpTree* alternative = alternatives->at(i);
5238 if (!alternative->IsAtom()) break;
5239 i++;
5240 }
5241 // Sort atoms to get ones with common prefixes together.
5242 // This step is more tricky if we are in a case-independent regexp,
5243 // because it would change /is|I/ to /I|is/, and order matters when
5244 // the regexp parts don't match only disjoint starting points. To fix
5245 // this we have a version of CompareFirstChar that uses case-
5246 // independent character classes for comparison.
5247 DCHECK_LT(first_atom, alternatives->length());
5248 DCHECK_LE(i, alternatives->length());
5249 DCHECK_LE(first_atom, i);
5250 if (compiler->ignore_case()) {
5251 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5252 compiler->isolate()->regexp_macro_assembler_canonicalize();
5253 auto compare_closure =
5254 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5255 return CompareFirstCharCaseIndependent(canonicalize, a, b);
5256 };
5257 alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5258 } else {
5259 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5260 }
5261 if (i - first_atom > 1) found_consecutive_atoms = true;
5262 }
5263 return found_consecutive_atoms;
5264}
5265
5266
5267// Optimizes ab|ac|az to a(?:b|c|d).
5268void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5269 Zone* zone = compiler->zone();
5270 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5271 int length = alternatives->length();
5272
5273 int write_posn = 0;
5274 int i = 0;
5275 while (i < length) {
5276 RegExpTree* alternative = alternatives->at(i);
5277 if (!alternative->IsAtom()) {
5278 alternatives->at(write_posn++) = alternatives->at(i);
5279 i++;
5280 continue;
5281 }
5282 RegExpAtom* atom = alternative->AsAtom();
5283 unibrow::uchar common_prefix = atom->data().at(0);
5284 int first_with_prefix = i;
5285 int prefix_length = atom->length();
5286 i++;
5287 while (i < length) {
5288 alternative = alternatives->at(i);
5289 if (!alternative->IsAtom()) break;
5290 atom = alternative->AsAtom();
5291 unibrow::uchar new_prefix = atom->data().at(0);
5292 if (new_prefix != common_prefix) {
5293 if (!compiler->ignore_case()) break;
5294 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5295 compiler->isolate()->regexp_macro_assembler_canonicalize();
5296 new_prefix = Canonical(canonicalize, new_prefix);
5297 common_prefix = Canonical(canonicalize, common_prefix);
5298 if (new_prefix != common_prefix) break;
5299 }
5300 prefix_length = Min(prefix_length, atom->length());
5301 i++;
5302 }
5303 if (i > first_with_prefix + 2) {
5304 // Found worthwhile run of alternatives with common prefix of at least one
5305 // character. The sorting function above did not sort on more than one
5306 // character for reasons of correctness, but there may still be a longer
5307 // common prefix if the terms were similar or presorted in the input.
5308 // Find out how long the common prefix is.
5309 int run_length = i - first_with_prefix;
5310 atom = alternatives->at(first_with_prefix)->AsAtom();
5311 for (int j = 1; j < run_length && prefix_length > 1; j++) {
5312 RegExpAtom* old_atom =
5313 alternatives->at(j + first_with_prefix)->AsAtom();
5314 for (int k = 1; k < prefix_length; k++) {
5315 if (atom->data().at(k) != old_atom->data().at(k)) {
5316 prefix_length = k;
5317 break;
5318 }
5319 }
5320 }
5321 RegExpAtom* prefix =
5322 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
5323 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5324 pair->Add(prefix, zone);
5325 ZoneList<RegExpTree*>* suffixes =
5326 new (zone) ZoneList<RegExpTree*>(run_length, zone);
5327 for (int j = 0; j < run_length; j++) {
5328 RegExpAtom* old_atom =
5329 alternatives->at(j + first_with_prefix)->AsAtom();
5330 int len = old_atom->length();
5331 if (len == prefix_length) {
5332 suffixes->Add(new (zone) RegExpEmpty(), zone);
5333 } else {
5334 RegExpTree* suffix = new (zone) RegExpAtom(
5335 old_atom->data().SubVector(prefix_length, old_atom->length()));
5336 suffixes->Add(suffix, zone);
5337 }
5338 }
5339 pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5340 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5341 } else {
5342 // Just copy any non-worthwhile alternatives.
5343 for (int j = first_with_prefix; j < i; j++) {
5344 alternatives->at(write_posn++) = alternatives->at(j);
5345 }
5346 }
5347 }
5348 alternatives->Rewind(write_posn); // Trim end of array.
5349}
5350
5351
5352// Optimizes b|c|z to [bcz].
5353void RegExpDisjunction::FixSingleCharacterDisjunctions(
5354 RegExpCompiler* compiler) {
5355 Zone* zone = compiler->zone();
5356 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5357 int length = alternatives->length();
5358
5359 int write_posn = 0;
5360 int i = 0;
5361 while (i < length) {
5362 RegExpTree* alternative = alternatives->at(i);
5363 if (!alternative->IsAtom()) {
5364 alternatives->at(write_posn++) = alternatives->at(i);
5365 i++;
5366 continue;
5367 }
5368 RegExpAtom* atom = alternative->AsAtom();
5369 if (atom->length() != 1) {
5370 alternatives->at(write_posn++) = alternatives->at(i);
5371 i++;
5372 continue;
5373 }
5374 int first_in_run = i;
5375 i++;
5376 while (i < length) {
5377 alternative = alternatives->at(i);
5378 if (!alternative->IsAtom()) break;
5379 atom = alternative->AsAtom();
5380 if (atom->length() != 1) break;
5381 i++;
5382 }
5383 if (i > first_in_run + 1) {
5384 // Found non-trivial run of single-character alternatives.
5385 int run_length = i - first_in_run;
5386 ZoneList<CharacterRange>* ranges =
5387 new (zone) ZoneList<CharacterRange>(2, zone);
5388 for (int j = 0; j < run_length; j++) {
5389 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5390 DCHECK_EQ(old_atom->length(), 1);
5391 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5392 }
5393 alternatives->at(write_posn++) =
5394 new (zone) RegExpCharacterClass(ranges, false);
5395 } else {
5396 // Just copy any trivial alternatives.
5397 for (int j = first_in_run; j < i; j++) {
5398 alternatives->at(write_posn++) = alternatives->at(j);
5399 }
5400 }
5401 }
5402 alternatives->Rewind(write_posn); // Trim end of array.
5403}
5404
5405
5406RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5407 RegExpNode* on_success) {
5408 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5409
5410 if (alternatives->length() > 2) {
5411 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5412 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5413 FixSingleCharacterDisjunctions(compiler);
5414 if (alternatives->length() == 1) {
5415 return alternatives->at(0)->ToNode(compiler, on_success);
5416 }
5417 }
5418
5419 int length = alternatives->length();
5420
5421 ChoiceNode* result =
5422 new(compiler->zone()) ChoiceNode(length, compiler->zone());
5423 for (int i = 0; i < length; i++) {
5424 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5425 on_success));
5426 result->AddAlternative(alternative);
5427 }
5428 return result;
5429}
5430
5431
5432RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5433 RegExpNode* on_success) {
5434 return ToNode(min(),
5435 max(),
5436 is_greedy(),
5437 body(),
5438 compiler,
5439 on_success);
5440}
5441
5442
5443// Scoped object to keep track of how much we unroll quantifier loops in the
5444// regexp graph generator.
5445class RegExpExpansionLimiter {
5446 public:
5447 static const int kMaxExpansionFactor = 6;
5448 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5449 : compiler_(compiler),
5450 saved_expansion_factor_(compiler->current_expansion_factor()),
5451 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5452 DCHECK(factor > 0);
5453 if (ok_to_expand_) {
5454 if (factor > kMaxExpansionFactor) {
5455 // Avoid integer overflow of the current expansion factor.
5456 ok_to_expand_ = false;
5457 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5458 } else {
5459 int new_factor = saved_expansion_factor_ * factor;
5460 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5461 compiler->set_current_expansion_factor(new_factor);
5462 }
5463 }
5464 }
5465
5466 ~RegExpExpansionLimiter() {
5467 compiler_->set_current_expansion_factor(saved_expansion_factor_);
5468 }
5469
5470 bool ok_to_expand() { return ok_to_expand_; }
5471
5472 private:
5473 RegExpCompiler* compiler_;
5474 int saved_expansion_factor_;
5475 bool ok_to_expand_;
5476
5477 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5478};
5479
5480
5481RegExpNode* RegExpQuantifier::ToNode(int min,
5482 int max,
5483 bool is_greedy,
5484 RegExpTree* body,
5485 RegExpCompiler* compiler,
5486 RegExpNode* on_success,
5487 bool not_at_start) {
5488 // x{f, t} becomes this:
5489 //
5490 // (r++)<-.
5491 // | `
5492 // | (x)
5493 // v ^
5494 // (r=0)-->(?)---/ [if r < t]
5495 // |
5496 // [if r >= f] \----> ...
5497 //
5498
5499 // 15.10.2.5 RepeatMatcher algorithm.
5500 // The parser has already eliminated the case where max is 0. In the case
5501 // where max_match is zero the parser has removed the quantifier if min was
5502 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5503
5504 // If we know that we cannot match zero length then things are a little
5505 // simpler since we don't need to make the special zero length match check
5506 // from step 2.1. If the min and max are small we can unroll a little in
5507 // this case.
5508 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5509 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5510 if (max == 0) return on_success; // This can happen due to recursion.
5511 bool body_can_be_empty = (body->min_match() == 0);
5512 int body_start_reg = RegExpCompiler::kNoRegister;
5513 Interval capture_registers = body->CaptureRegisters();
5514 bool needs_capture_clearing = !capture_registers.is_empty();
5515 Zone* zone = compiler->zone();
5516
5517 if (body_can_be_empty) {
5518 body_start_reg = compiler->AllocateRegister();
5519 } else if (compiler->optimize() && !needs_capture_clearing) {
5520 // Only unroll if there are no captures and the body can't be
5521 // empty.
5522 {
5523 RegExpExpansionLimiter limiter(
5524 compiler, min + ((max != min) ? 1 : 0));
5525 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5526 int new_max = (max == kInfinity) ? max : max - min;
5527 // Recurse once to get the loop or optional matches after the fixed
5528 // ones.
5529 RegExpNode* answer = ToNode(
5530 0, new_max, is_greedy, body, compiler, on_success, true);
5531 // Unroll the forced matches from 0 to min. This can cause chains of
5532 // TextNodes (which the parser does not generate). These should be
5533 // combined if it turns out they hinder good code generation.
5534 for (int i = 0; i < min; i++) {
5535 answer = body->ToNode(compiler, answer);
5536 }
5537 return answer;
5538 }
5539 }
5540 if (max <= kMaxUnrolledMaxMatches && min == 0) {
5541 DCHECK(max > 0); // Due to the 'if' above.
5542 RegExpExpansionLimiter limiter(compiler, max);
5543 if (limiter.ok_to_expand()) {
5544 // Unroll the optional matches up to max.
5545 RegExpNode* answer = on_success;
5546 for (int i = 0; i < max; i++) {
5547 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5548 if (is_greedy) {
5549 alternation->AddAlternative(
5550 GuardedAlternative(body->ToNode(compiler, answer)));
5551 alternation->AddAlternative(GuardedAlternative(on_success));
5552 } else {
5553 alternation->AddAlternative(GuardedAlternative(on_success));
5554 alternation->AddAlternative(
5555 GuardedAlternative(body->ToNode(compiler, answer)));
5556 }
5557 answer = alternation;
5558 if (not_at_start && !compiler->read_backward()) {
5559 alternation->set_not_at_start();
5560 }
5561 }
5562 return answer;
5563 }
5564 }
5565 }
5566 bool has_min = min > 0;
5567 bool has_max = max < RegExpTree::kInfinity;
5568 bool needs_counter = has_min || has_max;
5569 int reg_ctr = needs_counter
5570 ? compiler->AllocateRegister()
5571 : RegExpCompiler::kNoRegister;
5572 LoopChoiceNode* center = new (zone)
5573 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5574 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5575 RegExpNode* loop_return = needs_counter
5576 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5577 : static_cast<RegExpNode*>(center);
5578 if (body_can_be_empty) {
5579 // If the body can be empty we need to check if it was and then
5580 // backtrack.
5581 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5582 reg_ctr,
5583 min,
5584 loop_return);
5585 }
5586 RegExpNode* body_node = body->ToNode(compiler, loop_return);
5587 if (body_can_be_empty) {
5588 // If the body can be empty we need to store the start position
5589 // so we can bail out if it was empty.
5590 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5591 }
5592 if (needs_capture_clearing) {
5593 // Before entering the body of this loop we need to clear captures.
5594 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5595 }
5596 GuardedAlternative body_alt(body_node);
5597 if (has_max) {
5598 Guard* body_guard =
5599 new(zone) Guard(reg_ctr, Guard::LT, max);
5600 body_alt.AddGuard(body_guard, zone);
5601 }
5602 GuardedAlternative rest_alt(on_success);
5603 if (has_min) {
5604 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5605 rest_alt.AddGuard(rest_guard, zone);
5606 }
5607 if (is_greedy) {
5608 center->AddLoopAlternative(body_alt);
5609 center->AddContinueAlternative(rest_alt);
5610 } else {
5611 center->AddContinueAlternative(rest_alt);
5612 center->AddLoopAlternative(body_alt);
5613 }
5614 if (needs_counter) {
5615 return ActionNode::SetRegister(reg_ctr, 0, center);
5616 } else {
5617 return center;
5618 }
5619}
5620
5621
5622RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5623 RegExpNode* on_success) {
5624 NodeInfo info;
5625 Zone* zone = compiler->zone();
5626
5627 switch (assertion_type()) {
5628 case START_OF_LINE:
5629 return AssertionNode::AfterNewline(on_success);
5630 case START_OF_INPUT:
5631 return AssertionNode::AtStart(on_success);
5632 case BOUNDARY:
5633 return AssertionNode::AtBoundary(on_success);
5634 case NON_BOUNDARY:
5635 return AssertionNode::AtNonBoundary(on_success);
5636 case END_OF_INPUT:
5637 return AssertionNode::AtEnd(on_success);
5638 case END_OF_LINE: {
5639 // Compile $ in multiline regexps as an alternation with a positive
5640 // lookahead in one side and an end-of-input on the other side.
5641 // We need two registers for the lookahead.
5642 int stack_pointer_register = compiler->AllocateRegister();
5643 int position_register = compiler->AllocateRegister();
5644 // The ChoiceNode to distinguish between a newline and end-of-input.
5645 ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5646 // Create a newline atom.
5647 ZoneList<CharacterRange>* newline_ranges =
5648 new(zone) ZoneList<CharacterRange>(3, zone);
5649 CharacterRange::AddClassEscape('n', newline_ranges, zone);
5650 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
5651 TextNode* newline_matcher = new (zone) TextNode(
5652 newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5653 stack_pointer_register, position_register,
5654 0, // No captures inside.
5655 -1, // Ignored if no captures.
5656 on_success));
5657 // Create an end-of-input matcher.
5658 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5659 stack_pointer_register,
5660 position_register,
5661 newline_matcher);
5662 // Add the two alternatives to the ChoiceNode.
5663 GuardedAlternative eol_alternative(end_of_line);
5664 result->AddAlternative(eol_alternative);
5665 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5666 result->AddAlternative(end_alternative);
5667 return result;
5668 }
5669 default:
5670 UNREACHABLE();
5671 }
5672 return on_success;
5673}
5674
5675
5676RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5677 RegExpNode* on_success) {
5678 return new (compiler->zone())
5679 BackReferenceNode(RegExpCapture::StartRegister(index()),
5680 RegExpCapture::EndRegister(index()),
5681 compiler->read_backward(), on_success);
5682}
5683
5684
5685RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5686 RegExpNode* on_success) {
5687 return on_success;
5688}
5689
5690
Ben Murdoch097c5b22016-05-18 11:27:45 +01005691RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5692 int stack_pointer_register,
5693 int position_register,
5694 int capture_register_count,
5695 int capture_register_start)
5696 : is_positive_(is_positive),
5697 on_success_(on_success),
5698 stack_pointer_register_(stack_pointer_register),
5699 position_register_(position_register) {
5700 if (is_positive_) {
5701 on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5702 stack_pointer_register, position_register, capture_register_count,
5703 capture_register_start, on_success_);
5704 } else {
5705 Zone* zone = on_success_->zone();
5706 on_match_success_ = new (zone) NegativeSubmatchSuccess(
5707 stack_pointer_register, position_register, capture_register_count,
5708 capture_register_start, zone);
5709 }
5710}
5711
5712
5713RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5714 if (is_positive_) {
5715 return ActionNode::BeginSubmatch(stack_pointer_register_,
5716 position_register_, match);
5717 } else {
5718 Zone* zone = on_success_->zone();
5719 // We use a ChoiceNode to represent the negative lookaround. The first
5720 // alternative is the negative match. On success, the end node backtracks.
5721 // On failure, the second alternative is tried and leads to success.
5722 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5723 // first exit when calculating quick checks.
5724 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5725 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5726 return ActionNode::BeginSubmatch(stack_pointer_register_,
5727 position_register_, choice_node);
5728 }
5729}
5730
5731
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005732RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5733 RegExpNode* on_success) {
5734 int stack_pointer_register = compiler->AllocateRegister();
5735 int position_register = compiler->AllocateRegister();
5736
5737 const int registers_per_capture = 2;
5738 const int register_of_first_capture = 2;
5739 int register_count = capture_count_ * registers_per_capture;
5740 int register_start =
5741 register_of_first_capture + capture_from_ * registers_per_capture;
5742
5743 RegExpNode* result;
5744 bool was_reading_backward = compiler->read_backward();
5745 compiler->set_read_backward(type() == LOOKBEHIND);
Ben Murdoch097c5b22016-05-18 11:27:45 +01005746 Builder builder(is_positive(), on_success, stack_pointer_register,
5747 position_register, register_count, register_start);
5748 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5749 result = builder.ForMatch(match);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005750 compiler->set_read_backward(was_reading_backward);
5751 return result;
5752}
5753
5754
5755RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5756 RegExpNode* on_success) {
5757 return ToNode(body(), index(), compiler, on_success);
5758}
5759
5760
5761RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5762 int index,
5763 RegExpCompiler* compiler,
5764 RegExpNode* on_success) {
5765 DCHECK_NOT_NULL(body);
5766 int start_reg = RegExpCapture::StartRegister(index);
5767 int end_reg = RegExpCapture::EndRegister(index);
5768 if (compiler->read_backward()) std::swap(start_reg, end_reg);
5769 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5770 RegExpNode* body_node = body->ToNode(compiler, store_end);
5771 return ActionNode::StorePosition(start_reg, true, body_node);
5772}
5773
5774
5775RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5776 RegExpNode* on_success) {
5777 ZoneList<RegExpTree*>* children = nodes();
5778 RegExpNode* current = on_success;
5779 if (compiler->read_backward()) {
5780 for (int i = 0; i < children->length(); i++) {
5781 current = children->at(i)->ToNode(compiler, current);
5782 }
5783 } else {
5784 for (int i = children->length() - 1; i >= 0; i--) {
5785 current = children->at(i)->ToNode(compiler, current);
5786 }
5787 }
5788 return current;
5789}
5790
5791
5792static void AddClass(const int* elmv,
5793 int elmc,
5794 ZoneList<CharacterRange>* ranges,
5795 Zone* zone) {
5796 elmc--;
Ben Murdoch097c5b22016-05-18 11:27:45 +01005797 DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005798 for (int i = 0; i < elmc; i += 2) {
5799 DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch097c5b22016-05-18 11:27:45 +01005800 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005801 }
5802}
5803
5804
5805static void AddClassNegated(const int *elmv,
5806 int elmc,
5807 ZoneList<CharacterRange>* ranges,
5808 Zone* zone) {
5809 elmc--;
Ben Murdoch097c5b22016-05-18 11:27:45 +01005810 DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005811 DCHECK(elmv[0] != 0x0000);
Ben Murdoch097c5b22016-05-18 11:27:45 +01005812 DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005813 uc16 last = 0x0000;
5814 for (int i = 0; i < elmc; i += 2) {
5815 DCHECK(last <= elmv[i] - 1);
5816 DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch097c5b22016-05-18 11:27:45 +01005817 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005818 last = elmv[i + 1];
5819 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01005820 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005821}
5822
5823
5824void CharacterRange::AddClassEscape(uc16 type,
5825 ZoneList<CharacterRange>* ranges,
5826 Zone* zone) {
5827 switch (type) {
5828 case 's':
5829 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5830 break;
5831 case 'S':
5832 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5833 break;
5834 case 'w':
5835 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5836 break;
5837 case 'W':
5838 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5839 break;
5840 case 'd':
5841 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5842 break;
5843 case 'D':
5844 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5845 break;
5846 case '.':
5847 AddClassNegated(kLineTerminatorRanges,
5848 kLineTerminatorRangeCount,
5849 ranges,
5850 zone);
5851 break;
5852 // This is not a character range as defined by the spec but a
5853 // convenient shorthand for a character class that matches any
5854 // character.
5855 case '*':
5856 ranges->Add(CharacterRange::Everything(), zone);
5857 break;
5858 // This is the set of characters matched by the $ and ^ symbols
5859 // in multiline mode.
5860 case 'n':
5861 AddClass(kLineTerminatorRanges,
5862 kLineTerminatorRangeCount,
5863 ranges,
5864 zone);
5865 break;
5866 default:
5867 UNREACHABLE();
5868 }
5869}
5870
5871
5872Vector<const int> CharacterRange::GetWordBounds() {
5873 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5874}
5875
5876
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005877void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5878 ZoneList<CharacterRange>* ranges,
5879 bool is_one_byte) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01005880 int range_count = ranges->length();
5881 for (int i = 0; i < range_count; i++) {
5882 CharacterRange range = ranges->at(i);
5883 uc32 bottom = range.from();
5884 if (bottom > String::kMaxUtf16CodeUnit) return;
5885 uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5886 // Nothing to be done for surrogates.
5887 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
5888 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5889 if (bottom > String::kMaxOneByteCharCode) return;
5890 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005891 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01005892 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5893 if (top == bottom) {
5894 // If this is a singleton we just expand the one character.
5895 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005896 for (int i = 0; i < length; i++) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01005897 uc32 chr = chars[i];
5898 if (chr != bottom) {
5899 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005900 }
5901 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01005902 } else {
5903 // If this is a range we expand the characters block by block, expanding
5904 // contiguous subranges (blocks) one at a time. The approach is as
5905 // follows. For a given start character we look up the remainder of the
5906 // block that contains it (represented by the end point), for instance we
5907 // find 'z' if the character is 'c'. A block is characterized by the
5908 // property that all characters uncanonicalize in the same way, except
5909 // that each entry in the result is incremented by the distance from the
5910 // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
5911 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
5912 // we've found the end point we look up its uncanonicalization and
5913 // produce a range for each element. For instance for [c-f] we look up
5914 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
5915 // it is not already contained in the input, so [c-f] will be skipped but
5916 // [C-F] will be added. If this range is not completely contained in a
5917 // block we do this for all the blocks covered by the range (handling
5918 // characters that is not in a block as a "singleton block").
5919 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5920 int pos = bottom;
5921 while (pos <= top) {
5922 int length =
5923 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5924 uc32 block_end;
5925 if (length == 0) {
5926 block_end = pos;
5927 } else {
5928 DCHECK_EQ(1, length);
5929 block_end = equivalents[0];
5930 }
5931 int end = (block_end > top) ? top : block_end;
5932 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5933 equivalents);
5934 for (int i = 0; i < length; i++) {
5935 uc32 c = equivalents[i];
5936 uc32 range_from = c - (block_end - pos);
5937 uc32 range_to = c - (block_end - end);
5938 if (!(bottom <= range_from && range_to <= top)) {
5939 ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5940 }
5941 }
5942 pos = end + 1;
5943 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005944 }
5945 }
5946}
5947
5948
5949bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5950 DCHECK_NOT_NULL(ranges);
5951 int n = ranges->length();
5952 if (n <= 1) return true;
5953 int max = ranges->at(0).to();
5954 for (int i = 1; i < n; i++) {
5955 CharacterRange next_range = ranges->at(i);
5956 if (next_range.from() <= max + 1) return false;
5957 max = next_range.to();
5958 }
5959 return true;
5960}
5961
5962
5963ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5964 if (ranges_ == NULL) {
5965 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5966 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5967 }
5968 return ranges_;
5969}
5970
5971
5972// Move a number of elements in a zonelist to another position
5973// in the same list. Handles overlapping source and target areas.
5974static void MoveRanges(ZoneList<CharacterRange>* list,
5975 int from,
5976 int to,
5977 int count) {
5978 // Ranges are potentially overlapping.
5979 if (from < to) {
5980 for (int i = count - 1; i >= 0; i--) {
5981 list->at(to + i) = list->at(from + i);
5982 }
5983 } else {
5984 for (int i = 0; i < count; i++) {
5985 list->at(to + i) = list->at(from + i);
5986 }
5987 }
5988}
5989
5990
5991static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5992 int count,
5993 CharacterRange insert) {
5994 // Inserts a range into list[0..count[, which must be sorted
5995 // by from value and non-overlapping and non-adjacent, using at most
5996 // list[0..count] for the result. Returns the number of resulting
5997 // canonicalized ranges. Inserting a range may collapse existing ranges into
5998 // fewer ranges, so the return value can be anything in the range 1..count+1.
Ben Murdoch097c5b22016-05-18 11:27:45 +01005999 uc32 from = insert.from();
6000 uc32 to = insert.to();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006001 int start_pos = 0;
6002 int end_pos = count;
6003 for (int i = count - 1; i >= 0; i--) {
6004 CharacterRange current = list->at(i);
6005 if (current.from() > to + 1) {
6006 end_pos = i;
6007 } else if (current.to() + 1 < from) {
6008 start_pos = i + 1;
6009 break;
6010 }
6011 }
6012
6013 // Inserted range overlaps, or is adjacent to, ranges at positions
6014 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6015 // not affected by the insertion.
6016 // If start_pos == end_pos, the range must be inserted before start_pos.
6017 // if start_pos < end_pos, the entire range from start_pos to end_pos
6018 // must be merged with the insert range.
6019
6020 if (start_pos == end_pos) {
6021 // Insert between existing ranges at position start_pos.
6022 if (start_pos < count) {
6023 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6024 }
6025 list->at(start_pos) = insert;
6026 return count + 1;
6027 }
6028 if (start_pos + 1 == end_pos) {
6029 // Replace single existing range at position start_pos.
6030 CharacterRange to_replace = list->at(start_pos);
6031 int new_from = Min(to_replace.from(), from);
6032 int new_to = Max(to_replace.to(), to);
Ben Murdoch097c5b22016-05-18 11:27:45 +01006033 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006034 return count;
6035 }
6036 // Replace a number of existing ranges from start_pos to end_pos - 1.
6037 // Move the remaining ranges down.
6038
6039 int new_from = Min(list->at(start_pos).from(), from);
6040 int new_to = Max(list->at(end_pos - 1).to(), to);
6041 if (end_pos < count) {
6042 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6043 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006044 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006045 return count - (end_pos - start_pos) + 1;
6046}
6047
6048
6049void CharacterSet::Canonicalize() {
6050 // Special/default classes are always considered canonical. The result
6051 // of calling ranges() will be sorted.
6052 if (ranges_ == NULL) return;
6053 CharacterRange::Canonicalize(ranges_);
6054}
6055
6056
6057void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6058 if (character_ranges->length() <= 1) return;
6059 // Check whether ranges are already canonical (increasing, non-overlapping,
6060 // non-adjacent).
6061 int n = character_ranges->length();
6062 int max = character_ranges->at(0).to();
6063 int i = 1;
6064 while (i < n) {
6065 CharacterRange current = character_ranges->at(i);
6066 if (current.from() <= max + 1) {
6067 break;
6068 }
6069 max = current.to();
6070 i++;
6071 }
6072 // Canonical until the i'th range. If that's all of them, we are done.
6073 if (i == n) return;
6074
6075 // The ranges at index i and forward are not canonicalized. Make them so by
6076 // doing the equivalent of insertion sort (inserting each into the previous
6077 // list, in order).
6078 // Notice that inserting a range can reduce the number of ranges in the
6079 // result due to combining of adjacent and overlapping ranges.
6080 int read = i; // Range to insert.
6081 int num_canonical = i; // Length of canonicalized part of list.
6082 do {
6083 num_canonical = InsertRangeInCanonicalList(character_ranges,
6084 num_canonical,
6085 character_ranges->at(read));
6086 read++;
6087 } while (read < n);
6088 character_ranges->Rewind(num_canonical);
6089
6090 DCHECK(CharacterRange::IsCanonical(character_ranges));
6091}
6092
6093
6094void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6095 ZoneList<CharacterRange>* negated_ranges,
6096 Zone* zone) {
6097 DCHECK(CharacterRange::IsCanonical(ranges));
6098 DCHECK_EQ(0, negated_ranges->length());
6099 int range_count = ranges->length();
Ben Murdoch097c5b22016-05-18 11:27:45 +01006100 uc32 from = 0;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006101 int i = 0;
6102 if (range_count > 0 && ranges->at(0).from() == 0) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006103 from = ranges->at(0).to() + 1;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006104 i = 1;
6105 }
6106 while (i < range_count) {
6107 CharacterRange range = ranges->at(i);
Ben Murdoch097c5b22016-05-18 11:27:45 +01006108 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6109 from = range.to() + 1;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006110 i++;
6111 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006112 if (from < String::kMaxCodePoint) {
6113 negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006114 zone);
6115 }
6116}
6117
6118
6119// -------------------------------------------------------------------
6120// Splay tree
6121
6122
6123OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6124 if (Get(value))
6125 return this;
6126 if (successors(zone) != NULL) {
6127 for (int i = 0; i < successors(zone)->length(); i++) {
6128 OutSet* successor = successors(zone)->at(i);
6129 if (successor->Get(value))
6130 return successor;
6131 }
6132 } else {
6133 successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6134 }
6135 OutSet* result = new(zone) OutSet(first_, remaining_);
6136 result->Set(value, zone);
6137 successors(zone)->Add(result, zone);
6138 return result;
6139}
6140
6141
6142void OutSet::Set(unsigned value, Zone *zone) {
6143 if (value < kFirstLimit) {
6144 first_ |= (1 << value);
6145 } else {
6146 if (remaining_ == NULL)
6147 remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6148 if (remaining_->is_empty() || !remaining_->Contains(value))
6149 remaining_->Add(value, zone);
6150 }
6151}
6152
6153
6154bool OutSet::Get(unsigned value) const {
6155 if (value < kFirstLimit) {
6156 return (first_ & (1 << value)) != 0;
6157 } else if (remaining_ == NULL) {
6158 return false;
6159 } else {
6160 return remaining_->Contains(value);
6161 }
6162}
6163
6164
Ben Murdoch097c5b22016-05-18 11:27:45 +01006165const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006166
6167
6168void DispatchTable::AddRange(CharacterRange full_range, int value,
6169 Zone* zone) {
6170 CharacterRange current = full_range;
6171 if (tree()->is_empty()) {
6172 // If this is the first range we just insert into the table.
6173 ZoneSplayTree<Config>::Locator loc;
6174 bool inserted = tree()->Insert(current.from(), &loc);
6175 DCHECK(inserted);
6176 USE(inserted);
6177 loc.set_value(Entry(current.from(), current.to(),
6178 empty()->Extend(value, zone)));
6179 return;
6180 }
6181 // First see if there is a range to the left of this one that
6182 // overlaps.
6183 ZoneSplayTree<Config>::Locator loc;
6184 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6185 Entry* entry = &loc.value();
6186 // If we've found a range that overlaps with this one, and it
6187 // starts strictly to the left of this one, we have to fix it
6188 // because the following code only handles ranges that start on
6189 // or after the start point of the range we're adding.
6190 if (entry->from() < current.from() && entry->to() >= current.from()) {
6191 // Snap the overlapping range in half around the start point of
6192 // the range we're adding.
Ben Murdoch097c5b22016-05-18 11:27:45 +01006193 CharacterRange left =
6194 CharacterRange::Range(entry->from(), current.from() - 1);
6195 CharacterRange right = CharacterRange::Range(current.from(), entry->to());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006196 // The left part of the overlapping range doesn't overlap.
6197 // Truncate the whole entry to be just the left part.
6198 entry->set_to(left.to());
6199 // The right part is the one that overlaps. We add this part
6200 // to the map and let the next step deal with merging it with
6201 // the range we're adding.
6202 ZoneSplayTree<Config>::Locator loc;
6203 bool inserted = tree()->Insert(right.from(), &loc);
6204 DCHECK(inserted);
6205 USE(inserted);
6206 loc.set_value(Entry(right.from(),
6207 right.to(),
6208 entry->out_set()));
6209 }
6210 }
6211 while (current.is_valid()) {
6212 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6213 (loc.value().from() <= current.to()) &&
6214 (loc.value().to() >= current.from())) {
6215 Entry* entry = &loc.value();
6216 // We have overlap. If there is space between the start point of
6217 // the range we're adding and where the overlapping range starts
6218 // then we have to add a range covering just that space.
6219 if (current.from() < entry->from()) {
6220 ZoneSplayTree<Config>::Locator ins;
6221 bool inserted = tree()->Insert(current.from(), &ins);
6222 DCHECK(inserted);
6223 USE(inserted);
6224 ins.set_value(Entry(current.from(),
6225 entry->from() - 1,
6226 empty()->Extend(value, zone)));
6227 current.set_from(entry->from());
6228 }
6229 DCHECK_EQ(current.from(), entry->from());
6230 // If the overlapping range extends beyond the one we want to add
6231 // we have to snap the right part off and add it separately.
6232 if (entry->to() > current.to()) {
6233 ZoneSplayTree<Config>::Locator ins;
6234 bool inserted = tree()->Insert(current.to() + 1, &ins);
6235 DCHECK(inserted);
6236 USE(inserted);
6237 ins.set_value(Entry(current.to() + 1,
6238 entry->to(),
6239 entry->out_set()));
6240 entry->set_to(current.to());
6241 }
6242 DCHECK(entry->to() <= current.to());
6243 // The overlapping range is now completely contained by the range
6244 // we're adding so we can just update it and move the start point
6245 // of the range we're adding just past it.
6246 entry->AddValue(value, zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006247 DCHECK(entry->to() + 1 > current.from());
6248 current.set_from(entry->to() + 1);
6249 } else {
6250 // There is no overlap so we can just add the range
6251 ZoneSplayTree<Config>::Locator ins;
6252 bool inserted = tree()->Insert(current.from(), &ins);
6253 DCHECK(inserted);
6254 USE(inserted);
6255 ins.set_value(Entry(current.from(),
6256 current.to(),
6257 empty()->Extend(value, zone)));
6258 break;
6259 }
6260 }
6261}
6262
6263
Ben Murdoch097c5b22016-05-18 11:27:45 +01006264OutSet* DispatchTable::Get(uc32 value) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006265 ZoneSplayTree<Config>::Locator loc;
6266 if (!tree()->FindGreatestLessThan(value, &loc))
6267 return empty();
6268 Entry* entry = &loc.value();
6269 if (value <= entry->to())
6270 return entry->out_set();
6271 else
6272 return empty();
6273}
6274
6275
6276// -------------------------------------------------------------------
6277// Analysis
6278
6279
6280void Analysis::EnsureAnalyzed(RegExpNode* that) {
6281 StackLimitCheck check(isolate());
6282 if (check.HasOverflowed()) {
6283 fail("Stack overflow");
6284 return;
6285 }
6286 if (that->info()->been_analyzed || that->info()->being_analyzed)
6287 return;
6288 that->info()->being_analyzed = true;
6289 that->Accept(this);
6290 that->info()->being_analyzed = false;
6291 that->info()->been_analyzed = true;
6292}
6293
6294
6295void Analysis::VisitEnd(EndNode* that) {
6296 // nothing to do
6297}
6298
6299
6300void TextNode::CalculateOffsets() {
6301 int element_count = elements()->length();
6302 // Set up the offsets of the elements relative to the start. This is a fixed
6303 // quantity since a TextNode can only contain fixed-width things.
6304 int cp_offset = 0;
6305 for (int i = 0; i < element_count; i++) {
6306 TextElement& elm = elements()->at(i);
6307 elm.set_cp_offset(cp_offset);
6308 cp_offset += elm.length();
6309 }
6310}
6311
6312
6313void Analysis::VisitText(TextNode* that) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006314 if (ignore_case()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006315 that->MakeCaseIndependent(isolate(), is_one_byte_);
6316 }
6317 EnsureAnalyzed(that->on_success());
6318 if (!has_failed()) {
6319 that->CalculateOffsets();
6320 }
6321}
6322
6323
6324void Analysis::VisitAction(ActionNode* that) {
6325 RegExpNode* target = that->on_success();
6326 EnsureAnalyzed(target);
6327 if (!has_failed()) {
6328 // If the next node is interested in what it follows then this node
6329 // has to be interested too so it can pass the information on.
6330 that->info()->AddFromFollowing(target->info());
6331 }
6332}
6333
6334
6335void Analysis::VisitChoice(ChoiceNode* that) {
6336 NodeInfo* info = that->info();
6337 for (int i = 0; i < that->alternatives()->length(); i++) {
6338 RegExpNode* node = that->alternatives()->at(i).node();
6339 EnsureAnalyzed(node);
6340 if (has_failed()) return;
6341 // Anything the following nodes need to know has to be known by
6342 // this node also, so it can pass it on.
6343 info->AddFromFollowing(node->info());
6344 }
6345}
6346
6347
6348void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6349 NodeInfo* info = that->info();
6350 for (int i = 0; i < that->alternatives()->length(); i++) {
6351 RegExpNode* node = that->alternatives()->at(i).node();
6352 if (node != that->loop_node()) {
6353 EnsureAnalyzed(node);
6354 if (has_failed()) return;
6355 info->AddFromFollowing(node->info());
6356 }
6357 }
6358 // Check the loop last since it may need the value of this node
6359 // to get a correct result.
6360 EnsureAnalyzed(that->loop_node());
6361 if (!has_failed()) {
6362 info->AddFromFollowing(that->loop_node()->info());
6363 }
6364}
6365
6366
6367void Analysis::VisitBackReference(BackReferenceNode* that) {
6368 EnsureAnalyzed(that->on_success());
6369}
6370
6371
6372void Analysis::VisitAssertion(AssertionNode* that) {
6373 EnsureAnalyzed(that->on_success());
6374}
6375
6376
6377void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6378 BoyerMooreLookahead* bm,
6379 bool not_at_start) {
6380 // Working out the set of characters that a backreference can match is too
6381 // hard, so we just say that any character can match.
6382 bm->SetRest(offset);
6383 SaveBMInfo(bm, not_at_start, offset);
6384}
6385
6386
6387STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6388 RegExpMacroAssembler::kTableSize);
6389
6390
6391void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6392 BoyerMooreLookahead* bm, bool not_at_start) {
6393 ZoneList<GuardedAlternative>* alts = alternatives();
6394 budget = (budget - 1) / alts->length();
6395 for (int i = 0; i < alts->length(); i++) {
6396 GuardedAlternative& alt = alts->at(i);
6397 if (alt.guards() != NULL && alt.guards()->length() != 0) {
6398 bm->SetRest(offset); // Give up trying to fill in info.
6399 SaveBMInfo(bm, not_at_start, offset);
6400 return;
6401 }
6402 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6403 }
6404 SaveBMInfo(bm, not_at_start, offset);
6405}
6406
6407
6408void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6409 BoyerMooreLookahead* bm, bool not_at_start) {
6410 if (initial_offset >= bm->length()) return;
6411 int offset = initial_offset;
6412 int max_char = bm->max_char();
6413 for (int i = 0; i < elements()->length(); i++) {
6414 if (offset >= bm->length()) {
6415 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6416 return;
6417 }
6418 TextElement text = elements()->at(i);
6419 if (text.text_type() == TextElement::ATOM) {
6420 RegExpAtom* atom = text.atom();
6421 for (int j = 0; j < atom->length(); j++, offset++) {
6422 if (offset >= bm->length()) {
6423 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6424 return;
6425 }
6426 uc16 character = atom->data()[j];
6427 if (bm->compiler()->ignore_case()) {
6428 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6429 int length = GetCaseIndependentLetters(
6430 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6431 chars);
6432 for (int j = 0; j < length; j++) {
6433 bm->Set(offset, chars[j]);
6434 }
6435 } else {
6436 if (character <= max_char) bm->Set(offset, character);
6437 }
6438 }
6439 } else {
6440 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6441 RegExpCharacterClass* char_class = text.char_class();
6442 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6443 if (char_class->is_negated()) {
6444 bm->SetAll(offset);
6445 } else {
6446 for (int k = 0; k < ranges->length(); k++) {
6447 CharacterRange& range = ranges->at(k);
6448 if (range.from() > max_char) continue;
6449 int to = Min(max_char, static_cast<int>(range.to()));
6450 bm->SetInterval(offset, Interval(range.from(), to));
6451 }
6452 }
6453 offset++;
6454 }
6455 }
6456 if (offset >= bm->length()) {
6457 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6458 return;
6459 }
6460 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6461 true); // Not at start after a text node.
6462 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6463}
6464
6465
6466// -------------------------------------------------------------------
6467// Dispatch table construction
6468
6469
6470void DispatchTableConstructor::VisitEnd(EndNode* that) {
6471 AddRange(CharacterRange::Everything());
6472}
6473
6474
6475void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6476 node->set_being_calculated(true);
6477 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6478 for (int i = 0; i < alternatives->length(); i++) {
6479 set_choice_index(i);
6480 alternatives->at(i).node()->Accept(this);
6481 }
6482 node->set_being_calculated(false);
6483}
6484
6485
6486class AddDispatchRange {
6487 public:
6488 explicit AddDispatchRange(DispatchTableConstructor* constructor)
6489 : constructor_(constructor) { }
6490 void Call(uc32 from, DispatchTable::Entry entry);
6491 private:
6492 DispatchTableConstructor* constructor_;
6493};
6494
6495
6496void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006497 constructor_->AddRange(CharacterRange::Range(from, entry.to()));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006498}
6499
6500
6501void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6502 if (node->being_calculated())
6503 return;
6504 DispatchTable* table = node->GetTable(ignore_case_);
6505 AddDispatchRange adder(this);
6506 table->ForEach(&adder);
6507}
6508
6509
6510void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6511 // TODO(160): Find the node that we refer back to and propagate its start
6512 // set back to here. For now we just accept anything.
6513 AddRange(CharacterRange::Everything());
6514}
6515
6516
6517void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6518 RegExpNode* target = that->on_success();
6519 target->Accept(this);
6520}
6521
6522
6523static int CompareRangeByFrom(const CharacterRange* a,
6524 const CharacterRange* b) {
6525 return Compare<uc16>(a->from(), b->from());
6526}
6527
6528
6529void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6530 ranges->Sort(CompareRangeByFrom);
6531 uc16 last = 0;
6532 for (int i = 0; i < ranges->length(); i++) {
6533 CharacterRange range = ranges->at(i);
6534 if (last < range.from())
Ben Murdoch097c5b22016-05-18 11:27:45 +01006535 AddRange(CharacterRange::Range(last, range.from() - 1));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006536 if (range.to() >= last) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006537 if (range.to() == String::kMaxCodePoint) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006538 return;
6539 } else {
6540 last = range.to() + 1;
6541 }
6542 }
6543 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006544 AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006545}
6546
6547
6548void DispatchTableConstructor::VisitText(TextNode* that) {
6549 TextElement elm = that->elements()->at(0);
6550 switch (elm.text_type()) {
6551 case TextElement::ATOM: {
6552 uc16 c = elm.atom()->data()[0];
Ben Murdoch097c5b22016-05-18 11:27:45 +01006553 AddRange(CharacterRange::Range(c, c));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006554 break;
6555 }
6556 case TextElement::CHAR_CLASS: {
6557 RegExpCharacterClass* tree = elm.char_class();
6558 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6559 if (tree->is_negated()) {
6560 AddInverse(ranges);
6561 } else {
6562 for (int i = 0; i < ranges->length(); i++)
6563 AddRange(ranges->at(i));
6564 }
6565 break;
6566 }
6567 default: {
6568 UNIMPLEMENTED();
6569 }
6570 }
6571}
6572
6573
6574void DispatchTableConstructor::VisitAction(ActionNode* that) {
6575 RegExpNode* target = that->on_success();
6576 target->Accept(this);
6577}
6578
6579
Ben Murdoch097c5b22016-05-18 11:27:45 +01006580RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6581 RegExpNode* on_success) {
6582 // If the regexp matching starts within a surrogate pair, step back
6583 // to the lead surrogate and start matching from there.
6584 DCHECK(!compiler->read_backward());
6585 Zone* zone = compiler->zone();
6586 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6587 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6588 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6589 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6590
6591 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6592
6593 int stack_register = compiler->UnicodeLookaroundStackRegister();
6594 int position_register = compiler->UnicodeLookaroundPositionRegister();
6595 RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6596 zone, lead_surrogates, true, on_success);
6597 RegExpLookaround::Builder builder(true, step_back, stack_register,
6598 position_register);
6599 RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6600 zone, trail_surrogates, false, builder.on_match_success());
6601
6602 optional_step_back->AddAlternative(
6603 GuardedAlternative(builder.ForMatch(match_trail)));
6604 optional_step_back->AddAlternative(GuardedAlternative(on_success));
6605
6606 return optional_step_back;
6607}
6608
6609
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006610RegExpEngine::CompilationResult RegExpEngine::Compile(
Ben Murdoch097c5b22016-05-18 11:27:45 +01006611 Isolate* isolate, Zone* zone, RegExpCompileData* data,
6612 JSRegExp::Flags flags, Handle<String> pattern,
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006613 Handle<String> sample_subject, bool is_one_byte) {
6614 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6615 return IrregexpRegExpTooBig(isolate);
6616 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006617 bool ignore_case = flags & JSRegExp::kIgnoreCase;
6618 bool is_sticky = flags & JSRegExp::kSticky;
6619 bool is_global = flags & JSRegExp::kGlobal;
6620 bool is_unicode = flags & JSRegExp::kUnicode;
6621 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006622 is_one_byte);
6623
6624 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6625
6626 // Sample some characters from the middle of the string.
6627 static const int kSampleSize = 128;
6628
6629 sample_subject = String::Flatten(sample_subject);
6630 int chars_sampled = 0;
6631 int half_way = (sample_subject->length() - kSampleSize) / 2;
6632 for (int i = Max(0, half_way);
6633 i < sample_subject->length() && chars_sampled < kSampleSize;
6634 i++, chars_sampled++) {
6635 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6636 }
6637
6638 // Wrap the body of the regexp in capture #0.
6639 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6640 0,
6641 &compiler,
6642 compiler.accept());
6643 RegExpNode* node = captured_body;
6644 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6645 bool is_start_anchored = data->tree->IsAnchoredAtStart();
6646 int max_length = data->tree->max_match();
6647 if (!is_start_anchored && !is_sticky) {
6648 // Add a .*? at the beginning, outside the body capture, unless
6649 // this expression is anchored at the beginning or sticky.
6650 RegExpNode* loop_node = RegExpQuantifier::ToNode(
6651 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
6652 &compiler, captured_body, data->contains_anchor);
6653
6654 if (data->contains_anchor) {
6655 // Unroll loop once, to take care of the case that might start
6656 // at the start of input.
6657 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6658 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6659 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6660 new (zone) RegExpCharacterClass('*'), false, loop_node)));
6661 node = first_step_node;
6662 } else {
6663 node = loop_node;
6664 }
6665 }
6666 if (is_one_byte) {
6667 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6668 // Do it again to propagate the new nodes to places where they were not
6669 // put because they had not been calculated yet.
6670 if (node != NULL) {
6671 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6672 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006673 } else if (compiler.unicode() && (is_global || is_sticky)) {
6674 node = OptionallyStepBackToLeadSurrogate(&compiler, node);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006675 }
6676
6677 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6678 data->node = node;
Ben Murdoch097c5b22016-05-18 11:27:45 +01006679 Analysis analysis(isolate, flags, is_one_byte);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006680 analysis.EnsureAnalyzed(node);
6681 if (analysis.has_failed()) {
6682 const char* error_message = analysis.error_message();
6683 return CompilationResult(isolate, error_message);
6684 }
6685
6686 // Create the correct assembler for the architecture.
6687#ifndef V8_INTERPRETED_REGEXP
6688 // Native regexp implementation.
6689
6690 NativeRegExpMacroAssembler::Mode mode =
6691 is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6692 : NativeRegExpMacroAssembler::UC16;
6693
6694#if V8_TARGET_ARCH_IA32
6695 RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6696 (data->capture_count + 1) * 2);
6697#elif V8_TARGET_ARCH_X64
6698 RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6699 (data->capture_count + 1) * 2);
6700#elif V8_TARGET_ARCH_ARM
6701 RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6702 (data->capture_count + 1) * 2);
6703#elif V8_TARGET_ARCH_ARM64
6704 RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6705 (data->capture_count + 1) * 2);
6706#elif V8_TARGET_ARCH_PPC
6707 RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6708 (data->capture_count + 1) * 2);
6709#elif V8_TARGET_ARCH_MIPS
6710 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6711 (data->capture_count + 1) * 2);
6712#elif V8_TARGET_ARCH_MIPS64
6713 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6714 (data->capture_count + 1) * 2);
6715#elif V8_TARGET_ARCH_X87
6716 RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
6717 (data->capture_count + 1) * 2);
6718#else
6719#error "Unsupported architecture"
6720#endif
6721
6722#else // V8_INTERPRETED_REGEXP
6723 // Interpreted regexp implementation.
6724 EmbeddedVector<byte, 1024> codes;
6725 RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6726#endif // V8_INTERPRETED_REGEXP
6727
6728 macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
6729
6730 // Inserted here, instead of in Assembler, because it depends on information
6731 // in the AST that isn't replicated in the Node structure.
6732 static const int kMaxBacksearchLimit = 1024;
6733 if (is_end_anchored &&
6734 !is_start_anchored &&
6735 max_length < kMaxBacksearchLimit) {
6736 macro_assembler.SetCurrentPositionFromEnd(max_length);
6737 }
6738
6739 if (is_global) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006740 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6741 if (data->tree->min_match() > 0) {
6742 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6743 } else if (is_unicode) {
6744 mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6745 }
6746 macro_assembler.set_global_mode(mode);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006747 }
6748
6749 return compiler.Assemble(&macro_assembler,
6750 node,
6751 data->capture_count,
6752 pattern);
6753}
6754
6755
6756bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
6757 Heap* heap = pattern->GetHeap();
6758 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6759 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
6760 heap->isolate()->memory_allocator()->SizeExecutable() >
6761 RegExpImpl::kRegExpExecutableMemoryLimit) {
6762 too_much = true;
6763 }
6764 return too_much;
6765}
6766
6767
6768Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
6769 Object* key_pattern,
6770 FixedArray** last_match_cache,
6771 ResultsCacheType type) {
6772 FixedArray* cache;
6773 if (!key_string->IsInternalizedString()) return Smi::FromInt(0);
6774 if (type == STRING_SPLIT_SUBSTRINGS) {
6775 DCHECK(key_pattern->IsString());
6776 if (!key_pattern->IsInternalizedString()) return Smi::FromInt(0);
6777 cache = heap->string_split_cache();
6778 } else {
6779 DCHECK(type == REGEXP_MULTIPLE_INDICES);
6780 DCHECK(key_pattern->IsFixedArray());
6781 cache = heap->regexp_multiple_cache();
6782 }
6783
6784 uint32_t hash = key_string->Hash();
6785 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6786 ~(kArrayEntriesPerCacheEntry - 1));
6787 if (cache->get(index + kStringOffset) != key_string ||
6788 cache->get(index + kPatternOffset) != key_pattern) {
6789 index =
6790 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6791 if (cache->get(index + kStringOffset) != key_string ||
6792 cache->get(index + kPatternOffset) != key_pattern) {
6793 return Smi::FromInt(0);
6794 }
6795 }
6796
6797 *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6798 return cache->get(index + kArrayOffset);
6799}
6800
6801
6802void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6803 Handle<Object> key_pattern,
6804 Handle<FixedArray> value_array,
6805 Handle<FixedArray> last_match_cache,
6806 ResultsCacheType type) {
6807 Factory* factory = isolate->factory();
6808 Handle<FixedArray> cache;
6809 if (!key_string->IsInternalizedString()) return;
6810 if (type == STRING_SPLIT_SUBSTRINGS) {
6811 DCHECK(key_pattern->IsString());
6812 if (!key_pattern->IsInternalizedString()) return;
6813 cache = factory->string_split_cache();
6814 } else {
6815 DCHECK(type == REGEXP_MULTIPLE_INDICES);
6816 DCHECK(key_pattern->IsFixedArray());
6817 cache = factory->regexp_multiple_cache();
6818 }
6819
6820 uint32_t hash = key_string->Hash();
6821 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6822 ~(kArrayEntriesPerCacheEntry - 1));
6823 if (cache->get(index + kStringOffset) == Smi::FromInt(0)) {
6824 cache->set(index + kStringOffset, *key_string);
6825 cache->set(index + kPatternOffset, *key_pattern);
6826 cache->set(index + kArrayOffset, *value_array);
6827 cache->set(index + kLastMatchOffset, *last_match_cache);
6828 } else {
6829 uint32_t index2 =
6830 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6831 if (cache->get(index2 + kStringOffset) == Smi::FromInt(0)) {
6832 cache->set(index2 + kStringOffset, *key_string);
6833 cache->set(index2 + kPatternOffset, *key_pattern);
6834 cache->set(index2 + kArrayOffset, *value_array);
6835 cache->set(index2 + kLastMatchOffset, *last_match_cache);
6836 } else {
6837 cache->set(index2 + kStringOffset, Smi::FromInt(0));
6838 cache->set(index2 + kPatternOffset, Smi::FromInt(0));
6839 cache->set(index2 + kArrayOffset, Smi::FromInt(0));
6840 cache->set(index2 + kLastMatchOffset, Smi::FromInt(0));
6841 cache->set(index + kStringOffset, *key_string);
6842 cache->set(index + kPatternOffset, *key_pattern);
6843 cache->set(index + kArrayOffset, *value_array);
6844 cache->set(index + kLastMatchOffset, *last_match_cache);
6845 }
6846 }
6847 // If the array is a reasonably short list of substrings, convert it into a
6848 // list of internalized strings.
6849 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6850 for (int i = 0; i < value_array->length(); i++) {
6851 Handle<String> str(String::cast(value_array->get(i)), isolate);
6852 Handle<String> internalized_str = factory->InternalizeString(str);
6853 value_array->set(i, *internalized_str);
6854 }
6855 }
6856 // Convert backing store to a copy-on-write array.
6857 value_array->set_map_no_write_barrier(*factory->fixed_cow_array_map());
6858}
6859
6860
6861void RegExpResultsCache::Clear(FixedArray* cache) {
6862 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6863 cache->set(i, Smi::FromInt(0));
6864 }
6865}
6866
6867} // namespace internal
6868} // namespace v8