blob: ddb4a16cafb7f781ffb4ed85f2586a7843f10def [file] [log] [blame]
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001// Copyright 2012 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/jsregexp.h"
6
7#include "src/ast/ast.h"
8#include "src/base/platform/platform.h"
9#include "src/compilation-cache.h"
10#include "src/compiler.h"
11#include "src/execution.h"
12#include "src/factory.h"
13#include "src/isolate-inl.h"
14#include "src/messages.h"
15#include "src/ostreams.h"
16#include "src/regexp/interpreter-irregexp.h"
17#include "src/regexp/jsregexp-inl.h"
18#include "src/regexp/regexp-macro-assembler.h"
19#include "src/regexp/regexp-macro-assembler-irregexp.h"
20#include "src/regexp/regexp-macro-assembler-tracer.h"
21#include "src/regexp/regexp-parser.h"
22#include "src/regexp/regexp-stack.h"
23#include "src/runtime/runtime.h"
24#include "src/splay-tree-inl.h"
25#include "src/string-search.h"
26#include "src/unicode-decoder.h"
27
Ben Murdoch097c5b22016-05-18 11:27:45 +010028#ifdef V8_I18N_SUPPORT
29#include "unicode/uset.h"
30#include "unicode/utypes.h"
31#endif // V8_I18N_SUPPORT
32
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000033#ifndef V8_INTERPRETED_REGEXP
34#if V8_TARGET_ARCH_IA32
35#include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
36#elif V8_TARGET_ARCH_X64
37#include "src/regexp/x64/regexp-macro-assembler-x64.h"
38#elif V8_TARGET_ARCH_ARM64
39#include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
40#elif V8_TARGET_ARCH_ARM
41#include "src/regexp/arm/regexp-macro-assembler-arm.h"
42#elif V8_TARGET_ARCH_PPC
43#include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
Ben Murdochda12d292016-06-02 14:46:10 +010044#elif V8_TARGET_ARCH_S390
45#include "src/regexp/s390/regexp-macro-assembler-s390.h"
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000046#elif V8_TARGET_ARCH_MIPS
47#include "src/regexp/mips/regexp-macro-assembler-mips.h"
48#elif V8_TARGET_ARCH_MIPS64
49#include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
50#elif V8_TARGET_ARCH_X87
51#include "src/regexp/x87/regexp-macro-assembler-x87.h"
52#else
53#error Unsupported target architecture.
54#endif
55#endif
56
57
58namespace v8 {
59namespace internal {
60
61MUST_USE_RESULT
62static inline MaybeHandle<Object> ThrowRegExpException(
63 Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
64 Isolate* isolate = re->GetIsolate();
65 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
66 pattern, error_text),
67 Object);
68}
69
70
71inline void ThrowRegExpException(Handle<JSRegExp> re,
72 Handle<String> error_text) {
73 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
74}
75
76
77ContainedInLattice AddRange(ContainedInLattice containment,
78 const int* ranges,
79 int ranges_length,
80 Interval new_range) {
81 DCHECK((ranges_length & 1) == 1);
Ben Murdoch097c5b22016-05-18 11:27:45 +010082 DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000083 if (containment == kLatticeUnknown) return containment;
84 bool inside = false;
85 int last = 0;
86 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
87 // Consider the range from last to ranges[i].
88 // We haven't got to the new range yet.
89 if (ranges[i] <= new_range.from()) continue;
90 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
91 // inclusive, but the values in ranges are not.
92 if (last <= new_range.from() && new_range.to() < ranges[i]) {
93 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
94 }
95 return kLatticeUnknown;
96 }
97 return containment;
98}
99
100
101// More makes code generation slower, less makes V8 benchmark score lower.
102const int kMaxLookaheadForBoyerMoore = 8;
103// In a 3-character pattern you can maximally step forwards 3 characters
104// at a time, which is not always enough to pay for the extra logic.
105const int kPatternTooShortForBoyerMoore = 2;
106
107
108// Identifies the sort of regexps where the regexp engine is faster
109// than the code used for atom matches.
110static bool HasFewDifferentCharacters(Handle<String> pattern) {
111 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
112 if (length <= kPatternTooShortForBoyerMoore) return false;
113 const int kMod = 128;
114 bool character_found[kMod];
115 int different = 0;
116 memset(&character_found[0], 0, sizeof(character_found));
117 for (int i = 0; i < length; i++) {
118 int ch = (pattern->Get(i) & (kMod - 1));
119 if (!character_found[ch]) {
120 character_found[ch] = true;
121 different++;
122 // We declare a regexp low-alphabet if it has at least 3 times as many
123 // characters as it has different characters.
124 if (different * 3 > length) return false;
125 }
126 }
127 return true;
128}
129
130
131// Generic RegExp methods. Dispatches to implementation specific methods.
132
133
134MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
135 Handle<String> pattern,
136 JSRegExp::Flags flags) {
137 Isolate* isolate = re->GetIsolate();
Ben Murdochda12d292016-06-02 14:46:10 +0100138 Zone zone(isolate->allocator());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000139 CompilationCache* compilation_cache = isolate->compilation_cache();
140 MaybeHandle<FixedArray> maybe_cached =
141 compilation_cache->LookupRegExp(pattern, flags);
142 Handle<FixedArray> cached;
143 bool in_cache = maybe_cached.ToHandle(&cached);
144 LOG(isolate, RegExpCompileEvent(re, in_cache));
145
146 Handle<Object> result;
147 if (in_cache) {
148 re->set_data(*cached);
149 return re;
150 }
151 pattern = String::Flatten(pattern);
152 PostponeInterruptsScope postpone(isolate);
153 RegExpCompileData parse_result;
154 FlatStringReader reader(isolate, pattern);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100155 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
156 &parse_result)) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000157 // Throw an exception if we fail to parse the pattern.
158 return ThrowRegExpException(re, pattern, parse_result.error);
159 }
160
161 bool has_been_compiled = false;
162
163 if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
164 !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
165 // Parse-tree is a single atom that is equal to the pattern.
166 AtomCompile(re, pattern, flags, pattern);
167 has_been_compiled = true;
168 } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
169 !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
170 RegExpAtom* atom = parse_result.tree->AsAtom();
171 Vector<const uc16> atom_pattern = atom->data();
172 Handle<String> atom_string;
173 ASSIGN_RETURN_ON_EXCEPTION(
174 isolate, atom_string,
175 isolate->factory()->NewStringFromTwoByte(atom_pattern),
176 Object);
177 if (!HasFewDifferentCharacters(atom_string)) {
178 AtomCompile(re, pattern, flags, atom_string);
179 has_been_compiled = true;
180 }
181 }
182 if (!has_been_compiled) {
183 IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
184 }
185 DCHECK(re->data()->IsFixedArray());
186 // Compilation succeeded so the data is set on the regexp
187 // and we can store it in the cache.
188 Handle<FixedArray> data(FixedArray::cast(re->data()));
189 compilation_cache->PutRegExp(pattern, flags, data);
190
191 return re;
192}
193
194
195MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
196 Handle<String> subject,
197 int index,
198 Handle<JSArray> last_match_info) {
199 switch (regexp->TypeTag()) {
200 case JSRegExp::ATOM:
201 return AtomExec(regexp, subject, index, last_match_info);
202 case JSRegExp::IRREGEXP: {
203 return IrregexpExec(regexp, subject, index, last_match_info);
204 }
205 default:
206 UNREACHABLE();
207 return MaybeHandle<Object>();
208 }
209}
210
211
212// RegExp Atom implementation: Simple string search using indexOf.
213
214
215void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
216 Handle<String> pattern,
217 JSRegExp::Flags flags,
218 Handle<String> match_pattern) {
219 re->GetIsolate()->factory()->SetRegExpAtomData(re,
220 JSRegExp::ATOM,
221 pattern,
222 flags,
223 match_pattern);
224}
225
226
227static void SetAtomLastCapture(FixedArray* array,
228 String* subject,
229 int from,
230 int to) {
231 SealHandleScope shs(array->GetIsolate());
232 RegExpImpl::SetLastCaptureCount(array, 2);
233 RegExpImpl::SetLastSubject(array, subject);
234 RegExpImpl::SetLastInput(array, subject);
235 RegExpImpl::SetCapture(array, 0, from);
236 RegExpImpl::SetCapture(array, 1, to);
237}
238
239
240int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
241 Handle<String> subject,
242 int index,
243 int32_t* output,
244 int output_size) {
245 Isolate* isolate = regexp->GetIsolate();
246
247 DCHECK(0 <= index);
248 DCHECK(index <= subject->length());
249
250 subject = String::Flatten(subject);
251 DisallowHeapAllocation no_gc; // ensure vectors stay valid
252
253 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
254 int needle_len = needle->length();
255 DCHECK(needle->IsFlat());
256 DCHECK_LT(0, needle_len);
257
258 if (index + needle_len > subject->length()) {
259 return RegExpImpl::RE_FAILURE;
260 }
261
262 for (int i = 0; i < output_size; i += 2) {
263 String::FlatContent needle_content = needle->GetFlatContent();
264 String::FlatContent subject_content = subject->GetFlatContent();
265 DCHECK(needle_content.IsFlat());
266 DCHECK(subject_content.IsFlat());
267 // dispatch on type of strings
268 index =
269 (needle_content.IsOneByte()
270 ? (subject_content.IsOneByte()
271 ? SearchString(isolate, subject_content.ToOneByteVector(),
272 needle_content.ToOneByteVector(), index)
273 : SearchString(isolate, subject_content.ToUC16Vector(),
274 needle_content.ToOneByteVector(), index))
275 : (subject_content.IsOneByte()
276 ? SearchString(isolate, subject_content.ToOneByteVector(),
277 needle_content.ToUC16Vector(), index)
278 : SearchString(isolate, subject_content.ToUC16Vector(),
279 needle_content.ToUC16Vector(), index)));
280 if (index == -1) {
281 return i / 2; // Return number of matches.
282 } else {
283 output[i] = index;
284 output[i+1] = index + needle_len;
285 index += needle_len;
286 }
287 }
288 return output_size / 2;
289}
290
291
292Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
293 Handle<String> subject,
294 int index,
295 Handle<JSArray> last_match_info) {
296 Isolate* isolate = re->GetIsolate();
297
298 static const int kNumRegisters = 2;
299 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
300 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
301
302 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
303
304 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
305
306 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
307 SealHandleScope shs(isolate);
308 FixedArray* array = FixedArray::cast(last_match_info->elements());
309 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
310 return last_match_info;
311}
312
313
314// Irregexp implementation.
315
316// Ensures that the regexp object contains a compiled version of the
317// source for either one-byte or two-byte subject strings.
318// If the compiled version doesn't already exist, it is compiled
319// from the source pattern.
320// If compilation fails, an exception is thrown and this function
321// returns false.
322bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
323 Handle<String> sample_subject,
324 bool is_one_byte) {
325 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
326#ifdef V8_INTERPRETED_REGEXP
327 if (compiled_code->IsByteArray()) return true;
328#else // V8_INTERPRETED_REGEXP (RegExp native code)
329 if (compiled_code->IsCode()) return true;
330#endif
331 // We could potentially have marked this as flushable, but have kept
332 // a saved version if we did not flush it yet.
333 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
334 if (saved_code->IsCode()) {
335 // Reinstate the code in the original place.
336 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
337 DCHECK(compiled_code->IsSmi());
338 return true;
339 }
340 return CompileIrregexp(re, sample_subject, is_one_byte);
341}
342
343
344bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
345 Handle<String> sample_subject,
346 bool is_one_byte) {
347 // Compile the RegExp.
348 Isolate* isolate = re->GetIsolate();
Ben Murdochda12d292016-06-02 14:46:10 +0100349 Zone zone(isolate->allocator());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000350 PostponeInterruptsScope postpone(isolate);
351 // If we had a compilation error the last time this is saved at the
352 // saved code index.
353 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
354 // When arriving here entry can only be a smi, either representing an
355 // uncompiled regexp, a previous compilation error, or code that has
356 // been flushed.
357 DCHECK(entry->IsSmi());
358 int entry_value = Smi::cast(entry)->value();
359 DCHECK(entry_value == JSRegExp::kUninitializedValue ||
360 entry_value == JSRegExp::kCompilationErrorValue ||
361 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
362
363 if (entry_value == JSRegExp::kCompilationErrorValue) {
364 // A previous compilation failed and threw an error which we store in
365 // the saved code index (we store the error message, not the actual
366 // error). Recreate the error object and throw it.
367 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
368 DCHECK(error_string->IsString());
369 Handle<String> error_message(String::cast(error_string));
370 ThrowRegExpException(re, error_message);
371 return false;
372 }
373
374 JSRegExp::Flags flags = re->GetFlags();
375
376 Handle<String> pattern(re->Pattern());
377 pattern = String::Flatten(pattern);
378 RegExpCompileData compile_data;
379 FlatStringReader reader(isolate, pattern);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100380 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
381 &compile_data)) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000382 // Throw an exception if we fail to parse the pattern.
383 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
384 USE(ThrowRegExpException(re, pattern, compile_data.error));
385 return false;
386 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100387 RegExpEngine::CompilationResult result =
388 RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
389 sample_subject, is_one_byte);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000390 if (result.error_message != NULL) {
391 // Unable to compile regexp.
392 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
393 CStrVector(result.error_message)).ToHandleChecked();
394 ThrowRegExpException(re, error_message);
395 return false;
396 }
397
398 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
399 data->set(JSRegExp::code_index(is_one_byte), result.code);
400 int register_max = IrregexpMaxRegisterCount(*data);
401 if (result.num_registers > register_max) {
402 SetIrregexpMaxRegisterCount(*data, result.num_registers);
403 }
404
405 return true;
406}
407
408
409int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
410 return Smi::cast(
411 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
412}
413
414
415void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
416 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
417}
418
419
420int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
421 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
422}
423
424
425int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
426 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
427}
428
429
430ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
431 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
432}
433
434
435Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
436 return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
437}
438
439
440void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
441 Handle<String> pattern,
442 JSRegExp::Flags flags,
443 int capture_count) {
444 // Initialize compiled code entries to null.
445 re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
446 JSRegExp::IRREGEXP,
447 pattern,
448 flags,
449 capture_count);
450}
451
452
453int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
454 Handle<String> subject) {
455 subject = String::Flatten(subject);
456
457 // Check representation of the underlying storage.
458 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
459 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
460
461#ifdef V8_INTERPRETED_REGEXP
462 // Byte-code regexp needs space allocated for all its registers.
463 // The result captures are copied to the start of the registers array
464 // if the match succeeds. This way those registers are not clobbered
465 // when we set the last match info from last successful match.
466 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
467 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
468#else // V8_INTERPRETED_REGEXP
469 // Native regexp only needs room to output captures. Registers are handled
470 // internally.
471 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
472#endif // V8_INTERPRETED_REGEXP
473}
474
475
476int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
477 Handle<String> subject,
478 int index,
479 int32_t* output,
480 int output_size) {
481 Isolate* isolate = regexp->GetIsolate();
482
483 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
484
485 DCHECK(index >= 0);
486 DCHECK(index <= subject->length());
487 DCHECK(subject->IsFlat());
488
489 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
490
491#ifndef V8_INTERPRETED_REGEXP
492 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
493 do {
494 EnsureCompiledIrregexp(regexp, subject, is_one_byte);
495 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
496 // The stack is used to allocate registers for the compiled regexp code.
497 // This means that in case of failure, the output registers array is left
498 // untouched and contains the capture results from the previous successful
499 // match. We can use that to set the last match info lazily.
500 NativeRegExpMacroAssembler::Result res =
501 NativeRegExpMacroAssembler::Match(code,
502 subject,
503 output,
504 output_size,
505 index,
506 isolate);
507 if (res != NativeRegExpMacroAssembler::RETRY) {
508 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
509 isolate->has_pending_exception());
510 STATIC_ASSERT(
511 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
512 STATIC_ASSERT(
513 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
514 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
515 == RE_EXCEPTION);
516 return static_cast<IrregexpResult>(res);
517 }
518 // If result is RETRY, the string has changed representation, and we
519 // must restart from scratch.
520 // In this case, it means we must make sure we are prepared to handle
521 // the, potentially, different subject (the string can switch between
522 // being internal and external, and even between being Latin1 and UC16,
523 // but the characters are always the same).
524 IrregexpPrepare(regexp, subject);
525 is_one_byte = subject->IsOneByteRepresentationUnderneath();
526 } while (true);
527 UNREACHABLE();
528 return RE_EXCEPTION;
529#else // V8_INTERPRETED_REGEXP
530
531 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
532 // We must have done EnsureCompiledIrregexp, so we can get the number of
533 // registers.
534 int number_of_capture_registers =
535 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
536 int32_t* raw_output = &output[number_of_capture_registers];
537 // We do not touch the actual capture result registers until we know there
538 // has been a match so that we can use those capture results to set the
539 // last match info.
540 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
541 raw_output[i] = -1;
542 }
543 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
544 isolate);
545
546 IrregexpResult result = IrregexpInterpreter::Match(isolate,
547 byte_codes,
548 subject,
549 raw_output,
550 index);
551 if (result == RE_SUCCESS) {
552 // Copy capture results to the start of the registers array.
553 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
554 }
555 if (result == RE_EXCEPTION) {
556 DCHECK(!isolate->has_pending_exception());
557 isolate->StackOverflow();
558 }
559 return result;
560#endif // V8_INTERPRETED_REGEXP
561}
562
563
564MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
565 Handle<String> subject,
566 int previous_index,
567 Handle<JSArray> last_match_info) {
568 Isolate* isolate = regexp->GetIsolate();
569 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
570
571 // Prepare space for the return values.
572#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
573 if (FLAG_trace_regexp_bytecodes) {
574 String* pattern = regexp->Pattern();
575 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
576 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
577 }
578#endif
579 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
580 if (required_registers < 0) {
581 // Compiling failed with an exception.
582 DCHECK(isolate->has_pending_exception());
583 return MaybeHandle<Object>();
584 }
585
586 int32_t* output_registers = NULL;
587 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
588 output_registers = NewArray<int32_t>(required_registers);
589 }
590 base::SmartArrayPointer<int32_t> auto_release(output_registers);
591 if (output_registers == NULL) {
592 output_registers = isolate->jsregexp_static_offsets_vector();
593 }
594
595 int res = RegExpImpl::IrregexpExecRaw(
596 regexp, subject, previous_index, output_registers, required_registers);
597 if (res == RE_SUCCESS) {
598 int capture_count =
599 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
600 return SetLastMatchInfo(
601 last_match_info, subject, capture_count, output_registers);
602 }
603 if (res == RE_EXCEPTION) {
604 DCHECK(isolate->has_pending_exception());
605 return MaybeHandle<Object>();
606 }
607 DCHECK(res == RE_FAILURE);
608 return isolate->factory()->null_value();
609}
610
611
612static void EnsureSize(Handle<JSArray> array, uint32_t minimum_size) {
613 if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) {
614 JSArray::SetLength(array, minimum_size);
615 }
616}
617
618
619Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
620 Handle<String> subject,
621 int capture_count,
622 int32_t* match) {
623 DCHECK(last_match_info->HasFastObjectElements());
624 int capture_register_count = (capture_count + 1) * 2;
625 EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead);
626 DisallowHeapAllocation no_allocation;
627 FixedArray* array = FixedArray::cast(last_match_info->elements());
628 if (match != NULL) {
629 for (int i = 0; i < capture_register_count; i += 2) {
630 SetCapture(array, i, match[i]);
631 SetCapture(array, i + 1, match[i + 1]);
632 }
633 }
634 SetLastCaptureCount(array, capture_register_count);
635 SetLastSubject(array, *subject);
636 SetLastInput(array, *subject);
637 return last_match_info;
638}
639
640
641RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
642 Handle<String> subject,
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000643 Isolate* isolate)
644 : register_array_(NULL),
645 register_array_size_(0),
646 regexp_(regexp),
647 subject_(subject) {
648#ifdef V8_INTERPRETED_REGEXP
649 bool interpreted = true;
650#else
651 bool interpreted = false;
652#endif // V8_INTERPRETED_REGEXP
653
654 if (regexp_->TypeTag() == JSRegExp::ATOM) {
655 static const int kAtomRegistersPerMatch = 2;
656 registers_per_match_ = kAtomRegistersPerMatch;
657 // There is no distinction between interpreted and native for atom regexps.
658 interpreted = false;
659 } else {
660 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
661 if (registers_per_match_ < 0) {
662 num_matches_ = -1; // Signal exception.
663 return;
664 }
665 }
666
Ben Murdoch097c5b22016-05-18 11:27:45 +0100667 DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
668 if (!interpreted) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000669 register_array_size_ =
670 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
671 max_matches_ = register_array_size_ / registers_per_match_;
672 } else {
673 // Global loop in interpreted regexp is not implemented. We choose
674 // the size of the offsets vector so that it can only store one match.
675 register_array_size_ = registers_per_match_;
676 max_matches_ = 1;
677 }
678
679 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
680 register_array_ = NewArray<int32_t>(register_array_size_);
681 } else {
682 register_array_ = isolate->jsregexp_static_offsets_vector();
683 }
684
685 // Set state so that fetching the results the first time triggers a call
686 // to the compiled regexp.
687 current_match_index_ = max_matches_ - 1;
688 num_matches_ = max_matches_;
689 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
690 DCHECK_GE(register_array_size_, registers_per_match_);
691 int32_t* last_match =
692 &register_array_[current_match_index_ * registers_per_match_];
693 last_match[0] = -1;
694 last_match[1] = 0;
695}
696
Ben Murdoch097c5b22016-05-18 11:27:45 +0100697int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
698 if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
699 last_index + 1 < subject_->length() &&
700 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
701 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
702 // Advance over the surrogate pair.
703 return last_index + 2;
704 }
705 return last_index + 1;
706}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000707
708// -------------------------------------------------------------------
709// Implementation of the Irregexp regular expression engine.
710//
711// The Irregexp regular expression engine is intended to be a complete
712// implementation of ECMAScript regular expressions. It generates either
713// bytecodes or native code.
714
715// The Irregexp regexp engine is structured in three steps.
716// 1) The parser generates an abstract syntax tree. See ast.cc.
717// 2) From the AST a node network is created. The nodes are all
718// subclasses of RegExpNode. The nodes represent states when
719// executing a regular expression. Several optimizations are
720// performed on the node network.
721// 3) From the nodes we generate either byte codes or native code
722// that can actually execute the regular expression (perform
723// the search). The code generation step is described in more
724// detail below.
725
726// Code generation.
727//
728// The nodes are divided into four main categories.
729// * Choice nodes
730// These represent places where the regular expression can
731// match in more than one way. For example on entry to an
732// alternation (foo|bar) or a repetition (*, +, ? or {}).
733// * Action nodes
734// These represent places where some action should be
735// performed. Examples include recording the current position
736// in the input string to a register (in order to implement
737// captures) or other actions on register for example in order
738// to implement the counters needed for {} repetitions.
739// * Matching nodes
740// These attempt to match some element part of the input string.
741// Examples of elements include character classes, plain strings
742// or back references.
743// * End nodes
744// These are used to implement the actions required on finding
745// a successful match or failing to find a match.
746//
747// The code generated (whether as byte codes or native code) maintains
748// some state as it runs. This consists of the following elements:
749//
750// * The capture registers. Used for string captures.
751// * Other registers. Used for counters etc.
752// * The current position.
753// * The stack of backtracking information. Used when a matching node
754// fails to find a match and needs to try an alternative.
755//
756// Conceptual regular expression execution model:
757//
758// There is a simple conceptual model of regular expression execution
759// which will be presented first. The actual code generated is a more
760// efficient simulation of the simple conceptual model:
761//
762// * Choice nodes are implemented as follows:
763// For each choice except the last {
764// push current position
765// push backtrack code location
766// <generate code to test for choice>
767// backtrack code location:
768// pop current position
769// }
770// <generate code to test for last choice>
771//
772// * Actions nodes are generated as follows
773// <push affected registers on backtrack stack>
774// <generate code to perform action>
775// push backtrack code location
776// <generate code to test for following nodes>
777// backtrack code location:
778// <pop affected registers to restore their state>
779// <pop backtrack location from stack and go to it>
780//
781// * Matching nodes are generated as follows:
782// if input string matches at current position
783// update current position
784// <generate code to test for following nodes>
785// else
786// <pop backtrack location from stack and go to it>
787//
788// Thus it can be seen that the current position is saved and restored
789// by the choice nodes, whereas the registers are saved and restored by
790// by the action nodes that manipulate them.
791//
792// The other interesting aspect of this model is that nodes are generated
793// at the point where they are needed by a recursive call to Emit(). If
794// the node has already been code generated then the Emit() call will
795// generate a jump to the previously generated code instead. In order to
796// limit recursion it is possible for the Emit() function to put the node
797// on a work list for later generation and instead generate a jump. The
798// destination of the jump is resolved later when the code is generated.
799//
800// Actual regular expression code generation.
801//
802// Code generation is actually more complicated than the above. In order
803// to improve the efficiency of the generated code some optimizations are
804// performed
805//
806// * Choice nodes have 1-character lookahead.
807// A choice node looks at the following character and eliminates some of
808// the choices immediately based on that character. This is not yet
809// implemented.
810// * Simple greedy loops store reduced backtracking information.
811// A quantifier like /.*foo/m will greedily match the whole input. It will
812// then need to backtrack to a point where it can match "foo". The naive
813// implementation of this would push each character position onto the
814// backtracking stack, then pop them off one by one. This would use space
815// proportional to the length of the input string. However since the "."
816// can only match in one way and always has a constant length (in this case
817// of 1) it suffices to store the current position on the top of the stack
818// once. Matching now becomes merely incrementing the current position and
819// backtracking becomes decrementing the current position and checking the
820// result against the stored current position. This is faster and saves
821// space.
822// * The current state is virtualized.
823// This is used to defer expensive operations until it is clear that they
824// are needed and to generate code for a node more than once, allowing
825// specialized an efficient versions of the code to be created. This is
826// explained in the section below.
827//
828// Execution state virtualization.
829//
830// Instead of emitting code, nodes that manipulate the state can record their
831// manipulation in an object called the Trace. The Trace object can record a
832// current position offset, an optional backtrack code location on the top of
833// the virtualized backtrack stack and some register changes. When a node is
834// to be emitted it can flush the Trace or update it. Flushing the Trace
835// will emit code to bring the actual state into line with the virtual state.
836// Avoiding flushing the state can postpone some work (e.g. updates of capture
837// registers). Postponing work can save time when executing the regular
838// expression since it may be found that the work never has to be done as a
839// failure to match can occur. In addition it is much faster to jump to a
840// known backtrack code location than it is to pop an unknown backtrack
841// location from the stack and jump there.
842//
843// The virtual state found in the Trace affects code generation. For example
844// the virtual state contains the difference between the actual current
845// position and the virtual current position, and matching code needs to use
846// this offset to attempt a match in the correct location of the input
847// string. Therefore code generated for a non-trivial trace is specialized
848// to that trace. The code generator therefore has the ability to generate
849// code for each node several times. In order to limit the size of the
850// generated code there is an arbitrary limit on how many specialized sets of
851// code may be generated for a given node. If the limit is reached, the
852// trace is flushed and a generic version of the code for a node is emitted.
853// This is subsequently used for that node. The code emitted for non-generic
854// trace is not recorded in the node and so it cannot currently be reused in
855// the event that code generation is requested for an identical trace.
856
857
858void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
859 UNREACHABLE();
860}
861
862
863void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
864 text->AddElement(TextElement::Atom(this), zone);
865}
866
867
868void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
869 text->AddElement(TextElement::CharClass(this), zone);
870}
871
872
873void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
874 for (int i = 0; i < elements()->length(); i++)
875 text->AddElement(elements()->at(i), zone);
876}
877
878
879TextElement TextElement::Atom(RegExpAtom* atom) {
880 return TextElement(ATOM, atom);
881}
882
883
884TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
885 return TextElement(CHAR_CLASS, char_class);
886}
887
888
889int TextElement::length() const {
890 switch (text_type()) {
891 case ATOM:
892 return atom()->length();
893
894 case CHAR_CLASS:
895 return 1;
896 }
897 UNREACHABLE();
898 return 0;
899}
900
901
902DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
903 if (table_ == NULL) {
904 table_ = new(zone()) DispatchTable(zone());
905 DispatchTableConstructor cons(table_, ignore_case, zone());
906 cons.BuildTable(this);
907 }
908 return table_;
909}
910
911
912class FrequencyCollator {
913 public:
914 FrequencyCollator() : total_samples_(0) {
915 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
916 frequencies_[i] = CharacterFrequency(i);
917 }
918 }
919
920 void CountCharacter(int character) {
921 int index = (character & RegExpMacroAssembler::kTableMask);
922 frequencies_[index].Increment();
923 total_samples_++;
924 }
925
926 // Does not measure in percent, but rather per-128 (the table size from the
927 // regexp macro assembler).
928 int Frequency(int in_character) {
929 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
930 if (total_samples_ < 1) return 1; // Division by zero.
931 int freq_in_per128 =
932 (frequencies_[in_character].counter() * 128) / total_samples_;
933 return freq_in_per128;
934 }
935
936 private:
937 class CharacterFrequency {
938 public:
939 CharacterFrequency() : counter_(0), character_(-1) { }
940 explicit CharacterFrequency(int character)
941 : counter_(0), character_(character) { }
942
943 void Increment() { counter_++; }
944 int counter() { return counter_; }
945 int character() { return character_; }
946
947 private:
948 int counter_;
949 int character_;
950 };
951
952
953 private:
954 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
955 int total_samples_;
956};
957
958
959class RegExpCompiler {
960 public:
961 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch097c5b22016-05-18 11:27:45 +0100962 JSRegExp::Flags flags, bool is_one_byte);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000963
964 int AllocateRegister() {
965 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
966 reg_exp_too_big_ = true;
967 return next_register_;
968 }
969 return next_register_++;
970 }
971
Ben Murdoch097c5b22016-05-18 11:27:45 +0100972 // Lookarounds to match lone surrogates for unicode character class matches
973 // are never nested. We can therefore reuse registers.
974 int UnicodeLookaroundStackRegister() {
975 if (unicode_lookaround_stack_register_ == kNoRegister) {
976 unicode_lookaround_stack_register_ = AllocateRegister();
977 }
978 return unicode_lookaround_stack_register_;
979 }
980
981 int UnicodeLookaroundPositionRegister() {
982 if (unicode_lookaround_position_register_ == kNoRegister) {
983 unicode_lookaround_position_register_ = AllocateRegister();
984 }
985 return unicode_lookaround_position_register_;
986 }
987
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000988 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
989 RegExpNode* start,
990 int capture_count,
991 Handle<String> pattern);
992
993 inline void AddWork(RegExpNode* node) {
994 if (!node->on_work_list() && !node->label()->is_bound()) {
995 node->set_on_work_list(true);
996 work_list_->Add(node);
997 }
998 }
999
1000 static const int kImplementationOffset = 0;
1001 static const int kNumberOfRegistersOffset = 0;
1002 static const int kCodeOffset = 1;
1003
1004 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1005 EndNode* accept() { return accept_; }
1006
1007 static const int kMaxRecursion = 100;
1008 inline int recursion_depth() { return recursion_depth_; }
1009 inline void IncrementRecursionDepth() { recursion_depth_++; }
1010 inline void DecrementRecursionDepth() { recursion_depth_--; }
1011
1012 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1013
Ben Murdoch097c5b22016-05-18 11:27:45 +01001014 inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
1015 inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001016 inline bool one_byte() { return one_byte_; }
1017 inline bool optimize() { return optimize_; }
1018 inline void set_optimize(bool value) { optimize_ = value; }
1019 inline bool limiting_recursion() { return limiting_recursion_; }
1020 inline void set_limiting_recursion(bool value) {
1021 limiting_recursion_ = value;
1022 }
1023 bool read_backward() { return read_backward_; }
1024 void set_read_backward(bool value) { read_backward_ = value; }
1025 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1026
1027 int current_expansion_factor() { return current_expansion_factor_; }
1028 void set_current_expansion_factor(int value) {
1029 current_expansion_factor_ = value;
1030 }
1031
1032 Isolate* isolate() const { return isolate_; }
1033 Zone* zone() const { return zone_; }
1034
1035 static const int kNoRegister = -1;
1036
1037 private:
1038 EndNode* accept_;
1039 int next_register_;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001040 int unicode_lookaround_stack_register_;
1041 int unicode_lookaround_position_register_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001042 List<RegExpNode*>* work_list_;
1043 int recursion_depth_;
1044 RegExpMacroAssembler* macro_assembler_;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001045 JSRegExp::Flags flags_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001046 bool one_byte_;
1047 bool reg_exp_too_big_;
1048 bool limiting_recursion_;
1049 bool optimize_;
1050 bool read_backward_;
1051 int current_expansion_factor_;
1052 FrequencyCollator frequency_collator_;
1053 Isolate* isolate_;
1054 Zone* zone_;
1055};
1056
1057
1058class RecursionCheck {
1059 public:
1060 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1061 compiler->IncrementRecursionDepth();
1062 }
1063 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1064 private:
1065 RegExpCompiler* compiler_;
1066};
1067
1068
1069static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1070 return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1071}
1072
1073
1074// Attempts to compile the regexp using an Irregexp code generator. Returns
1075// a fixed array or a null handle depending on whether it succeeded.
1076RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch097c5b22016-05-18 11:27:45 +01001077 JSRegExp::Flags flags, bool one_byte)
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001078 : next_register_(2 * (capture_count + 1)),
Ben Murdoch097c5b22016-05-18 11:27:45 +01001079 unicode_lookaround_stack_register_(kNoRegister),
1080 unicode_lookaround_position_register_(kNoRegister),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001081 work_list_(NULL),
1082 recursion_depth_(0),
Ben Murdoch097c5b22016-05-18 11:27:45 +01001083 flags_(flags),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001084 one_byte_(one_byte),
1085 reg_exp_too_big_(false),
1086 limiting_recursion_(false),
1087 optimize_(FLAG_regexp_optimization),
1088 read_backward_(false),
1089 current_expansion_factor_(1),
1090 frequency_collator_(),
1091 isolate_(isolate),
1092 zone_(zone) {
1093 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1094 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1095}
1096
1097
1098RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1099 RegExpMacroAssembler* macro_assembler,
1100 RegExpNode* start,
1101 int capture_count,
1102 Handle<String> pattern) {
1103 Heap* heap = pattern->GetHeap();
1104
1105#ifdef DEBUG
1106 if (FLAG_trace_regexp_assembler)
1107 macro_assembler_ =
1108 new RegExpMacroAssemblerTracer(isolate(), macro_assembler);
1109 else
1110#endif
1111 macro_assembler_ = macro_assembler;
1112
1113 List <RegExpNode*> work_list(0);
1114 work_list_ = &work_list;
1115 Label fail;
1116 macro_assembler_->PushBacktrack(&fail);
1117 Trace new_trace;
1118 start->Emit(this, &new_trace);
1119 macro_assembler_->Bind(&fail);
1120 macro_assembler_->Fail();
1121 while (!work_list.is_empty()) {
1122 RegExpNode* node = work_list.RemoveLast();
1123 node->set_on_work_list(false);
1124 if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1125 }
1126 if (reg_exp_too_big_) {
1127 macro_assembler_->AbortedCodeGeneration();
1128 return IrregexpRegExpTooBig(isolate_);
1129 }
1130
1131 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1132 heap->IncreaseTotalRegexpCodeGenerated(code->Size());
1133 work_list_ = NULL;
1134#ifdef ENABLE_DISASSEMBLER
1135 if (FLAG_print_code) {
1136 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
1137 OFStream os(trace_scope.file());
1138 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1139 }
1140#endif
1141#ifdef DEBUG
1142 if (FLAG_trace_regexp_assembler) {
1143 delete macro_assembler_;
1144 }
1145#endif
1146 return RegExpEngine::CompilationResult(*code, next_register_);
1147}
1148
1149
1150bool Trace::DeferredAction::Mentions(int that) {
1151 if (action_type() == ActionNode::CLEAR_CAPTURES) {
1152 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1153 return range.Contains(that);
1154 } else {
1155 return reg() == that;
1156 }
1157}
1158
1159
1160bool Trace::mentions_reg(int reg) {
1161 for (DeferredAction* action = actions_;
1162 action != NULL;
1163 action = action->next()) {
1164 if (action->Mentions(reg))
1165 return true;
1166 }
1167 return false;
1168}
1169
1170
1171bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1172 DCHECK_EQ(0, *cp_offset);
1173 for (DeferredAction* action = actions_;
1174 action != NULL;
1175 action = action->next()) {
1176 if (action->Mentions(reg)) {
1177 if (action->action_type() == ActionNode::STORE_POSITION) {
1178 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1179 return true;
1180 } else {
1181 return false;
1182 }
1183 }
1184 }
1185 return false;
1186}
1187
1188
1189int Trace::FindAffectedRegisters(OutSet* affected_registers,
1190 Zone* zone) {
1191 int max_register = RegExpCompiler::kNoRegister;
1192 for (DeferredAction* action = actions_;
1193 action != NULL;
1194 action = action->next()) {
1195 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1196 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1197 for (int i = range.from(); i <= range.to(); i++)
1198 affected_registers->Set(i, zone);
1199 if (range.to() > max_register) max_register = range.to();
1200 } else {
1201 affected_registers->Set(action->reg(), zone);
1202 if (action->reg() > max_register) max_register = action->reg();
1203 }
1204 }
1205 return max_register;
1206}
1207
1208
1209void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1210 int max_register,
1211 const OutSet& registers_to_pop,
1212 const OutSet& registers_to_clear) {
1213 for (int reg = max_register; reg >= 0; reg--) {
1214 if (registers_to_pop.Get(reg)) {
1215 assembler->PopRegister(reg);
1216 } else if (registers_to_clear.Get(reg)) {
1217 int clear_to = reg;
1218 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1219 reg--;
1220 }
1221 assembler->ClearRegisters(reg, clear_to);
1222 }
1223 }
1224}
1225
1226
1227void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1228 int max_register,
1229 const OutSet& affected_registers,
1230 OutSet* registers_to_pop,
1231 OutSet* registers_to_clear,
1232 Zone* zone) {
1233 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1234 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1235
1236 // Count pushes performed to force a stack limit check occasionally.
1237 int pushes = 0;
1238
1239 for (int reg = 0; reg <= max_register; reg++) {
1240 if (!affected_registers.Get(reg)) {
1241 continue;
1242 }
1243
1244 // The chronologically first deferred action in the trace
1245 // is used to infer the action needed to restore a register
1246 // to its previous state (or not, if it's safe to ignore it).
1247 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1248 DeferredActionUndoType undo_action = IGNORE;
1249
1250 int value = 0;
1251 bool absolute = false;
1252 bool clear = false;
1253 static const int kNoStore = kMinInt;
1254 int store_position = kNoStore;
1255 // This is a little tricky because we are scanning the actions in reverse
1256 // historical order (newest first).
1257 for (DeferredAction* action = actions_;
1258 action != NULL;
1259 action = action->next()) {
1260 if (action->Mentions(reg)) {
1261 switch (action->action_type()) {
1262 case ActionNode::SET_REGISTER: {
1263 Trace::DeferredSetRegister* psr =
1264 static_cast<Trace::DeferredSetRegister*>(action);
1265 if (!absolute) {
1266 value += psr->value();
1267 absolute = true;
1268 }
1269 // SET_REGISTER is currently only used for newly introduced loop
1270 // counters. They can have a significant previous value if they
1271 // occour in a loop. TODO(lrn): Propagate this information, so
1272 // we can set undo_action to IGNORE if we know there is no value to
1273 // restore.
1274 undo_action = RESTORE;
1275 DCHECK_EQ(store_position, kNoStore);
1276 DCHECK(!clear);
1277 break;
1278 }
1279 case ActionNode::INCREMENT_REGISTER:
1280 if (!absolute) {
1281 value++;
1282 }
1283 DCHECK_EQ(store_position, kNoStore);
1284 DCHECK(!clear);
1285 undo_action = RESTORE;
1286 break;
1287 case ActionNode::STORE_POSITION: {
1288 Trace::DeferredCapture* pc =
1289 static_cast<Trace::DeferredCapture*>(action);
1290 if (!clear && store_position == kNoStore) {
1291 store_position = pc->cp_offset();
1292 }
1293
1294 // For captures we know that stores and clears alternate.
1295 // Other register, are never cleared, and if the occur
1296 // inside a loop, they might be assigned more than once.
1297 if (reg <= 1) {
1298 // Registers zero and one, aka "capture zero", is
1299 // always set correctly if we succeed. There is no
1300 // need to undo a setting on backtrack, because we
1301 // will set it again or fail.
1302 undo_action = IGNORE;
1303 } else {
1304 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1305 }
1306 DCHECK(!absolute);
1307 DCHECK_EQ(value, 0);
1308 break;
1309 }
1310 case ActionNode::CLEAR_CAPTURES: {
1311 // Since we're scanning in reverse order, if we've already
1312 // set the position we have to ignore historically earlier
1313 // clearing operations.
1314 if (store_position == kNoStore) {
1315 clear = true;
1316 }
1317 undo_action = RESTORE;
1318 DCHECK(!absolute);
1319 DCHECK_EQ(value, 0);
1320 break;
1321 }
1322 default:
1323 UNREACHABLE();
1324 break;
1325 }
1326 }
1327 }
1328 // Prepare for the undo-action (e.g., push if it's going to be popped).
1329 if (undo_action == RESTORE) {
1330 pushes++;
1331 RegExpMacroAssembler::StackCheckFlag stack_check =
1332 RegExpMacroAssembler::kNoStackLimitCheck;
1333 if (pushes == push_limit) {
1334 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1335 pushes = 0;
1336 }
1337
1338 assembler->PushRegister(reg, stack_check);
1339 registers_to_pop->Set(reg, zone);
1340 } else if (undo_action == CLEAR) {
1341 registers_to_clear->Set(reg, zone);
1342 }
1343 // Perform the chronologically last action (or accumulated increment)
1344 // for the register.
1345 if (store_position != kNoStore) {
1346 assembler->WriteCurrentPositionToRegister(reg, store_position);
1347 } else if (clear) {
1348 assembler->ClearRegisters(reg, reg);
1349 } else if (absolute) {
1350 assembler->SetRegister(reg, value);
1351 } else if (value != 0) {
1352 assembler->AdvanceRegister(reg, value);
1353 }
1354 }
1355}
1356
1357
1358// This is called as we come into a loop choice node and some other tricky
1359// nodes. It normalizes the state of the code generator to ensure we can
1360// generate generic code.
1361void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1362 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1363
1364 DCHECK(!is_trivial());
1365
1366 if (actions_ == NULL && backtrack() == NULL) {
1367 // Here we just have some deferred cp advances to fix and we are back to
1368 // a normal situation. We may also have to forget some information gained
1369 // through a quick check that was already performed.
1370 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1371 // Create a new trivial state and generate the node with that.
1372 Trace new_state;
1373 successor->Emit(compiler, &new_state);
1374 return;
1375 }
1376
1377 // Generate deferred actions here along with code to undo them again.
1378 OutSet affected_registers;
1379
1380 if (backtrack() != NULL) {
1381 // Here we have a concrete backtrack location. These are set up by choice
1382 // nodes and so they indicate that we have a deferred save of the current
1383 // position which we may need to emit here.
1384 assembler->PushCurrentPosition();
1385 }
1386
1387 int max_register = FindAffectedRegisters(&affected_registers,
1388 compiler->zone());
1389 OutSet registers_to_pop;
1390 OutSet registers_to_clear;
1391 PerformDeferredActions(assembler,
1392 max_register,
1393 affected_registers,
1394 &registers_to_pop,
1395 &registers_to_clear,
1396 compiler->zone());
1397 if (cp_offset_ != 0) {
1398 assembler->AdvanceCurrentPosition(cp_offset_);
1399 }
1400
1401 // Create a new trivial state and generate the node with that.
1402 Label undo;
1403 assembler->PushBacktrack(&undo);
1404 if (successor->KeepRecursing(compiler)) {
1405 Trace new_state;
1406 successor->Emit(compiler, &new_state);
1407 } else {
1408 compiler->AddWork(successor);
1409 assembler->GoTo(successor->label());
1410 }
1411
1412 // On backtrack we need to restore state.
1413 assembler->Bind(&undo);
1414 RestoreAffectedRegisters(assembler,
1415 max_register,
1416 registers_to_pop,
1417 registers_to_clear);
1418 if (backtrack() == NULL) {
1419 assembler->Backtrack();
1420 } else {
1421 assembler->PopCurrentPosition();
1422 assembler->GoTo(backtrack());
1423 }
1424}
1425
1426
1427void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1428 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1429
1430 // Omit flushing the trace. We discard the entire stack frame anyway.
1431
1432 if (!label()->is_bound()) {
1433 // We are completely independent of the trace, since we ignore it,
1434 // so this code can be used as the generic version.
1435 assembler->Bind(label());
1436 }
1437
1438 // Throw away everything on the backtrack stack since the start
1439 // of the negative submatch and restore the character position.
1440 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1441 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1442 if (clear_capture_count_ > 0) {
1443 // Clear any captures that might have been performed during the success
1444 // of the body of the negative look-ahead.
1445 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1446 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1447 }
1448 // Now that we have unwound the stack we find at the top of the stack the
1449 // backtrack that the BeginSubmatch node got.
1450 assembler->Backtrack();
1451}
1452
1453
1454void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1455 if (!trace->is_trivial()) {
1456 trace->Flush(compiler, this);
1457 return;
1458 }
1459 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1460 if (!label()->is_bound()) {
1461 assembler->Bind(label());
1462 }
1463 switch (action_) {
1464 case ACCEPT:
1465 assembler->Succeed();
1466 return;
1467 case BACKTRACK:
1468 assembler->GoTo(trace->backtrack());
1469 return;
1470 case NEGATIVE_SUBMATCH_SUCCESS:
1471 // This case is handled in a different virtual method.
1472 UNREACHABLE();
1473 }
1474 UNIMPLEMENTED();
1475}
1476
1477
1478void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1479 if (guards_ == NULL)
1480 guards_ = new(zone) ZoneList<Guard*>(1, zone);
1481 guards_->Add(guard, zone);
1482}
1483
1484
1485ActionNode* ActionNode::SetRegister(int reg,
1486 int val,
1487 RegExpNode* on_success) {
1488 ActionNode* result =
1489 new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1490 result->data_.u_store_register.reg = reg;
1491 result->data_.u_store_register.value = val;
1492 return result;
1493}
1494
1495
1496ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1497 ActionNode* result =
1498 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1499 result->data_.u_increment_register.reg = reg;
1500 return result;
1501}
1502
1503
1504ActionNode* ActionNode::StorePosition(int reg,
1505 bool is_capture,
1506 RegExpNode* on_success) {
1507 ActionNode* result =
1508 new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1509 result->data_.u_position_register.reg = reg;
1510 result->data_.u_position_register.is_capture = is_capture;
1511 return result;
1512}
1513
1514
1515ActionNode* ActionNode::ClearCaptures(Interval range,
1516 RegExpNode* on_success) {
1517 ActionNode* result =
1518 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1519 result->data_.u_clear_captures.range_from = range.from();
1520 result->data_.u_clear_captures.range_to = range.to();
1521 return result;
1522}
1523
1524
1525ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1526 int position_reg,
1527 RegExpNode* on_success) {
1528 ActionNode* result =
1529 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1530 result->data_.u_submatch.stack_pointer_register = stack_reg;
1531 result->data_.u_submatch.current_position_register = position_reg;
1532 return result;
1533}
1534
1535
1536ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1537 int position_reg,
1538 int clear_register_count,
1539 int clear_register_from,
1540 RegExpNode* on_success) {
1541 ActionNode* result =
1542 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1543 result->data_.u_submatch.stack_pointer_register = stack_reg;
1544 result->data_.u_submatch.current_position_register = position_reg;
1545 result->data_.u_submatch.clear_register_count = clear_register_count;
1546 result->data_.u_submatch.clear_register_from = clear_register_from;
1547 return result;
1548}
1549
1550
1551ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1552 int repetition_register,
1553 int repetition_limit,
1554 RegExpNode* on_success) {
1555 ActionNode* result =
1556 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1557 result->data_.u_empty_match_check.start_register = start_register;
1558 result->data_.u_empty_match_check.repetition_register = repetition_register;
1559 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1560 return result;
1561}
1562
1563
1564#define DEFINE_ACCEPT(Type) \
1565 void Type##Node::Accept(NodeVisitor* visitor) { \
1566 visitor->Visit##Type(this); \
1567 }
1568FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1569#undef DEFINE_ACCEPT
1570
1571
1572void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1573 visitor->VisitLoopChoice(this);
1574}
1575
1576
1577// -------------------------------------------------------------------
1578// Emit code.
1579
1580
1581void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1582 Guard* guard,
1583 Trace* trace) {
1584 switch (guard->op()) {
1585 case Guard::LT:
1586 DCHECK(!trace->mentions_reg(guard->reg()));
1587 macro_assembler->IfRegisterGE(guard->reg(),
1588 guard->value(),
1589 trace->backtrack());
1590 break;
1591 case Guard::GEQ:
1592 DCHECK(!trace->mentions_reg(guard->reg()));
1593 macro_assembler->IfRegisterLT(guard->reg(),
1594 guard->value(),
1595 trace->backtrack());
1596 break;
1597 }
1598}
1599
1600
1601// Returns the number of characters in the equivalence class, omitting those
1602// that cannot occur in the source string because it is Latin1.
1603static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1604 bool one_byte_subject,
1605 unibrow::uchar* letters) {
1606 int length =
1607 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1608 // Unibrow returns 0 or 1 for characters where case independence is
1609 // trivial.
1610 if (length == 0) {
1611 letters[0] = character;
1612 length = 1;
1613 }
1614
1615 if (one_byte_subject) {
1616 int new_length = 0;
1617 for (int i = 0; i < length; i++) {
1618 if (letters[i] <= String::kMaxOneByteCharCode) {
1619 letters[new_length++] = letters[i];
1620 }
1621 }
1622 length = new_length;
1623 }
1624
1625 return length;
1626}
1627
1628
1629static inline bool EmitSimpleCharacter(Isolate* isolate,
1630 RegExpCompiler* compiler,
1631 uc16 c,
1632 Label* on_failure,
1633 int cp_offset,
1634 bool check,
1635 bool preloaded) {
1636 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1637 bool bound_checked = false;
1638 if (!preloaded) {
1639 assembler->LoadCurrentCharacter(
1640 cp_offset,
1641 on_failure,
1642 check);
1643 bound_checked = true;
1644 }
1645 assembler->CheckNotCharacter(c, on_failure);
1646 return bound_checked;
1647}
1648
1649
1650// Only emits non-letters (things that don't have case). Only used for case
1651// independent matches.
1652static inline bool EmitAtomNonLetter(Isolate* isolate,
1653 RegExpCompiler* compiler,
1654 uc16 c,
1655 Label* on_failure,
1656 int cp_offset,
1657 bool check,
1658 bool preloaded) {
1659 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1660 bool one_byte = compiler->one_byte();
1661 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1662 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1663 if (length < 1) {
1664 // This can't match. Must be an one-byte subject and a non-one-byte
1665 // character. We do not need to do anything since the one-byte pass
1666 // already handled this.
1667 return false; // Bounds not checked.
1668 }
1669 bool checked = false;
1670 // We handle the length > 1 case in a later pass.
1671 if (length == 1) {
1672 if (one_byte && c > String::kMaxOneByteCharCodeU) {
1673 // Can't match - see above.
1674 return false; // Bounds not checked.
1675 }
1676 if (!preloaded) {
1677 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1678 checked = check;
1679 }
1680 macro_assembler->CheckNotCharacter(c, on_failure);
1681 }
1682 return checked;
1683}
1684
1685
1686static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1687 bool one_byte, uc16 c1, uc16 c2,
1688 Label* on_failure) {
1689 uc16 char_mask;
1690 if (one_byte) {
1691 char_mask = String::kMaxOneByteCharCode;
1692 } else {
1693 char_mask = String::kMaxUtf16CodeUnit;
1694 }
1695 uc16 exor = c1 ^ c2;
1696 // Check whether exor has only one bit set.
1697 if (((exor - 1) & exor) == 0) {
1698 // If c1 and c2 differ only by one bit.
1699 // Ecma262UnCanonicalize always gives the highest number last.
1700 DCHECK(c2 > c1);
1701 uc16 mask = char_mask ^ exor;
1702 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1703 return true;
1704 }
1705 DCHECK(c2 > c1);
1706 uc16 diff = c2 - c1;
1707 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1708 // If the characters differ by 2^n but don't differ by one bit then
1709 // subtract the difference from the found character, then do the or
1710 // trick. We avoid the theoretical case where negative numbers are
1711 // involved in order to simplify code generation.
1712 uc16 mask = char_mask ^ diff;
1713 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1714 diff,
1715 mask,
1716 on_failure);
1717 return true;
1718 }
1719 return false;
1720}
1721
1722
1723typedef bool EmitCharacterFunction(Isolate* isolate,
1724 RegExpCompiler* compiler,
1725 uc16 c,
1726 Label* on_failure,
1727 int cp_offset,
1728 bool check,
1729 bool preloaded);
1730
1731// Only emits letters (things that have case). Only used for case independent
1732// matches.
1733static inline bool EmitAtomLetter(Isolate* isolate,
1734 RegExpCompiler* compiler,
1735 uc16 c,
1736 Label* on_failure,
1737 int cp_offset,
1738 bool check,
1739 bool preloaded) {
1740 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1741 bool one_byte = compiler->one_byte();
1742 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1743 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1744 if (length <= 1) return false;
1745 // We may not need to check against the end of the input string
1746 // if this character lies before a character that matched.
1747 if (!preloaded) {
1748 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1749 }
1750 Label ok;
1751 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1752 switch (length) {
1753 case 2: {
1754 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1755 chars[1], on_failure)) {
1756 } else {
1757 macro_assembler->CheckCharacter(chars[0], &ok);
1758 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1759 macro_assembler->Bind(&ok);
1760 }
1761 break;
1762 }
1763 case 4:
1764 macro_assembler->CheckCharacter(chars[3], &ok);
1765 // Fall through!
1766 case 3:
1767 macro_assembler->CheckCharacter(chars[0], &ok);
1768 macro_assembler->CheckCharacter(chars[1], &ok);
1769 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1770 macro_assembler->Bind(&ok);
1771 break;
1772 default:
1773 UNREACHABLE();
1774 break;
1775 }
1776 return true;
1777}
1778
1779
1780static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1781 int border,
1782 Label* fall_through,
1783 Label* above_or_equal,
1784 Label* below) {
1785 if (below != fall_through) {
1786 masm->CheckCharacterLT(border, below);
1787 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1788 } else {
1789 masm->CheckCharacterGT(border - 1, above_or_equal);
1790 }
1791}
1792
1793
1794static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1795 int first,
1796 int last,
1797 Label* fall_through,
1798 Label* in_range,
1799 Label* out_of_range) {
1800 if (in_range == fall_through) {
1801 if (first == last) {
1802 masm->CheckNotCharacter(first, out_of_range);
1803 } else {
1804 masm->CheckCharacterNotInRange(first, last, out_of_range);
1805 }
1806 } else {
1807 if (first == last) {
1808 masm->CheckCharacter(first, in_range);
1809 } else {
1810 masm->CheckCharacterInRange(first, last, in_range);
1811 }
1812 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1813 }
1814}
1815
1816
1817// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1818// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1819static void EmitUseLookupTable(
1820 RegExpMacroAssembler* masm,
1821 ZoneList<int>* ranges,
1822 int start_index,
1823 int end_index,
1824 int min_char,
1825 Label* fall_through,
1826 Label* even_label,
1827 Label* odd_label) {
1828 static const int kSize = RegExpMacroAssembler::kTableSize;
1829 static const int kMask = RegExpMacroAssembler::kTableMask;
1830
1831 int base = (min_char & ~kMask);
1832 USE(base);
1833
1834 // Assert that everything is on one kTableSize page.
1835 for (int i = start_index; i <= end_index; i++) {
1836 DCHECK_EQ(ranges->at(i) & ~kMask, base);
1837 }
1838 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1839
1840 char templ[kSize];
1841 Label* on_bit_set;
1842 Label* on_bit_clear;
1843 int bit;
1844 if (even_label == fall_through) {
1845 on_bit_set = odd_label;
1846 on_bit_clear = even_label;
1847 bit = 1;
1848 } else {
1849 on_bit_set = even_label;
1850 on_bit_clear = odd_label;
1851 bit = 0;
1852 }
1853 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1854 templ[i] = bit;
1855 }
1856 int j = 0;
1857 bit ^= 1;
1858 for (int i = start_index; i < end_index; i++) {
1859 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1860 templ[j] = bit;
1861 }
1862 bit ^= 1;
1863 }
1864 for (int i = j; i < kSize; i++) {
1865 templ[i] = bit;
1866 }
1867 Factory* factory = masm->isolate()->factory();
1868 // TODO(erikcorry): Cache these.
1869 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1870 for (int i = 0; i < kSize; i++) {
1871 ba->set(i, templ[i]);
1872 }
1873 masm->CheckBitInTable(ba, on_bit_set);
1874 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1875}
1876
1877
1878static void CutOutRange(RegExpMacroAssembler* masm,
1879 ZoneList<int>* ranges,
1880 int start_index,
1881 int end_index,
1882 int cut_index,
1883 Label* even_label,
1884 Label* odd_label) {
1885 bool odd = (((cut_index - start_index) & 1) == 1);
1886 Label* in_range_label = odd ? odd_label : even_label;
1887 Label dummy;
1888 EmitDoubleBoundaryTest(masm,
1889 ranges->at(cut_index),
1890 ranges->at(cut_index + 1) - 1,
1891 &dummy,
1892 in_range_label,
1893 &dummy);
1894 DCHECK(!dummy.is_linked());
1895 // Cut out the single range by rewriting the array. This creates a new
1896 // range that is a merger of the two ranges on either side of the one we
1897 // are cutting out. The oddity of the labels is preserved.
1898 for (int j = cut_index; j > start_index; j--) {
1899 ranges->at(j) = ranges->at(j - 1);
1900 }
1901 for (int j = cut_index + 1; j < end_index; j++) {
1902 ranges->at(j) = ranges->at(j + 1);
1903 }
1904}
1905
1906
1907// Unicode case. Split the search space into kSize spaces that are handled
1908// with recursion.
1909static void SplitSearchSpace(ZoneList<int>* ranges,
1910 int start_index,
1911 int end_index,
1912 int* new_start_index,
1913 int* new_end_index,
1914 int* border) {
1915 static const int kSize = RegExpMacroAssembler::kTableSize;
1916 static const int kMask = RegExpMacroAssembler::kTableMask;
1917
1918 int first = ranges->at(start_index);
1919 int last = ranges->at(end_index) - 1;
1920
1921 *new_start_index = start_index;
1922 *border = (ranges->at(start_index) & ~kMask) + kSize;
1923 while (*new_start_index < end_index) {
1924 if (ranges->at(*new_start_index) > *border) break;
1925 (*new_start_index)++;
1926 }
1927 // new_start_index is the index of the first edge that is beyond the
1928 // current kSize space.
1929
1930 // For very large search spaces we do a binary chop search of the non-Latin1
1931 // space instead of just going to the end of the current kSize space. The
1932 // heuristics are complicated a little by the fact that any 128-character
1933 // encoding space can be quickly tested with a table lookup, so we don't
1934 // wish to do binary chop search at a smaller granularity than that. A
1935 // 128-character space can take up a lot of space in the ranges array if,
1936 // for example, we only want to match every second character (eg. the lower
1937 // case characters on some Unicode pages).
1938 int binary_chop_index = (end_index + start_index) / 2;
1939 // The first test ensures that we get to the code that handles the Latin1
1940 // range with a single not-taken branch, speeding up this important
1941 // character range (even non-Latin1 charset-based text has spaces and
1942 // punctuation).
1943 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1944 end_index - start_index > (*new_start_index - start_index) * 2 &&
1945 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1946 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1947 int scan_forward_for_section_border = binary_chop_index;;
1948 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1949
1950 while (scan_forward_for_section_border < end_index) {
1951 if (ranges->at(scan_forward_for_section_border) > new_border) {
1952 *new_start_index = scan_forward_for_section_border;
1953 *border = new_border;
1954 break;
1955 }
1956 scan_forward_for_section_border++;
1957 }
1958 }
1959
1960 DCHECK(*new_start_index > start_index);
1961 *new_end_index = *new_start_index - 1;
1962 if (ranges->at(*new_end_index) == *border) {
1963 (*new_end_index)--;
1964 }
1965 if (*border >= ranges->at(end_index)) {
1966 *border = ranges->at(end_index);
1967 *new_start_index = end_index; // Won't be used.
1968 *new_end_index = end_index - 1;
1969 }
1970}
1971
1972
1973// Gets a series of segment boundaries representing a character class. If the
1974// character is in the range between an even and an odd boundary (counting from
1975// start_index) then go to even_label, otherwise go to odd_label. We already
1976// know that the character is in the range of min_char to max_char inclusive.
1977// Either label can be NULL indicating backtracking. Either label can also be
1978// equal to the fall_through label.
Ben Murdoch097c5b22016-05-18 11:27:45 +01001979static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1980 int start_index, int end_index, uc32 min_char,
1981 uc32 max_char, Label* fall_through,
1982 Label* even_label, Label* odd_label) {
1983 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1984 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1985
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001986 int first = ranges->at(start_index);
1987 int last = ranges->at(end_index) - 1;
1988
1989 DCHECK_LT(min_char, first);
1990
1991 // Just need to test if the character is before or on-or-after
1992 // a particular character.
1993 if (start_index == end_index) {
1994 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1995 return;
1996 }
1997
1998 // Another almost trivial case: There is one interval in the middle that is
1999 // different from the end intervals.
2000 if (start_index + 1 == end_index) {
2001 EmitDoubleBoundaryTest(
2002 masm, first, last, fall_through, even_label, odd_label);
2003 return;
2004 }
2005
2006 // It's not worth using table lookup if there are very few intervals in the
2007 // character class.
2008 if (end_index - start_index <= 6) {
2009 // It is faster to test for individual characters, so we look for those
2010 // first, then try arbitrary ranges in the second round.
2011 static int kNoCutIndex = -1;
2012 int cut = kNoCutIndex;
2013 for (int i = start_index; i < end_index; i++) {
2014 if (ranges->at(i) == ranges->at(i + 1) - 1) {
2015 cut = i;
2016 break;
2017 }
2018 }
2019 if (cut == kNoCutIndex) cut = start_index;
2020 CutOutRange(
2021 masm, ranges, start_index, end_index, cut, even_label, odd_label);
2022 DCHECK_GE(end_index - start_index, 2);
2023 GenerateBranches(masm,
2024 ranges,
2025 start_index + 1,
2026 end_index - 1,
2027 min_char,
2028 max_char,
2029 fall_through,
2030 even_label,
2031 odd_label);
2032 return;
2033 }
2034
2035 // If there are a lot of intervals in the regexp, then we will use tables to
2036 // determine whether the character is inside or outside the character class.
2037 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2038
2039 if ((max_char >> kBits) == (min_char >> kBits)) {
2040 EmitUseLookupTable(masm,
2041 ranges,
2042 start_index,
2043 end_index,
2044 min_char,
2045 fall_through,
2046 even_label,
2047 odd_label);
2048 return;
2049 }
2050
2051 if ((min_char >> kBits) != (first >> kBits)) {
2052 masm->CheckCharacterLT(first, odd_label);
2053 GenerateBranches(masm,
2054 ranges,
2055 start_index + 1,
2056 end_index,
2057 first,
2058 max_char,
2059 fall_through,
2060 odd_label,
2061 even_label);
2062 return;
2063 }
2064
2065 int new_start_index = 0;
2066 int new_end_index = 0;
2067 int border = 0;
2068
2069 SplitSearchSpace(ranges,
2070 start_index,
2071 end_index,
2072 &new_start_index,
2073 &new_end_index,
2074 &border);
2075
2076 Label handle_rest;
2077 Label* above = &handle_rest;
2078 if (border == last + 1) {
2079 // We didn't find any section that started after the limit, so everything
2080 // above the border is one of the terminal labels.
2081 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2082 DCHECK(new_end_index == end_index - 1);
2083 }
2084
2085 DCHECK_LE(start_index, new_end_index);
2086 DCHECK_LE(new_start_index, end_index);
2087 DCHECK_LT(start_index, new_start_index);
2088 DCHECK_LT(new_end_index, end_index);
2089 DCHECK(new_end_index + 1 == new_start_index ||
2090 (new_end_index + 2 == new_start_index &&
2091 border == ranges->at(new_end_index + 1)));
2092 DCHECK_LT(min_char, border - 1);
2093 DCHECK_LT(border, max_char);
2094 DCHECK_LT(ranges->at(new_end_index), border);
2095 DCHECK(border < ranges->at(new_start_index) ||
2096 (border == ranges->at(new_start_index) &&
2097 new_start_index == end_index &&
2098 new_end_index == end_index - 1 &&
2099 border == last + 1));
2100 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2101
2102 masm->CheckCharacterGT(border - 1, above);
2103 Label dummy;
2104 GenerateBranches(masm,
2105 ranges,
2106 start_index,
2107 new_end_index,
2108 min_char,
2109 border - 1,
2110 &dummy,
2111 even_label,
2112 odd_label);
2113 if (handle_rest.is_linked()) {
2114 masm->Bind(&handle_rest);
2115 bool flip = (new_start_index & 1) != (start_index & 1);
2116 GenerateBranches(masm,
2117 ranges,
2118 new_start_index,
2119 end_index,
2120 border,
2121 max_char,
2122 &dummy,
2123 flip ? odd_label : even_label,
2124 flip ? even_label : odd_label);
2125 }
2126}
2127
2128
2129static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2130 RegExpCharacterClass* cc, bool one_byte,
2131 Label* on_failure, int cp_offset, bool check_offset,
2132 bool preloaded, Zone* zone) {
2133 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
Ben Murdoch097c5b22016-05-18 11:27:45 +01002134 CharacterRange::Canonicalize(ranges);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002135
2136 int max_char;
2137 if (one_byte) {
2138 max_char = String::kMaxOneByteCharCode;
2139 } else {
2140 max_char = String::kMaxUtf16CodeUnit;
2141 }
2142
2143 int range_count = ranges->length();
2144
2145 int last_valid_range = range_count - 1;
2146 while (last_valid_range >= 0) {
2147 CharacterRange& range = ranges->at(last_valid_range);
2148 if (range.from() <= max_char) {
2149 break;
2150 }
2151 last_valid_range--;
2152 }
2153
2154 if (last_valid_range < 0) {
2155 if (!cc->is_negated()) {
2156 macro_assembler->GoTo(on_failure);
2157 }
2158 if (check_offset) {
2159 macro_assembler->CheckPosition(cp_offset, on_failure);
2160 }
2161 return;
2162 }
2163
2164 if (last_valid_range == 0 &&
2165 ranges->at(0).IsEverything(max_char)) {
2166 if (cc->is_negated()) {
2167 macro_assembler->GoTo(on_failure);
2168 } else {
2169 // This is a common case hit by non-anchored expressions.
2170 if (check_offset) {
2171 macro_assembler->CheckPosition(cp_offset, on_failure);
2172 }
2173 }
2174 return;
2175 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002176
2177 if (!preloaded) {
2178 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2179 }
2180
2181 if (cc->is_standard(zone) &&
Ben Murdoch097c5b22016-05-18 11:27:45 +01002182 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2183 on_failure)) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002184 return;
2185 }
2186
2187
2188 // A new list with ascending entries. Each entry is a code unit
2189 // where there is a boundary between code units that are part of
2190 // the class and code units that are not. Normally we insert an
2191 // entry at zero which goes to the failure label, but if there
2192 // was already one there we fall through for success on that entry.
2193 // Subsequent entries have alternating meaning (success/failure).
2194 ZoneList<int>* range_boundaries =
2195 new(zone) ZoneList<int>(last_valid_range, zone);
2196
2197 bool zeroth_entry_is_failure = !cc->is_negated();
2198
2199 for (int i = 0; i <= last_valid_range; i++) {
2200 CharacterRange& range = ranges->at(i);
2201 if (range.from() == 0) {
2202 DCHECK_EQ(i, 0);
2203 zeroth_entry_is_failure = !zeroth_entry_is_failure;
2204 } else {
2205 range_boundaries->Add(range.from(), zone);
2206 }
2207 range_boundaries->Add(range.to() + 1, zone);
2208 }
2209 int end_index = range_boundaries->length() - 1;
2210 if (range_boundaries->at(end_index) > max_char) {
2211 end_index--;
2212 }
2213
2214 Label fall_through;
2215 GenerateBranches(macro_assembler,
2216 range_boundaries,
2217 0, // start_index.
2218 end_index,
2219 0, // min_char.
2220 max_char,
2221 &fall_through,
2222 zeroth_entry_is_failure ? &fall_through : on_failure,
2223 zeroth_entry_is_failure ? on_failure : &fall_through);
2224 macro_assembler->Bind(&fall_through);
2225}
2226
2227
2228RegExpNode::~RegExpNode() {
2229}
2230
2231
2232RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2233 Trace* trace) {
2234 // If we are generating a greedy loop then don't stop and don't reuse code.
2235 if (trace->stop_node() != NULL) {
2236 return CONTINUE;
2237 }
2238
2239 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2240 if (trace->is_trivial()) {
2241 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2242 // If a generic version is already scheduled to be generated or we have
2243 // recursed too deeply then just generate a jump to that code.
2244 macro_assembler->GoTo(&label_);
2245 // This will queue it up for generation of a generic version if it hasn't
2246 // already been queued.
2247 compiler->AddWork(this);
2248 return DONE;
2249 }
2250 // Generate generic version of the node and bind the label for later use.
2251 macro_assembler->Bind(&label_);
2252 return CONTINUE;
2253 }
2254
2255 // We are being asked to make a non-generic version. Keep track of how many
2256 // non-generic versions we generate so as not to overdo it.
2257 trace_count_++;
2258 if (KeepRecursing(compiler) && compiler->optimize() &&
2259 trace_count_ < kMaxCopiesCodeGenerated) {
2260 return CONTINUE;
2261 }
2262
2263 // If we get here code has been generated for this node too many times or
2264 // recursion is too deep. Time to switch to a generic version. The code for
2265 // generic versions above can handle deep recursion properly.
2266 bool was_limiting = compiler->limiting_recursion();
2267 compiler->set_limiting_recursion(true);
2268 trace->Flush(compiler, this);
2269 compiler->set_limiting_recursion(was_limiting);
2270 return DONE;
2271}
2272
2273
2274bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2275 return !compiler->limiting_recursion() &&
2276 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2277}
2278
2279
2280int ActionNode::EatsAtLeast(int still_to_find,
2281 int budget,
2282 bool not_at_start) {
2283 if (budget <= 0) return 0;
2284 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2285 return on_success()->EatsAtLeast(still_to_find,
2286 budget - 1,
2287 not_at_start);
2288}
2289
2290
2291void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2292 BoyerMooreLookahead* bm, bool not_at_start) {
2293 if (action_type_ == BEGIN_SUBMATCH) {
2294 bm->SetRest(offset);
2295 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2296 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2297 }
2298 SaveBMInfo(bm, not_at_start, offset);
2299}
2300
2301
2302int AssertionNode::EatsAtLeast(int still_to_find,
2303 int budget,
2304 bool not_at_start) {
2305 if (budget <= 0) return 0;
2306 // If we know we are not at the start and we are asked "how many characters
2307 // will you match if you succeed?" then we can answer anything since false
2308 // implies false. So lets just return the max answer (still_to_find) since
2309 // that won't prevent us from preloading a lot of characters for the other
2310 // branches in the node graph.
2311 if (assertion_type() == AT_START && not_at_start) return still_to_find;
2312 return on_success()->EatsAtLeast(still_to_find,
2313 budget - 1,
2314 not_at_start);
2315}
2316
2317
2318void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2319 BoyerMooreLookahead* bm, bool not_at_start) {
2320 // Match the behaviour of EatsAtLeast on this node.
2321 if (assertion_type() == AT_START && not_at_start) return;
2322 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2323 SaveBMInfo(bm, not_at_start, offset);
2324}
2325
2326
2327int BackReferenceNode::EatsAtLeast(int still_to_find,
2328 int budget,
2329 bool not_at_start) {
2330 if (read_backward()) return 0;
2331 if (budget <= 0) return 0;
2332 return on_success()->EatsAtLeast(still_to_find,
2333 budget - 1,
2334 not_at_start);
2335}
2336
2337
2338int TextNode::EatsAtLeast(int still_to_find,
2339 int budget,
2340 bool not_at_start) {
2341 if (read_backward()) return 0;
2342 int answer = Length();
2343 if (answer >= still_to_find) return answer;
2344 if (budget <= 0) return answer;
2345 // We are not at start after this node so we set the last argument to 'true'.
2346 return answer + on_success()->EatsAtLeast(still_to_find - answer,
2347 budget - 1,
2348 true);
2349}
2350
2351
2352int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2353 bool not_at_start) {
2354 if (budget <= 0) return 0;
2355 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2356 // afterwards.
2357 RegExpNode* node = alternatives_->at(1).node();
2358 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2359}
2360
2361
2362void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2363 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2364 bool not_at_start) {
2365 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2366 // afterwards.
2367 RegExpNode* node = alternatives_->at(1).node();
2368 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2369}
2370
2371
2372int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2373 int budget,
2374 RegExpNode* ignore_this_node,
2375 bool not_at_start) {
2376 if (budget <= 0) return 0;
2377 int min = 100;
2378 int choice_count = alternatives_->length();
2379 budget = (budget - 1) / choice_count;
2380 for (int i = 0; i < choice_count; i++) {
2381 RegExpNode* node = alternatives_->at(i).node();
2382 if (node == ignore_this_node) continue;
2383 int node_eats_at_least =
2384 node->EatsAtLeast(still_to_find, budget, not_at_start);
2385 if (node_eats_at_least < min) min = node_eats_at_least;
2386 if (min == 0) return 0;
2387 }
2388 return min;
2389}
2390
2391
2392int LoopChoiceNode::EatsAtLeast(int still_to_find,
2393 int budget,
2394 bool not_at_start) {
2395 return EatsAtLeastHelper(still_to_find,
2396 budget - 1,
2397 loop_node_,
2398 not_at_start);
2399}
2400
2401
2402int ChoiceNode::EatsAtLeast(int still_to_find,
2403 int budget,
2404 bool not_at_start) {
2405 return EatsAtLeastHelper(still_to_find,
2406 budget,
2407 NULL,
2408 not_at_start);
2409}
2410
2411
2412// Takes the left-most 1-bit and smears it out, setting all bits to its right.
2413static inline uint32_t SmearBitsRight(uint32_t v) {
2414 v |= v >> 1;
2415 v |= v >> 2;
2416 v |= v >> 4;
2417 v |= v >> 8;
2418 v |= v >> 16;
2419 return v;
2420}
2421
2422
2423bool QuickCheckDetails::Rationalize(bool asc) {
2424 bool found_useful_op = false;
2425 uint32_t char_mask;
2426 if (asc) {
2427 char_mask = String::kMaxOneByteCharCode;
2428 } else {
2429 char_mask = String::kMaxUtf16CodeUnit;
2430 }
2431 mask_ = 0;
2432 value_ = 0;
2433 int char_shift = 0;
2434 for (int i = 0; i < characters_; i++) {
2435 Position* pos = &positions_[i];
2436 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2437 found_useful_op = true;
2438 }
2439 mask_ |= (pos->mask & char_mask) << char_shift;
2440 value_ |= (pos->value & char_mask) << char_shift;
2441 char_shift += asc ? 8 : 16;
2442 }
2443 return found_useful_op;
2444}
2445
2446
2447bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2448 Trace* bounds_check_trace,
2449 Trace* trace,
2450 bool preload_has_checked_bounds,
2451 Label* on_possible_success,
2452 QuickCheckDetails* details,
2453 bool fall_through_on_failure) {
2454 if (details->characters() == 0) return false;
2455 GetQuickCheckDetails(
2456 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2457 if (details->cannot_match()) return false;
2458 if (!details->Rationalize(compiler->one_byte())) return false;
2459 DCHECK(details->characters() == 1 ||
2460 compiler->macro_assembler()->CanReadUnaligned());
2461 uint32_t mask = details->mask();
2462 uint32_t value = details->value();
2463
2464 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2465
2466 if (trace->characters_preloaded() != details->characters()) {
2467 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2468 // We are attempting to preload the minimum number of characters
2469 // any choice would eat, so if the bounds check fails, then none of the
2470 // choices can succeed, so we can just immediately backtrack, rather
2471 // than go to the next choice.
2472 assembler->LoadCurrentCharacter(trace->cp_offset(),
2473 bounds_check_trace->backtrack(),
2474 !preload_has_checked_bounds,
2475 details->characters());
2476 }
2477
2478
2479 bool need_mask = true;
2480
2481 if (details->characters() == 1) {
2482 // If number of characters preloaded is 1 then we used a byte or 16 bit
2483 // load so the value is already masked down.
2484 uint32_t char_mask;
2485 if (compiler->one_byte()) {
2486 char_mask = String::kMaxOneByteCharCode;
2487 } else {
2488 char_mask = String::kMaxUtf16CodeUnit;
2489 }
2490 if ((mask & char_mask) == char_mask) need_mask = false;
2491 mask &= char_mask;
2492 } else {
2493 // For 2-character preloads in one-byte mode or 1-character preloads in
2494 // two-byte mode we also use a 16 bit load with zero extend.
Ben Murdoch097c5b22016-05-18 11:27:45 +01002495 static const uint32_t kTwoByteMask = 0xffff;
2496 static const uint32_t kFourByteMask = 0xffffffff;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002497 if (details->characters() == 2 && compiler->one_byte()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01002498 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002499 } else if (details->characters() == 1 && !compiler->one_byte()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01002500 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002501 } else {
Ben Murdoch097c5b22016-05-18 11:27:45 +01002502 if (mask == kFourByteMask) need_mask = false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002503 }
2504 }
2505
2506 if (fall_through_on_failure) {
2507 if (need_mask) {
2508 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2509 } else {
2510 assembler->CheckCharacter(value, on_possible_success);
2511 }
2512 } else {
2513 if (need_mask) {
2514 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2515 } else {
2516 assembler->CheckNotCharacter(value, trace->backtrack());
2517 }
2518 }
2519 return true;
2520}
2521
2522
2523// Here is the meat of GetQuickCheckDetails (see also the comment on the
2524// super-class in the .h file).
2525//
2526// We iterate along the text object, building up for each character a
2527// mask and value that can be used to test for a quick failure to match.
2528// The masks and values for the positions will be combined into a single
2529// machine word for the current character width in order to be used in
2530// generating a quick check.
2531void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2532 RegExpCompiler* compiler,
2533 int characters_filled_in,
2534 bool not_at_start) {
2535 // Do not collect any quick check details if the text node reads backward,
2536 // since it reads in the opposite direction than we use for quick checks.
2537 if (read_backward()) return;
2538 Isolate* isolate = compiler->macro_assembler()->isolate();
2539 DCHECK(characters_filled_in < details->characters());
2540 int characters = details->characters();
2541 int char_mask;
2542 if (compiler->one_byte()) {
2543 char_mask = String::kMaxOneByteCharCode;
2544 } else {
2545 char_mask = String::kMaxUtf16CodeUnit;
2546 }
2547 for (int k = 0; k < elements()->length(); k++) {
2548 TextElement elm = elements()->at(k);
2549 if (elm.text_type() == TextElement::ATOM) {
2550 Vector<const uc16> quarks = elm.atom()->data();
2551 for (int i = 0; i < characters && i < quarks.length(); i++) {
2552 QuickCheckDetails::Position* pos =
2553 details->positions(characters_filled_in);
2554 uc16 c = quarks[i];
2555 if (compiler->ignore_case()) {
2556 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2557 int length = GetCaseIndependentLetters(isolate, c,
2558 compiler->one_byte(), chars);
2559 if (length == 0) {
2560 // This can happen because all case variants are non-Latin1, but we
2561 // know the input is Latin1.
2562 details->set_cannot_match();
2563 pos->determines_perfectly = false;
2564 return;
2565 }
2566 if (length == 1) {
2567 // This letter has no case equivalents, so it's nice and simple
2568 // and the mask-compare will determine definitely whether we have
2569 // a match at this character position.
2570 pos->mask = char_mask;
2571 pos->value = c;
2572 pos->determines_perfectly = true;
2573 } else {
2574 uint32_t common_bits = char_mask;
2575 uint32_t bits = chars[0];
2576 for (int j = 1; j < length; j++) {
2577 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2578 common_bits ^= differing_bits;
2579 bits &= common_bits;
2580 }
2581 // If length is 2 and common bits has only one zero in it then
2582 // our mask and compare instruction will determine definitely
2583 // whether we have a match at this character position. Otherwise
2584 // it can only be an approximate check.
2585 uint32_t one_zero = (common_bits | ~char_mask);
2586 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2587 pos->determines_perfectly = true;
2588 }
2589 pos->mask = common_bits;
2590 pos->value = bits;
2591 }
2592 } else {
2593 // Don't ignore case. Nice simple case where the mask-compare will
2594 // determine definitely whether we have a match at this character
2595 // position.
2596 if (c > char_mask) {
2597 details->set_cannot_match();
2598 pos->determines_perfectly = false;
2599 return;
2600 }
2601 pos->mask = char_mask;
2602 pos->value = c;
2603 pos->determines_perfectly = true;
2604 }
2605 characters_filled_in++;
2606 DCHECK(characters_filled_in <= details->characters());
2607 if (characters_filled_in == details->characters()) {
2608 return;
2609 }
2610 }
2611 } else {
2612 QuickCheckDetails::Position* pos =
2613 details->positions(characters_filled_in);
2614 RegExpCharacterClass* tree = elm.char_class();
2615 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2616 if (tree->is_negated()) {
2617 // A quick check uses multi-character mask and compare. There is no
2618 // useful way to incorporate a negative char class into this scheme
2619 // so we just conservatively create a mask and value that will always
2620 // succeed.
2621 pos->mask = 0;
2622 pos->value = 0;
2623 } else {
2624 int first_range = 0;
2625 while (ranges->at(first_range).from() > char_mask) {
2626 first_range++;
2627 if (first_range == ranges->length()) {
2628 details->set_cannot_match();
2629 pos->determines_perfectly = false;
2630 return;
2631 }
2632 }
2633 CharacterRange range = ranges->at(first_range);
2634 uc16 from = range.from();
2635 uc16 to = range.to();
2636 if (to > char_mask) {
2637 to = char_mask;
2638 }
2639 uint32_t differing_bits = (from ^ to);
2640 // A mask and compare is only perfect if the differing bits form a
2641 // number like 00011111 with one single block of trailing 1s.
2642 if ((differing_bits & (differing_bits + 1)) == 0 &&
2643 from + differing_bits == to) {
2644 pos->determines_perfectly = true;
2645 }
2646 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2647 uint32_t bits = (from & common_bits);
2648 for (int i = first_range + 1; i < ranges->length(); i++) {
2649 CharacterRange range = ranges->at(i);
2650 uc16 from = range.from();
2651 uc16 to = range.to();
2652 if (from > char_mask) continue;
2653 if (to > char_mask) to = char_mask;
2654 // Here we are combining more ranges into the mask and compare
2655 // value. With each new range the mask becomes more sparse and
2656 // so the chances of a false positive rise. A character class
2657 // with multiple ranges is assumed never to be equivalent to a
2658 // mask and compare operation.
2659 pos->determines_perfectly = false;
2660 uint32_t new_common_bits = (from ^ to);
2661 new_common_bits = ~SmearBitsRight(new_common_bits);
2662 common_bits &= new_common_bits;
2663 bits &= new_common_bits;
2664 uint32_t differing_bits = (from & common_bits) ^ bits;
2665 common_bits ^= differing_bits;
2666 bits &= common_bits;
2667 }
2668 pos->mask = common_bits;
2669 pos->value = bits;
2670 }
2671 characters_filled_in++;
2672 DCHECK(characters_filled_in <= details->characters());
2673 if (characters_filled_in == details->characters()) {
2674 return;
2675 }
2676 }
2677 }
2678 DCHECK(characters_filled_in != details->characters());
2679 if (!details->cannot_match()) {
2680 on_success()-> GetQuickCheckDetails(details,
2681 compiler,
2682 characters_filled_in,
2683 true);
2684 }
2685}
2686
2687
2688void QuickCheckDetails::Clear() {
2689 for (int i = 0; i < characters_; i++) {
2690 positions_[i].mask = 0;
2691 positions_[i].value = 0;
2692 positions_[i].determines_perfectly = false;
2693 }
2694 characters_ = 0;
2695}
2696
2697
2698void QuickCheckDetails::Advance(int by, bool one_byte) {
2699 if (by >= characters_ || by < 0) {
2700 DCHECK_IMPLIES(by < 0, characters_ == 0);
2701 Clear();
2702 return;
2703 }
2704 DCHECK_LE(characters_ - by, 4);
2705 DCHECK_LE(characters_, 4);
2706 for (int i = 0; i < characters_ - by; i++) {
2707 positions_[i] = positions_[by + i];
2708 }
2709 for (int i = characters_ - by; i < characters_; i++) {
2710 positions_[i].mask = 0;
2711 positions_[i].value = 0;
2712 positions_[i].determines_perfectly = false;
2713 }
2714 characters_ -= by;
2715 // We could change mask_ and value_ here but we would never advance unless
2716 // they had already been used in a check and they won't be used again because
2717 // it would gain us nothing. So there's no point.
2718}
2719
2720
2721void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2722 DCHECK(characters_ == other->characters_);
2723 if (other->cannot_match_) {
2724 return;
2725 }
2726 if (cannot_match_) {
2727 *this = *other;
2728 return;
2729 }
2730 for (int i = from_index; i < characters_; i++) {
2731 QuickCheckDetails::Position* pos = positions(i);
2732 QuickCheckDetails::Position* other_pos = other->positions(i);
2733 if (pos->mask != other_pos->mask ||
2734 pos->value != other_pos->value ||
2735 !other_pos->determines_perfectly) {
2736 // Our mask-compare operation will be approximate unless we have the
2737 // exact same operation on both sides of the alternation.
2738 pos->determines_perfectly = false;
2739 }
2740 pos->mask &= other_pos->mask;
2741 pos->value &= pos->mask;
2742 other_pos->value &= pos->mask;
2743 uc16 differing_bits = (pos->value ^ other_pos->value);
2744 pos->mask &= ~differing_bits;
2745 pos->value &= pos->mask;
2746 }
2747}
2748
2749
2750class VisitMarker {
2751 public:
2752 explicit VisitMarker(NodeInfo* info) : info_(info) {
2753 DCHECK(!info->visited);
2754 info->visited = true;
2755 }
2756 ~VisitMarker() {
2757 info_->visited = false;
2758 }
2759 private:
2760 NodeInfo* info_;
2761};
2762
2763
2764RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2765 if (info()->replacement_calculated) return replacement();
2766 if (depth < 0) return this;
2767 DCHECK(!info()->visited);
2768 VisitMarker marker(info());
2769 return FilterSuccessor(depth - 1, ignore_case);
2770}
2771
2772
2773RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2774 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2775 if (next == NULL) return set_replacement(NULL);
2776 on_success_ = next;
2777 return set_replacement(this);
2778}
2779
2780
2781// We need to check for the following characters: 0x39c 0x3bc 0x178.
2782static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2783 // TODO(dcarney): this could be a lot more efficient.
2784 return range.Contains(0x39c) ||
2785 range.Contains(0x3bc) || range.Contains(0x178);
2786}
2787
2788
2789static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2790 for (int i = 0; i < ranges->length(); i++) {
2791 // TODO(dcarney): this could be a lot more efficient.
2792 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2793 }
2794 return false;
2795}
2796
2797
2798RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2799 if (info()->replacement_calculated) return replacement();
2800 if (depth < 0) return this;
2801 DCHECK(!info()->visited);
2802 VisitMarker marker(info());
2803 int element_count = elements()->length();
2804 for (int i = 0; i < element_count; i++) {
2805 TextElement elm = elements()->at(i);
2806 if (elm.text_type() == TextElement::ATOM) {
2807 Vector<const uc16> quarks = elm.atom()->data();
2808 for (int j = 0; j < quarks.length(); j++) {
2809 uint16_t c = quarks[j];
2810 if (c <= String::kMaxOneByteCharCode) continue;
2811 if (!ignore_case) return set_replacement(NULL);
2812 // Here, we need to check for characters whose upper and lower cases
2813 // are outside the Latin-1 range.
2814 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2815 // Character is outside Latin-1 completely
2816 if (converted == 0) return set_replacement(NULL);
2817 // Convert quark to Latin-1 in place.
2818 uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2819 copy[j] = converted;
2820 }
2821 } else {
2822 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2823 RegExpCharacterClass* cc = elm.char_class();
2824 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +01002825 CharacterRange::Canonicalize(ranges);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002826 // Now they are in order so we only need to look at the first.
2827 int range_count = ranges->length();
2828 if (cc->is_negated()) {
2829 if (range_count != 0 &&
2830 ranges->at(0).from() == 0 &&
2831 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2832 // This will be handled in a later filter.
2833 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2834 return set_replacement(NULL);
2835 }
2836 } else {
2837 if (range_count == 0 ||
2838 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2839 // This will be handled in a later filter.
2840 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2841 return set_replacement(NULL);
2842 }
2843 }
2844 }
2845 }
2846 return FilterSuccessor(depth - 1, ignore_case);
2847}
2848
2849
2850RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2851 if (info()->replacement_calculated) return replacement();
2852 if (depth < 0) return this;
2853 if (info()->visited) return this;
2854 {
2855 VisitMarker marker(info());
2856
2857 RegExpNode* continue_replacement =
2858 continue_node_->FilterOneByte(depth - 1, ignore_case);
2859 // If we can't continue after the loop then there is no sense in doing the
2860 // loop.
2861 if (continue_replacement == NULL) return set_replacement(NULL);
2862 }
2863
2864 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2865}
2866
2867
2868RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2869 if (info()->replacement_calculated) return replacement();
2870 if (depth < 0) return this;
2871 if (info()->visited) return this;
2872 VisitMarker marker(info());
2873 int choice_count = alternatives_->length();
2874
2875 for (int i = 0; i < choice_count; i++) {
2876 GuardedAlternative alternative = alternatives_->at(i);
2877 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2878 set_replacement(this);
2879 return this;
2880 }
2881 }
2882
2883 int surviving = 0;
2884 RegExpNode* survivor = NULL;
2885 for (int i = 0; i < choice_count; i++) {
2886 GuardedAlternative alternative = alternatives_->at(i);
2887 RegExpNode* replacement =
2888 alternative.node()->FilterOneByte(depth - 1, ignore_case);
2889 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2890 if (replacement != NULL) {
2891 alternatives_->at(i).set_node(replacement);
2892 surviving++;
2893 survivor = replacement;
2894 }
2895 }
2896 if (surviving < 2) return set_replacement(survivor);
2897
2898 set_replacement(this);
2899 if (surviving == choice_count) {
2900 return this;
2901 }
2902 // Only some of the nodes survived the filtering. We need to rebuild the
2903 // alternatives list.
2904 ZoneList<GuardedAlternative>* new_alternatives =
2905 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2906 for (int i = 0; i < choice_count; i++) {
2907 RegExpNode* replacement =
2908 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2909 if (replacement != NULL) {
2910 alternatives_->at(i).set_node(replacement);
2911 new_alternatives->Add(alternatives_->at(i), zone());
2912 }
2913 }
2914 alternatives_ = new_alternatives;
2915 return this;
2916}
2917
2918
2919RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
2920 bool ignore_case) {
2921 if (info()->replacement_calculated) return replacement();
2922 if (depth < 0) return this;
2923 if (info()->visited) return this;
2924 VisitMarker marker(info());
2925 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2926 // afterwards.
2927 RegExpNode* node = alternatives_->at(1).node();
2928 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2929 if (replacement == NULL) return set_replacement(NULL);
2930 alternatives_->at(1).set_node(replacement);
2931
2932 RegExpNode* neg_node = alternatives_->at(0).node();
2933 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2934 // If the negative lookahead is always going to fail then
2935 // we don't need to check it.
2936 if (neg_replacement == NULL) return set_replacement(replacement);
2937 alternatives_->at(0).set_node(neg_replacement);
2938 return set_replacement(this);
2939}
2940
2941
2942void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2943 RegExpCompiler* compiler,
2944 int characters_filled_in,
2945 bool not_at_start) {
2946 if (body_can_be_zero_length_ || info()->visited) return;
2947 VisitMarker marker(info());
2948 return ChoiceNode::GetQuickCheckDetails(details,
2949 compiler,
2950 characters_filled_in,
2951 not_at_start);
2952}
2953
2954
2955void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2956 BoyerMooreLookahead* bm, bool not_at_start) {
2957 if (body_can_be_zero_length_ || budget <= 0) {
2958 bm->SetRest(offset);
2959 SaveBMInfo(bm, not_at_start, offset);
2960 return;
2961 }
2962 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2963 SaveBMInfo(bm, not_at_start, offset);
2964}
2965
2966
2967void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2968 RegExpCompiler* compiler,
2969 int characters_filled_in,
2970 bool not_at_start) {
2971 not_at_start = (not_at_start || not_at_start_);
2972 int choice_count = alternatives_->length();
2973 DCHECK(choice_count > 0);
2974 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2975 compiler,
2976 characters_filled_in,
2977 not_at_start);
2978 for (int i = 1; i < choice_count; i++) {
2979 QuickCheckDetails new_details(details->characters());
2980 RegExpNode* node = alternatives_->at(i).node();
2981 node->GetQuickCheckDetails(&new_details, compiler,
2982 characters_filled_in,
2983 not_at_start);
2984 // Here we merge the quick match details of the two branches.
2985 details->Merge(&new_details, characters_filled_in);
2986 }
2987}
2988
2989
2990// Check for [0-9A-Z_a-z].
2991static void EmitWordCheck(RegExpMacroAssembler* assembler,
2992 Label* word,
2993 Label* non_word,
2994 bool fall_through_on_word) {
2995 if (assembler->CheckSpecialCharacterClass(
2996 fall_through_on_word ? 'w' : 'W',
2997 fall_through_on_word ? non_word : word)) {
2998 // Optimized implementation available.
2999 return;
3000 }
3001 assembler->CheckCharacterGT('z', non_word);
3002 assembler->CheckCharacterLT('0', non_word);
3003 assembler->CheckCharacterGT('a' - 1, word);
3004 assembler->CheckCharacterLT('9' + 1, word);
3005 assembler->CheckCharacterLT('A', non_word);
3006 assembler->CheckCharacterLT('Z' + 1, word);
3007 if (fall_through_on_word) {
3008 assembler->CheckNotCharacter('_', non_word);
3009 } else {
3010 assembler->CheckCharacter('_', word);
3011 }
3012}
3013
3014
3015// Emit the code to check for a ^ in multiline mode (1-character lookbehind
3016// that matches newline or the start of input).
3017static void EmitHat(RegExpCompiler* compiler,
3018 RegExpNode* on_success,
3019 Trace* trace) {
3020 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3021 // We will be loading the previous character into the current character
3022 // register.
3023 Trace new_trace(*trace);
3024 new_trace.InvalidateCurrentCharacter();
3025
3026 Label ok;
3027 if (new_trace.cp_offset() == 0) {
3028 // The start of input counts as a newline in this context, so skip to
3029 // ok if we are at the start.
3030 assembler->CheckAtStart(&ok);
3031 }
3032 // We already checked that we are not at the start of input so it must be
3033 // OK to load the previous character.
3034 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3035 new_trace.backtrack(),
3036 false);
3037 if (!assembler->CheckSpecialCharacterClass('n',
3038 new_trace.backtrack())) {
3039 // Newline means \n, \r, 0x2028 or 0x2029.
3040 if (!compiler->one_byte()) {
3041 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3042 }
3043 assembler->CheckCharacter('\n', &ok);
3044 assembler->CheckNotCharacter('\r', new_trace.backtrack());
3045 }
3046 assembler->Bind(&ok);
3047 on_success->Emit(compiler, &new_trace);
3048}
3049
3050
3051// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3052void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3053 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3054 Isolate* isolate = assembler->isolate();
3055 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3056 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3057 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3058 if (lookahead == NULL) {
3059 int eats_at_least =
3060 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3061 kRecursionBudget,
3062 not_at_start));
3063 if (eats_at_least >= 1) {
3064 BoyerMooreLookahead* bm =
3065 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3066 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
3067 if (bm->at(0)->is_non_word())
3068 next_is_word_character = Trace::FALSE_VALUE;
3069 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3070 }
3071 } else {
3072 if (lookahead->at(0)->is_non_word())
3073 next_is_word_character = Trace::FALSE_VALUE;
3074 if (lookahead->at(0)->is_word())
3075 next_is_word_character = Trace::TRUE_VALUE;
3076 }
3077 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3078 if (next_is_word_character == Trace::UNKNOWN) {
3079 Label before_non_word;
3080 Label before_word;
3081 if (trace->characters_preloaded() != 1) {
3082 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3083 }
3084 // Fall through on non-word.
3085 EmitWordCheck(assembler, &before_word, &before_non_word, false);
3086 // Next character is not a word character.
3087 assembler->Bind(&before_non_word);
3088 Label ok;
3089 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3090 assembler->GoTo(&ok);
3091
3092 assembler->Bind(&before_word);
3093 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3094 assembler->Bind(&ok);
3095 } else if (next_is_word_character == Trace::TRUE_VALUE) {
3096 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3097 } else {
3098 DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3099 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3100 }
3101}
3102
3103
3104void AssertionNode::BacktrackIfPrevious(
3105 RegExpCompiler* compiler,
3106 Trace* trace,
3107 AssertionNode::IfPrevious backtrack_if_previous) {
3108 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3109 Trace new_trace(*trace);
3110 new_trace.InvalidateCurrentCharacter();
3111
3112 Label fall_through, dummy;
3113
3114 Label* non_word = backtrack_if_previous == kIsNonWord ?
3115 new_trace.backtrack() :
3116 &fall_through;
3117 Label* word = backtrack_if_previous == kIsNonWord ?
3118 &fall_through :
3119 new_trace.backtrack();
3120
3121 if (new_trace.cp_offset() == 0) {
3122 // The start of input counts as a non-word character, so the question is
3123 // decided if we are at the start.
3124 assembler->CheckAtStart(non_word);
3125 }
3126 // We already checked that we are not at the start of input so it must be
3127 // OK to load the previous character.
3128 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3129 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3130
3131 assembler->Bind(&fall_through);
3132 on_success()->Emit(compiler, &new_trace);
3133}
3134
3135
3136void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3137 RegExpCompiler* compiler,
3138 int filled_in,
3139 bool not_at_start) {
3140 if (assertion_type_ == AT_START && not_at_start) {
3141 details->set_cannot_match();
3142 return;
3143 }
3144 return on_success()->GetQuickCheckDetails(details,
3145 compiler,
3146 filled_in,
3147 not_at_start);
3148}
3149
3150
3151void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3152 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3153 switch (assertion_type_) {
3154 case AT_END: {
3155 Label ok;
3156 assembler->CheckPosition(trace->cp_offset(), &ok);
3157 assembler->GoTo(trace->backtrack());
3158 assembler->Bind(&ok);
3159 break;
3160 }
3161 case AT_START: {
3162 if (trace->at_start() == Trace::FALSE_VALUE) {
3163 assembler->GoTo(trace->backtrack());
3164 return;
3165 }
3166 if (trace->at_start() == Trace::UNKNOWN) {
3167 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3168 Trace at_start_trace = *trace;
3169 at_start_trace.set_at_start(Trace::TRUE_VALUE);
3170 on_success()->Emit(compiler, &at_start_trace);
3171 return;
3172 }
3173 }
3174 break;
3175 case AFTER_NEWLINE:
3176 EmitHat(compiler, on_success(), trace);
3177 return;
3178 case AT_BOUNDARY:
3179 case AT_NON_BOUNDARY: {
3180 EmitBoundaryCheck(compiler, trace);
3181 return;
3182 }
3183 }
3184 on_success()->Emit(compiler, trace);
3185}
3186
3187
3188static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3189 if (quick_check == NULL) return false;
3190 if (offset >= quick_check->characters()) return false;
3191 return quick_check->positions(offset)->determines_perfectly;
3192}
3193
3194
3195static void UpdateBoundsCheck(int index, int* checked_up_to) {
3196 if (index > *checked_up_to) {
3197 *checked_up_to = index;
3198 }
3199}
3200
3201
3202// We call this repeatedly to generate code for each pass over the text node.
3203// The passes are in increasing order of difficulty because we hope one
3204// of the first passes will fail in which case we are saved the work of the
3205// later passes. for example for the case independent regexp /%[asdfghjkl]a/
3206// we will check the '%' in the first pass, the case independent 'a' in the
3207// second pass and the character class in the last pass.
3208//
3209// The passes are done from right to left, so for example to test for /bar/
3210// we will first test for an 'r' with offset 2, then an 'a' with offset 1
3211// and then a 'b' with offset 0. This means we can avoid the end-of-input
3212// bounds check most of the time. In the example we only need to check for
3213// end-of-input when loading the putative 'r'.
3214//
3215// A slight complication involves the fact that the first character may already
3216// be fetched into a register by the previous node. In this case we want to
3217// do the test for that character first. We do this in separate passes. The
3218// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3219// pass has been performed then subsequent passes will have true in
3220// first_element_checked to indicate that that character does not need to be
3221// checked again.
3222//
3223// In addition to all this we are passed a Trace, which can
3224// contain an AlternativeGeneration object. In this AlternativeGeneration
3225// object we can see details of any quick check that was already passed in
3226// order to get to the code we are now generating. The quick check can involve
3227// loading characters, which means we do not need to recheck the bounds
3228// up to the limit the quick check already checked. In addition the quick
3229// check can have involved a mask and compare operation which may simplify
3230// or obviate the need for further checks at some character positions.
3231void TextNode::TextEmitPass(RegExpCompiler* compiler,
3232 TextEmitPassType pass,
3233 bool preloaded,
3234 Trace* trace,
3235 bool first_element_checked,
3236 int* checked_up_to) {
3237 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3238 Isolate* isolate = assembler->isolate();
3239 bool one_byte = compiler->one_byte();
3240 Label* backtrack = trace->backtrack();
3241 QuickCheckDetails* quick_check = trace->quick_check_performed();
3242 int element_count = elements()->length();
3243 int backward_offset = read_backward() ? -Length() : 0;
3244 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3245 TextElement elm = elements()->at(i);
3246 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3247 if (elm.text_type() == TextElement::ATOM) {
3248 Vector<const uc16> quarks = elm.atom()->data();
3249 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3250 if (first_element_checked && i == 0 && j == 0) continue;
3251 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3252 EmitCharacterFunction* emit_function = NULL;
3253 switch (pass) {
3254 case NON_LATIN1_MATCH:
3255 DCHECK(one_byte);
3256 if (quarks[j] > String::kMaxOneByteCharCode) {
3257 assembler->GoTo(backtrack);
3258 return;
3259 }
3260 break;
3261 case NON_LETTER_CHARACTER_MATCH:
3262 emit_function = &EmitAtomNonLetter;
3263 break;
3264 case SIMPLE_CHARACTER_MATCH:
3265 emit_function = &EmitSimpleCharacter;
3266 break;
3267 case CASE_CHARACTER_MATCH:
3268 emit_function = &EmitAtomLetter;
3269 break;
3270 default:
3271 break;
3272 }
3273 if (emit_function != NULL) {
3274 bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3275 bool bound_checked =
3276 emit_function(isolate, compiler, quarks[j], backtrack,
3277 cp_offset + j, bounds_check, preloaded);
3278 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3279 }
3280 }
3281 } else {
3282 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3283 if (pass == CHARACTER_CLASS_MATCH) {
3284 if (first_element_checked && i == 0) continue;
3285 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3286 RegExpCharacterClass* cc = elm.char_class();
3287 bool bounds_check = *checked_up_to < cp_offset || read_backward();
3288 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3289 bounds_check, preloaded, zone());
3290 UpdateBoundsCheck(cp_offset, checked_up_to);
3291 }
3292 }
3293 }
3294}
3295
3296
3297int TextNode::Length() {
3298 TextElement elm = elements()->last();
3299 DCHECK(elm.cp_offset() >= 0);
3300 return elm.cp_offset() + elm.length();
3301}
3302
3303
3304bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3305 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3306 if (ignore_case) {
3307 return pass == SIMPLE_CHARACTER_MATCH;
3308 } else {
3309 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3310 }
3311}
3312
3313
Ben Murdoch097c5b22016-05-18 11:27:45 +01003314TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3315 ZoneList<CharacterRange>* ranges,
3316 bool read_backward,
3317 RegExpNode* on_success) {
3318 DCHECK_NOT_NULL(ranges);
3319 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3320 elms->Add(
3321 TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
3322 zone);
3323 return new (zone) TextNode(elms, read_backward, on_success);
3324}
3325
3326
3327TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3328 CharacterRange trail,
3329 bool read_backward,
3330 RegExpNode* on_success) {
3331 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3332 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3333 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3334 elms->Add(TextElement::CharClass(
3335 new (zone) RegExpCharacterClass(lead_ranges, false)),
3336 zone);
3337 elms->Add(TextElement::CharClass(
3338 new (zone) RegExpCharacterClass(trail_ranges, false)),
3339 zone);
3340 return new (zone) TextNode(elms, read_backward, on_success);
3341}
3342
3343
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003344// This generates the code to match a text node. A text node can contain
3345// straight character sequences (possibly to be matched in a case-independent
3346// way) and character classes. For efficiency we do not do this in a single
3347// pass from left to right. Instead we pass over the text node several times,
3348// emitting code for some character positions every time. See the comment on
3349// TextEmitPass for details.
3350void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3351 LimitResult limit_result = LimitVersions(compiler, trace);
3352 if (limit_result == DONE) return;
3353 DCHECK(limit_result == CONTINUE);
3354
3355 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3356 compiler->SetRegExpTooBig();
3357 return;
3358 }
3359
3360 if (compiler->one_byte()) {
3361 int dummy = 0;
3362 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3363 }
3364
3365 bool first_elt_done = false;
3366 int bound_checked_to = trace->cp_offset() - 1;
3367 bound_checked_to += trace->bound_checked_up_to();
3368
3369 // If a character is preloaded into the current character register then
3370 // check that now.
3371 if (trace->characters_preloaded() == 1) {
3372 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3373 if (!SkipPass(pass, compiler->ignore_case())) {
3374 TextEmitPass(compiler,
3375 static_cast<TextEmitPassType>(pass),
3376 true,
3377 trace,
3378 false,
3379 &bound_checked_to);
3380 }
3381 }
3382 first_elt_done = true;
3383 }
3384
3385 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3386 if (!SkipPass(pass, compiler->ignore_case())) {
3387 TextEmitPass(compiler,
3388 static_cast<TextEmitPassType>(pass),
3389 false,
3390 trace,
3391 first_elt_done,
3392 &bound_checked_to);
3393 }
3394 }
3395
3396 Trace successor_trace(*trace);
3397 // If we advance backward, we may end up at the start.
3398 successor_trace.AdvanceCurrentPositionInTrace(
3399 read_backward() ? -Length() : Length(), compiler);
3400 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3401 : Trace::FALSE_VALUE);
3402 RecursionCheck rc(compiler);
3403 on_success()->Emit(compiler, &successor_trace);
3404}
3405
3406
3407void Trace::InvalidateCurrentCharacter() {
3408 characters_preloaded_ = 0;
3409}
3410
3411
3412void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3413 // We don't have an instruction for shifting the current character register
3414 // down or for using a shifted value for anything so lets just forget that
3415 // we preloaded any characters into it.
3416 characters_preloaded_ = 0;
3417 // Adjust the offsets of the quick check performed information. This
3418 // information is used to find out what we already determined about the
3419 // characters by means of mask and compare.
3420 quick_check_performed_.Advance(by, compiler->one_byte());
3421 cp_offset_ += by;
3422 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3423 compiler->SetRegExpTooBig();
3424 cp_offset_ = 0;
3425 }
3426 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3427}
3428
3429
3430void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3431 int element_count = elements()->length();
3432 for (int i = 0; i < element_count; i++) {
3433 TextElement elm = elements()->at(i);
3434 if (elm.text_type() == TextElement::CHAR_CLASS) {
3435 RegExpCharacterClass* cc = elm.char_class();
3436 // None of the standard character classes is different in the case
3437 // independent case and it slows us down if we don't know that.
3438 if (cc->is_standard(zone())) continue;
3439 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +01003440 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003441 }
3442 }
3443}
3444
3445
3446int TextNode::GreedyLoopTextLength() { return Length(); }
3447
3448
3449RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3450 RegExpCompiler* compiler) {
3451 if (read_backward()) return NULL;
3452 if (elements()->length() != 1) return NULL;
3453 TextElement elm = elements()->at(0);
3454 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3455 RegExpCharacterClass* node = elm.char_class();
3456 ZoneList<CharacterRange>* ranges = node->ranges(zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +01003457 CharacterRange::Canonicalize(ranges);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003458 if (node->is_negated()) {
3459 return ranges->length() == 0 ? on_success() : NULL;
3460 }
3461 if (ranges->length() != 1) return NULL;
3462 uint32_t max_char;
3463 if (compiler->one_byte()) {
3464 max_char = String::kMaxOneByteCharCode;
3465 } else {
3466 max_char = String::kMaxUtf16CodeUnit;
3467 }
3468 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3469}
3470
3471
3472// Finds the fixed match length of a sequence of nodes that goes from
3473// this alternative and back to this choice node. If there are variable
3474// length nodes or other complications in the way then return a sentinel
3475// value indicating that a greedy loop cannot be constructed.
3476int ChoiceNode::GreedyLoopTextLengthForAlternative(
3477 GuardedAlternative* alternative) {
3478 int length = 0;
3479 RegExpNode* node = alternative->node();
3480 // Later we will generate code for all these text nodes using recursion
3481 // so we have to limit the max number.
3482 int recursion_depth = 0;
3483 while (node != this) {
3484 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3485 return kNodeIsTooComplexForGreedyLoops;
3486 }
3487 int node_length = node->GreedyLoopTextLength();
3488 if (node_length == kNodeIsTooComplexForGreedyLoops) {
3489 return kNodeIsTooComplexForGreedyLoops;
3490 }
3491 length += node_length;
3492 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3493 node = seq_node->on_success();
3494 }
3495 return read_backward() ? -length : length;
3496}
3497
3498
3499void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3500 DCHECK_NULL(loop_node_);
3501 AddAlternative(alt);
3502 loop_node_ = alt.node();
3503}
3504
3505
3506void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3507 DCHECK_NULL(continue_node_);
3508 AddAlternative(alt);
3509 continue_node_ = alt.node();
3510}
3511
3512
3513void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3514 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3515 if (trace->stop_node() == this) {
3516 // Back edge of greedy optimized loop node graph.
3517 int text_length =
3518 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3519 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
3520 // Update the counter-based backtracking info on the stack. This is an
3521 // optimization for greedy loops (see below).
3522 DCHECK(trace->cp_offset() == text_length);
3523 macro_assembler->AdvanceCurrentPosition(text_length);
3524 macro_assembler->GoTo(trace->loop_label());
3525 return;
3526 }
3527 DCHECK_NULL(trace->stop_node());
3528 if (!trace->is_trivial()) {
3529 trace->Flush(compiler, this);
3530 return;
3531 }
3532 ChoiceNode::Emit(compiler, trace);
3533}
3534
3535
3536int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3537 int eats_at_least) {
3538 int preload_characters = Min(4, eats_at_least);
3539 if (compiler->macro_assembler()->CanReadUnaligned()) {
3540 bool one_byte = compiler->one_byte();
3541 if (one_byte) {
3542 if (preload_characters > 4) preload_characters = 4;
3543 // We can't preload 3 characters because there is no machine instruction
3544 // to do that. We can't just load 4 because we could be reading
3545 // beyond the end of the string, which could cause a memory fault.
3546 if (preload_characters == 3) preload_characters = 2;
3547 } else {
3548 if (preload_characters > 2) preload_characters = 2;
3549 }
3550 } else {
3551 if (preload_characters > 1) preload_characters = 1;
3552 }
3553 return preload_characters;
3554}
3555
3556
3557// This class is used when generating the alternatives in a choice node. It
3558// records the way the alternative is being code generated.
3559class AlternativeGeneration: public Malloced {
3560 public:
3561 AlternativeGeneration()
3562 : possible_success(),
3563 expects_preload(false),
3564 after(),
3565 quick_check_details() { }
3566 Label possible_success;
3567 bool expects_preload;
3568 Label after;
3569 QuickCheckDetails quick_check_details;
3570};
3571
3572
3573// Creates a list of AlternativeGenerations. If the list has a reasonable
3574// size then it is on the stack, otherwise the excess is on the heap.
3575class AlternativeGenerationList {
3576 public:
3577 AlternativeGenerationList(int count, Zone* zone)
3578 : alt_gens_(count, zone) {
3579 for (int i = 0; i < count && i < kAFew; i++) {
3580 alt_gens_.Add(a_few_alt_gens_ + i, zone);
3581 }
3582 for (int i = kAFew; i < count; i++) {
3583 alt_gens_.Add(new AlternativeGeneration(), zone);
3584 }
3585 }
3586 ~AlternativeGenerationList() {
3587 for (int i = kAFew; i < alt_gens_.length(); i++) {
3588 delete alt_gens_[i];
3589 alt_gens_[i] = NULL;
3590 }
3591 }
3592
3593 AlternativeGeneration* at(int i) {
3594 return alt_gens_[i];
3595 }
3596
3597 private:
3598 static const int kAFew = 10;
3599 ZoneList<AlternativeGeneration*> alt_gens_;
3600 AlternativeGeneration a_few_alt_gens_[kAFew];
3601};
3602
3603
Ben Murdoch097c5b22016-05-18 11:27:45 +01003604static const uc32 kRangeEndMarker = 0x110000;
3605
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003606// The '2' variant is has inclusive from and exclusive to.
3607// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3608// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
Ben Murdoch097c5b22016-05-18 11:27:45 +01003609static const int kSpaceRanges[] = {
3610 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681,
3611 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3612 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003613static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3614
3615static const int kWordRanges[] = {
Ben Murdoch097c5b22016-05-18 11:27:45 +01003616 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003617static const int kWordRangeCount = arraysize(kWordRanges);
Ben Murdoch097c5b22016-05-18 11:27:45 +01003618static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003619static const int kDigitRangeCount = arraysize(kDigitRanges);
Ben Murdoch097c5b22016-05-18 11:27:45 +01003620static const int kSurrogateRanges[] = {
3621 kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003622static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
Ben Murdoch097c5b22016-05-18 11:27:45 +01003623static const int kLineTerminatorRanges[] = {
3624 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003625static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3626
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003627void BoyerMoorePositionInfo::Set(int character) {
3628 SetInterval(Interval(character, character));
3629}
3630
3631
3632void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3633 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3634 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3635 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3636 surrogate_ =
3637 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3638 if (interval.to() - interval.from() >= kMapSize - 1) {
3639 if (map_count_ != kMapSize) {
3640 map_count_ = kMapSize;
3641 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3642 }
3643 return;
3644 }
3645 for (int i = interval.from(); i <= interval.to(); i++) {
3646 int mod_character = (i & kMask);
3647 if (!map_->at(mod_character)) {
3648 map_count_++;
3649 map_->at(mod_character) = true;
3650 }
3651 if (map_count_ == kMapSize) return;
3652 }
3653}
3654
3655
3656void BoyerMoorePositionInfo::SetAll() {
3657 s_ = w_ = d_ = kLatticeUnknown;
3658 if (map_count_ != kMapSize) {
3659 map_count_ = kMapSize;
3660 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3661 }
3662}
3663
3664
3665BoyerMooreLookahead::BoyerMooreLookahead(
3666 int length, RegExpCompiler* compiler, Zone* zone)
3667 : length_(length),
3668 compiler_(compiler) {
3669 if (compiler->one_byte()) {
3670 max_char_ = String::kMaxOneByteCharCode;
3671 } else {
3672 max_char_ = String::kMaxUtf16CodeUnit;
3673 }
3674 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3675 for (int i = 0; i < length; i++) {
3676 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3677 }
3678}
3679
3680
3681// Find the longest range of lookahead that has the fewest number of different
3682// characters that can occur at a given position. Since we are optimizing two
3683// different parameters at once this is a tradeoff.
3684bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3685 int biggest_points = 0;
3686 // If more than 32 characters out of 128 can occur it is unlikely that we can
3687 // be lucky enough to step forwards much of the time.
3688 const int kMaxMax = 32;
3689 for (int max_number_of_chars = 4;
3690 max_number_of_chars < kMaxMax;
3691 max_number_of_chars *= 2) {
3692 biggest_points =
3693 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3694 }
3695 if (biggest_points == 0) return false;
3696 return true;
3697}
3698
3699
3700// Find the highest-points range between 0 and length_ where the character
3701// information is not too vague. 'Too vague' means that there are more than
3702// max_number_of_chars that can occur at this position. Calculates the number
3703// of points as the product of width-of-the-range and
3704// probability-of-finding-one-of-the-characters, where the probability is
3705// calculated using the frequency distribution of the sample subject string.
3706int BoyerMooreLookahead::FindBestInterval(
3707 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3708 int biggest_points = old_biggest_points;
3709 static const int kSize = RegExpMacroAssembler::kTableSize;
3710 for (int i = 0; i < length_; ) {
3711 while (i < length_ && Count(i) > max_number_of_chars) i++;
3712 if (i == length_) break;
3713 int remembered_from = i;
3714 bool union_map[kSize];
3715 for (int j = 0; j < kSize; j++) union_map[j] = false;
3716 while (i < length_ && Count(i) <= max_number_of_chars) {
3717 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3718 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3719 i++;
3720 }
3721 int frequency = 0;
3722 for (int j = 0; j < kSize; j++) {
3723 if (union_map[j]) {
3724 // Add 1 to the frequency to give a small per-character boost for
3725 // the cases where our sampling is not good enough and many
3726 // characters have a frequency of zero. This means the frequency
3727 // can theoretically be up to 2*kSize though we treat it mostly as
3728 // a fraction of kSize.
3729 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3730 }
3731 }
3732 // We use the probability of skipping times the distance we are skipping to
3733 // judge the effectiveness of this. Actually we have a cut-off: By
3734 // dividing by 2 we switch off the skipping if the probability of skipping
3735 // is less than 50%. This is because the multibyte mask-and-compare
3736 // skipping in quickcheck is more likely to do well on this case.
3737 bool in_quickcheck_range =
3738 ((i - remembered_from < 4) ||
3739 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3740 // Called 'probability' but it is only a rough estimate and can actually
3741 // be outside the 0-kSize range.
3742 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3743 int points = (i - remembered_from) * probability;
3744 if (points > biggest_points) {
3745 *from = remembered_from;
3746 *to = i - 1;
3747 biggest_points = points;
3748 }
3749 }
3750 return biggest_points;
3751}
3752
3753
3754// Take all the characters that will not prevent a successful match if they
3755// occur in the subject string in the range between min_lookahead and
3756// max_lookahead (inclusive) measured from the current position. If the
3757// character at max_lookahead offset is not one of these characters, then we
3758// can safely skip forwards by the number of characters in the range.
3759int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3760 int max_lookahead,
3761 Handle<ByteArray> boolean_skip_table) {
3762 const int kSize = RegExpMacroAssembler::kTableSize;
3763
3764 const int kSkipArrayEntry = 0;
3765 const int kDontSkipArrayEntry = 1;
3766
3767 for (int i = 0; i < kSize; i++) {
3768 boolean_skip_table->set(i, kSkipArrayEntry);
3769 }
3770 int skip = max_lookahead + 1 - min_lookahead;
3771
3772 for (int i = max_lookahead; i >= min_lookahead; i--) {
3773 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3774 for (int j = 0; j < kSize; j++) {
3775 if (map->at(j)) {
3776 boolean_skip_table->set(j, kDontSkipArrayEntry);
3777 }
3778 }
3779 }
3780
3781 return skip;
3782}
3783
3784
3785// See comment above on the implementation of GetSkipTable.
3786void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3787 const int kSize = RegExpMacroAssembler::kTableSize;
3788
3789 int min_lookahead = 0;
3790 int max_lookahead = 0;
3791
3792 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3793
3794 bool found_single_character = false;
3795 int single_character = 0;
3796 for (int i = max_lookahead; i >= min_lookahead; i--) {
3797 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3798 if (map->map_count() > 1 ||
3799 (found_single_character && map->map_count() != 0)) {
3800 found_single_character = false;
3801 break;
3802 }
3803 for (int j = 0; j < kSize; j++) {
3804 if (map->at(j)) {
3805 found_single_character = true;
3806 single_character = j;
3807 break;
3808 }
3809 }
3810 }
3811
3812 int lookahead_width = max_lookahead + 1 - min_lookahead;
3813
3814 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3815 // The mask-compare can probably handle this better.
3816 return;
3817 }
3818
3819 if (found_single_character) {
3820 Label cont, again;
3821 masm->Bind(&again);
3822 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3823 if (max_char_ > kSize) {
3824 masm->CheckCharacterAfterAnd(single_character,
3825 RegExpMacroAssembler::kTableMask,
3826 &cont);
3827 } else {
3828 masm->CheckCharacter(single_character, &cont);
3829 }
3830 masm->AdvanceCurrentPosition(lookahead_width);
3831 masm->GoTo(&again);
3832 masm->Bind(&cont);
3833 return;
3834 }
3835
3836 Factory* factory = masm->isolate()->factory();
3837 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3838 int skip_distance = GetSkipTable(
3839 min_lookahead, max_lookahead, boolean_skip_table);
3840 DCHECK(skip_distance != 0);
3841
3842 Label cont, again;
3843 masm->Bind(&again);
3844 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3845 masm->CheckBitInTable(boolean_skip_table, &cont);
3846 masm->AdvanceCurrentPosition(skip_distance);
3847 masm->GoTo(&again);
3848 masm->Bind(&cont);
3849}
3850
3851
3852/* Code generation for choice nodes.
3853 *
3854 * We generate quick checks that do a mask and compare to eliminate a
3855 * choice. If the quick check succeeds then it jumps to the continuation to
3856 * do slow checks and check subsequent nodes. If it fails (the common case)
3857 * it falls through to the next choice.
3858 *
3859 * Here is the desired flow graph. Nodes directly below each other imply
3860 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3861 * 3 doesn't have a quick check so we have to call the slow check.
3862 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3863 * regexp continuation is generated directly after the Sn node, up to the
3864 * next GoTo if we decide to reuse some already generated code. Some
3865 * nodes expect preload_characters to be preloaded into the current
3866 * character register. R nodes do this preloading. Vertices are marked
3867 * F for failures and S for success (possible success in the case of quick
3868 * nodes). L, V, < and > are used as arrow heads.
3869 *
3870 * ----------> R
3871 * |
3872 * V
3873 * Q1 -----> S1
3874 * | S /
3875 * F| /
3876 * | F/
3877 * | /
3878 * | R
3879 * | /
3880 * V L
3881 * Q2 -----> S2
3882 * | S /
3883 * F| /
3884 * | F/
3885 * | /
3886 * | R
3887 * | /
3888 * V L
3889 * S3
3890 * |
3891 * F|
3892 * |
3893 * R
3894 * |
3895 * backtrack V
3896 * <----------Q4
3897 * \ F |
3898 * \ |S
3899 * \ F V
3900 * \-----S4
3901 *
3902 * For greedy loops we push the current position, then generate the code that
3903 * eats the input specially in EmitGreedyLoop. The other choice (the
3904 * continuation) is generated by the normal code in EmitChoices, and steps back
3905 * in the input to the starting position when it fails to match. The loop code
3906 * looks like this (U is the unwind code that steps back in the greedy loop).
3907 *
3908 * _____
3909 * / \
3910 * V |
3911 * ----------> S1 |
3912 * /| |
3913 * / |S |
3914 * F/ \_____/
3915 * /
3916 * |<-----
3917 * | \
3918 * V |S
3919 * Q2 ---> U----->backtrack
3920 * | F /
3921 * S| /
3922 * V F /
3923 * S2--/
3924 */
3925
3926GreedyLoopState::GreedyLoopState(bool not_at_start) {
3927 counter_backtrack_trace_.set_backtrack(&label_);
3928 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3929}
3930
3931
3932void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3933#ifdef DEBUG
3934 int choice_count = alternatives_->length();
3935 for (int i = 0; i < choice_count - 1; i++) {
3936 GuardedAlternative alternative = alternatives_->at(i);
3937 ZoneList<Guard*>* guards = alternative.guards();
3938 int guard_count = (guards == NULL) ? 0 : guards->length();
3939 for (int j = 0; j < guard_count; j++) {
3940 DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3941 }
3942 }
3943#endif
3944}
3945
3946
3947void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3948 Trace* current_trace,
3949 PreloadState* state) {
3950 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3951 // Save some time by looking at most one machine word ahead.
3952 state->eats_at_least_ =
3953 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3954 current_trace->at_start() == Trace::FALSE_VALUE);
3955 }
3956 state->preload_characters_ =
3957 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3958
3959 state->preload_is_current_ =
3960 (current_trace->characters_preloaded() == state->preload_characters_);
3961 state->preload_has_checked_bounds_ = state->preload_is_current_;
3962}
3963
3964
3965void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3966 int choice_count = alternatives_->length();
3967
Ben Murdoch097c5b22016-05-18 11:27:45 +01003968 if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
3969 alternatives_->at(0).node()->Emit(compiler, trace);
3970 return;
3971 }
3972
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003973 AssertGuardsMentionRegisters(trace);
3974
3975 LimitResult limit_result = LimitVersions(compiler, trace);
3976 if (limit_result == DONE) return;
3977 DCHECK(limit_result == CONTINUE);
3978
3979 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3980 // other choice nodes we only flush if we are out of code size budget.
3981 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3982 trace->Flush(compiler, this);
3983 return;
3984 }
3985
3986 RecursionCheck rc(compiler);
3987
3988 PreloadState preload;
3989 preload.init();
3990 GreedyLoopState greedy_loop_state(not_at_start());
3991
3992 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3993 AlternativeGenerationList alt_gens(choice_count, zone());
3994
3995 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3996 trace = EmitGreedyLoop(compiler,
3997 trace,
3998 &alt_gens,
3999 &preload,
4000 &greedy_loop_state,
4001 text_length);
4002 } else {
4003 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
4004 // match the traces produced pre-cleanup.
4005 Label second_choice;
4006 compiler->macro_assembler()->Bind(&second_choice);
4007
4008 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
4009
4010 EmitChoices(compiler,
4011 &alt_gens,
4012 0,
4013 trace,
4014 &preload);
4015 }
4016
4017 // At this point we need to generate slow checks for the alternatives where
4018 // the quick check was inlined. We can recognize these because the associated
4019 // label was bound.
4020 int new_flush_budget = trace->flush_budget() / choice_count;
4021 for (int i = 0; i < choice_count; i++) {
4022 AlternativeGeneration* alt_gen = alt_gens.at(i);
4023 Trace new_trace(*trace);
4024 // If there are actions to be flushed we have to limit how many times
4025 // they are flushed. Take the budget of the parent trace and distribute
4026 // it fairly amongst the children.
4027 if (new_trace.actions() != NULL) {
4028 new_trace.set_flush_budget(new_flush_budget);
4029 }
4030 bool next_expects_preload =
4031 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
4032 EmitOutOfLineContinuation(compiler,
4033 &new_trace,
4034 alternatives_->at(i),
4035 alt_gen,
4036 preload.preload_characters_,
4037 next_expects_preload);
4038 }
4039}
4040
4041
4042Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
4043 Trace* trace,
4044 AlternativeGenerationList* alt_gens,
4045 PreloadState* preload,
4046 GreedyLoopState* greedy_loop_state,
4047 int text_length) {
4048 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4049 // Here we have special handling for greedy loops containing only text nodes
4050 // and other simple nodes. These are handled by pushing the current
4051 // position on the stack and then incrementing the current position each
4052 // time around the switch. On backtrack we decrement the current position
4053 // and check it against the pushed value. This avoids pushing backtrack
4054 // information for each iteration of the loop, which could take up a lot of
4055 // space.
4056 DCHECK(trace->stop_node() == NULL);
4057 macro_assembler->PushCurrentPosition();
4058 Label greedy_match_failed;
4059 Trace greedy_match_trace;
4060 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
4061 greedy_match_trace.set_backtrack(&greedy_match_failed);
4062 Label loop_label;
4063 macro_assembler->Bind(&loop_label);
4064 greedy_match_trace.set_stop_node(this);
4065 greedy_match_trace.set_loop_label(&loop_label);
4066 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4067 macro_assembler->Bind(&greedy_match_failed);
4068
4069 Label second_choice; // For use in greedy matches.
4070 macro_assembler->Bind(&second_choice);
4071
4072 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4073
4074 EmitChoices(compiler,
4075 alt_gens,
4076 1,
4077 new_trace,
4078 preload);
4079
4080 macro_assembler->Bind(greedy_loop_state->label());
4081 // If we have unwound to the bottom then backtrack.
4082 macro_assembler->CheckGreedyLoop(trace->backtrack());
4083 // Otherwise try the second priority at an earlier position.
4084 macro_assembler->AdvanceCurrentPosition(-text_length);
4085 macro_assembler->GoTo(&second_choice);
4086 return new_trace;
4087}
4088
4089int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4090 Trace* trace) {
4091 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4092 if (alternatives_->length() != 2) return eats_at_least;
4093
4094 GuardedAlternative alt1 = alternatives_->at(1);
4095 if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4096 return eats_at_least;
4097 }
4098 RegExpNode* eats_anything_node = alt1.node();
4099 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4100 return eats_at_least;
4101 }
4102
4103 // Really we should be creating a new trace when we execute this function,
4104 // but there is no need, because the code it generates cannot backtrack, and
4105 // we always arrive here with a trivial trace (since it's the entry to a
4106 // loop. That also implies that there are no preloaded characters, which is
4107 // good, because it means we won't be violating any assumptions by
4108 // overwriting those characters with new load instructions.
4109 DCHECK(trace->is_trivial());
4110
4111 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4112 Isolate* isolate = macro_assembler->isolate();
4113 // At this point we know that we are at a non-greedy loop that will eat
4114 // any character one at a time. Any non-anchored regexp has such a
4115 // loop prepended to it in order to find where it starts. We look for
4116 // a pattern of the form ...abc... where we can look 6 characters ahead
4117 // and step forwards 3 if the character is not one of abc. Abc need
4118 // not be atoms, they can be any reasonably limited character class or
4119 // small alternation.
4120 BoyerMooreLookahead* bm = bm_info(false);
4121 if (bm == NULL) {
4122 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4123 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4124 kRecursionBudget,
4125 false));
4126 if (eats_at_least >= 1) {
4127 bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4128 compiler,
4129 zone());
4130 GuardedAlternative alt0 = alternatives_->at(0);
4131 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4132 }
4133 }
4134 if (bm != NULL) {
4135 bm->EmitSkipInstructions(macro_assembler);
4136 }
4137 return eats_at_least;
4138}
4139
4140
4141void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4142 AlternativeGenerationList* alt_gens,
4143 int first_choice,
4144 Trace* trace,
4145 PreloadState* preload) {
4146 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4147 SetUpPreLoad(compiler, trace, preload);
4148
4149 // For now we just call all choices one after the other. The idea ultimately
4150 // is to use the Dispatch table to try only the relevant ones.
4151 int choice_count = alternatives_->length();
4152
4153 int new_flush_budget = trace->flush_budget() / choice_count;
4154
4155 for (int i = first_choice; i < choice_count; i++) {
4156 bool is_last = i == choice_count - 1;
4157 bool fall_through_on_failure = !is_last;
4158 GuardedAlternative alternative = alternatives_->at(i);
4159 AlternativeGeneration* alt_gen = alt_gens->at(i);
4160 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4161 ZoneList<Guard*>* guards = alternative.guards();
4162 int guard_count = (guards == NULL) ? 0 : guards->length();
4163 Trace new_trace(*trace);
4164 new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4165 preload->preload_characters_ :
4166 0);
4167 if (preload->preload_has_checked_bounds_) {
4168 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4169 }
4170 new_trace.quick_check_performed()->Clear();
4171 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4172 if (!is_last) {
4173 new_trace.set_backtrack(&alt_gen->after);
4174 }
4175 alt_gen->expects_preload = preload->preload_is_current_;
4176 bool generate_full_check_inline = false;
4177 if (compiler->optimize() &&
4178 try_to_emit_quick_check_for_alternative(i == 0) &&
4179 alternative.node()->EmitQuickCheck(
4180 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4181 &alt_gen->possible_success, &alt_gen->quick_check_details,
4182 fall_through_on_failure)) {
4183 // Quick check was generated for this choice.
4184 preload->preload_is_current_ = true;
4185 preload->preload_has_checked_bounds_ = true;
4186 // If we generated the quick check to fall through on possible success,
4187 // we now need to generate the full check inline.
4188 if (!fall_through_on_failure) {
4189 macro_assembler->Bind(&alt_gen->possible_success);
4190 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4191 new_trace.set_characters_preloaded(preload->preload_characters_);
4192 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4193 generate_full_check_inline = true;
4194 }
4195 } else if (alt_gen->quick_check_details.cannot_match()) {
4196 if (!fall_through_on_failure) {
4197 macro_assembler->GoTo(trace->backtrack());
4198 }
4199 continue;
4200 } else {
4201 // No quick check was generated. Put the full code here.
4202 // If this is not the first choice then there could be slow checks from
4203 // previous cases that go here when they fail. There's no reason to
4204 // insist that they preload characters since the slow check we are about
4205 // to generate probably can't use it.
4206 if (i != first_choice) {
4207 alt_gen->expects_preload = false;
4208 new_trace.InvalidateCurrentCharacter();
4209 }
4210 generate_full_check_inline = true;
4211 }
4212 if (generate_full_check_inline) {
4213 if (new_trace.actions() != NULL) {
4214 new_trace.set_flush_budget(new_flush_budget);
4215 }
4216 for (int j = 0; j < guard_count; j++) {
4217 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4218 }
4219 alternative.node()->Emit(compiler, &new_trace);
4220 preload->preload_is_current_ = false;
4221 }
4222 macro_assembler->Bind(&alt_gen->after);
4223 }
4224}
4225
4226
4227void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4228 Trace* trace,
4229 GuardedAlternative alternative,
4230 AlternativeGeneration* alt_gen,
4231 int preload_characters,
4232 bool next_expects_preload) {
4233 if (!alt_gen->possible_success.is_linked()) return;
4234
4235 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4236 macro_assembler->Bind(&alt_gen->possible_success);
4237 Trace out_of_line_trace(*trace);
4238 out_of_line_trace.set_characters_preloaded(preload_characters);
4239 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4240 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4241 ZoneList<Guard*>* guards = alternative.guards();
4242 int guard_count = (guards == NULL) ? 0 : guards->length();
4243 if (next_expects_preload) {
4244 Label reload_current_char;
4245 out_of_line_trace.set_backtrack(&reload_current_char);
4246 for (int j = 0; j < guard_count; j++) {
4247 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4248 }
4249 alternative.node()->Emit(compiler, &out_of_line_trace);
4250 macro_assembler->Bind(&reload_current_char);
4251 // Reload the current character, since the next quick check expects that.
4252 // We don't need to check bounds here because we only get into this
4253 // code through a quick check which already did the checked load.
4254 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4255 NULL,
4256 false,
4257 preload_characters);
4258 macro_assembler->GoTo(&(alt_gen->after));
4259 } else {
4260 out_of_line_trace.set_backtrack(&(alt_gen->after));
4261 for (int j = 0; j < guard_count; j++) {
4262 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4263 }
4264 alternative.node()->Emit(compiler, &out_of_line_trace);
4265 }
4266}
4267
4268
4269void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4270 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4271 LimitResult limit_result = LimitVersions(compiler, trace);
4272 if (limit_result == DONE) return;
4273 DCHECK(limit_result == CONTINUE);
4274
4275 RecursionCheck rc(compiler);
4276
4277 switch (action_type_) {
4278 case STORE_POSITION: {
4279 Trace::DeferredCapture
4280 new_capture(data_.u_position_register.reg,
4281 data_.u_position_register.is_capture,
4282 trace);
4283 Trace new_trace = *trace;
4284 new_trace.add_action(&new_capture);
4285 on_success()->Emit(compiler, &new_trace);
4286 break;
4287 }
4288 case INCREMENT_REGISTER: {
4289 Trace::DeferredIncrementRegister
4290 new_increment(data_.u_increment_register.reg);
4291 Trace new_trace = *trace;
4292 new_trace.add_action(&new_increment);
4293 on_success()->Emit(compiler, &new_trace);
4294 break;
4295 }
4296 case SET_REGISTER: {
4297 Trace::DeferredSetRegister
4298 new_set(data_.u_store_register.reg, data_.u_store_register.value);
4299 Trace new_trace = *trace;
4300 new_trace.add_action(&new_set);
4301 on_success()->Emit(compiler, &new_trace);
4302 break;
4303 }
4304 case CLEAR_CAPTURES: {
4305 Trace::DeferredClearCaptures
4306 new_capture(Interval(data_.u_clear_captures.range_from,
4307 data_.u_clear_captures.range_to));
4308 Trace new_trace = *trace;
4309 new_trace.add_action(&new_capture);
4310 on_success()->Emit(compiler, &new_trace);
4311 break;
4312 }
4313 case BEGIN_SUBMATCH:
4314 if (!trace->is_trivial()) {
4315 trace->Flush(compiler, this);
4316 } else {
4317 assembler->WriteCurrentPositionToRegister(
4318 data_.u_submatch.current_position_register, 0);
4319 assembler->WriteStackPointerToRegister(
4320 data_.u_submatch.stack_pointer_register);
4321 on_success()->Emit(compiler, trace);
4322 }
4323 break;
4324 case EMPTY_MATCH_CHECK: {
4325 int start_pos_reg = data_.u_empty_match_check.start_register;
4326 int stored_pos = 0;
4327 int rep_reg = data_.u_empty_match_check.repetition_register;
4328 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4329 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4330 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4331 // If we know we haven't advanced and there is no minimum we
4332 // can just backtrack immediately.
4333 assembler->GoTo(trace->backtrack());
4334 } else if (know_dist && stored_pos < trace->cp_offset()) {
4335 // If we know we've advanced we can generate the continuation
4336 // immediately.
4337 on_success()->Emit(compiler, trace);
4338 } else if (!trace->is_trivial()) {
4339 trace->Flush(compiler, this);
4340 } else {
4341 Label skip_empty_check;
4342 // If we have a minimum number of repetitions we check the current
4343 // number first and skip the empty check if it's not enough.
4344 if (has_minimum) {
4345 int limit = data_.u_empty_match_check.repetition_limit;
4346 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4347 }
4348 // If the match is empty we bail out, otherwise we fall through
4349 // to the on-success continuation.
4350 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4351 trace->backtrack());
4352 assembler->Bind(&skip_empty_check);
4353 on_success()->Emit(compiler, trace);
4354 }
4355 break;
4356 }
4357 case POSITIVE_SUBMATCH_SUCCESS: {
4358 if (!trace->is_trivial()) {
4359 trace->Flush(compiler, this);
4360 return;
4361 }
4362 assembler->ReadCurrentPositionFromRegister(
4363 data_.u_submatch.current_position_register);
4364 assembler->ReadStackPointerFromRegister(
4365 data_.u_submatch.stack_pointer_register);
4366 int clear_register_count = data_.u_submatch.clear_register_count;
4367 if (clear_register_count == 0) {
4368 on_success()->Emit(compiler, trace);
4369 return;
4370 }
4371 int clear_registers_from = data_.u_submatch.clear_register_from;
4372 Label clear_registers_backtrack;
4373 Trace new_trace = *trace;
4374 new_trace.set_backtrack(&clear_registers_backtrack);
4375 on_success()->Emit(compiler, &new_trace);
4376
4377 assembler->Bind(&clear_registers_backtrack);
4378 int clear_registers_to = clear_registers_from + clear_register_count - 1;
4379 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4380
4381 DCHECK(trace->backtrack() == NULL);
4382 assembler->Backtrack();
4383 return;
4384 }
4385 default:
4386 UNREACHABLE();
4387 }
4388}
4389
4390
4391void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4392 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4393 if (!trace->is_trivial()) {
4394 trace->Flush(compiler, this);
4395 return;
4396 }
4397
4398 LimitResult limit_result = LimitVersions(compiler, trace);
4399 if (limit_result == DONE) return;
4400 DCHECK(limit_result == CONTINUE);
4401
4402 RecursionCheck rc(compiler);
4403
4404 DCHECK_EQ(start_reg_ + 1, end_reg_);
4405 if (compiler->ignore_case()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01004406 assembler->CheckNotBackReferenceIgnoreCase(
4407 start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004408 } else {
4409 assembler->CheckNotBackReference(start_reg_, read_backward(),
4410 trace->backtrack());
4411 }
4412 // We are going to advance backward, so we may end up at the start.
4413 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
Ben Murdoch097c5b22016-05-18 11:27:45 +01004414
4415 // Check that the back reference does not end inside a surrogate pair.
4416 if (compiler->unicode() && !compiler->one_byte()) {
4417 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4418 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004419 on_success()->Emit(compiler, trace);
4420}
4421
4422
4423// -------------------------------------------------------------------
4424// Dot/dotty output
4425
4426
4427#ifdef DEBUG
4428
4429
4430class DotPrinter: public NodeVisitor {
4431 public:
4432 DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4433 : os_(os),
4434 ignore_case_(ignore_case) {}
4435 void PrintNode(const char* label, RegExpNode* node);
4436 void Visit(RegExpNode* node);
4437 void PrintAttributes(RegExpNode* from);
4438 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4439#define DECLARE_VISIT(Type) \
4440 virtual void Visit##Type(Type##Node* that);
4441FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4442#undef DECLARE_VISIT
4443 private:
4444 std::ostream& os_;
4445 bool ignore_case_;
4446};
4447
4448
4449void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4450 os_ << "digraph G {\n graph [label=\"";
4451 for (int i = 0; label[i]; i++) {
4452 switch (label[i]) {
4453 case '\\':
4454 os_ << "\\\\";
4455 break;
4456 case '"':
4457 os_ << "\"";
4458 break;
4459 default:
4460 os_ << label[i];
4461 break;
4462 }
4463 }
4464 os_ << "\"];\n";
4465 Visit(node);
4466 os_ << "}" << std::endl;
4467}
4468
4469
4470void DotPrinter::Visit(RegExpNode* node) {
4471 if (node->info()->visited) return;
4472 node->info()->visited = true;
4473 node->Accept(this);
4474}
4475
4476
4477void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4478 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4479 Visit(on_failure);
4480}
4481
4482
4483class TableEntryBodyPrinter {
4484 public:
4485 TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4486 : os_(os),
4487 choice_(choice) {}
4488 void Call(uc16 from, DispatchTable::Entry entry) {
4489 OutSet* out_set = entry.out_set();
4490 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4491 if (out_set->Get(i)) {
4492 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4493 << choice()->alternatives()->at(i).node() << ";\n";
4494 }
4495 }
4496 }
4497 private:
4498 ChoiceNode* choice() { return choice_; }
4499 std::ostream& os_;
4500 ChoiceNode* choice_;
4501};
4502
4503
4504class TableEntryHeaderPrinter {
4505 public:
4506 explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4507 : first_(true),
4508 os_(os) {}
4509 void Call(uc16 from, DispatchTable::Entry entry) {
4510 if (first_) {
4511 first_ = false;
4512 } else {
4513 os_ << "|";
4514 }
4515 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4516 OutSet* out_set = entry.out_set();
4517 int priority = 0;
4518 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4519 if (out_set->Get(i)) {
4520 if (priority > 0) os_ << "|";
4521 os_ << "<s" << from << "o" << i << "> " << priority;
4522 priority++;
4523 }
4524 }
4525 os_ << "}}";
4526 }
4527
4528 private:
4529 bool first_;
4530 std::ostream& os_;
4531};
4532
4533
4534class AttributePrinter {
4535 public:
4536 explicit AttributePrinter(std::ostream& os) // NOLINT
4537 : os_(os),
4538 first_(true) {}
4539 void PrintSeparator() {
4540 if (first_) {
4541 first_ = false;
4542 } else {
4543 os_ << "|";
4544 }
4545 }
4546 void PrintBit(const char* name, bool value) {
4547 if (!value) return;
4548 PrintSeparator();
4549 os_ << "{" << name << "}";
4550 }
4551 void PrintPositive(const char* name, int value) {
4552 if (value < 0) return;
4553 PrintSeparator();
4554 os_ << "{" << name << "|" << value << "}";
4555 }
4556
4557 private:
4558 std::ostream& os_;
4559 bool first_;
4560};
4561
4562
4563void DotPrinter::PrintAttributes(RegExpNode* that) {
4564 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4565 << "margin=0.1, fontsize=10, label=\"{";
4566 AttributePrinter printer(os_);
4567 NodeInfo* info = that->info();
4568 printer.PrintBit("NI", info->follows_newline_interest);
4569 printer.PrintBit("WI", info->follows_word_interest);
4570 printer.PrintBit("SI", info->follows_start_interest);
4571 Label* label = that->label();
4572 if (label->is_bound())
4573 printer.PrintPositive("@", label->pos());
4574 os_ << "}\"];\n"
4575 << " a" << that << " -> n" << that
4576 << " [style=dashed, color=grey, arrowhead=none];\n";
4577}
4578
4579
4580static const bool kPrintDispatchTable = false;
4581void DotPrinter::VisitChoice(ChoiceNode* that) {
4582 if (kPrintDispatchTable) {
4583 os_ << " n" << that << " [shape=Mrecord, label=\"";
4584 TableEntryHeaderPrinter header_printer(os_);
4585 that->GetTable(ignore_case_)->ForEach(&header_printer);
4586 os_ << "\"]\n";
4587 PrintAttributes(that);
4588 TableEntryBodyPrinter body_printer(os_, that);
4589 that->GetTable(ignore_case_)->ForEach(&body_printer);
4590 } else {
4591 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4592 for (int i = 0; i < that->alternatives()->length(); i++) {
4593 GuardedAlternative alt = that->alternatives()->at(i);
4594 os_ << " n" << that << " -> n" << alt.node();
4595 }
4596 }
4597 for (int i = 0; i < that->alternatives()->length(); i++) {
4598 GuardedAlternative alt = that->alternatives()->at(i);
4599 alt.node()->Accept(this);
4600 }
4601}
4602
4603
4604void DotPrinter::VisitText(TextNode* that) {
4605 Zone* zone = that->zone();
4606 os_ << " n" << that << " [label=\"";
4607 for (int i = 0; i < that->elements()->length(); i++) {
4608 if (i > 0) os_ << " ";
4609 TextElement elm = that->elements()->at(i);
4610 switch (elm.text_type()) {
4611 case TextElement::ATOM: {
4612 Vector<const uc16> data = elm.atom()->data();
4613 for (int i = 0; i < data.length(); i++) {
4614 os_ << static_cast<char>(data[i]);
4615 }
4616 break;
4617 }
4618 case TextElement::CHAR_CLASS: {
4619 RegExpCharacterClass* node = elm.char_class();
4620 os_ << "[";
4621 if (node->is_negated()) os_ << "^";
4622 for (int j = 0; j < node->ranges(zone)->length(); j++) {
4623 CharacterRange range = node->ranges(zone)->at(j);
4624 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4625 }
4626 os_ << "]";
4627 break;
4628 }
4629 default:
4630 UNREACHABLE();
4631 }
4632 }
4633 os_ << "\", shape=box, peripheries=2];\n";
4634 PrintAttributes(that);
4635 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4636 Visit(that->on_success());
4637}
4638
4639
4640void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4641 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4642 << that->end_register() << "\", shape=doubleoctagon];\n";
4643 PrintAttributes(that);
4644 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4645 Visit(that->on_success());
4646}
4647
4648
4649void DotPrinter::VisitEnd(EndNode* that) {
4650 os_ << " n" << that << " [style=bold, shape=point];\n";
4651 PrintAttributes(that);
4652}
4653
4654
4655void DotPrinter::VisitAssertion(AssertionNode* that) {
4656 os_ << " n" << that << " [";
4657 switch (that->assertion_type()) {
4658 case AssertionNode::AT_END:
4659 os_ << "label=\"$\", shape=septagon";
4660 break;
4661 case AssertionNode::AT_START:
4662 os_ << "label=\"^\", shape=septagon";
4663 break;
4664 case AssertionNode::AT_BOUNDARY:
4665 os_ << "label=\"\\b\", shape=septagon";
4666 break;
4667 case AssertionNode::AT_NON_BOUNDARY:
4668 os_ << "label=\"\\B\", shape=septagon";
4669 break;
4670 case AssertionNode::AFTER_NEWLINE:
4671 os_ << "label=\"(?<=\\n)\", shape=septagon";
4672 break;
4673 }
4674 os_ << "];\n";
4675 PrintAttributes(that);
4676 RegExpNode* successor = that->on_success();
4677 os_ << " n" << that << " -> n" << successor << ";\n";
4678 Visit(successor);
4679}
4680
4681
4682void DotPrinter::VisitAction(ActionNode* that) {
4683 os_ << " n" << that << " [";
4684 switch (that->action_type_) {
4685 case ActionNode::SET_REGISTER:
4686 os_ << "label=\"$" << that->data_.u_store_register.reg
4687 << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4688 break;
4689 case ActionNode::INCREMENT_REGISTER:
4690 os_ << "label=\"$" << that->data_.u_increment_register.reg
4691 << "++\", shape=octagon";
4692 break;
4693 case ActionNode::STORE_POSITION:
4694 os_ << "label=\"$" << that->data_.u_position_register.reg
4695 << ":=$pos\", shape=octagon";
4696 break;
4697 case ActionNode::BEGIN_SUBMATCH:
4698 os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4699 << ":=$pos,begin\", shape=septagon";
4700 break;
4701 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4702 os_ << "label=\"escape\", shape=septagon";
4703 break;
4704 case ActionNode::EMPTY_MATCH_CHECK:
4705 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4706 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4707 << "<" << that->data_.u_empty_match_check.repetition_limit
4708 << "?\", shape=septagon";
4709 break;
4710 case ActionNode::CLEAR_CAPTURES: {
4711 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4712 << " to $" << that->data_.u_clear_captures.range_to
4713 << "\", shape=septagon";
4714 break;
4715 }
4716 }
4717 os_ << "];\n";
4718 PrintAttributes(that);
4719 RegExpNode* successor = that->on_success();
4720 os_ << " n" << that << " -> n" << successor << ";\n";
4721 Visit(successor);
4722}
4723
4724
4725class DispatchTableDumper {
4726 public:
4727 explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4728 void Call(uc16 key, DispatchTable::Entry entry);
4729 private:
4730 std::ostream& os_;
4731};
4732
4733
4734void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4735 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4736 OutSet* set = entry.out_set();
4737 bool first = true;
4738 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4739 if (set->Get(i)) {
4740 if (first) {
4741 first = false;
4742 } else {
4743 os_ << ", ";
4744 }
4745 os_ << i;
4746 }
4747 }
4748 os_ << "}\n";
4749}
4750
4751
4752void DispatchTable::Dump() {
4753 OFStream os(stderr);
4754 DispatchTableDumper dumper(os);
4755 tree()->ForEach(&dumper);
4756}
4757
4758
4759void RegExpEngine::DotPrint(const char* label,
4760 RegExpNode* node,
4761 bool ignore_case) {
4762 OFStream os(stdout);
4763 DotPrinter printer(os, ignore_case);
4764 printer.PrintNode(label, node);
4765}
4766
4767
4768#endif // DEBUG
4769
4770
4771// -------------------------------------------------------------------
4772// Tree to graph conversion
4773
4774RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4775 RegExpNode* on_success) {
4776 ZoneList<TextElement>* elms =
4777 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4778 elms->Add(TextElement::Atom(this), compiler->zone());
4779 return new (compiler->zone())
4780 TextNode(elms, compiler->read_backward(), on_success);
4781}
4782
4783
4784RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4785 RegExpNode* on_success) {
4786 return new (compiler->zone())
4787 TextNode(elements(), compiler->read_backward(), on_success);
4788}
4789
4790
4791static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4792 const int* special_class,
4793 int length) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01004794 length--; // Remove final marker.
4795 DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004796 DCHECK(ranges->length() != 0);
4797 DCHECK(length != 0);
4798 DCHECK(special_class[0] != 0);
4799 if (ranges->length() != (length >> 1) + 1) {
4800 return false;
4801 }
4802 CharacterRange range = ranges->at(0);
4803 if (range.from() != 0) {
4804 return false;
4805 }
4806 for (int i = 0; i < length; i += 2) {
4807 if (special_class[i] != (range.to() + 1)) {
4808 return false;
4809 }
4810 range = ranges->at((i >> 1) + 1);
4811 if (special_class[i+1] != range.from()) {
4812 return false;
4813 }
4814 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01004815 if (range.to() != String::kMaxCodePoint) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004816 return false;
4817 }
4818 return true;
4819}
4820
4821
4822static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4823 const int* special_class,
4824 int length) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01004825 length--; // Remove final marker.
4826 DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004827 if (ranges->length() * 2 != length) {
4828 return false;
4829 }
4830 for (int i = 0; i < length; i += 2) {
4831 CharacterRange range = ranges->at(i >> 1);
4832 if (range.from() != special_class[i] ||
4833 range.to() != special_class[i + 1] - 1) {
4834 return false;
4835 }
4836 }
4837 return true;
4838}
4839
4840
4841bool RegExpCharacterClass::is_standard(Zone* zone) {
4842 // TODO(lrn): Remove need for this function, by not throwing away information
4843 // along the way.
4844 if (is_negated_) {
4845 return false;
4846 }
4847 if (set_.is_standard()) {
4848 return true;
4849 }
4850 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4851 set_.set_standard_set_type('s');
4852 return true;
4853 }
4854 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4855 set_.set_standard_set_type('S');
4856 return true;
4857 }
4858 if (CompareInverseRanges(set_.ranges(zone),
4859 kLineTerminatorRanges,
4860 kLineTerminatorRangeCount)) {
4861 set_.set_standard_set_type('.');
4862 return true;
4863 }
4864 if (CompareRanges(set_.ranges(zone),
4865 kLineTerminatorRanges,
4866 kLineTerminatorRangeCount)) {
4867 set_.set_standard_set_type('n');
4868 return true;
4869 }
4870 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4871 set_.set_standard_set_type('w');
4872 return true;
4873 }
4874 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4875 set_.set_standard_set_type('W');
4876 return true;
4877 }
4878 return false;
4879}
4880
4881
Ben Murdoch097c5b22016-05-18 11:27:45 +01004882UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4883 ZoneList<CharacterRange>* base)
4884 : zone_(zone),
4885 table_(zone),
4886 bmp_(nullptr),
4887 lead_surrogates_(nullptr),
4888 trail_surrogates_(nullptr),
4889 non_bmp_(nullptr) {
4890 // The unicode range splitter categorizes given character ranges into:
4891 // - Code points from the BMP representable by one code unit.
4892 // - Code points outside the BMP that need to be split into surrogate pairs.
4893 // - Lone lead surrogates.
4894 // - Lone trail surrogates.
4895 // Lone surrogates are valid code points, even though no actual characters.
4896 // They require special matching to make sure we do not split surrogate pairs.
4897 // We use the dispatch table to accomplish this. The base range is split up
4898 // by the table by the overlay ranges, and the Call callback is used to
4899 // filter and collect ranges for each category.
4900 for (int i = 0; i < base->length(); i++) {
4901 table_.AddRange(base->at(i), kBase, zone_);
4902 }
4903 // Add overlay ranges.
4904 table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4905 kBmpCodePoints, zone_);
4906 table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4907 kLeadSurrogates, zone_);
4908 table_.AddRange(
4909 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4910 kTrailSurrogates, zone_);
4911 table_.AddRange(
4912 CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4913 kBmpCodePoints, zone_);
4914 table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4915 kNonBmpCodePoints, zone_);
4916 table_.ForEach(this);
4917}
4918
4919
4920void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4921 OutSet* outset = entry.out_set();
4922 if (!outset->Get(kBase)) return;
4923 ZoneList<CharacterRange>** target = NULL;
4924 if (outset->Get(kBmpCodePoints)) {
4925 target = &bmp_;
4926 } else if (outset->Get(kLeadSurrogates)) {
4927 target = &lead_surrogates_;
4928 } else if (outset->Get(kTrailSurrogates)) {
4929 target = &trail_surrogates_;
4930 } else {
4931 DCHECK(outset->Get(kNonBmpCodePoints));
4932 target = &non_bmp_;
4933 }
4934 if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4935 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4936}
4937
4938
4939void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4940 RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4941 ZoneList<CharacterRange>* bmp = splitter->bmp();
4942 if (bmp == nullptr) return;
4943 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4944 compiler->zone(), bmp, compiler->read_backward(), on_success)));
4945}
4946
4947
4948void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4949 RegExpNode* on_success,
4950 UnicodeRangeSplitter* splitter) {
4951 ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4952 if (non_bmp == nullptr) return;
4953 DCHECK(compiler->unicode());
4954 DCHECK(!compiler->one_byte());
4955 Zone* zone = compiler->zone();
4956 CharacterRange::Canonicalize(non_bmp);
4957 for (int i = 0; i < non_bmp->length(); i++) {
4958 // Match surrogate pair.
4959 // E.g. [\u10005-\u11005] becomes
4960 // \ud800[\udc05-\udfff]|
4961 // [\ud801-\ud803][\udc00-\udfff]|
4962 // \ud804[\udc00-\udc05]
4963 uc32 from = non_bmp->at(i).from();
4964 uc32 to = non_bmp->at(i).to();
4965 uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4966 uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4967 uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4968 uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4969 if (from_l == to_l) {
4970 // The lead surrogate is the same.
4971 result->AddAlternative(
4972 GuardedAlternative(TextNode::CreateForSurrogatePair(
4973 zone, CharacterRange::Singleton(from_l),
4974 CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4975 on_success)));
4976 } else {
4977 if (from_t != kTrailSurrogateStart) {
4978 // Add [from_l][from_t-\udfff]
4979 result->AddAlternative(
4980 GuardedAlternative(TextNode::CreateForSurrogatePair(
4981 zone, CharacterRange::Singleton(from_l),
4982 CharacterRange::Range(from_t, kTrailSurrogateEnd),
4983 compiler->read_backward(), on_success)));
4984 from_l++;
4985 }
4986 if (to_t != kTrailSurrogateEnd) {
4987 // Add [to_l][\udc00-to_t]
4988 result->AddAlternative(
4989 GuardedAlternative(TextNode::CreateForSurrogatePair(
4990 zone, CharacterRange::Singleton(to_l),
4991 CharacterRange::Range(kTrailSurrogateStart, to_t),
4992 compiler->read_backward(), on_success)));
4993 to_l--;
4994 }
4995 if (from_l <= to_l) {
4996 // Add [from_l-to_l][\udc00-\udfff]
4997 result->AddAlternative(
4998 GuardedAlternative(TextNode::CreateForSurrogatePair(
4999 zone, CharacterRange::Range(from_l, to_l),
5000 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
5001 compiler->read_backward(), on_success)));
5002 }
5003 }
5004 }
5005}
5006
5007
5008RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
5009 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
5010 ZoneList<CharacterRange>* match, RegExpNode* on_success,
5011 bool read_backward) {
5012 Zone* zone = compiler->zone();
5013 RegExpNode* match_node = TextNode::CreateForCharacterRanges(
5014 zone, match, read_backward, on_success);
5015 int stack_register = compiler->UnicodeLookaroundStackRegister();
5016 int position_register = compiler->UnicodeLookaroundPositionRegister();
5017 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
5018 position_register);
5019 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5020 zone, lookbehind, !read_backward, lookaround.on_match_success());
5021 return lookaround.ForMatch(negative_match);
5022}
5023
5024
5025RegExpNode* MatchAndNegativeLookaroundInReadDirection(
5026 RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
5027 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
5028 bool read_backward) {
5029 Zone* zone = compiler->zone();
5030 int stack_register = compiler->UnicodeLookaroundStackRegister();
5031 int position_register = compiler->UnicodeLookaroundPositionRegister();
5032 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
5033 position_register);
5034 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5035 zone, lookahead, read_backward, lookaround.on_match_success());
5036 return TextNode::CreateForCharacterRanges(
5037 zone, match, read_backward, lookaround.ForMatch(negative_match));
5038}
5039
5040
5041void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5042 RegExpNode* on_success,
5043 UnicodeRangeSplitter* splitter) {
5044 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
5045 if (lead_surrogates == nullptr) return;
5046 Zone* zone = compiler->zone();
5047 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
5048 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
5049 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
5050
5051 RegExpNode* match;
5052 if (compiler->read_backward()) {
5053 // Reading backward. Assert that reading forward, there is no trail
5054 // surrogate, and then backward match the lead surrogate.
5055 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5056 compiler, trail_surrogates, lead_surrogates, on_success, true);
5057 } else {
5058 // Reading forward. Forward match the lead surrogate and assert that
5059 // no trail surrogate follows.
5060 match = MatchAndNegativeLookaroundInReadDirection(
5061 compiler, lead_surrogates, trail_surrogates, on_success, false);
5062 }
5063 result->AddAlternative(GuardedAlternative(match));
5064}
5065
5066
5067void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5068 RegExpNode* on_success,
5069 UnicodeRangeSplitter* splitter) {
5070 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5071 if (trail_surrogates == nullptr) return;
5072 Zone* zone = compiler->zone();
5073 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5074 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5075 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5076
5077 RegExpNode* match;
5078 if (compiler->read_backward()) {
5079 // Reading backward. Backward match the trail surrogate and assert that no
5080 // lead surrogate precedes it.
5081 match = MatchAndNegativeLookaroundInReadDirection(
5082 compiler, trail_surrogates, lead_surrogates, on_success, true);
5083 } else {
5084 // Reading forward. Assert that reading backward, there is no lead
5085 // surrogate, and then forward match the trail surrogate.
5086 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5087 compiler, lead_surrogates, trail_surrogates, on_success, false);
5088 }
5089 result->AddAlternative(GuardedAlternative(match));
5090}
5091
5092RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5093 RegExpNode* on_success) {
5094 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5095 DCHECK(!compiler->read_backward());
5096 Zone* zone = compiler->zone();
5097 // Advance any character. If the character happens to be a lead surrogate and
5098 // we advanced into the middle of a surrogate pair, it will work out, as
5099 // nothing will match from there. We will have to advance again, consuming
5100 // the associated trail surrogate.
5101 ZoneList<CharacterRange>* range = CharacterRange::List(
5102 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5103 return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
5104}
5105
5106
5107void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
5108 ZoneList<CharacterRange>* ranges) {
5109#ifdef V8_I18N_SUPPORT
5110 // Use ICU to compute the case fold closure over the ranges.
5111 DCHECK(compiler->unicode());
5112 DCHECK(compiler->ignore_case());
5113 USet* set = uset_openEmpty();
5114 for (int i = 0; i < ranges->length(); i++) {
5115 uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
5116 }
5117 ranges->Clear();
5118 uset_closeOver(set, USET_CASE_INSENSITIVE);
5119 // Full case mapping map single characters to multiple characters.
5120 // Those are represented as strings in the set. Remove them so that
5121 // we end up with only simple and common case mappings.
5122 uset_removeAllStrings(set);
5123 int item_count = uset_getItemCount(set);
5124 int item_result = 0;
5125 UErrorCode ec = U_ZERO_ERROR;
5126 Zone* zone = compiler->zone();
5127 for (int i = 0; i < item_count; i++) {
5128 uc32 start = 0;
5129 uc32 end = 0;
5130 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
5131 ranges->Add(CharacterRange::Range(start, end), zone);
5132 }
5133 // No errors and everything we collected have been ranges.
5134 DCHECK_EQ(U_ZERO_ERROR, ec);
5135 DCHECK_EQ(0, item_result);
5136 uset_close(set);
5137#else
5138 // Fallback if ICU is not included.
5139 CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
5140 ranges, compiler->one_byte());
5141#endif // V8_I18N_SUPPORT
5142 CharacterRange::Canonicalize(ranges);
5143}
5144
5145
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005146RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5147 RegExpNode* on_success) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01005148 set_.Canonicalize();
5149 Zone* zone = compiler->zone();
5150 ZoneList<CharacterRange>* ranges = this->ranges(zone);
5151 if (compiler->unicode() && compiler->ignore_case()) {
5152 AddUnicodeCaseEquivalents(compiler, ranges);
5153 }
5154 if (compiler->unicode() && !compiler->one_byte()) {
5155 if (is_negated()) {
5156 ZoneList<CharacterRange>* negated =
5157 new (zone) ZoneList<CharacterRange>(2, zone);
5158 CharacterRange::Negate(ranges, negated, zone);
5159 ranges = negated;
5160 }
5161 if (ranges->length() == 0) {
5162 // No matches possible.
5163 return new (zone) EndNode(EndNode::BACKTRACK, zone);
5164 }
5165 if (standard_type() == '*') {
5166 return UnanchoredAdvance(compiler, on_success);
5167 } else {
5168 ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5169 UnicodeRangeSplitter splitter(zone, ranges);
5170 AddBmpCharacters(compiler, result, on_success, &splitter);
5171 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5172 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5173 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5174 return result;
5175 }
5176 } else {
5177 return new (zone) TextNode(this, compiler->read_backward(), on_success);
5178 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005179}
5180
5181
5182int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5183 RegExpAtom* atom1 = (*a)->AsAtom();
5184 RegExpAtom* atom2 = (*b)->AsAtom();
5185 uc16 character1 = atom1->data().at(0);
5186 uc16 character2 = atom2->data().at(0);
5187 if (character1 < character2) return -1;
5188 if (character1 > character2) return 1;
5189 return 0;
5190}
5191
5192
5193static unibrow::uchar Canonical(
5194 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5195 unibrow::uchar c) {
5196 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5197 int length = canonicalize->get(c, '\0', chars);
5198 DCHECK_LE(length, 1);
5199 unibrow::uchar canonical = c;
5200 if (length == 1) canonical = chars[0];
5201 return canonical;
5202}
5203
5204
5205int CompareFirstCharCaseIndependent(
5206 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5207 RegExpTree* const* a, RegExpTree* const* b) {
5208 RegExpAtom* atom1 = (*a)->AsAtom();
5209 RegExpAtom* atom2 = (*b)->AsAtom();
5210 unibrow::uchar character1 = atom1->data().at(0);
5211 unibrow::uchar character2 = atom2->data().at(0);
5212 if (character1 == character2) return 0;
5213 if (character1 >= 'a' || character2 >= 'a') {
5214 character1 = Canonical(canonicalize, character1);
5215 character2 = Canonical(canonicalize, character2);
5216 }
5217 return static_cast<int>(character1) - static_cast<int>(character2);
5218}
5219
5220
5221// We can stable sort runs of atoms, since the order does not matter if they
5222// start with different characters.
5223// Returns true if any consecutive atoms were found.
5224bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5225 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5226 int length = alternatives->length();
5227 bool found_consecutive_atoms = false;
5228 for (int i = 0; i < length; i++) {
5229 while (i < length) {
5230 RegExpTree* alternative = alternatives->at(i);
5231 if (alternative->IsAtom()) break;
5232 i++;
5233 }
5234 // i is length or it is the index of an atom.
5235 if (i == length) break;
5236 int first_atom = i;
5237 i++;
5238 while (i < length) {
5239 RegExpTree* alternative = alternatives->at(i);
5240 if (!alternative->IsAtom()) break;
5241 i++;
5242 }
5243 // Sort atoms to get ones with common prefixes together.
5244 // This step is more tricky if we are in a case-independent regexp,
5245 // because it would change /is|I/ to /I|is/, and order matters when
5246 // the regexp parts don't match only disjoint starting points. To fix
5247 // this we have a version of CompareFirstChar that uses case-
5248 // independent character classes for comparison.
5249 DCHECK_LT(first_atom, alternatives->length());
5250 DCHECK_LE(i, alternatives->length());
5251 DCHECK_LE(first_atom, i);
5252 if (compiler->ignore_case()) {
5253 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5254 compiler->isolate()->regexp_macro_assembler_canonicalize();
5255 auto compare_closure =
5256 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5257 return CompareFirstCharCaseIndependent(canonicalize, a, b);
5258 };
5259 alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5260 } else {
5261 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5262 }
5263 if (i - first_atom > 1) found_consecutive_atoms = true;
5264 }
5265 return found_consecutive_atoms;
5266}
5267
5268
5269// Optimizes ab|ac|az to a(?:b|c|d).
5270void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5271 Zone* zone = compiler->zone();
5272 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5273 int length = alternatives->length();
5274
5275 int write_posn = 0;
5276 int i = 0;
5277 while (i < length) {
5278 RegExpTree* alternative = alternatives->at(i);
5279 if (!alternative->IsAtom()) {
5280 alternatives->at(write_posn++) = alternatives->at(i);
5281 i++;
5282 continue;
5283 }
5284 RegExpAtom* atom = alternative->AsAtom();
5285 unibrow::uchar common_prefix = atom->data().at(0);
5286 int first_with_prefix = i;
5287 int prefix_length = atom->length();
5288 i++;
5289 while (i < length) {
5290 alternative = alternatives->at(i);
5291 if (!alternative->IsAtom()) break;
5292 atom = alternative->AsAtom();
5293 unibrow::uchar new_prefix = atom->data().at(0);
5294 if (new_prefix != common_prefix) {
5295 if (!compiler->ignore_case()) break;
5296 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5297 compiler->isolate()->regexp_macro_assembler_canonicalize();
5298 new_prefix = Canonical(canonicalize, new_prefix);
5299 common_prefix = Canonical(canonicalize, common_prefix);
5300 if (new_prefix != common_prefix) break;
5301 }
5302 prefix_length = Min(prefix_length, atom->length());
5303 i++;
5304 }
5305 if (i > first_with_prefix + 2) {
5306 // Found worthwhile run of alternatives with common prefix of at least one
5307 // character. The sorting function above did not sort on more than one
5308 // character for reasons of correctness, but there may still be a longer
5309 // common prefix if the terms were similar or presorted in the input.
5310 // Find out how long the common prefix is.
5311 int run_length = i - first_with_prefix;
5312 atom = alternatives->at(first_with_prefix)->AsAtom();
5313 for (int j = 1; j < run_length && prefix_length > 1; j++) {
5314 RegExpAtom* old_atom =
5315 alternatives->at(j + first_with_prefix)->AsAtom();
5316 for (int k = 1; k < prefix_length; k++) {
5317 if (atom->data().at(k) != old_atom->data().at(k)) {
5318 prefix_length = k;
5319 break;
5320 }
5321 }
5322 }
5323 RegExpAtom* prefix =
5324 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
5325 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5326 pair->Add(prefix, zone);
5327 ZoneList<RegExpTree*>* suffixes =
5328 new (zone) ZoneList<RegExpTree*>(run_length, zone);
5329 for (int j = 0; j < run_length; j++) {
5330 RegExpAtom* old_atom =
5331 alternatives->at(j + first_with_prefix)->AsAtom();
5332 int len = old_atom->length();
5333 if (len == prefix_length) {
5334 suffixes->Add(new (zone) RegExpEmpty(), zone);
5335 } else {
5336 RegExpTree* suffix = new (zone) RegExpAtom(
5337 old_atom->data().SubVector(prefix_length, old_atom->length()));
5338 suffixes->Add(suffix, zone);
5339 }
5340 }
5341 pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5342 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5343 } else {
5344 // Just copy any non-worthwhile alternatives.
5345 for (int j = first_with_prefix; j < i; j++) {
5346 alternatives->at(write_posn++) = alternatives->at(j);
5347 }
5348 }
5349 }
5350 alternatives->Rewind(write_posn); // Trim end of array.
5351}
5352
5353
5354// Optimizes b|c|z to [bcz].
5355void RegExpDisjunction::FixSingleCharacterDisjunctions(
5356 RegExpCompiler* compiler) {
5357 Zone* zone = compiler->zone();
5358 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5359 int length = alternatives->length();
5360
5361 int write_posn = 0;
5362 int i = 0;
5363 while (i < length) {
5364 RegExpTree* alternative = alternatives->at(i);
5365 if (!alternative->IsAtom()) {
5366 alternatives->at(write_posn++) = alternatives->at(i);
5367 i++;
5368 continue;
5369 }
5370 RegExpAtom* atom = alternative->AsAtom();
5371 if (atom->length() != 1) {
5372 alternatives->at(write_posn++) = alternatives->at(i);
5373 i++;
5374 continue;
5375 }
5376 int first_in_run = i;
5377 i++;
5378 while (i < length) {
5379 alternative = alternatives->at(i);
5380 if (!alternative->IsAtom()) break;
5381 atom = alternative->AsAtom();
5382 if (atom->length() != 1) break;
5383 i++;
5384 }
5385 if (i > first_in_run + 1) {
5386 // Found non-trivial run of single-character alternatives.
5387 int run_length = i - first_in_run;
5388 ZoneList<CharacterRange>* ranges =
5389 new (zone) ZoneList<CharacterRange>(2, zone);
5390 for (int j = 0; j < run_length; j++) {
5391 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5392 DCHECK_EQ(old_atom->length(), 1);
5393 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5394 }
5395 alternatives->at(write_posn++) =
5396 new (zone) RegExpCharacterClass(ranges, false);
5397 } else {
5398 // Just copy any trivial alternatives.
5399 for (int j = first_in_run; j < i; j++) {
5400 alternatives->at(write_posn++) = alternatives->at(j);
5401 }
5402 }
5403 }
5404 alternatives->Rewind(write_posn); // Trim end of array.
5405}
5406
5407
5408RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5409 RegExpNode* on_success) {
5410 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5411
5412 if (alternatives->length() > 2) {
5413 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5414 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5415 FixSingleCharacterDisjunctions(compiler);
5416 if (alternatives->length() == 1) {
5417 return alternatives->at(0)->ToNode(compiler, on_success);
5418 }
5419 }
5420
5421 int length = alternatives->length();
5422
5423 ChoiceNode* result =
5424 new(compiler->zone()) ChoiceNode(length, compiler->zone());
5425 for (int i = 0; i < length; i++) {
5426 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5427 on_success));
5428 result->AddAlternative(alternative);
5429 }
5430 return result;
5431}
5432
5433
5434RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5435 RegExpNode* on_success) {
5436 return ToNode(min(),
5437 max(),
5438 is_greedy(),
5439 body(),
5440 compiler,
5441 on_success);
5442}
5443
5444
5445// Scoped object to keep track of how much we unroll quantifier loops in the
5446// regexp graph generator.
5447class RegExpExpansionLimiter {
5448 public:
5449 static const int kMaxExpansionFactor = 6;
5450 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5451 : compiler_(compiler),
5452 saved_expansion_factor_(compiler->current_expansion_factor()),
5453 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5454 DCHECK(factor > 0);
5455 if (ok_to_expand_) {
5456 if (factor > kMaxExpansionFactor) {
5457 // Avoid integer overflow of the current expansion factor.
5458 ok_to_expand_ = false;
5459 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5460 } else {
5461 int new_factor = saved_expansion_factor_ * factor;
5462 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5463 compiler->set_current_expansion_factor(new_factor);
5464 }
5465 }
5466 }
5467
5468 ~RegExpExpansionLimiter() {
5469 compiler_->set_current_expansion_factor(saved_expansion_factor_);
5470 }
5471
5472 bool ok_to_expand() { return ok_to_expand_; }
5473
5474 private:
5475 RegExpCompiler* compiler_;
5476 int saved_expansion_factor_;
5477 bool ok_to_expand_;
5478
5479 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5480};
5481
5482
5483RegExpNode* RegExpQuantifier::ToNode(int min,
5484 int max,
5485 bool is_greedy,
5486 RegExpTree* body,
5487 RegExpCompiler* compiler,
5488 RegExpNode* on_success,
5489 bool not_at_start) {
5490 // x{f, t} becomes this:
5491 //
5492 // (r++)<-.
5493 // | `
5494 // | (x)
5495 // v ^
5496 // (r=0)-->(?)---/ [if r < t]
5497 // |
5498 // [if r >= f] \----> ...
5499 //
5500
5501 // 15.10.2.5 RepeatMatcher algorithm.
5502 // The parser has already eliminated the case where max is 0. In the case
5503 // where max_match is zero the parser has removed the quantifier if min was
5504 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5505
5506 // If we know that we cannot match zero length then things are a little
5507 // simpler since we don't need to make the special zero length match check
5508 // from step 2.1. If the min and max are small we can unroll a little in
5509 // this case.
5510 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5511 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5512 if (max == 0) return on_success; // This can happen due to recursion.
5513 bool body_can_be_empty = (body->min_match() == 0);
5514 int body_start_reg = RegExpCompiler::kNoRegister;
5515 Interval capture_registers = body->CaptureRegisters();
5516 bool needs_capture_clearing = !capture_registers.is_empty();
5517 Zone* zone = compiler->zone();
5518
5519 if (body_can_be_empty) {
5520 body_start_reg = compiler->AllocateRegister();
5521 } else if (compiler->optimize() && !needs_capture_clearing) {
5522 // Only unroll if there are no captures and the body can't be
5523 // empty.
5524 {
5525 RegExpExpansionLimiter limiter(
5526 compiler, min + ((max != min) ? 1 : 0));
5527 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5528 int new_max = (max == kInfinity) ? max : max - min;
5529 // Recurse once to get the loop or optional matches after the fixed
5530 // ones.
5531 RegExpNode* answer = ToNode(
5532 0, new_max, is_greedy, body, compiler, on_success, true);
5533 // Unroll the forced matches from 0 to min. This can cause chains of
5534 // TextNodes (which the parser does not generate). These should be
5535 // combined if it turns out they hinder good code generation.
5536 for (int i = 0; i < min; i++) {
5537 answer = body->ToNode(compiler, answer);
5538 }
5539 return answer;
5540 }
5541 }
5542 if (max <= kMaxUnrolledMaxMatches && min == 0) {
5543 DCHECK(max > 0); // Due to the 'if' above.
5544 RegExpExpansionLimiter limiter(compiler, max);
5545 if (limiter.ok_to_expand()) {
5546 // Unroll the optional matches up to max.
5547 RegExpNode* answer = on_success;
5548 for (int i = 0; i < max; i++) {
5549 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5550 if (is_greedy) {
5551 alternation->AddAlternative(
5552 GuardedAlternative(body->ToNode(compiler, answer)));
5553 alternation->AddAlternative(GuardedAlternative(on_success));
5554 } else {
5555 alternation->AddAlternative(GuardedAlternative(on_success));
5556 alternation->AddAlternative(
5557 GuardedAlternative(body->ToNode(compiler, answer)));
5558 }
5559 answer = alternation;
5560 if (not_at_start && !compiler->read_backward()) {
5561 alternation->set_not_at_start();
5562 }
5563 }
5564 return answer;
5565 }
5566 }
5567 }
5568 bool has_min = min > 0;
5569 bool has_max = max < RegExpTree::kInfinity;
5570 bool needs_counter = has_min || has_max;
5571 int reg_ctr = needs_counter
5572 ? compiler->AllocateRegister()
5573 : RegExpCompiler::kNoRegister;
5574 LoopChoiceNode* center = new (zone)
5575 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5576 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5577 RegExpNode* loop_return = needs_counter
5578 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5579 : static_cast<RegExpNode*>(center);
5580 if (body_can_be_empty) {
5581 // If the body can be empty we need to check if it was and then
5582 // backtrack.
5583 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5584 reg_ctr,
5585 min,
5586 loop_return);
5587 }
5588 RegExpNode* body_node = body->ToNode(compiler, loop_return);
5589 if (body_can_be_empty) {
5590 // If the body can be empty we need to store the start position
5591 // so we can bail out if it was empty.
5592 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5593 }
5594 if (needs_capture_clearing) {
5595 // Before entering the body of this loop we need to clear captures.
5596 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5597 }
5598 GuardedAlternative body_alt(body_node);
5599 if (has_max) {
5600 Guard* body_guard =
5601 new(zone) Guard(reg_ctr, Guard::LT, max);
5602 body_alt.AddGuard(body_guard, zone);
5603 }
5604 GuardedAlternative rest_alt(on_success);
5605 if (has_min) {
5606 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5607 rest_alt.AddGuard(rest_guard, zone);
5608 }
5609 if (is_greedy) {
5610 center->AddLoopAlternative(body_alt);
5611 center->AddContinueAlternative(rest_alt);
5612 } else {
5613 center->AddContinueAlternative(rest_alt);
5614 center->AddLoopAlternative(body_alt);
5615 }
5616 if (needs_counter) {
5617 return ActionNode::SetRegister(reg_ctr, 0, center);
5618 } else {
5619 return center;
5620 }
5621}
5622
5623
5624RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5625 RegExpNode* on_success) {
5626 NodeInfo info;
5627 Zone* zone = compiler->zone();
5628
5629 switch (assertion_type()) {
5630 case START_OF_LINE:
5631 return AssertionNode::AfterNewline(on_success);
5632 case START_OF_INPUT:
5633 return AssertionNode::AtStart(on_success);
5634 case BOUNDARY:
5635 return AssertionNode::AtBoundary(on_success);
5636 case NON_BOUNDARY:
5637 return AssertionNode::AtNonBoundary(on_success);
5638 case END_OF_INPUT:
5639 return AssertionNode::AtEnd(on_success);
5640 case END_OF_LINE: {
5641 // Compile $ in multiline regexps as an alternation with a positive
5642 // lookahead in one side and an end-of-input on the other side.
5643 // We need two registers for the lookahead.
5644 int stack_pointer_register = compiler->AllocateRegister();
5645 int position_register = compiler->AllocateRegister();
5646 // The ChoiceNode to distinguish between a newline and end-of-input.
5647 ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5648 // Create a newline atom.
5649 ZoneList<CharacterRange>* newline_ranges =
5650 new(zone) ZoneList<CharacterRange>(3, zone);
5651 CharacterRange::AddClassEscape('n', newline_ranges, zone);
5652 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
5653 TextNode* newline_matcher = new (zone) TextNode(
5654 newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5655 stack_pointer_register, position_register,
5656 0, // No captures inside.
5657 -1, // Ignored if no captures.
5658 on_success));
5659 // Create an end-of-input matcher.
5660 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5661 stack_pointer_register,
5662 position_register,
5663 newline_matcher);
5664 // Add the two alternatives to the ChoiceNode.
5665 GuardedAlternative eol_alternative(end_of_line);
5666 result->AddAlternative(eol_alternative);
5667 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5668 result->AddAlternative(end_alternative);
5669 return result;
5670 }
5671 default:
5672 UNREACHABLE();
5673 }
5674 return on_success;
5675}
5676
5677
5678RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5679 RegExpNode* on_success) {
5680 return new (compiler->zone())
5681 BackReferenceNode(RegExpCapture::StartRegister(index()),
5682 RegExpCapture::EndRegister(index()),
5683 compiler->read_backward(), on_success);
5684}
5685
5686
5687RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5688 RegExpNode* on_success) {
5689 return on_success;
5690}
5691
5692
Ben Murdoch097c5b22016-05-18 11:27:45 +01005693RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5694 int stack_pointer_register,
5695 int position_register,
5696 int capture_register_count,
5697 int capture_register_start)
5698 : is_positive_(is_positive),
5699 on_success_(on_success),
5700 stack_pointer_register_(stack_pointer_register),
5701 position_register_(position_register) {
5702 if (is_positive_) {
5703 on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5704 stack_pointer_register, position_register, capture_register_count,
5705 capture_register_start, on_success_);
5706 } else {
5707 Zone* zone = on_success_->zone();
5708 on_match_success_ = new (zone) NegativeSubmatchSuccess(
5709 stack_pointer_register, position_register, capture_register_count,
5710 capture_register_start, zone);
5711 }
5712}
5713
5714
5715RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5716 if (is_positive_) {
5717 return ActionNode::BeginSubmatch(stack_pointer_register_,
5718 position_register_, match);
5719 } else {
5720 Zone* zone = on_success_->zone();
5721 // We use a ChoiceNode to represent the negative lookaround. The first
5722 // alternative is the negative match. On success, the end node backtracks.
5723 // On failure, the second alternative is tried and leads to success.
5724 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5725 // first exit when calculating quick checks.
5726 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5727 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5728 return ActionNode::BeginSubmatch(stack_pointer_register_,
5729 position_register_, choice_node);
5730 }
5731}
5732
5733
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005734RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5735 RegExpNode* on_success) {
5736 int stack_pointer_register = compiler->AllocateRegister();
5737 int position_register = compiler->AllocateRegister();
5738
5739 const int registers_per_capture = 2;
5740 const int register_of_first_capture = 2;
5741 int register_count = capture_count_ * registers_per_capture;
5742 int register_start =
5743 register_of_first_capture + capture_from_ * registers_per_capture;
5744
5745 RegExpNode* result;
5746 bool was_reading_backward = compiler->read_backward();
5747 compiler->set_read_backward(type() == LOOKBEHIND);
Ben Murdoch097c5b22016-05-18 11:27:45 +01005748 Builder builder(is_positive(), on_success, stack_pointer_register,
5749 position_register, register_count, register_start);
5750 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5751 result = builder.ForMatch(match);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005752 compiler->set_read_backward(was_reading_backward);
5753 return result;
5754}
5755
5756
5757RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5758 RegExpNode* on_success) {
5759 return ToNode(body(), index(), compiler, on_success);
5760}
5761
5762
5763RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5764 int index,
5765 RegExpCompiler* compiler,
5766 RegExpNode* on_success) {
5767 DCHECK_NOT_NULL(body);
5768 int start_reg = RegExpCapture::StartRegister(index);
5769 int end_reg = RegExpCapture::EndRegister(index);
5770 if (compiler->read_backward()) std::swap(start_reg, end_reg);
5771 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5772 RegExpNode* body_node = body->ToNode(compiler, store_end);
5773 return ActionNode::StorePosition(start_reg, true, body_node);
5774}
5775
5776
5777RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5778 RegExpNode* on_success) {
5779 ZoneList<RegExpTree*>* children = nodes();
5780 RegExpNode* current = on_success;
5781 if (compiler->read_backward()) {
5782 for (int i = 0; i < children->length(); i++) {
5783 current = children->at(i)->ToNode(compiler, current);
5784 }
5785 } else {
5786 for (int i = children->length() - 1; i >= 0; i--) {
5787 current = children->at(i)->ToNode(compiler, current);
5788 }
5789 }
5790 return current;
5791}
5792
5793
5794static void AddClass(const int* elmv,
5795 int elmc,
5796 ZoneList<CharacterRange>* ranges,
5797 Zone* zone) {
5798 elmc--;
Ben Murdoch097c5b22016-05-18 11:27:45 +01005799 DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005800 for (int i = 0; i < elmc; i += 2) {
5801 DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch097c5b22016-05-18 11:27:45 +01005802 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005803 }
5804}
5805
5806
5807static void AddClassNegated(const int *elmv,
5808 int elmc,
5809 ZoneList<CharacterRange>* ranges,
5810 Zone* zone) {
5811 elmc--;
Ben Murdoch097c5b22016-05-18 11:27:45 +01005812 DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005813 DCHECK(elmv[0] != 0x0000);
Ben Murdoch097c5b22016-05-18 11:27:45 +01005814 DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005815 uc16 last = 0x0000;
5816 for (int i = 0; i < elmc; i += 2) {
5817 DCHECK(last <= elmv[i] - 1);
5818 DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch097c5b22016-05-18 11:27:45 +01005819 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005820 last = elmv[i + 1];
5821 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01005822 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005823}
5824
5825
5826void CharacterRange::AddClassEscape(uc16 type,
5827 ZoneList<CharacterRange>* ranges,
5828 Zone* zone) {
5829 switch (type) {
5830 case 's':
5831 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5832 break;
5833 case 'S':
5834 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5835 break;
5836 case 'w':
5837 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5838 break;
5839 case 'W':
5840 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5841 break;
5842 case 'd':
5843 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5844 break;
5845 case 'D':
5846 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5847 break;
5848 case '.':
5849 AddClassNegated(kLineTerminatorRanges,
5850 kLineTerminatorRangeCount,
5851 ranges,
5852 zone);
5853 break;
5854 // This is not a character range as defined by the spec but a
5855 // convenient shorthand for a character class that matches any
5856 // character.
5857 case '*':
5858 ranges->Add(CharacterRange::Everything(), zone);
5859 break;
5860 // This is the set of characters matched by the $ and ^ symbols
5861 // in multiline mode.
5862 case 'n':
5863 AddClass(kLineTerminatorRanges,
5864 kLineTerminatorRangeCount,
5865 ranges,
5866 zone);
5867 break;
5868 default:
5869 UNREACHABLE();
5870 }
5871}
5872
5873
5874Vector<const int> CharacterRange::GetWordBounds() {
5875 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5876}
5877
5878
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005879void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5880 ZoneList<CharacterRange>* ranges,
5881 bool is_one_byte) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01005882 int range_count = ranges->length();
5883 for (int i = 0; i < range_count; i++) {
5884 CharacterRange range = ranges->at(i);
5885 uc32 bottom = range.from();
5886 if (bottom > String::kMaxUtf16CodeUnit) return;
5887 uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5888 // Nothing to be done for surrogates.
5889 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
5890 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5891 if (bottom > String::kMaxOneByteCharCode) return;
5892 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005893 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01005894 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5895 if (top == bottom) {
5896 // If this is a singleton we just expand the one character.
5897 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005898 for (int i = 0; i < length; i++) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01005899 uc32 chr = chars[i];
5900 if (chr != bottom) {
5901 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005902 }
5903 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01005904 } else {
5905 // If this is a range we expand the characters block by block, expanding
5906 // contiguous subranges (blocks) one at a time. The approach is as
5907 // follows. For a given start character we look up the remainder of the
5908 // block that contains it (represented by the end point), for instance we
5909 // find 'z' if the character is 'c'. A block is characterized by the
5910 // property that all characters uncanonicalize in the same way, except
5911 // that each entry in the result is incremented by the distance from the
5912 // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
5913 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
5914 // we've found the end point we look up its uncanonicalization and
5915 // produce a range for each element. For instance for [c-f] we look up
5916 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
5917 // it is not already contained in the input, so [c-f] will be skipped but
5918 // [C-F] will be added. If this range is not completely contained in a
5919 // block we do this for all the blocks covered by the range (handling
5920 // characters that is not in a block as a "singleton block").
5921 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5922 int pos = bottom;
5923 while (pos <= top) {
5924 int length =
5925 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5926 uc32 block_end;
5927 if (length == 0) {
5928 block_end = pos;
5929 } else {
5930 DCHECK_EQ(1, length);
5931 block_end = equivalents[0];
5932 }
5933 int end = (block_end > top) ? top : block_end;
5934 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5935 equivalents);
5936 for (int i = 0; i < length; i++) {
5937 uc32 c = equivalents[i];
5938 uc32 range_from = c - (block_end - pos);
5939 uc32 range_to = c - (block_end - end);
5940 if (!(bottom <= range_from && range_to <= top)) {
5941 ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5942 }
5943 }
5944 pos = end + 1;
5945 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005946 }
5947 }
5948}
5949
5950
5951bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5952 DCHECK_NOT_NULL(ranges);
5953 int n = ranges->length();
5954 if (n <= 1) return true;
5955 int max = ranges->at(0).to();
5956 for (int i = 1; i < n; i++) {
5957 CharacterRange next_range = ranges->at(i);
5958 if (next_range.from() <= max + 1) return false;
5959 max = next_range.to();
5960 }
5961 return true;
5962}
5963
5964
5965ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5966 if (ranges_ == NULL) {
5967 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5968 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5969 }
5970 return ranges_;
5971}
5972
5973
5974// Move a number of elements in a zonelist to another position
5975// in the same list. Handles overlapping source and target areas.
5976static void MoveRanges(ZoneList<CharacterRange>* list,
5977 int from,
5978 int to,
5979 int count) {
5980 // Ranges are potentially overlapping.
5981 if (from < to) {
5982 for (int i = count - 1; i >= 0; i--) {
5983 list->at(to + i) = list->at(from + i);
5984 }
5985 } else {
5986 for (int i = 0; i < count; i++) {
5987 list->at(to + i) = list->at(from + i);
5988 }
5989 }
5990}
5991
5992
5993static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5994 int count,
5995 CharacterRange insert) {
5996 // Inserts a range into list[0..count[, which must be sorted
5997 // by from value and non-overlapping and non-adjacent, using at most
5998 // list[0..count] for the result. Returns the number of resulting
5999 // canonicalized ranges. Inserting a range may collapse existing ranges into
6000 // fewer ranges, so the return value can be anything in the range 1..count+1.
Ben Murdoch097c5b22016-05-18 11:27:45 +01006001 uc32 from = insert.from();
6002 uc32 to = insert.to();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006003 int start_pos = 0;
6004 int end_pos = count;
6005 for (int i = count - 1; i >= 0; i--) {
6006 CharacterRange current = list->at(i);
6007 if (current.from() > to + 1) {
6008 end_pos = i;
6009 } else if (current.to() + 1 < from) {
6010 start_pos = i + 1;
6011 break;
6012 }
6013 }
6014
6015 // Inserted range overlaps, or is adjacent to, ranges at positions
6016 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6017 // not affected by the insertion.
6018 // If start_pos == end_pos, the range must be inserted before start_pos.
6019 // if start_pos < end_pos, the entire range from start_pos to end_pos
6020 // must be merged with the insert range.
6021
6022 if (start_pos == end_pos) {
6023 // Insert between existing ranges at position start_pos.
6024 if (start_pos < count) {
6025 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6026 }
6027 list->at(start_pos) = insert;
6028 return count + 1;
6029 }
6030 if (start_pos + 1 == end_pos) {
6031 // Replace single existing range at position start_pos.
6032 CharacterRange to_replace = list->at(start_pos);
6033 int new_from = Min(to_replace.from(), from);
6034 int new_to = Max(to_replace.to(), to);
Ben Murdoch097c5b22016-05-18 11:27:45 +01006035 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006036 return count;
6037 }
6038 // Replace a number of existing ranges from start_pos to end_pos - 1.
6039 // Move the remaining ranges down.
6040
6041 int new_from = Min(list->at(start_pos).from(), from);
6042 int new_to = Max(list->at(end_pos - 1).to(), to);
6043 if (end_pos < count) {
6044 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6045 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006046 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006047 return count - (end_pos - start_pos) + 1;
6048}
6049
6050
6051void CharacterSet::Canonicalize() {
6052 // Special/default classes are always considered canonical. The result
6053 // of calling ranges() will be sorted.
6054 if (ranges_ == NULL) return;
6055 CharacterRange::Canonicalize(ranges_);
6056}
6057
6058
6059void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6060 if (character_ranges->length() <= 1) return;
6061 // Check whether ranges are already canonical (increasing, non-overlapping,
6062 // non-adjacent).
6063 int n = character_ranges->length();
6064 int max = character_ranges->at(0).to();
6065 int i = 1;
6066 while (i < n) {
6067 CharacterRange current = character_ranges->at(i);
6068 if (current.from() <= max + 1) {
6069 break;
6070 }
6071 max = current.to();
6072 i++;
6073 }
6074 // Canonical until the i'th range. If that's all of them, we are done.
6075 if (i == n) return;
6076
6077 // The ranges at index i and forward are not canonicalized. Make them so by
6078 // doing the equivalent of insertion sort (inserting each into the previous
6079 // list, in order).
6080 // Notice that inserting a range can reduce the number of ranges in the
6081 // result due to combining of adjacent and overlapping ranges.
6082 int read = i; // Range to insert.
6083 int num_canonical = i; // Length of canonicalized part of list.
6084 do {
6085 num_canonical = InsertRangeInCanonicalList(character_ranges,
6086 num_canonical,
6087 character_ranges->at(read));
6088 read++;
6089 } while (read < n);
6090 character_ranges->Rewind(num_canonical);
6091
6092 DCHECK(CharacterRange::IsCanonical(character_ranges));
6093}
6094
6095
6096void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6097 ZoneList<CharacterRange>* negated_ranges,
6098 Zone* zone) {
6099 DCHECK(CharacterRange::IsCanonical(ranges));
6100 DCHECK_EQ(0, negated_ranges->length());
6101 int range_count = ranges->length();
Ben Murdoch097c5b22016-05-18 11:27:45 +01006102 uc32 from = 0;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006103 int i = 0;
6104 if (range_count > 0 && ranges->at(0).from() == 0) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006105 from = ranges->at(0).to() + 1;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006106 i = 1;
6107 }
6108 while (i < range_count) {
6109 CharacterRange range = ranges->at(i);
Ben Murdoch097c5b22016-05-18 11:27:45 +01006110 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6111 from = range.to() + 1;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006112 i++;
6113 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006114 if (from < String::kMaxCodePoint) {
6115 negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006116 zone);
6117 }
6118}
6119
6120
6121// -------------------------------------------------------------------
6122// Splay tree
6123
6124
6125OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6126 if (Get(value))
6127 return this;
6128 if (successors(zone) != NULL) {
6129 for (int i = 0; i < successors(zone)->length(); i++) {
6130 OutSet* successor = successors(zone)->at(i);
6131 if (successor->Get(value))
6132 return successor;
6133 }
6134 } else {
6135 successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6136 }
6137 OutSet* result = new(zone) OutSet(first_, remaining_);
6138 result->Set(value, zone);
6139 successors(zone)->Add(result, zone);
6140 return result;
6141}
6142
6143
6144void OutSet::Set(unsigned value, Zone *zone) {
6145 if (value < kFirstLimit) {
6146 first_ |= (1 << value);
6147 } else {
6148 if (remaining_ == NULL)
6149 remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6150 if (remaining_->is_empty() || !remaining_->Contains(value))
6151 remaining_->Add(value, zone);
6152 }
6153}
6154
6155
6156bool OutSet::Get(unsigned value) const {
6157 if (value < kFirstLimit) {
6158 return (first_ & (1 << value)) != 0;
6159 } else if (remaining_ == NULL) {
6160 return false;
6161 } else {
6162 return remaining_->Contains(value);
6163 }
6164}
6165
6166
Ben Murdoch097c5b22016-05-18 11:27:45 +01006167const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006168
6169
6170void DispatchTable::AddRange(CharacterRange full_range, int value,
6171 Zone* zone) {
6172 CharacterRange current = full_range;
6173 if (tree()->is_empty()) {
6174 // If this is the first range we just insert into the table.
6175 ZoneSplayTree<Config>::Locator loc;
6176 bool inserted = tree()->Insert(current.from(), &loc);
6177 DCHECK(inserted);
6178 USE(inserted);
6179 loc.set_value(Entry(current.from(), current.to(),
6180 empty()->Extend(value, zone)));
6181 return;
6182 }
6183 // First see if there is a range to the left of this one that
6184 // overlaps.
6185 ZoneSplayTree<Config>::Locator loc;
6186 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6187 Entry* entry = &loc.value();
6188 // If we've found a range that overlaps with this one, and it
6189 // starts strictly to the left of this one, we have to fix it
6190 // because the following code only handles ranges that start on
6191 // or after the start point of the range we're adding.
6192 if (entry->from() < current.from() && entry->to() >= current.from()) {
6193 // Snap the overlapping range in half around the start point of
6194 // the range we're adding.
Ben Murdoch097c5b22016-05-18 11:27:45 +01006195 CharacterRange left =
6196 CharacterRange::Range(entry->from(), current.from() - 1);
6197 CharacterRange right = CharacterRange::Range(current.from(), entry->to());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006198 // The left part of the overlapping range doesn't overlap.
6199 // Truncate the whole entry to be just the left part.
6200 entry->set_to(left.to());
6201 // The right part is the one that overlaps. We add this part
6202 // to the map and let the next step deal with merging it with
6203 // the range we're adding.
6204 ZoneSplayTree<Config>::Locator loc;
6205 bool inserted = tree()->Insert(right.from(), &loc);
6206 DCHECK(inserted);
6207 USE(inserted);
6208 loc.set_value(Entry(right.from(),
6209 right.to(),
6210 entry->out_set()));
6211 }
6212 }
6213 while (current.is_valid()) {
6214 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6215 (loc.value().from() <= current.to()) &&
6216 (loc.value().to() >= current.from())) {
6217 Entry* entry = &loc.value();
6218 // We have overlap. If there is space between the start point of
6219 // the range we're adding and where the overlapping range starts
6220 // then we have to add a range covering just that space.
6221 if (current.from() < entry->from()) {
6222 ZoneSplayTree<Config>::Locator ins;
6223 bool inserted = tree()->Insert(current.from(), &ins);
6224 DCHECK(inserted);
6225 USE(inserted);
6226 ins.set_value(Entry(current.from(),
6227 entry->from() - 1,
6228 empty()->Extend(value, zone)));
6229 current.set_from(entry->from());
6230 }
6231 DCHECK_EQ(current.from(), entry->from());
6232 // If the overlapping range extends beyond the one we want to add
6233 // we have to snap the right part off and add it separately.
6234 if (entry->to() > current.to()) {
6235 ZoneSplayTree<Config>::Locator ins;
6236 bool inserted = tree()->Insert(current.to() + 1, &ins);
6237 DCHECK(inserted);
6238 USE(inserted);
6239 ins.set_value(Entry(current.to() + 1,
6240 entry->to(),
6241 entry->out_set()));
6242 entry->set_to(current.to());
6243 }
6244 DCHECK(entry->to() <= current.to());
6245 // The overlapping range is now completely contained by the range
6246 // we're adding so we can just update it and move the start point
6247 // of the range we're adding just past it.
6248 entry->AddValue(value, zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006249 DCHECK(entry->to() + 1 > current.from());
6250 current.set_from(entry->to() + 1);
6251 } else {
6252 // There is no overlap so we can just add the range
6253 ZoneSplayTree<Config>::Locator ins;
6254 bool inserted = tree()->Insert(current.from(), &ins);
6255 DCHECK(inserted);
6256 USE(inserted);
6257 ins.set_value(Entry(current.from(),
6258 current.to(),
6259 empty()->Extend(value, zone)));
6260 break;
6261 }
6262 }
6263}
6264
6265
Ben Murdoch097c5b22016-05-18 11:27:45 +01006266OutSet* DispatchTable::Get(uc32 value) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006267 ZoneSplayTree<Config>::Locator loc;
6268 if (!tree()->FindGreatestLessThan(value, &loc))
6269 return empty();
6270 Entry* entry = &loc.value();
6271 if (value <= entry->to())
6272 return entry->out_set();
6273 else
6274 return empty();
6275}
6276
6277
6278// -------------------------------------------------------------------
6279// Analysis
6280
6281
6282void Analysis::EnsureAnalyzed(RegExpNode* that) {
6283 StackLimitCheck check(isolate());
6284 if (check.HasOverflowed()) {
6285 fail("Stack overflow");
6286 return;
6287 }
6288 if (that->info()->been_analyzed || that->info()->being_analyzed)
6289 return;
6290 that->info()->being_analyzed = true;
6291 that->Accept(this);
6292 that->info()->being_analyzed = false;
6293 that->info()->been_analyzed = true;
6294}
6295
6296
6297void Analysis::VisitEnd(EndNode* that) {
6298 // nothing to do
6299}
6300
6301
6302void TextNode::CalculateOffsets() {
6303 int element_count = elements()->length();
6304 // Set up the offsets of the elements relative to the start. This is a fixed
6305 // quantity since a TextNode can only contain fixed-width things.
6306 int cp_offset = 0;
6307 for (int i = 0; i < element_count; i++) {
6308 TextElement& elm = elements()->at(i);
6309 elm.set_cp_offset(cp_offset);
6310 cp_offset += elm.length();
6311 }
6312}
6313
6314
6315void Analysis::VisitText(TextNode* that) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006316 if (ignore_case()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006317 that->MakeCaseIndependent(isolate(), is_one_byte_);
6318 }
6319 EnsureAnalyzed(that->on_success());
6320 if (!has_failed()) {
6321 that->CalculateOffsets();
6322 }
6323}
6324
6325
6326void Analysis::VisitAction(ActionNode* that) {
6327 RegExpNode* target = that->on_success();
6328 EnsureAnalyzed(target);
6329 if (!has_failed()) {
6330 // If the next node is interested in what it follows then this node
6331 // has to be interested too so it can pass the information on.
6332 that->info()->AddFromFollowing(target->info());
6333 }
6334}
6335
6336
6337void Analysis::VisitChoice(ChoiceNode* that) {
6338 NodeInfo* info = that->info();
6339 for (int i = 0; i < that->alternatives()->length(); i++) {
6340 RegExpNode* node = that->alternatives()->at(i).node();
6341 EnsureAnalyzed(node);
6342 if (has_failed()) return;
6343 // Anything the following nodes need to know has to be known by
6344 // this node also, so it can pass it on.
6345 info->AddFromFollowing(node->info());
6346 }
6347}
6348
6349
6350void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6351 NodeInfo* info = that->info();
6352 for (int i = 0; i < that->alternatives()->length(); i++) {
6353 RegExpNode* node = that->alternatives()->at(i).node();
6354 if (node != that->loop_node()) {
6355 EnsureAnalyzed(node);
6356 if (has_failed()) return;
6357 info->AddFromFollowing(node->info());
6358 }
6359 }
6360 // Check the loop last since it may need the value of this node
6361 // to get a correct result.
6362 EnsureAnalyzed(that->loop_node());
6363 if (!has_failed()) {
6364 info->AddFromFollowing(that->loop_node()->info());
6365 }
6366}
6367
6368
6369void Analysis::VisitBackReference(BackReferenceNode* that) {
6370 EnsureAnalyzed(that->on_success());
6371}
6372
6373
6374void Analysis::VisitAssertion(AssertionNode* that) {
6375 EnsureAnalyzed(that->on_success());
6376}
6377
6378
6379void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6380 BoyerMooreLookahead* bm,
6381 bool not_at_start) {
6382 // Working out the set of characters that a backreference can match is too
6383 // hard, so we just say that any character can match.
6384 bm->SetRest(offset);
6385 SaveBMInfo(bm, not_at_start, offset);
6386}
6387
6388
6389STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6390 RegExpMacroAssembler::kTableSize);
6391
6392
6393void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6394 BoyerMooreLookahead* bm, bool not_at_start) {
6395 ZoneList<GuardedAlternative>* alts = alternatives();
6396 budget = (budget - 1) / alts->length();
6397 for (int i = 0; i < alts->length(); i++) {
6398 GuardedAlternative& alt = alts->at(i);
6399 if (alt.guards() != NULL && alt.guards()->length() != 0) {
6400 bm->SetRest(offset); // Give up trying to fill in info.
6401 SaveBMInfo(bm, not_at_start, offset);
6402 return;
6403 }
6404 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6405 }
6406 SaveBMInfo(bm, not_at_start, offset);
6407}
6408
6409
6410void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6411 BoyerMooreLookahead* bm, bool not_at_start) {
6412 if (initial_offset >= bm->length()) return;
6413 int offset = initial_offset;
6414 int max_char = bm->max_char();
6415 for (int i = 0; i < elements()->length(); i++) {
6416 if (offset >= bm->length()) {
6417 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6418 return;
6419 }
6420 TextElement text = elements()->at(i);
6421 if (text.text_type() == TextElement::ATOM) {
6422 RegExpAtom* atom = text.atom();
6423 for (int j = 0; j < atom->length(); j++, offset++) {
6424 if (offset >= bm->length()) {
6425 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6426 return;
6427 }
6428 uc16 character = atom->data()[j];
6429 if (bm->compiler()->ignore_case()) {
6430 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6431 int length = GetCaseIndependentLetters(
6432 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6433 chars);
6434 for (int j = 0; j < length; j++) {
6435 bm->Set(offset, chars[j]);
6436 }
6437 } else {
6438 if (character <= max_char) bm->Set(offset, character);
6439 }
6440 }
6441 } else {
6442 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6443 RegExpCharacterClass* char_class = text.char_class();
6444 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6445 if (char_class->is_negated()) {
6446 bm->SetAll(offset);
6447 } else {
6448 for (int k = 0; k < ranges->length(); k++) {
6449 CharacterRange& range = ranges->at(k);
6450 if (range.from() > max_char) continue;
6451 int to = Min(max_char, static_cast<int>(range.to()));
6452 bm->SetInterval(offset, Interval(range.from(), to));
6453 }
6454 }
6455 offset++;
6456 }
6457 }
6458 if (offset >= bm->length()) {
6459 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6460 return;
6461 }
6462 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6463 true); // Not at start after a text node.
6464 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6465}
6466
6467
6468// -------------------------------------------------------------------
6469// Dispatch table construction
6470
6471
6472void DispatchTableConstructor::VisitEnd(EndNode* that) {
6473 AddRange(CharacterRange::Everything());
6474}
6475
6476
6477void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6478 node->set_being_calculated(true);
6479 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6480 for (int i = 0; i < alternatives->length(); i++) {
6481 set_choice_index(i);
6482 alternatives->at(i).node()->Accept(this);
6483 }
6484 node->set_being_calculated(false);
6485}
6486
6487
6488class AddDispatchRange {
6489 public:
6490 explicit AddDispatchRange(DispatchTableConstructor* constructor)
6491 : constructor_(constructor) { }
6492 void Call(uc32 from, DispatchTable::Entry entry);
6493 private:
6494 DispatchTableConstructor* constructor_;
6495};
6496
6497
6498void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006499 constructor_->AddRange(CharacterRange::Range(from, entry.to()));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006500}
6501
6502
6503void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6504 if (node->being_calculated())
6505 return;
6506 DispatchTable* table = node->GetTable(ignore_case_);
6507 AddDispatchRange adder(this);
6508 table->ForEach(&adder);
6509}
6510
6511
6512void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6513 // TODO(160): Find the node that we refer back to and propagate its start
6514 // set back to here. For now we just accept anything.
6515 AddRange(CharacterRange::Everything());
6516}
6517
6518
6519void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6520 RegExpNode* target = that->on_success();
6521 target->Accept(this);
6522}
6523
6524
6525static int CompareRangeByFrom(const CharacterRange* a,
6526 const CharacterRange* b) {
6527 return Compare<uc16>(a->from(), b->from());
6528}
6529
6530
6531void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6532 ranges->Sort(CompareRangeByFrom);
6533 uc16 last = 0;
6534 for (int i = 0; i < ranges->length(); i++) {
6535 CharacterRange range = ranges->at(i);
6536 if (last < range.from())
Ben Murdoch097c5b22016-05-18 11:27:45 +01006537 AddRange(CharacterRange::Range(last, range.from() - 1));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006538 if (range.to() >= last) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006539 if (range.to() == String::kMaxCodePoint) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006540 return;
6541 } else {
6542 last = range.to() + 1;
6543 }
6544 }
6545 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006546 AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006547}
6548
6549
6550void DispatchTableConstructor::VisitText(TextNode* that) {
6551 TextElement elm = that->elements()->at(0);
6552 switch (elm.text_type()) {
6553 case TextElement::ATOM: {
6554 uc16 c = elm.atom()->data()[0];
Ben Murdoch097c5b22016-05-18 11:27:45 +01006555 AddRange(CharacterRange::Range(c, c));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006556 break;
6557 }
6558 case TextElement::CHAR_CLASS: {
6559 RegExpCharacterClass* tree = elm.char_class();
6560 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6561 if (tree->is_negated()) {
6562 AddInverse(ranges);
6563 } else {
6564 for (int i = 0; i < ranges->length(); i++)
6565 AddRange(ranges->at(i));
6566 }
6567 break;
6568 }
6569 default: {
6570 UNIMPLEMENTED();
6571 }
6572 }
6573}
6574
6575
6576void DispatchTableConstructor::VisitAction(ActionNode* that) {
6577 RegExpNode* target = that->on_success();
6578 target->Accept(this);
6579}
6580
6581
Ben Murdoch097c5b22016-05-18 11:27:45 +01006582RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6583 RegExpNode* on_success) {
6584 // If the regexp matching starts within a surrogate pair, step back
6585 // to the lead surrogate and start matching from there.
6586 DCHECK(!compiler->read_backward());
6587 Zone* zone = compiler->zone();
6588 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6589 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6590 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6591 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6592
6593 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6594
6595 int stack_register = compiler->UnicodeLookaroundStackRegister();
6596 int position_register = compiler->UnicodeLookaroundPositionRegister();
6597 RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6598 zone, lead_surrogates, true, on_success);
6599 RegExpLookaround::Builder builder(true, step_back, stack_register,
6600 position_register);
6601 RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6602 zone, trail_surrogates, false, builder.on_match_success());
6603
6604 optional_step_back->AddAlternative(
6605 GuardedAlternative(builder.ForMatch(match_trail)));
6606 optional_step_back->AddAlternative(GuardedAlternative(on_success));
6607
6608 return optional_step_back;
6609}
6610
6611
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006612RegExpEngine::CompilationResult RegExpEngine::Compile(
Ben Murdoch097c5b22016-05-18 11:27:45 +01006613 Isolate* isolate, Zone* zone, RegExpCompileData* data,
6614 JSRegExp::Flags flags, Handle<String> pattern,
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006615 Handle<String> sample_subject, bool is_one_byte) {
6616 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6617 return IrregexpRegExpTooBig(isolate);
6618 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006619 bool ignore_case = flags & JSRegExp::kIgnoreCase;
6620 bool is_sticky = flags & JSRegExp::kSticky;
6621 bool is_global = flags & JSRegExp::kGlobal;
6622 bool is_unicode = flags & JSRegExp::kUnicode;
6623 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006624 is_one_byte);
6625
6626 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6627
6628 // Sample some characters from the middle of the string.
6629 static const int kSampleSize = 128;
6630
6631 sample_subject = String::Flatten(sample_subject);
6632 int chars_sampled = 0;
6633 int half_way = (sample_subject->length() - kSampleSize) / 2;
6634 for (int i = Max(0, half_way);
6635 i < sample_subject->length() && chars_sampled < kSampleSize;
6636 i++, chars_sampled++) {
6637 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6638 }
6639
6640 // Wrap the body of the regexp in capture #0.
6641 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6642 0,
6643 &compiler,
6644 compiler.accept());
6645 RegExpNode* node = captured_body;
6646 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6647 bool is_start_anchored = data->tree->IsAnchoredAtStart();
6648 int max_length = data->tree->max_match();
6649 if (!is_start_anchored && !is_sticky) {
6650 // Add a .*? at the beginning, outside the body capture, unless
6651 // this expression is anchored at the beginning or sticky.
6652 RegExpNode* loop_node = RegExpQuantifier::ToNode(
6653 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
6654 &compiler, captured_body, data->contains_anchor);
6655
6656 if (data->contains_anchor) {
6657 // Unroll loop once, to take care of the case that might start
6658 // at the start of input.
6659 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6660 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6661 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6662 new (zone) RegExpCharacterClass('*'), false, loop_node)));
6663 node = first_step_node;
6664 } else {
6665 node = loop_node;
6666 }
6667 }
6668 if (is_one_byte) {
6669 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6670 // Do it again to propagate the new nodes to places where they were not
6671 // put because they had not been calculated yet.
6672 if (node != NULL) {
6673 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6674 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006675 } else if (compiler.unicode() && (is_global || is_sticky)) {
6676 node = OptionallyStepBackToLeadSurrogate(&compiler, node);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006677 }
6678
6679 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6680 data->node = node;
Ben Murdoch097c5b22016-05-18 11:27:45 +01006681 Analysis analysis(isolate, flags, is_one_byte);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006682 analysis.EnsureAnalyzed(node);
6683 if (analysis.has_failed()) {
6684 const char* error_message = analysis.error_message();
6685 return CompilationResult(isolate, error_message);
6686 }
6687
6688 // Create the correct assembler for the architecture.
6689#ifndef V8_INTERPRETED_REGEXP
6690 // Native regexp implementation.
6691
6692 NativeRegExpMacroAssembler::Mode mode =
6693 is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6694 : NativeRegExpMacroAssembler::UC16;
6695
6696#if V8_TARGET_ARCH_IA32
6697 RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6698 (data->capture_count + 1) * 2);
6699#elif V8_TARGET_ARCH_X64
6700 RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6701 (data->capture_count + 1) * 2);
6702#elif V8_TARGET_ARCH_ARM
6703 RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6704 (data->capture_count + 1) * 2);
6705#elif V8_TARGET_ARCH_ARM64
6706 RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6707 (data->capture_count + 1) * 2);
Ben Murdochda12d292016-06-02 14:46:10 +01006708#elif V8_TARGET_ARCH_S390
6709 RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
6710 (data->capture_count + 1) * 2);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006711#elif V8_TARGET_ARCH_PPC
6712 RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6713 (data->capture_count + 1) * 2);
6714#elif V8_TARGET_ARCH_MIPS
6715 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6716 (data->capture_count + 1) * 2);
6717#elif V8_TARGET_ARCH_MIPS64
6718 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6719 (data->capture_count + 1) * 2);
6720#elif V8_TARGET_ARCH_X87
6721 RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
6722 (data->capture_count + 1) * 2);
6723#else
6724#error "Unsupported architecture"
6725#endif
6726
6727#else // V8_INTERPRETED_REGEXP
6728 // Interpreted regexp implementation.
6729 EmbeddedVector<byte, 1024> codes;
6730 RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6731#endif // V8_INTERPRETED_REGEXP
6732
6733 macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
6734
6735 // Inserted here, instead of in Assembler, because it depends on information
6736 // in the AST that isn't replicated in the Node structure.
6737 static const int kMaxBacksearchLimit = 1024;
6738 if (is_end_anchored &&
6739 !is_start_anchored &&
6740 max_length < kMaxBacksearchLimit) {
6741 macro_assembler.SetCurrentPositionFromEnd(max_length);
6742 }
6743
6744 if (is_global) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006745 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6746 if (data->tree->min_match() > 0) {
6747 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6748 } else if (is_unicode) {
6749 mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6750 }
6751 macro_assembler.set_global_mode(mode);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006752 }
6753
6754 return compiler.Assemble(&macro_assembler,
6755 node,
6756 data->capture_count,
6757 pattern);
6758}
6759
6760
6761bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
6762 Heap* heap = pattern->GetHeap();
6763 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6764 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
6765 heap->isolate()->memory_allocator()->SizeExecutable() >
6766 RegExpImpl::kRegExpExecutableMemoryLimit) {
6767 too_much = true;
6768 }
6769 return too_much;
6770}
6771
6772
6773Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
6774 Object* key_pattern,
6775 FixedArray** last_match_cache,
6776 ResultsCacheType type) {
6777 FixedArray* cache;
6778 if (!key_string->IsInternalizedString()) return Smi::FromInt(0);
6779 if (type == STRING_SPLIT_SUBSTRINGS) {
6780 DCHECK(key_pattern->IsString());
6781 if (!key_pattern->IsInternalizedString()) return Smi::FromInt(0);
6782 cache = heap->string_split_cache();
6783 } else {
6784 DCHECK(type == REGEXP_MULTIPLE_INDICES);
6785 DCHECK(key_pattern->IsFixedArray());
6786 cache = heap->regexp_multiple_cache();
6787 }
6788
6789 uint32_t hash = key_string->Hash();
6790 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6791 ~(kArrayEntriesPerCacheEntry - 1));
6792 if (cache->get(index + kStringOffset) != key_string ||
6793 cache->get(index + kPatternOffset) != key_pattern) {
6794 index =
6795 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6796 if (cache->get(index + kStringOffset) != key_string ||
6797 cache->get(index + kPatternOffset) != key_pattern) {
6798 return Smi::FromInt(0);
6799 }
6800 }
6801
6802 *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6803 return cache->get(index + kArrayOffset);
6804}
6805
6806
6807void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6808 Handle<Object> key_pattern,
6809 Handle<FixedArray> value_array,
6810 Handle<FixedArray> last_match_cache,
6811 ResultsCacheType type) {
6812 Factory* factory = isolate->factory();
6813 Handle<FixedArray> cache;
6814 if (!key_string->IsInternalizedString()) return;
6815 if (type == STRING_SPLIT_SUBSTRINGS) {
6816 DCHECK(key_pattern->IsString());
6817 if (!key_pattern->IsInternalizedString()) return;
6818 cache = factory->string_split_cache();
6819 } else {
6820 DCHECK(type == REGEXP_MULTIPLE_INDICES);
6821 DCHECK(key_pattern->IsFixedArray());
6822 cache = factory->regexp_multiple_cache();
6823 }
6824
6825 uint32_t hash = key_string->Hash();
6826 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6827 ~(kArrayEntriesPerCacheEntry - 1));
6828 if (cache->get(index + kStringOffset) == Smi::FromInt(0)) {
6829 cache->set(index + kStringOffset, *key_string);
6830 cache->set(index + kPatternOffset, *key_pattern);
6831 cache->set(index + kArrayOffset, *value_array);
6832 cache->set(index + kLastMatchOffset, *last_match_cache);
6833 } else {
6834 uint32_t index2 =
6835 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6836 if (cache->get(index2 + kStringOffset) == Smi::FromInt(0)) {
6837 cache->set(index2 + kStringOffset, *key_string);
6838 cache->set(index2 + kPatternOffset, *key_pattern);
6839 cache->set(index2 + kArrayOffset, *value_array);
6840 cache->set(index2 + kLastMatchOffset, *last_match_cache);
6841 } else {
6842 cache->set(index2 + kStringOffset, Smi::FromInt(0));
6843 cache->set(index2 + kPatternOffset, Smi::FromInt(0));
6844 cache->set(index2 + kArrayOffset, Smi::FromInt(0));
6845 cache->set(index2 + kLastMatchOffset, Smi::FromInt(0));
6846 cache->set(index + kStringOffset, *key_string);
6847 cache->set(index + kPatternOffset, *key_pattern);
6848 cache->set(index + kArrayOffset, *value_array);
6849 cache->set(index + kLastMatchOffset, *last_match_cache);
6850 }
6851 }
6852 // If the array is a reasonably short list of substrings, convert it into a
6853 // list of internalized strings.
6854 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6855 for (int i = 0; i < value_array->length(); i++) {
6856 Handle<String> str(String::cast(value_array->get(i)), isolate);
6857 Handle<String> internalized_str = factory->InternalizeString(str);
6858 value_array->set(i, *internalized_str);
6859 }
6860 }
6861 // Convert backing store to a copy-on-write array.
6862 value_array->set_map_no_write_barrier(*factory->fixed_cow_array_map());
6863}
6864
6865
6866void RegExpResultsCache::Clear(FixedArray* cache) {
6867 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6868 cache->set(i, Smi::FromInt(0));
6869 }
6870}
6871
6872} // namespace internal
6873} // namespace v8