blob: c3b670b51825cbfa3baf3b989525660a150b9920 [file] [log] [blame]
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001// Copyright 2012 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/jsregexp.h"
6
7#include "src/ast/ast.h"
8#include "src/base/platform/platform.h"
9#include "src/compilation-cache.h"
10#include "src/compiler.h"
11#include "src/execution.h"
12#include "src/factory.h"
13#include "src/isolate-inl.h"
14#include "src/messages.h"
15#include "src/ostreams.h"
16#include "src/regexp/interpreter-irregexp.h"
17#include "src/regexp/jsregexp-inl.h"
18#include "src/regexp/regexp-macro-assembler.h"
19#include "src/regexp/regexp-macro-assembler-irregexp.h"
20#include "src/regexp/regexp-macro-assembler-tracer.h"
21#include "src/regexp/regexp-parser.h"
22#include "src/regexp/regexp-stack.h"
23#include "src/runtime/runtime.h"
24#include "src/splay-tree-inl.h"
25#include "src/string-search.h"
26#include "src/unicode-decoder.h"
27
Ben Murdoch097c5b22016-05-18 11:27:45 +010028#ifdef V8_I18N_SUPPORT
29#include "unicode/uset.h"
30#include "unicode/utypes.h"
31#endif // V8_I18N_SUPPORT
32
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000033#ifndef V8_INTERPRETED_REGEXP
34#if V8_TARGET_ARCH_IA32
35#include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
36#elif V8_TARGET_ARCH_X64
37#include "src/regexp/x64/regexp-macro-assembler-x64.h"
38#elif V8_TARGET_ARCH_ARM64
39#include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
40#elif V8_TARGET_ARCH_ARM
41#include "src/regexp/arm/regexp-macro-assembler-arm.h"
42#elif V8_TARGET_ARCH_PPC
43#include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
Ben Murdochda12d292016-06-02 14:46:10 +010044#elif V8_TARGET_ARCH_S390
45#include "src/regexp/s390/regexp-macro-assembler-s390.h"
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000046#elif V8_TARGET_ARCH_MIPS
47#include "src/regexp/mips/regexp-macro-assembler-mips.h"
48#elif V8_TARGET_ARCH_MIPS64
49#include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
50#elif V8_TARGET_ARCH_X87
51#include "src/regexp/x87/regexp-macro-assembler-x87.h"
52#else
53#error Unsupported target architecture.
54#endif
55#endif
56
57
58namespace v8 {
59namespace internal {
60
61MUST_USE_RESULT
62static inline MaybeHandle<Object> ThrowRegExpException(
63 Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
64 Isolate* isolate = re->GetIsolate();
65 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
66 pattern, error_text),
67 Object);
68}
69
70
71inline void ThrowRegExpException(Handle<JSRegExp> re,
72 Handle<String> error_text) {
73 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
74}
75
76
77ContainedInLattice AddRange(ContainedInLattice containment,
78 const int* ranges,
79 int ranges_length,
80 Interval new_range) {
81 DCHECK((ranges_length & 1) == 1);
Ben Murdoch097c5b22016-05-18 11:27:45 +010082 DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000083 if (containment == kLatticeUnknown) return containment;
84 bool inside = false;
85 int last = 0;
86 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
87 // Consider the range from last to ranges[i].
88 // We haven't got to the new range yet.
89 if (ranges[i] <= new_range.from()) continue;
90 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
91 // inclusive, but the values in ranges are not.
92 if (last <= new_range.from() && new_range.to() < ranges[i]) {
93 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
94 }
95 return kLatticeUnknown;
96 }
97 return containment;
98}
99
100
101// More makes code generation slower, less makes V8 benchmark score lower.
102const int kMaxLookaheadForBoyerMoore = 8;
103// In a 3-character pattern you can maximally step forwards 3 characters
104// at a time, which is not always enough to pay for the extra logic.
105const int kPatternTooShortForBoyerMoore = 2;
106
107
108// Identifies the sort of regexps where the regexp engine is faster
109// than the code used for atom matches.
110static bool HasFewDifferentCharacters(Handle<String> pattern) {
111 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
112 if (length <= kPatternTooShortForBoyerMoore) return false;
113 const int kMod = 128;
114 bool character_found[kMod];
115 int different = 0;
116 memset(&character_found[0], 0, sizeof(character_found));
117 for (int i = 0; i < length; i++) {
118 int ch = (pattern->Get(i) & (kMod - 1));
119 if (!character_found[ch]) {
120 character_found[ch] = true;
121 different++;
122 // We declare a regexp low-alphabet if it has at least 3 times as many
123 // characters as it has different characters.
124 if (different * 3 > length) return false;
125 }
126 }
127 return true;
128}
129
130
131// Generic RegExp methods. Dispatches to implementation specific methods.
132
133
134MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
135 Handle<String> pattern,
136 JSRegExp::Flags flags) {
137 Isolate* isolate = re->GetIsolate();
Ben Murdochda12d292016-06-02 14:46:10 +0100138 Zone zone(isolate->allocator());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000139 CompilationCache* compilation_cache = isolate->compilation_cache();
140 MaybeHandle<FixedArray> maybe_cached =
141 compilation_cache->LookupRegExp(pattern, flags);
142 Handle<FixedArray> cached;
143 bool in_cache = maybe_cached.ToHandle(&cached);
144 LOG(isolate, RegExpCompileEvent(re, in_cache));
145
146 Handle<Object> result;
147 if (in_cache) {
148 re->set_data(*cached);
149 return re;
150 }
151 pattern = String::Flatten(pattern);
152 PostponeInterruptsScope postpone(isolate);
153 RegExpCompileData parse_result;
154 FlatStringReader reader(isolate, pattern);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100155 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
156 &parse_result)) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000157 // Throw an exception if we fail to parse the pattern.
158 return ThrowRegExpException(re, pattern, parse_result.error);
159 }
160
161 bool has_been_compiled = false;
162
163 if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
164 !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
165 // Parse-tree is a single atom that is equal to the pattern.
166 AtomCompile(re, pattern, flags, pattern);
167 has_been_compiled = true;
168 } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
169 !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
170 RegExpAtom* atom = parse_result.tree->AsAtom();
171 Vector<const uc16> atom_pattern = atom->data();
172 Handle<String> atom_string;
173 ASSIGN_RETURN_ON_EXCEPTION(
174 isolate, atom_string,
175 isolate->factory()->NewStringFromTwoByte(atom_pattern),
176 Object);
177 if (!HasFewDifferentCharacters(atom_string)) {
178 AtomCompile(re, pattern, flags, atom_string);
179 has_been_compiled = true;
180 }
181 }
182 if (!has_been_compiled) {
183 IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
184 }
185 DCHECK(re->data()->IsFixedArray());
186 // Compilation succeeded so the data is set on the regexp
187 // and we can store it in the cache.
188 Handle<FixedArray> data(FixedArray::cast(re->data()));
189 compilation_cache->PutRegExp(pattern, flags, data);
190
191 return re;
192}
193
194
195MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
196 Handle<String> subject,
197 int index,
198 Handle<JSArray> last_match_info) {
199 switch (regexp->TypeTag()) {
200 case JSRegExp::ATOM:
201 return AtomExec(regexp, subject, index, last_match_info);
202 case JSRegExp::IRREGEXP: {
203 return IrregexpExec(regexp, subject, index, last_match_info);
204 }
205 default:
206 UNREACHABLE();
207 return MaybeHandle<Object>();
208 }
209}
210
211
212// RegExp Atom implementation: Simple string search using indexOf.
213
214
215void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
216 Handle<String> pattern,
217 JSRegExp::Flags flags,
218 Handle<String> match_pattern) {
219 re->GetIsolate()->factory()->SetRegExpAtomData(re,
220 JSRegExp::ATOM,
221 pattern,
222 flags,
223 match_pattern);
224}
225
226
227static void SetAtomLastCapture(FixedArray* array,
228 String* subject,
229 int from,
230 int to) {
231 SealHandleScope shs(array->GetIsolate());
232 RegExpImpl::SetLastCaptureCount(array, 2);
233 RegExpImpl::SetLastSubject(array, subject);
234 RegExpImpl::SetLastInput(array, subject);
235 RegExpImpl::SetCapture(array, 0, from);
236 RegExpImpl::SetCapture(array, 1, to);
237}
238
239
240int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
241 Handle<String> subject,
242 int index,
243 int32_t* output,
244 int output_size) {
245 Isolate* isolate = regexp->GetIsolate();
246
247 DCHECK(0 <= index);
248 DCHECK(index <= subject->length());
249
250 subject = String::Flatten(subject);
251 DisallowHeapAllocation no_gc; // ensure vectors stay valid
252
253 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
254 int needle_len = needle->length();
255 DCHECK(needle->IsFlat());
256 DCHECK_LT(0, needle_len);
257
258 if (index + needle_len > subject->length()) {
259 return RegExpImpl::RE_FAILURE;
260 }
261
262 for (int i = 0; i < output_size; i += 2) {
263 String::FlatContent needle_content = needle->GetFlatContent();
264 String::FlatContent subject_content = subject->GetFlatContent();
265 DCHECK(needle_content.IsFlat());
266 DCHECK(subject_content.IsFlat());
267 // dispatch on type of strings
268 index =
269 (needle_content.IsOneByte()
270 ? (subject_content.IsOneByte()
271 ? SearchString(isolate, subject_content.ToOneByteVector(),
272 needle_content.ToOneByteVector(), index)
273 : SearchString(isolate, subject_content.ToUC16Vector(),
274 needle_content.ToOneByteVector(), index))
275 : (subject_content.IsOneByte()
276 ? SearchString(isolate, subject_content.ToOneByteVector(),
277 needle_content.ToUC16Vector(), index)
278 : SearchString(isolate, subject_content.ToUC16Vector(),
279 needle_content.ToUC16Vector(), index)));
280 if (index == -1) {
281 return i / 2; // Return number of matches.
282 } else {
283 output[i] = index;
284 output[i+1] = index + needle_len;
285 index += needle_len;
286 }
287 }
288 return output_size / 2;
289}
290
291
292Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
293 Handle<String> subject,
294 int index,
295 Handle<JSArray> last_match_info) {
296 Isolate* isolate = re->GetIsolate();
297
298 static const int kNumRegisters = 2;
299 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
300 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
301
302 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
303
304 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
305
306 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
307 SealHandleScope shs(isolate);
308 FixedArray* array = FixedArray::cast(last_match_info->elements());
309 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
310 return last_match_info;
311}
312
313
314// Irregexp implementation.
315
316// Ensures that the regexp object contains a compiled version of the
317// source for either one-byte or two-byte subject strings.
318// If the compiled version doesn't already exist, it is compiled
319// from the source pattern.
320// If compilation fails, an exception is thrown and this function
321// returns false.
322bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
323 Handle<String> sample_subject,
324 bool is_one_byte) {
325 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
326#ifdef V8_INTERPRETED_REGEXP
327 if (compiled_code->IsByteArray()) return true;
328#else // V8_INTERPRETED_REGEXP (RegExp native code)
329 if (compiled_code->IsCode()) return true;
330#endif
331 // We could potentially have marked this as flushable, but have kept
332 // a saved version if we did not flush it yet.
333 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
334 if (saved_code->IsCode()) {
335 // Reinstate the code in the original place.
336 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
337 DCHECK(compiled_code->IsSmi());
338 return true;
339 }
340 return CompileIrregexp(re, sample_subject, is_one_byte);
341}
342
343
344bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
345 Handle<String> sample_subject,
346 bool is_one_byte) {
347 // Compile the RegExp.
348 Isolate* isolate = re->GetIsolate();
Ben Murdochda12d292016-06-02 14:46:10 +0100349 Zone zone(isolate->allocator());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000350 PostponeInterruptsScope postpone(isolate);
351 // If we had a compilation error the last time this is saved at the
352 // saved code index.
353 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
354 // When arriving here entry can only be a smi, either representing an
355 // uncompiled regexp, a previous compilation error, or code that has
356 // been flushed.
357 DCHECK(entry->IsSmi());
358 int entry_value = Smi::cast(entry)->value();
359 DCHECK(entry_value == JSRegExp::kUninitializedValue ||
360 entry_value == JSRegExp::kCompilationErrorValue ||
361 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
362
363 if (entry_value == JSRegExp::kCompilationErrorValue) {
364 // A previous compilation failed and threw an error which we store in
365 // the saved code index (we store the error message, not the actual
366 // error). Recreate the error object and throw it.
367 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
368 DCHECK(error_string->IsString());
369 Handle<String> error_message(String::cast(error_string));
370 ThrowRegExpException(re, error_message);
371 return false;
372 }
373
374 JSRegExp::Flags flags = re->GetFlags();
375
376 Handle<String> pattern(re->Pattern());
377 pattern = String::Flatten(pattern);
378 RegExpCompileData compile_data;
379 FlatStringReader reader(isolate, pattern);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100380 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
381 &compile_data)) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000382 // Throw an exception if we fail to parse the pattern.
383 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
384 USE(ThrowRegExpException(re, pattern, compile_data.error));
385 return false;
386 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100387 RegExpEngine::CompilationResult result =
388 RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
389 sample_subject, is_one_byte);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000390 if (result.error_message != NULL) {
391 // Unable to compile regexp.
392 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
393 CStrVector(result.error_message)).ToHandleChecked();
394 ThrowRegExpException(re, error_message);
395 return false;
396 }
397
398 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
399 data->set(JSRegExp::code_index(is_one_byte), result.code);
Ben Murdoch61f157c2016-09-16 13:49:30 +0100400 SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000401 int register_max = IrregexpMaxRegisterCount(*data);
402 if (result.num_registers > register_max) {
403 SetIrregexpMaxRegisterCount(*data, result.num_registers);
404 }
405
406 return true;
407}
408
409
410int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
411 return Smi::cast(
412 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
413}
414
415
416void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
417 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
418}
419
Ben Murdoch61f157c2016-09-16 13:49:30 +0100420void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
421 Handle<FixedArray> value) {
422 if (value.is_null()) {
423 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::FromInt(0));
424 } else {
425 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
426 }
427}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000428
429int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
430 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
431}
432
433
434int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
435 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
436}
437
438
439ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
440 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
441}
442
443
444Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
445 return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
446}
447
448
449void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
450 Handle<String> pattern,
451 JSRegExp::Flags flags,
452 int capture_count) {
453 // Initialize compiled code entries to null.
454 re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
455 JSRegExp::IRREGEXP,
456 pattern,
457 flags,
458 capture_count);
459}
460
461
462int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
463 Handle<String> subject) {
464 subject = String::Flatten(subject);
465
466 // Check representation of the underlying storage.
467 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
468 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
469
470#ifdef V8_INTERPRETED_REGEXP
471 // Byte-code regexp needs space allocated for all its registers.
472 // The result captures are copied to the start of the registers array
473 // if the match succeeds. This way those registers are not clobbered
474 // when we set the last match info from last successful match.
475 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
476 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
477#else // V8_INTERPRETED_REGEXP
478 // Native regexp only needs room to output captures. Registers are handled
479 // internally.
480 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
481#endif // V8_INTERPRETED_REGEXP
482}
483
484
485int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
486 Handle<String> subject,
487 int index,
488 int32_t* output,
489 int output_size) {
490 Isolate* isolate = regexp->GetIsolate();
491
492 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
493
494 DCHECK(index >= 0);
495 DCHECK(index <= subject->length());
496 DCHECK(subject->IsFlat());
497
498 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
499
500#ifndef V8_INTERPRETED_REGEXP
501 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
502 do {
503 EnsureCompiledIrregexp(regexp, subject, is_one_byte);
504 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
505 // The stack is used to allocate registers for the compiled regexp code.
506 // This means that in case of failure, the output registers array is left
507 // untouched and contains the capture results from the previous successful
508 // match. We can use that to set the last match info lazily.
509 NativeRegExpMacroAssembler::Result res =
510 NativeRegExpMacroAssembler::Match(code,
511 subject,
512 output,
513 output_size,
514 index,
515 isolate);
516 if (res != NativeRegExpMacroAssembler::RETRY) {
517 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
518 isolate->has_pending_exception());
519 STATIC_ASSERT(
520 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
521 STATIC_ASSERT(
522 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
523 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
524 == RE_EXCEPTION);
525 return static_cast<IrregexpResult>(res);
526 }
527 // If result is RETRY, the string has changed representation, and we
528 // must restart from scratch.
529 // In this case, it means we must make sure we are prepared to handle
530 // the, potentially, different subject (the string can switch between
531 // being internal and external, and even between being Latin1 and UC16,
532 // but the characters are always the same).
533 IrregexpPrepare(regexp, subject);
534 is_one_byte = subject->IsOneByteRepresentationUnderneath();
535 } while (true);
536 UNREACHABLE();
537 return RE_EXCEPTION;
538#else // V8_INTERPRETED_REGEXP
539
540 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
541 // We must have done EnsureCompiledIrregexp, so we can get the number of
542 // registers.
543 int number_of_capture_registers =
544 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
545 int32_t* raw_output = &output[number_of_capture_registers];
546 // We do not touch the actual capture result registers until we know there
547 // has been a match so that we can use those capture results to set the
548 // last match info.
549 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
550 raw_output[i] = -1;
551 }
552 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
553 isolate);
554
555 IrregexpResult result = IrregexpInterpreter::Match(isolate,
556 byte_codes,
557 subject,
558 raw_output,
559 index);
560 if (result == RE_SUCCESS) {
561 // Copy capture results to the start of the registers array.
562 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
563 }
564 if (result == RE_EXCEPTION) {
565 DCHECK(!isolate->has_pending_exception());
566 isolate->StackOverflow();
567 }
568 return result;
569#endif // V8_INTERPRETED_REGEXP
570}
571
572
573MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
574 Handle<String> subject,
575 int previous_index,
576 Handle<JSArray> last_match_info) {
577 Isolate* isolate = regexp->GetIsolate();
578 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
579
580 // Prepare space for the return values.
581#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
582 if (FLAG_trace_regexp_bytecodes) {
583 String* pattern = regexp->Pattern();
584 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
585 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
586 }
587#endif
588 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
589 if (required_registers < 0) {
590 // Compiling failed with an exception.
591 DCHECK(isolate->has_pending_exception());
592 return MaybeHandle<Object>();
593 }
594
595 int32_t* output_registers = NULL;
596 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
597 output_registers = NewArray<int32_t>(required_registers);
598 }
599 base::SmartArrayPointer<int32_t> auto_release(output_registers);
600 if (output_registers == NULL) {
601 output_registers = isolate->jsregexp_static_offsets_vector();
602 }
603
604 int res = RegExpImpl::IrregexpExecRaw(
605 regexp, subject, previous_index, output_registers, required_registers);
606 if (res == RE_SUCCESS) {
607 int capture_count =
608 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
609 return SetLastMatchInfo(
610 last_match_info, subject, capture_count, output_registers);
611 }
612 if (res == RE_EXCEPTION) {
613 DCHECK(isolate->has_pending_exception());
614 return MaybeHandle<Object>();
615 }
616 DCHECK(res == RE_FAILURE);
617 return isolate->factory()->null_value();
618}
619
620
621static void EnsureSize(Handle<JSArray> array, uint32_t minimum_size) {
622 if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) {
623 JSArray::SetLength(array, minimum_size);
624 }
625}
626
627
628Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
629 Handle<String> subject,
630 int capture_count,
631 int32_t* match) {
632 DCHECK(last_match_info->HasFastObjectElements());
633 int capture_register_count = (capture_count + 1) * 2;
634 EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead);
635 DisallowHeapAllocation no_allocation;
636 FixedArray* array = FixedArray::cast(last_match_info->elements());
637 if (match != NULL) {
638 for (int i = 0; i < capture_register_count; i += 2) {
639 SetCapture(array, i, match[i]);
640 SetCapture(array, i + 1, match[i + 1]);
641 }
642 }
643 SetLastCaptureCount(array, capture_register_count);
644 SetLastSubject(array, *subject);
645 SetLastInput(array, *subject);
646 return last_match_info;
647}
648
649
650RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
651 Handle<String> subject,
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000652 Isolate* isolate)
653 : register_array_(NULL),
654 register_array_size_(0),
655 regexp_(regexp),
656 subject_(subject) {
657#ifdef V8_INTERPRETED_REGEXP
658 bool interpreted = true;
659#else
660 bool interpreted = false;
661#endif // V8_INTERPRETED_REGEXP
662
663 if (regexp_->TypeTag() == JSRegExp::ATOM) {
664 static const int kAtomRegistersPerMatch = 2;
665 registers_per_match_ = kAtomRegistersPerMatch;
666 // There is no distinction between interpreted and native for atom regexps.
667 interpreted = false;
668 } else {
669 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
670 if (registers_per_match_ < 0) {
671 num_matches_ = -1; // Signal exception.
672 return;
673 }
674 }
675
Ben Murdoch097c5b22016-05-18 11:27:45 +0100676 DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
677 if (!interpreted) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000678 register_array_size_ =
679 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
680 max_matches_ = register_array_size_ / registers_per_match_;
681 } else {
682 // Global loop in interpreted regexp is not implemented. We choose
683 // the size of the offsets vector so that it can only store one match.
684 register_array_size_ = registers_per_match_;
685 max_matches_ = 1;
686 }
687
688 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
689 register_array_ = NewArray<int32_t>(register_array_size_);
690 } else {
691 register_array_ = isolate->jsregexp_static_offsets_vector();
692 }
693
694 // Set state so that fetching the results the first time triggers a call
695 // to the compiled regexp.
696 current_match_index_ = max_matches_ - 1;
697 num_matches_ = max_matches_;
698 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
699 DCHECK_GE(register_array_size_, registers_per_match_);
700 int32_t* last_match =
701 &register_array_[current_match_index_ * registers_per_match_];
702 last_match[0] = -1;
703 last_match[1] = 0;
704}
705
Ben Murdoch097c5b22016-05-18 11:27:45 +0100706int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
707 if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
708 last_index + 1 < subject_->length() &&
709 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
710 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
711 // Advance over the surrogate pair.
712 return last_index + 2;
713 }
714 return last_index + 1;
715}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000716
717// -------------------------------------------------------------------
718// Implementation of the Irregexp regular expression engine.
719//
720// The Irregexp regular expression engine is intended to be a complete
721// implementation of ECMAScript regular expressions. It generates either
722// bytecodes or native code.
723
724// The Irregexp regexp engine is structured in three steps.
725// 1) The parser generates an abstract syntax tree. See ast.cc.
726// 2) From the AST a node network is created. The nodes are all
727// subclasses of RegExpNode. The nodes represent states when
728// executing a regular expression. Several optimizations are
729// performed on the node network.
730// 3) From the nodes we generate either byte codes or native code
731// that can actually execute the regular expression (perform
732// the search). The code generation step is described in more
733// detail below.
734
735// Code generation.
736//
737// The nodes are divided into four main categories.
738// * Choice nodes
739// These represent places where the regular expression can
740// match in more than one way. For example on entry to an
741// alternation (foo|bar) or a repetition (*, +, ? or {}).
742// * Action nodes
743// These represent places where some action should be
744// performed. Examples include recording the current position
745// in the input string to a register (in order to implement
746// captures) or other actions on register for example in order
747// to implement the counters needed for {} repetitions.
748// * Matching nodes
749// These attempt to match some element part of the input string.
750// Examples of elements include character classes, plain strings
751// or back references.
752// * End nodes
753// These are used to implement the actions required on finding
754// a successful match or failing to find a match.
755//
756// The code generated (whether as byte codes or native code) maintains
757// some state as it runs. This consists of the following elements:
758//
759// * The capture registers. Used for string captures.
760// * Other registers. Used for counters etc.
761// * The current position.
762// * The stack of backtracking information. Used when a matching node
763// fails to find a match and needs to try an alternative.
764//
765// Conceptual regular expression execution model:
766//
767// There is a simple conceptual model of regular expression execution
768// which will be presented first. The actual code generated is a more
769// efficient simulation of the simple conceptual model:
770//
771// * Choice nodes are implemented as follows:
772// For each choice except the last {
773// push current position
774// push backtrack code location
775// <generate code to test for choice>
776// backtrack code location:
777// pop current position
778// }
779// <generate code to test for last choice>
780//
781// * Actions nodes are generated as follows
782// <push affected registers on backtrack stack>
783// <generate code to perform action>
784// push backtrack code location
785// <generate code to test for following nodes>
786// backtrack code location:
787// <pop affected registers to restore their state>
788// <pop backtrack location from stack and go to it>
789//
790// * Matching nodes are generated as follows:
791// if input string matches at current position
792// update current position
793// <generate code to test for following nodes>
794// else
795// <pop backtrack location from stack and go to it>
796//
797// Thus it can be seen that the current position is saved and restored
798// by the choice nodes, whereas the registers are saved and restored by
799// by the action nodes that manipulate them.
800//
801// The other interesting aspect of this model is that nodes are generated
802// at the point where they are needed by a recursive call to Emit(). If
803// the node has already been code generated then the Emit() call will
804// generate a jump to the previously generated code instead. In order to
805// limit recursion it is possible for the Emit() function to put the node
806// on a work list for later generation and instead generate a jump. The
807// destination of the jump is resolved later when the code is generated.
808//
809// Actual regular expression code generation.
810//
811// Code generation is actually more complicated than the above. In order
812// to improve the efficiency of the generated code some optimizations are
813// performed
814//
815// * Choice nodes have 1-character lookahead.
816// A choice node looks at the following character and eliminates some of
817// the choices immediately based on that character. This is not yet
818// implemented.
819// * Simple greedy loops store reduced backtracking information.
820// A quantifier like /.*foo/m will greedily match the whole input. It will
821// then need to backtrack to a point where it can match "foo". The naive
822// implementation of this would push each character position onto the
823// backtracking stack, then pop them off one by one. This would use space
824// proportional to the length of the input string. However since the "."
825// can only match in one way and always has a constant length (in this case
826// of 1) it suffices to store the current position on the top of the stack
827// once. Matching now becomes merely incrementing the current position and
828// backtracking becomes decrementing the current position and checking the
829// result against the stored current position. This is faster and saves
830// space.
831// * The current state is virtualized.
832// This is used to defer expensive operations until it is clear that they
833// are needed and to generate code for a node more than once, allowing
834// specialized an efficient versions of the code to be created. This is
835// explained in the section below.
836//
837// Execution state virtualization.
838//
839// Instead of emitting code, nodes that manipulate the state can record their
840// manipulation in an object called the Trace. The Trace object can record a
841// current position offset, an optional backtrack code location on the top of
842// the virtualized backtrack stack and some register changes. When a node is
843// to be emitted it can flush the Trace or update it. Flushing the Trace
844// will emit code to bring the actual state into line with the virtual state.
845// Avoiding flushing the state can postpone some work (e.g. updates of capture
846// registers). Postponing work can save time when executing the regular
847// expression since it may be found that the work never has to be done as a
848// failure to match can occur. In addition it is much faster to jump to a
849// known backtrack code location than it is to pop an unknown backtrack
850// location from the stack and jump there.
851//
852// The virtual state found in the Trace affects code generation. For example
853// the virtual state contains the difference between the actual current
854// position and the virtual current position, and matching code needs to use
855// this offset to attempt a match in the correct location of the input
856// string. Therefore code generated for a non-trivial trace is specialized
857// to that trace. The code generator therefore has the ability to generate
858// code for each node several times. In order to limit the size of the
859// generated code there is an arbitrary limit on how many specialized sets of
860// code may be generated for a given node. If the limit is reached, the
861// trace is flushed and a generic version of the code for a node is emitted.
862// This is subsequently used for that node. The code emitted for non-generic
863// trace is not recorded in the node and so it cannot currently be reused in
864// the event that code generation is requested for an identical trace.
865
866
867void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
868 UNREACHABLE();
869}
870
871
872void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
873 text->AddElement(TextElement::Atom(this), zone);
874}
875
876
877void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
878 text->AddElement(TextElement::CharClass(this), zone);
879}
880
881
882void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
883 for (int i = 0; i < elements()->length(); i++)
884 text->AddElement(elements()->at(i), zone);
885}
886
887
888TextElement TextElement::Atom(RegExpAtom* atom) {
889 return TextElement(ATOM, atom);
890}
891
892
893TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
894 return TextElement(CHAR_CLASS, char_class);
895}
896
897
898int TextElement::length() const {
899 switch (text_type()) {
900 case ATOM:
901 return atom()->length();
902
903 case CHAR_CLASS:
904 return 1;
905 }
906 UNREACHABLE();
907 return 0;
908}
909
910
911DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
912 if (table_ == NULL) {
913 table_ = new(zone()) DispatchTable(zone());
914 DispatchTableConstructor cons(table_, ignore_case, zone());
915 cons.BuildTable(this);
916 }
917 return table_;
918}
919
920
921class FrequencyCollator {
922 public:
923 FrequencyCollator() : total_samples_(0) {
924 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
925 frequencies_[i] = CharacterFrequency(i);
926 }
927 }
928
929 void CountCharacter(int character) {
930 int index = (character & RegExpMacroAssembler::kTableMask);
931 frequencies_[index].Increment();
932 total_samples_++;
933 }
934
935 // Does not measure in percent, but rather per-128 (the table size from the
936 // regexp macro assembler).
937 int Frequency(int in_character) {
938 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
939 if (total_samples_ < 1) return 1; // Division by zero.
940 int freq_in_per128 =
941 (frequencies_[in_character].counter() * 128) / total_samples_;
942 return freq_in_per128;
943 }
944
945 private:
946 class CharacterFrequency {
947 public:
948 CharacterFrequency() : counter_(0), character_(-1) { }
949 explicit CharacterFrequency(int character)
950 : counter_(0), character_(character) { }
951
952 void Increment() { counter_++; }
953 int counter() { return counter_; }
954 int character() { return character_; }
955
956 private:
957 int counter_;
958 int character_;
959 };
960
961
962 private:
963 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
964 int total_samples_;
965};
966
967
968class RegExpCompiler {
969 public:
970 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch097c5b22016-05-18 11:27:45 +0100971 JSRegExp::Flags flags, bool is_one_byte);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000972
973 int AllocateRegister() {
974 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
975 reg_exp_too_big_ = true;
976 return next_register_;
977 }
978 return next_register_++;
979 }
980
Ben Murdoch097c5b22016-05-18 11:27:45 +0100981 // Lookarounds to match lone surrogates for unicode character class matches
982 // are never nested. We can therefore reuse registers.
983 int UnicodeLookaroundStackRegister() {
984 if (unicode_lookaround_stack_register_ == kNoRegister) {
985 unicode_lookaround_stack_register_ = AllocateRegister();
986 }
987 return unicode_lookaround_stack_register_;
988 }
989
990 int UnicodeLookaroundPositionRegister() {
991 if (unicode_lookaround_position_register_ == kNoRegister) {
992 unicode_lookaround_position_register_ = AllocateRegister();
993 }
994 return unicode_lookaround_position_register_;
995 }
996
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000997 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
998 RegExpNode* start,
999 int capture_count,
1000 Handle<String> pattern);
1001
1002 inline void AddWork(RegExpNode* node) {
1003 if (!node->on_work_list() && !node->label()->is_bound()) {
1004 node->set_on_work_list(true);
1005 work_list_->Add(node);
1006 }
1007 }
1008
1009 static const int kImplementationOffset = 0;
1010 static const int kNumberOfRegistersOffset = 0;
1011 static const int kCodeOffset = 1;
1012
1013 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1014 EndNode* accept() { return accept_; }
1015
1016 static const int kMaxRecursion = 100;
1017 inline int recursion_depth() { return recursion_depth_; }
1018 inline void IncrementRecursionDepth() { recursion_depth_++; }
1019 inline void DecrementRecursionDepth() { recursion_depth_--; }
1020
1021 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1022
Ben Murdoch097c5b22016-05-18 11:27:45 +01001023 inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
1024 inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001025 inline bool one_byte() { return one_byte_; }
1026 inline bool optimize() { return optimize_; }
1027 inline void set_optimize(bool value) { optimize_ = value; }
1028 inline bool limiting_recursion() { return limiting_recursion_; }
1029 inline void set_limiting_recursion(bool value) {
1030 limiting_recursion_ = value;
1031 }
1032 bool read_backward() { return read_backward_; }
1033 void set_read_backward(bool value) { read_backward_ = value; }
1034 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1035
1036 int current_expansion_factor() { return current_expansion_factor_; }
1037 void set_current_expansion_factor(int value) {
1038 current_expansion_factor_ = value;
1039 }
1040
1041 Isolate* isolate() const { return isolate_; }
1042 Zone* zone() const { return zone_; }
1043
1044 static const int kNoRegister = -1;
1045
1046 private:
1047 EndNode* accept_;
1048 int next_register_;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001049 int unicode_lookaround_stack_register_;
1050 int unicode_lookaround_position_register_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001051 List<RegExpNode*>* work_list_;
1052 int recursion_depth_;
1053 RegExpMacroAssembler* macro_assembler_;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001054 JSRegExp::Flags flags_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001055 bool one_byte_;
1056 bool reg_exp_too_big_;
1057 bool limiting_recursion_;
1058 bool optimize_;
1059 bool read_backward_;
1060 int current_expansion_factor_;
1061 FrequencyCollator frequency_collator_;
1062 Isolate* isolate_;
1063 Zone* zone_;
1064};
1065
1066
1067class RecursionCheck {
1068 public:
1069 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1070 compiler->IncrementRecursionDepth();
1071 }
1072 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1073 private:
1074 RegExpCompiler* compiler_;
1075};
1076
1077
1078static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1079 return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1080}
1081
1082
1083// Attempts to compile the regexp using an Irregexp code generator. Returns
1084// a fixed array or a null handle depending on whether it succeeded.
1085RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch097c5b22016-05-18 11:27:45 +01001086 JSRegExp::Flags flags, bool one_byte)
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001087 : next_register_(2 * (capture_count + 1)),
Ben Murdoch097c5b22016-05-18 11:27:45 +01001088 unicode_lookaround_stack_register_(kNoRegister),
1089 unicode_lookaround_position_register_(kNoRegister),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001090 work_list_(NULL),
1091 recursion_depth_(0),
Ben Murdoch097c5b22016-05-18 11:27:45 +01001092 flags_(flags),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001093 one_byte_(one_byte),
1094 reg_exp_too_big_(false),
1095 limiting_recursion_(false),
1096 optimize_(FLAG_regexp_optimization),
1097 read_backward_(false),
1098 current_expansion_factor_(1),
1099 frequency_collator_(),
1100 isolate_(isolate),
1101 zone_(zone) {
1102 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1103 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1104}
1105
1106
1107RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1108 RegExpMacroAssembler* macro_assembler,
1109 RegExpNode* start,
1110 int capture_count,
1111 Handle<String> pattern) {
1112 Heap* heap = pattern->GetHeap();
1113
1114#ifdef DEBUG
1115 if (FLAG_trace_regexp_assembler)
1116 macro_assembler_ =
1117 new RegExpMacroAssemblerTracer(isolate(), macro_assembler);
1118 else
1119#endif
1120 macro_assembler_ = macro_assembler;
1121
1122 List <RegExpNode*> work_list(0);
1123 work_list_ = &work_list;
1124 Label fail;
1125 macro_assembler_->PushBacktrack(&fail);
1126 Trace new_trace;
1127 start->Emit(this, &new_trace);
1128 macro_assembler_->Bind(&fail);
1129 macro_assembler_->Fail();
1130 while (!work_list.is_empty()) {
1131 RegExpNode* node = work_list.RemoveLast();
1132 node->set_on_work_list(false);
1133 if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1134 }
1135 if (reg_exp_too_big_) {
1136 macro_assembler_->AbortedCodeGeneration();
1137 return IrregexpRegExpTooBig(isolate_);
1138 }
1139
1140 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1141 heap->IncreaseTotalRegexpCodeGenerated(code->Size());
1142 work_list_ = NULL;
1143#ifdef ENABLE_DISASSEMBLER
1144 if (FLAG_print_code) {
1145 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
1146 OFStream os(trace_scope.file());
1147 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1148 }
1149#endif
1150#ifdef DEBUG
1151 if (FLAG_trace_regexp_assembler) {
1152 delete macro_assembler_;
1153 }
1154#endif
1155 return RegExpEngine::CompilationResult(*code, next_register_);
1156}
1157
1158
1159bool Trace::DeferredAction::Mentions(int that) {
1160 if (action_type() == ActionNode::CLEAR_CAPTURES) {
1161 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1162 return range.Contains(that);
1163 } else {
1164 return reg() == that;
1165 }
1166}
1167
1168
1169bool Trace::mentions_reg(int reg) {
1170 for (DeferredAction* action = actions_;
1171 action != NULL;
1172 action = action->next()) {
1173 if (action->Mentions(reg))
1174 return true;
1175 }
1176 return false;
1177}
1178
1179
1180bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1181 DCHECK_EQ(0, *cp_offset);
1182 for (DeferredAction* action = actions_;
1183 action != NULL;
1184 action = action->next()) {
1185 if (action->Mentions(reg)) {
1186 if (action->action_type() == ActionNode::STORE_POSITION) {
1187 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1188 return true;
1189 } else {
1190 return false;
1191 }
1192 }
1193 }
1194 return false;
1195}
1196
1197
1198int Trace::FindAffectedRegisters(OutSet* affected_registers,
1199 Zone* zone) {
1200 int max_register = RegExpCompiler::kNoRegister;
1201 for (DeferredAction* action = actions_;
1202 action != NULL;
1203 action = action->next()) {
1204 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1205 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1206 for (int i = range.from(); i <= range.to(); i++)
1207 affected_registers->Set(i, zone);
1208 if (range.to() > max_register) max_register = range.to();
1209 } else {
1210 affected_registers->Set(action->reg(), zone);
1211 if (action->reg() > max_register) max_register = action->reg();
1212 }
1213 }
1214 return max_register;
1215}
1216
1217
1218void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1219 int max_register,
1220 const OutSet& registers_to_pop,
1221 const OutSet& registers_to_clear) {
1222 for (int reg = max_register; reg >= 0; reg--) {
1223 if (registers_to_pop.Get(reg)) {
1224 assembler->PopRegister(reg);
1225 } else if (registers_to_clear.Get(reg)) {
1226 int clear_to = reg;
1227 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1228 reg--;
1229 }
1230 assembler->ClearRegisters(reg, clear_to);
1231 }
1232 }
1233}
1234
1235
1236void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1237 int max_register,
1238 const OutSet& affected_registers,
1239 OutSet* registers_to_pop,
1240 OutSet* registers_to_clear,
1241 Zone* zone) {
1242 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1243 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1244
1245 // Count pushes performed to force a stack limit check occasionally.
1246 int pushes = 0;
1247
1248 for (int reg = 0; reg <= max_register; reg++) {
1249 if (!affected_registers.Get(reg)) {
1250 continue;
1251 }
1252
1253 // The chronologically first deferred action in the trace
1254 // is used to infer the action needed to restore a register
1255 // to its previous state (or not, if it's safe to ignore it).
1256 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1257 DeferredActionUndoType undo_action = IGNORE;
1258
1259 int value = 0;
1260 bool absolute = false;
1261 bool clear = false;
1262 static const int kNoStore = kMinInt;
1263 int store_position = kNoStore;
1264 // This is a little tricky because we are scanning the actions in reverse
1265 // historical order (newest first).
1266 for (DeferredAction* action = actions_;
1267 action != NULL;
1268 action = action->next()) {
1269 if (action->Mentions(reg)) {
1270 switch (action->action_type()) {
1271 case ActionNode::SET_REGISTER: {
1272 Trace::DeferredSetRegister* psr =
1273 static_cast<Trace::DeferredSetRegister*>(action);
1274 if (!absolute) {
1275 value += psr->value();
1276 absolute = true;
1277 }
1278 // SET_REGISTER is currently only used for newly introduced loop
1279 // counters. They can have a significant previous value if they
1280 // occour in a loop. TODO(lrn): Propagate this information, so
1281 // we can set undo_action to IGNORE if we know there is no value to
1282 // restore.
1283 undo_action = RESTORE;
1284 DCHECK_EQ(store_position, kNoStore);
1285 DCHECK(!clear);
1286 break;
1287 }
1288 case ActionNode::INCREMENT_REGISTER:
1289 if (!absolute) {
1290 value++;
1291 }
1292 DCHECK_EQ(store_position, kNoStore);
1293 DCHECK(!clear);
1294 undo_action = RESTORE;
1295 break;
1296 case ActionNode::STORE_POSITION: {
1297 Trace::DeferredCapture* pc =
1298 static_cast<Trace::DeferredCapture*>(action);
1299 if (!clear && store_position == kNoStore) {
1300 store_position = pc->cp_offset();
1301 }
1302
1303 // For captures we know that stores and clears alternate.
1304 // Other register, are never cleared, and if the occur
1305 // inside a loop, they might be assigned more than once.
1306 if (reg <= 1) {
1307 // Registers zero and one, aka "capture zero", is
1308 // always set correctly if we succeed. There is no
1309 // need to undo a setting on backtrack, because we
1310 // will set it again or fail.
1311 undo_action = IGNORE;
1312 } else {
1313 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1314 }
1315 DCHECK(!absolute);
1316 DCHECK_EQ(value, 0);
1317 break;
1318 }
1319 case ActionNode::CLEAR_CAPTURES: {
1320 // Since we're scanning in reverse order, if we've already
1321 // set the position we have to ignore historically earlier
1322 // clearing operations.
1323 if (store_position == kNoStore) {
1324 clear = true;
1325 }
1326 undo_action = RESTORE;
1327 DCHECK(!absolute);
1328 DCHECK_EQ(value, 0);
1329 break;
1330 }
1331 default:
1332 UNREACHABLE();
1333 break;
1334 }
1335 }
1336 }
1337 // Prepare for the undo-action (e.g., push if it's going to be popped).
1338 if (undo_action == RESTORE) {
1339 pushes++;
1340 RegExpMacroAssembler::StackCheckFlag stack_check =
1341 RegExpMacroAssembler::kNoStackLimitCheck;
1342 if (pushes == push_limit) {
1343 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1344 pushes = 0;
1345 }
1346
1347 assembler->PushRegister(reg, stack_check);
1348 registers_to_pop->Set(reg, zone);
1349 } else if (undo_action == CLEAR) {
1350 registers_to_clear->Set(reg, zone);
1351 }
1352 // Perform the chronologically last action (or accumulated increment)
1353 // for the register.
1354 if (store_position != kNoStore) {
1355 assembler->WriteCurrentPositionToRegister(reg, store_position);
1356 } else if (clear) {
1357 assembler->ClearRegisters(reg, reg);
1358 } else if (absolute) {
1359 assembler->SetRegister(reg, value);
1360 } else if (value != 0) {
1361 assembler->AdvanceRegister(reg, value);
1362 }
1363 }
1364}
1365
1366
1367// This is called as we come into a loop choice node and some other tricky
1368// nodes. It normalizes the state of the code generator to ensure we can
1369// generate generic code.
1370void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1371 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1372
1373 DCHECK(!is_trivial());
1374
1375 if (actions_ == NULL && backtrack() == NULL) {
1376 // Here we just have some deferred cp advances to fix and we are back to
1377 // a normal situation. We may also have to forget some information gained
1378 // through a quick check that was already performed.
1379 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1380 // Create a new trivial state and generate the node with that.
1381 Trace new_state;
1382 successor->Emit(compiler, &new_state);
1383 return;
1384 }
1385
1386 // Generate deferred actions here along with code to undo them again.
1387 OutSet affected_registers;
1388
1389 if (backtrack() != NULL) {
1390 // Here we have a concrete backtrack location. These are set up by choice
1391 // nodes and so they indicate that we have a deferred save of the current
1392 // position which we may need to emit here.
1393 assembler->PushCurrentPosition();
1394 }
1395
1396 int max_register = FindAffectedRegisters(&affected_registers,
1397 compiler->zone());
1398 OutSet registers_to_pop;
1399 OutSet registers_to_clear;
1400 PerformDeferredActions(assembler,
1401 max_register,
1402 affected_registers,
1403 &registers_to_pop,
1404 &registers_to_clear,
1405 compiler->zone());
1406 if (cp_offset_ != 0) {
1407 assembler->AdvanceCurrentPosition(cp_offset_);
1408 }
1409
1410 // Create a new trivial state and generate the node with that.
1411 Label undo;
1412 assembler->PushBacktrack(&undo);
1413 if (successor->KeepRecursing(compiler)) {
1414 Trace new_state;
1415 successor->Emit(compiler, &new_state);
1416 } else {
1417 compiler->AddWork(successor);
1418 assembler->GoTo(successor->label());
1419 }
1420
1421 // On backtrack we need to restore state.
1422 assembler->Bind(&undo);
1423 RestoreAffectedRegisters(assembler,
1424 max_register,
1425 registers_to_pop,
1426 registers_to_clear);
1427 if (backtrack() == NULL) {
1428 assembler->Backtrack();
1429 } else {
1430 assembler->PopCurrentPosition();
1431 assembler->GoTo(backtrack());
1432 }
1433}
1434
1435
1436void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1437 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1438
1439 // Omit flushing the trace. We discard the entire stack frame anyway.
1440
1441 if (!label()->is_bound()) {
1442 // We are completely independent of the trace, since we ignore it,
1443 // so this code can be used as the generic version.
1444 assembler->Bind(label());
1445 }
1446
1447 // Throw away everything on the backtrack stack since the start
1448 // of the negative submatch and restore the character position.
1449 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1450 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1451 if (clear_capture_count_ > 0) {
1452 // Clear any captures that might have been performed during the success
1453 // of the body of the negative look-ahead.
1454 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1455 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1456 }
1457 // Now that we have unwound the stack we find at the top of the stack the
1458 // backtrack that the BeginSubmatch node got.
1459 assembler->Backtrack();
1460}
1461
1462
1463void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1464 if (!trace->is_trivial()) {
1465 trace->Flush(compiler, this);
1466 return;
1467 }
1468 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1469 if (!label()->is_bound()) {
1470 assembler->Bind(label());
1471 }
1472 switch (action_) {
1473 case ACCEPT:
1474 assembler->Succeed();
1475 return;
1476 case BACKTRACK:
1477 assembler->GoTo(trace->backtrack());
1478 return;
1479 case NEGATIVE_SUBMATCH_SUCCESS:
1480 // This case is handled in a different virtual method.
1481 UNREACHABLE();
1482 }
1483 UNIMPLEMENTED();
1484}
1485
1486
1487void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1488 if (guards_ == NULL)
1489 guards_ = new(zone) ZoneList<Guard*>(1, zone);
1490 guards_->Add(guard, zone);
1491}
1492
1493
1494ActionNode* ActionNode::SetRegister(int reg,
1495 int val,
1496 RegExpNode* on_success) {
1497 ActionNode* result =
1498 new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1499 result->data_.u_store_register.reg = reg;
1500 result->data_.u_store_register.value = val;
1501 return result;
1502}
1503
1504
1505ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1506 ActionNode* result =
1507 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1508 result->data_.u_increment_register.reg = reg;
1509 return result;
1510}
1511
1512
1513ActionNode* ActionNode::StorePosition(int reg,
1514 bool is_capture,
1515 RegExpNode* on_success) {
1516 ActionNode* result =
1517 new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1518 result->data_.u_position_register.reg = reg;
1519 result->data_.u_position_register.is_capture = is_capture;
1520 return result;
1521}
1522
1523
1524ActionNode* ActionNode::ClearCaptures(Interval range,
1525 RegExpNode* on_success) {
1526 ActionNode* result =
1527 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1528 result->data_.u_clear_captures.range_from = range.from();
1529 result->data_.u_clear_captures.range_to = range.to();
1530 return result;
1531}
1532
1533
1534ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1535 int position_reg,
1536 RegExpNode* on_success) {
1537 ActionNode* result =
1538 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1539 result->data_.u_submatch.stack_pointer_register = stack_reg;
1540 result->data_.u_submatch.current_position_register = position_reg;
1541 return result;
1542}
1543
1544
1545ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1546 int position_reg,
1547 int clear_register_count,
1548 int clear_register_from,
1549 RegExpNode* on_success) {
1550 ActionNode* result =
1551 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1552 result->data_.u_submatch.stack_pointer_register = stack_reg;
1553 result->data_.u_submatch.current_position_register = position_reg;
1554 result->data_.u_submatch.clear_register_count = clear_register_count;
1555 result->data_.u_submatch.clear_register_from = clear_register_from;
1556 return result;
1557}
1558
1559
1560ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1561 int repetition_register,
1562 int repetition_limit,
1563 RegExpNode* on_success) {
1564 ActionNode* result =
1565 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1566 result->data_.u_empty_match_check.start_register = start_register;
1567 result->data_.u_empty_match_check.repetition_register = repetition_register;
1568 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1569 return result;
1570}
1571
1572
1573#define DEFINE_ACCEPT(Type) \
1574 void Type##Node::Accept(NodeVisitor* visitor) { \
1575 visitor->Visit##Type(this); \
1576 }
1577FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1578#undef DEFINE_ACCEPT
1579
1580
1581void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1582 visitor->VisitLoopChoice(this);
1583}
1584
1585
1586// -------------------------------------------------------------------
1587// Emit code.
1588
1589
1590void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1591 Guard* guard,
1592 Trace* trace) {
1593 switch (guard->op()) {
1594 case Guard::LT:
1595 DCHECK(!trace->mentions_reg(guard->reg()));
1596 macro_assembler->IfRegisterGE(guard->reg(),
1597 guard->value(),
1598 trace->backtrack());
1599 break;
1600 case Guard::GEQ:
1601 DCHECK(!trace->mentions_reg(guard->reg()));
1602 macro_assembler->IfRegisterLT(guard->reg(),
1603 guard->value(),
1604 trace->backtrack());
1605 break;
1606 }
1607}
1608
1609
1610// Returns the number of characters in the equivalence class, omitting those
1611// that cannot occur in the source string because it is Latin1.
1612static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1613 bool one_byte_subject,
1614 unibrow::uchar* letters) {
1615 int length =
1616 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1617 // Unibrow returns 0 or 1 for characters where case independence is
1618 // trivial.
1619 if (length == 0) {
1620 letters[0] = character;
1621 length = 1;
1622 }
1623
1624 if (one_byte_subject) {
1625 int new_length = 0;
1626 for (int i = 0; i < length; i++) {
1627 if (letters[i] <= String::kMaxOneByteCharCode) {
1628 letters[new_length++] = letters[i];
1629 }
1630 }
1631 length = new_length;
1632 }
1633
1634 return length;
1635}
1636
1637
1638static inline bool EmitSimpleCharacter(Isolate* isolate,
1639 RegExpCompiler* compiler,
1640 uc16 c,
1641 Label* on_failure,
1642 int cp_offset,
1643 bool check,
1644 bool preloaded) {
1645 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1646 bool bound_checked = false;
1647 if (!preloaded) {
1648 assembler->LoadCurrentCharacter(
1649 cp_offset,
1650 on_failure,
1651 check);
1652 bound_checked = true;
1653 }
1654 assembler->CheckNotCharacter(c, on_failure);
1655 return bound_checked;
1656}
1657
1658
1659// Only emits non-letters (things that don't have case). Only used for case
1660// independent matches.
1661static inline bool EmitAtomNonLetter(Isolate* isolate,
1662 RegExpCompiler* compiler,
1663 uc16 c,
1664 Label* on_failure,
1665 int cp_offset,
1666 bool check,
1667 bool preloaded) {
1668 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1669 bool one_byte = compiler->one_byte();
1670 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1671 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1672 if (length < 1) {
1673 // This can't match. Must be an one-byte subject and a non-one-byte
1674 // character. We do not need to do anything since the one-byte pass
1675 // already handled this.
1676 return false; // Bounds not checked.
1677 }
1678 bool checked = false;
1679 // We handle the length > 1 case in a later pass.
1680 if (length == 1) {
1681 if (one_byte && c > String::kMaxOneByteCharCodeU) {
1682 // Can't match - see above.
1683 return false; // Bounds not checked.
1684 }
1685 if (!preloaded) {
1686 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1687 checked = check;
1688 }
1689 macro_assembler->CheckNotCharacter(c, on_failure);
1690 }
1691 return checked;
1692}
1693
1694
1695static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1696 bool one_byte, uc16 c1, uc16 c2,
1697 Label* on_failure) {
1698 uc16 char_mask;
1699 if (one_byte) {
1700 char_mask = String::kMaxOneByteCharCode;
1701 } else {
1702 char_mask = String::kMaxUtf16CodeUnit;
1703 }
1704 uc16 exor = c1 ^ c2;
1705 // Check whether exor has only one bit set.
1706 if (((exor - 1) & exor) == 0) {
1707 // If c1 and c2 differ only by one bit.
1708 // Ecma262UnCanonicalize always gives the highest number last.
1709 DCHECK(c2 > c1);
1710 uc16 mask = char_mask ^ exor;
1711 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1712 return true;
1713 }
1714 DCHECK(c2 > c1);
1715 uc16 diff = c2 - c1;
1716 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1717 // If the characters differ by 2^n but don't differ by one bit then
1718 // subtract the difference from the found character, then do the or
1719 // trick. We avoid the theoretical case where negative numbers are
1720 // involved in order to simplify code generation.
1721 uc16 mask = char_mask ^ diff;
1722 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1723 diff,
1724 mask,
1725 on_failure);
1726 return true;
1727 }
1728 return false;
1729}
1730
1731
1732typedef bool EmitCharacterFunction(Isolate* isolate,
1733 RegExpCompiler* compiler,
1734 uc16 c,
1735 Label* on_failure,
1736 int cp_offset,
1737 bool check,
1738 bool preloaded);
1739
1740// Only emits letters (things that have case). Only used for case independent
1741// matches.
1742static inline bool EmitAtomLetter(Isolate* isolate,
1743 RegExpCompiler* compiler,
1744 uc16 c,
1745 Label* on_failure,
1746 int cp_offset,
1747 bool check,
1748 bool preloaded) {
1749 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1750 bool one_byte = compiler->one_byte();
1751 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1752 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1753 if (length <= 1) return false;
1754 // We may not need to check against the end of the input string
1755 // if this character lies before a character that matched.
1756 if (!preloaded) {
1757 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1758 }
1759 Label ok;
1760 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1761 switch (length) {
1762 case 2: {
1763 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1764 chars[1], on_failure)) {
1765 } else {
1766 macro_assembler->CheckCharacter(chars[0], &ok);
1767 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1768 macro_assembler->Bind(&ok);
1769 }
1770 break;
1771 }
1772 case 4:
1773 macro_assembler->CheckCharacter(chars[3], &ok);
1774 // Fall through!
1775 case 3:
1776 macro_assembler->CheckCharacter(chars[0], &ok);
1777 macro_assembler->CheckCharacter(chars[1], &ok);
1778 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1779 macro_assembler->Bind(&ok);
1780 break;
1781 default:
1782 UNREACHABLE();
1783 break;
1784 }
1785 return true;
1786}
1787
1788
1789static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1790 int border,
1791 Label* fall_through,
1792 Label* above_or_equal,
1793 Label* below) {
1794 if (below != fall_through) {
1795 masm->CheckCharacterLT(border, below);
1796 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1797 } else {
1798 masm->CheckCharacterGT(border - 1, above_or_equal);
1799 }
1800}
1801
1802
1803static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1804 int first,
1805 int last,
1806 Label* fall_through,
1807 Label* in_range,
1808 Label* out_of_range) {
1809 if (in_range == fall_through) {
1810 if (first == last) {
1811 masm->CheckNotCharacter(first, out_of_range);
1812 } else {
1813 masm->CheckCharacterNotInRange(first, last, out_of_range);
1814 }
1815 } else {
1816 if (first == last) {
1817 masm->CheckCharacter(first, in_range);
1818 } else {
1819 masm->CheckCharacterInRange(first, last, in_range);
1820 }
1821 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1822 }
1823}
1824
1825
1826// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1827// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1828static void EmitUseLookupTable(
1829 RegExpMacroAssembler* masm,
1830 ZoneList<int>* ranges,
1831 int start_index,
1832 int end_index,
1833 int min_char,
1834 Label* fall_through,
1835 Label* even_label,
1836 Label* odd_label) {
1837 static const int kSize = RegExpMacroAssembler::kTableSize;
1838 static const int kMask = RegExpMacroAssembler::kTableMask;
1839
1840 int base = (min_char & ~kMask);
1841 USE(base);
1842
1843 // Assert that everything is on one kTableSize page.
1844 for (int i = start_index; i <= end_index; i++) {
1845 DCHECK_EQ(ranges->at(i) & ~kMask, base);
1846 }
1847 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1848
1849 char templ[kSize];
1850 Label* on_bit_set;
1851 Label* on_bit_clear;
1852 int bit;
1853 if (even_label == fall_through) {
1854 on_bit_set = odd_label;
1855 on_bit_clear = even_label;
1856 bit = 1;
1857 } else {
1858 on_bit_set = even_label;
1859 on_bit_clear = odd_label;
1860 bit = 0;
1861 }
1862 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1863 templ[i] = bit;
1864 }
1865 int j = 0;
1866 bit ^= 1;
1867 for (int i = start_index; i < end_index; i++) {
1868 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1869 templ[j] = bit;
1870 }
1871 bit ^= 1;
1872 }
1873 for (int i = j; i < kSize; i++) {
1874 templ[i] = bit;
1875 }
1876 Factory* factory = masm->isolate()->factory();
1877 // TODO(erikcorry): Cache these.
1878 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1879 for (int i = 0; i < kSize; i++) {
1880 ba->set(i, templ[i]);
1881 }
1882 masm->CheckBitInTable(ba, on_bit_set);
1883 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1884}
1885
1886
1887static void CutOutRange(RegExpMacroAssembler* masm,
1888 ZoneList<int>* ranges,
1889 int start_index,
1890 int end_index,
1891 int cut_index,
1892 Label* even_label,
1893 Label* odd_label) {
1894 bool odd = (((cut_index - start_index) & 1) == 1);
1895 Label* in_range_label = odd ? odd_label : even_label;
1896 Label dummy;
1897 EmitDoubleBoundaryTest(masm,
1898 ranges->at(cut_index),
1899 ranges->at(cut_index + 1) - 1,
1900 &dummy,
1901 in_range_label,
1902 &dummy);
1903 DCHECK(!dummy.is_linked());
1904 // Cut out the single range by rewriting the array. This creates a new
1905 // range that is a merger of the two ranges on either side of the one we
1906 // are cutting out. The oddity of the labels is preserved.
1907 for (int j = cut_index; j > start_index; j--) {
1908 ranges->at(j) = ranges->at(j - 1);
1909 }
1910 for (int j = cut_index + 1; j < end_index; j++) {
1911 ranges->at(j) = ranges->at(j + 1);
1912 }
1913}
1914
1915
1916// Unicode case. Split the search space into kSize spaces that are handled
1917// with recursion.
1918static void SplitSearchSpace(ZoneList<int>* ranges,
1919 int start_index,
1920 int end_index,
1921 int* new_start_index,
1922 int* new_end_index,
1923 int* border) {
1924 static const int kSize = RegExpMacroAssembler::kTableSize;
1925 static const int kMask = RegExpMacroAssembler::kTableMask;
1926
1927 int first = ranges->at(start_index);
1928 int last = ranges->at(end_index) - 1;
1929
1930 *new_start_index = start_index;
1931 *border = (ranges->at(start_index) & ~kMask) + kSize;
1932 while (*new_start_index < end_index) {
1933 if (ranges->at(*new_start_index) > *border) break;
1934 (*new_start_index)++;
1935 }
1936 // new_start_index is the index of the first edge that is beyond the
1937 // current kSize space.
1938
1939 // For very large search spaces we do a binary chop search of the non-Latin1
1940 // space instead of just going to the end of the current kSize space. The
1941 // heuristics are complicated a little by the fact that any 128-character
1942 // encoding space can be quickly tested with a table lookup, so we don't
1943 // wish to do binary chop search at a smaller granularity than that. A
1944 // 128-character space can take up a lot of space in the ranges array if,
1945 // for example, we only want to match every second character (eg. the lower
1946 // case characters on some Unicode pages).
1947 int binary_chop_index = (end_index + start_index) / 2;
1948 // The first test ensures that we get to the code that handles the Latin1
1949 // range with a single not-taken branch, speeding up this important
1950 // character range (even non-Latin1 charset-based text has spaces and
1951 // punctuation).
1952 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1953 end_index - start_index > (*new_start_index - start_index) * 2 &&
1954 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1955 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1956 int scan_forward_for_section_border = binary_chop_index;;
1957 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1958
1959 while (scan_forward_for_section_border < end_index) {
1960 if (ranges->at(scan_forward_for_section_border) > new_border) {
1961 *new_start_index = scan_forward_for_section_border;
1962 *border = new_border;
1963 break;
1964 }
1965 scan_forward_for_section_border++;
1966 }
1967 }
1968
1969 DCHECK(*new_start_index > start_index);
1970 *new_end_index = *new_start_index - 1;
1971 if (ranges->at(*new_end_index) == *border) {
1972 (*new_end_index)--;
1973 }
1974 if (*border >= ranges->at(end_index)) {
1975 *border = ranges->at(end_index);
1976 *new_start_index = end_index; // Won't be used.
1977 *new_end_index = end_index - 1;
1978 }
1979}
1980
1981
1982// Gets a series of segment boundaries representing a character class. If the
1983// character is in the range between an even and an odd boundary (counting from
1984// start_index) then go to even_label, otherwise go to odd_label. We already
1985// know that the character is in the range of min_char to max_char inclusive.
1986// Either label can be NULL indicating backtracking. Either label can also be
1987// equal to the fall_through label.
Ben Murdoch097c5b22016-05-18 11:27:45 +01001988static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1989 int start_index, int end_index, uc32 min_char,
1990 uc32 max_char, Label* fall_through,
1991 Label* even_label, Label* odd_label) {
1992 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1993 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1994
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001995 int first = ranges->at(start_index);
1996 int last = ranges->at(end_index) - 1;
1997
1998 DCHECK_LT(min_char, first);
1999
2000 // Just need to test if the character is before or on-or-after
2001 // a particular character.
2002 if (start_index == end_index) {
2003 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
2004 return;
2005 }
2006
2007 // Another almost trivial case: There is one interval in the middle that is
2008 // different from the end intervals.
2009 if (start_index + 1 == end_index) {
2010 EmitDoubleBoundaryTest(
2011 masm, first, last, fall_through, even_label, odd_label);
2012 return;
2013 }
2014
2015 // It's not worth using table lookup if there are very few intervals in the
2016 // character class.
2017 if (end_index - start_index <= 6) {
2018 // It is faster to test for individual characters, so we look for those
2019 // first, then try arbitrary ranges in the second round.
2020 static int kNoCutIndex = -1;
2021 int cut = kNoCutIndex;
2022 for (int i = start_index; i < end_index; i++) {
2023 if (ranges->at(i) == ranges->at(i + 1) - 1) {
2024 cut = i;
2025 break;
2026 }
2027 }
2028 if (cut == kNoCutIndex) cut = start_index;
2029 CutOutRange(
2030 masm, ranges, start_index, end_index, cut, even_label, odd_label);
2031 DCHECK_GE(end_index - start_index, 2);
2032 GenerateBranches(masm,
2033 ranges,
2034 start_index + 1,
2035 end_index - 1,
2036 min_char,
2037 max_char,
2038 fall_through,
2039 even_label,
2040 odd_label);
2041 return;
2042 }
2043
2044 // If there are a lot of intervals in the regexp, then we will use tables to
2045 // determine whether the character is inside or outside the character class.
2046 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2047
2048 if ((max_char >> kBits) == (min_char >> kBits)) {
2049 EmitUseLookupTable(masm,
2050 ranges,
2051 start_index,
2052 end_index,
2053 min_char,
2054 fall_through,
2055 even_label,
2056 odd_label);
2057 return;
2058 }
2059
2060 if ((min_char >> kBits) != (first >> kBits)) {
2061 masm->CheckCharacterLT(first, odd_label);
2062 GenerateBranches(masm,
2063 ranges,
2064 start_index + 1,
2065 end_index,
2066 first,
2067 max_char,
2068 fall_through,
2069 odd_label,
2070 even_label);
2071 return;
2072 }
2073
2074 int new_start_index = 0;
2075 int new_end_index = 0;
2076 int border = 0;
2077
2078 SplitSearchSpace(ranges,
2079 start_index,
2080 end_index,
2081 &new_start_index,
2082 &new_end_index,
2083 &border);
2084
2085 Label handle_rest;
2086 Label* above = &handle_rest;
2087 if (border == last + 1) {
2088 // We didn't find any section that started after the limit, so everything
2089 // above the border is one of the terminal labels.
2090 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2091 DCHECK(new_end_index == end_index - 1);
2092 }
2093
2094 DCHECK_LE(start_index, new_end_index);
2095 DCHECK_LE(new_start_index, end_index);
2096 DCHECK_LT(start_index, new_start_index);
2097 DCHECK_LT(new_end_index, end_index);
2098 DCHECK(new_end_index + 1 == new_start_index ||
2099 (new_end_index + 2 == new_start_index &&
2100 border == ranges->at(new_end_index + 1)));
2101 DCHECK_LT(min_char, border - 1);
2102 DCHECK_LT(border, max_char);
2103 DCHECK_LT(ranges->at(new_end_index), border);
2104 DCHECK(border < ranges->at(new_start_index) ||
2105 (border == ranges->at(new_start_index) &&
2106 new_start_index == end_index &&
2107 new_end_index == end_index - 1 &&
2108 border == last + 1));
2109 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2110
2111 masm->CheckCharacterGT(border - 1, above);
2112 Label dummy;
2113 GenerateBranches(masm,
2114 ranges,
2115 start_index,
2116 new_end_index,
2117 min_char,
2118 border - 1,
2119 &dummy,
2120 even_label,
2121 odd_label);
2122 if (handle_rest.is_linked()) {
2123 masm->Bind(&handle_rest);
2124 bool flip = (new_start_index & 1) != (start_index & 1);
2125 GenerateBranches(masm,
2126 ranges,
2127 new_start_index,
2128 end_index,
2129 border,
2130 max_char,
2131 &dummy,
2132 flip ? odd_label : even_label,
2133 flip ? even_label : odd_label);
2134 }
2135}
2136
2137
2138static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2139 RegExpCharacterClass* cc, bool one_byte,
2140 Label* on_failure, int cp_offset, bool check_offset,
2141 bool preloaded, Zone* zone) {
2142 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
Ben Murdoch097c5b22016-05-18 11:27:45 +01002143 CharacterRange::Canonicalize(ranges);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002144
2145 int max_char;
2146 if (one_byte) {
2147 max_char = String::kMaxOneByteCharCode;
2148 } else {
2149 max_char = String::kMaxUtf16CodeUnit;
2150 }
2151
2152 int range_count = ranges->length();
2153
2154 int last_valid_range = range_count - 1;
2155 while (last_valid_range >= 0) {
2156 CharacterRange& range = ranges->at(last_valid_range);
2157 if (range.from() <= max_char) {
2158 break;
2159 }
2160 last_valid_range--;
2161 }
2162
2163 if (last_valid_range < 0) {
2164 if (!cc->is_negated()) {
2165 macro_assembler->GoTo(on_failure);
2166 }
2167 if (check_offset) {
2168 macro_assembler->CheckPosition(cp_offset, on_failure);
2169 }
2170 return;
2171 }
2172
2173 if (last_valid_range == 0 &&
2174 ranges->at(0).IsEverything(max_char)) {
2175 if (cc->is_negated()) {
2176 macro_assembler->GoTo(on_failure);
2177 } else {
2178 // This is a common case hit by non-anchored expressions.
2179 if (check_offset) {
2180 macro_assembler->CheckPosition(cp_offset, on_failure);
2181 }
2182 }
2183 return;
2184 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002185
2186 if (!preloaded) {
2187 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2188 }
2189
2190 if (cc->is_standard(zone) &&
Ben Murdoch097c5b22016-05-18 11:27:45 +01002191 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2192 on_failure)) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002193 return;
2194 }
2195
2196
2197 // A new list with ascending entries. Each entry is a code unit
2198 // where there is a boundary between code units that are part of
2199 // the class and code units that are not. Normally we insert an
2200 // entry at zero which goes to the failure label, but if there
2201 // was already one there we fall through for success on that entry.
2202 // Subsequent entries have alternating meaning (success/failure).
2203 ZoneList<int>* range_boundaries =
2204 new(zone) ZoneList<int>(last_valid_range, zone);
2205
2206 bool zeroth_entry_is_failure = !cc->is_negated();
2207
2208 for (int i = 0; i <= last_valid_range; i++) {
2209 CharacterRange& range = ranges->at(i);
2210 if (range.from() == 0) {
2211 DCHECK_EQ(i, 0);
2212 zeroth_entry_is_failure = !zeroth_entry_is_failure;
2213 } else {
2214 range_boundaries->Add(range.from(), zone);
2215 }
2216 range_boundaries->Add(range.to() + 1, zone);
2217 }
2218 int end_index = range_boundaries->length() - 1;
2219 if (range_boundaries->at(end_index) > max_char) {
2220 end_index--;
2221 }
2222
2223 Label fall_through;
2224 GenerateBranches(macro_assembler,
2225 range_boundaries,
2226 0, // start_index.
2227 end_index,
2228 0, // min_char.
2229 max_char,
2230 &fall_through,
2231 zeroth_entry_is_failure ? &fall_through : on_failure,
2232 zeroth_entry_is_failure ? on_failure : &fall_through);
2233 macro_assembler->Bind(&fall_through);
2234}
2235
2236
2237RegExpNode::~RegExpNode() {
2238}
2239
2240
2241RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2242 Trace* trace) {
2243 // If we are generating a greedy loop then don't stop and don't reuse code.
2244 if (trace->stop_node() != NULL) {
2245 return CONTINUE;
2246 }
2247
2248 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2249 if (trace->is_trivial()) {
2250 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2251 // If a generic version is already scheduled to be generated or we have
2252 // recursed too deeply then just generate a jump to that code.
2253 macro_assembler->GoTo(&label_);
2254 // This will queue it up for generation of a generic version if it hasn't
2255 // already been queued.
2256 compiler->AddWork(this);
2257 return DONE;
2258 }
2259 // Generate generic version of the node and bind the label for later use.
2260 macro_assembler->Bind(&label_);
2261 return CONTINUE;
2262 }
2263
2264 // We are being asked to make a non-generic version. Keep track of how many
2265 // non-generic versions we generate so as not to overdo it.
2266 trace_count_++;
2267 if (KeepRecursing(compiler) && compiler->optimize() &&
2268 trace_count_ < kMaxCopiesCodeGenerated) {
2269 return CONTINUE;
2270 }
2271
2272 // If we get here code has been generated for this node too many times or
2273 // recursion is too deep. Time to switch to a generic version. The code for
2274 // generic versions above can handle deep recursion properly.
2275 bool was_limiting = compiler->limiting_recursion();
2276 compiler->set_limiting_recursion(true);
2277 trace->Flush(compiler, this);
2278 compiler->set_limiting_recursion(was_limiting);
2279 return DONE;
2280}
2281
2282
2283bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2284 return !compiler->limiting_recursion() &&
2285 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2286}
2287
2288
2289int ActionNode::EatsAtLeast(int still_to_find,
2290 int budget,
2291 bool not_at_start) {
2292 if (budget <= 0) return 0;
2293 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2294 return on_success()->EatsAtLeast(still_to_find,
2295 budget - 1,
2296 not_at_start);
2297}
2298
2299
2300void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2301 BoyerMooreLookahead* bm, bool not_at_start) {
2302 if (action_type_ == BEGIN_SUBMATCH) {
2303 bm->SetRest(offset);
2304 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2305 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2306 }
2307 SaveBMInfo(bm, not_at_start, offset);
2308}
2309
2310
2311int AssertionNode::EatsAtLeast(int still_to_find,
2312 int budget,
2313 bool not_at_start) {
2314 if (budget <= 0) return 0;
2315 // If we know we are not at the start and we are asked "how many characters
2316 // will you match if you succeed?" then we can answer anything since false
2317 // implies false. So lets just return the max answer (still_to_find) since
2318 // that won't prevent us from preloading a lot of characters for the other
2319 // branches in the node graph.
2320 if (assertion_type() == AT_START && not_at_start) return still_to_find;
2321 return on_success()->EatsAtLeast(still_to_find,
2322 budget - 1,
2323 not_at_start);
2324}
2325
2326
2327void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2328 BoyerMooreLookahead* bm, bool not_at_start) {
2329 // Match the behaviour of EatsAtLeast on this node.
2330 if (assertion_type() == AT_START && not_at_start) return;
2331 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2332 SaveBMInfo(bm, not_at_start, offset);
2333}
2334
2335
2336int BackReferenceNode::EatsAtLeast(int still_to_find,
2337 int budget,
2338 bool not_at_start) {
2339 if (read_backward()) return 0;
2340 if (budget <= 0) return 0;
2341 return on_success()->EatsAtLeast(still_to_find,
2342 budget - 1,
2343 not_at_start);
2344}
2345
2346
2347int TextNode::EatsAtLeast(int still_to_find,
2348 int budget,
2349 bool not_at_start) {
2350 if (read_backward()) return 0;
2351 int answer = Length();
2352 if (answer >= still_to_find) return answer;
2353 if (budget <= 0) return answer;
2354 // We are not at start after this node so we set the last argument to 'true'.
2355 return answer + on_success()->EatsAtLeast(still_to_find - answer,
2356 budget - 1,
2357 true);
2358}
2359
2360
2361int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2362 bool not_at_start) {
2363 if (budget <= 0) return 0;
2364 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2365 // afterwards.
2366 RegExpNode* node = alternatives_->at(1).node();
2367 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2368}
2369
2370
2371void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2372 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2373 bool not_at_start) {
2374 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2375 // afterwards.
2376 RegExpNode* node = alternatives_->at(1).node();
2377 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2378}
2379
2380
2381int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2382 int budget,
2383 RegExpNode* ignore_this_node,
2384 bool not_at_start) {
2385 if (budget <= 0) return 0;
2386 int min = 100;
2387 int choice_count = alternatives_->length();
2388 budget = (budget - 1) / choice_count;
2389 for (int i = 0; i < choice_count; i++) {
2390 RegExpNode* node = alternatives_->at(i).node();
2391 if (node == ignore_this_node) continue;
2392 int node_eats_at_least =
2393 node->EatsAtLeast(still_to_find, budget, not_at_start);
2394 if (node_eats_at_least < min) min = node_eats_at_least;
2395 if (min == 0) return 0;
2396 }
2397 return min;
2398}
2399
2400
2401int LoopChoiceNode::EatsAtLeast(int still_to_find,
2402 int budget,
2403 bool not_at_start) {
2404 return EatsAtLeastHelper(still_to_find,
2405 budget - 1,
2406 loop_node_,
2407 not_at_start);
2408}
2409
2410
2411int ChoiceNode::EatsAtLeast(int still_to_find,
2412 int budget,
2413 bool not_at_start) {
2414 return EatsAtLeastHelper(still_to_find,
2415 budget,
2416 NULL,
2417 not_at_start);
2418}
2419
2420
2421// Takes the left-most 1-bit and smears it out, setting all bits to its right.
2422static inline uint32_t SmearBitsRight(uint32_t v) {
2423 v |= v >> 1;
2424 v |= v >> 2;
2425 v |= v >> 4;
2426 v |= v >> 8;
2427 v |= v >> 16;
2428 return v;
2429}
2430
2431
2432bool QuickCheckDetails::Rationalize(bool asc) {
2433 bool found_useful_op = false;
2434 uint32_t char_mask;
2435 if (asc) {
2436 char_mask = String::kMaxOneByteCharCode;
2437 } else {
2438 char_mask = String::kMaxUtf16CodeUnit;
2439 }
2440 mask_ = 0;
2441 value_ = 0;
2442 int char_shift = 0;
2443 for (int i = 0; i < characters_; i++) {
2444 Position* pos = &positions_[i];
2445 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2446 found_useful_op = true;
2447 }
2448 mask_ |= (pos->mask & char_mask) << char_shift;
2449 value_ |= (pos->value & char_mask) << char_shift;
2450 char_shift += asc ? 8 : 16;
2451 }
2452 return found_useful_op;
2453}
2454
2455
2456bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2457 Trace* bounds_check_trace,
2458 Trace* trace,
2459 bool preload_has_checked_bounds,
2460 Label* on_possible_success,
2461 QuickCheckDetails* details,
2462 bool fall_through_on_failure) {
2463 if (details->characters() == 0) return false;
2464 GetQuickCheckDetails(
2465 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2466 if (details->cannot_match()) return false;
2467 if (!details->Rationalize(compiler->one_byte())) return false;
2468 DCHECK(details->characters() == 1 ||
2469 compiler->macro_assembler()->CanReadUnaligned());
2470 uint32_t mask = details->mask();
2471 uint32_t value = details->value();
2472
2473 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2474
2475 if (trace->characters_preloaded() != details->characters()) {
2476 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2477 // We are attempting to preload the minimum number of characters
2478 // any choice would eat, so if the bounds check fails, then none of the
2479 // choices can succeed, so we can just immediately backtrack, rather
2480 // than go to the next choice.
2481 assembler->LoadCurrentCharacter(trace->cp_offset(),
2482 bounds_check_trace->backtrack(),
2483 !preload_has_checked_bounds,
2484 details->characters());
2485 }
2486
2487
2488 bool need_mask = true;
2489
2490 if (details->characters() == 1) {
2491 // If number of characters preloaded is 1 then we used a byte or 16 bit
2492 // load so the value is already masked down.
2493 uint32_t char_mask;
2494 if (compiler->one_byte()) {
2495 char_mask = String::kMaxOneByteCharCode;
2496 } else {
2497 char_mask = String::kMaxUtf16CodeUnit;
2498 }
2499 if ((mask & char_mask) == char_mask) need_mask = false;
2500 mask &= char_mask;
2501 } else {
2502 // For 2-character preloads in one-byte mode or 1-character preloads in
2503 // two-byte mode we also use a 16 bit load with zero extend.
Ben Murdoch097c5b22016-05-18 11:27:45 +01002504 static const uint32_t kTwoByteMask = 0xffff;
2505 static const uint32_t kFourByteMask = 0xffffffff;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002506 if (details->characters() == 2 && compiler->one_byte()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01002507 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002508 } else if (details->characters() == 1 && !compiler->one_byte()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01002509 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002510 } else {
Ben Murdoch097c5b22016-05-18 11:27:45 +01002511 if (mask == kFourByteMask) need_mask = false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002512 }
2513 }
2514
2515 if (fall_through_on_failure) {
2516 if (need_mask) {
2517 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2518 } else {
2519 assembler->CheckCharacter(value, on_possible_success);
2520 }
2521 } else {
2522 if (need_mask) {
2523 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2524 } else {
2525 assembler->CheckNotCharacter(value, trace->backtrack());
2526 }
2527 }
2528 return true;
2529}
2530
2531
2532// Here is the meat of GetQuickCheckDetails (see also the comment on the
2533// super-class in the .h file).
2534//
2535// We iterate along the text object, building up for each character a
2536// mask and value that can be used to test for a quick failure to match.
2537// The masks and values for the positions will be combined into a single
2538// machine word for the current character width in order to be used in
2539// generating a quick check.
2540void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2541 RegExpCompiler* compiler,
2542 int characters_filled_in,
2543 bool not_at_start) {
2544 // Do not collect any quick check details if the text node reads backward,
2545 // since it reads in the opposite direction than we use for quick checks.
2546 if (read_backward()) return;
2547 Isolate* isolate = compiler->macro_assembler()->isolate();
2548 DCHECK(characters_filled_in < details->characters());
2549 int characters = details->characters();
2550 int char_mask;
2551 if (compiler->one_byte()) {
2552 char_mask = String::kMaxOneByteCharCode;
2553 } else {
2554 char_mask = String::kMaxUtf16CodeUnit;
2555 }
2556 for (int k = 0; k < elements()->length(); k++) {
2557 TextElement elm = elements()->at(k);
2558 if (elm.text_type() == TextElement::ATOM) {
2559 Vector<const uc16> quarks = elm.atom()->data();
2560 for (int i = 0; i < characters && i < quarks.length(); i++) {
2561 QuickCheckDetails::Position* pos =
2562 details->positions(characters_filled_in);
2563 uc16 c = quarks[i];
2564 if (compiler->ignore_case()) {
2565 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2566 int length = GetCaseIndependentLetters(isolate, c,
2567 compiler->one_byte(), chars);
2568 if (length == 0) {
2569 // This can happen because all case variants are non-Latin1, but we
2570 // know the input is Latin1.
2571 details->set_cannot_match();
2572 pos->determines_perfectly = false;
2573 return;
2574 }
2575 if (length == 1) {
2576 // This letter has no case equivalents, so it's nice and simple
2577 // and the mask-compare will determine definitely whether we have
2578 // a match at this character position.
2579 pos->mask = char_mask;
2580 pos->value = c;
2581 pos->determines_perfectly = true;
2582 } else {
2583 uint32_t common_bits = char_mask;
2584 uint32_t bits = chars[0];
2585 for (int j = 1; j < length; j++) {
2586 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2587 common_bits ^= differing_bits;
2588 bits &= common_bits;
2589 }
2590 // If length is 2 and common bits has only one zero in it then
2591 // our mask and compare instruction will determine definitely
2592 // whether we have a match at this character position. Otherwise
2593 // it can only be an approximate check.
2594 uint32_t one_zero = (common_bits | ~char_mask);
2595 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2596 pos->determines_perfectly = true;
2597 }
2598 pos->mask = common_bits;
2599 pos->value = bits;
2600 }
2601 } else {
2602 // Don't ignore case. Nice simple case where the mask-compare will
2603 // determine definitely whether we have a match at this character
2604 // position.
2605 if (c > char_mask) {
2606 details->set_cannot_match();
2607 pos->determines_perfectly = false;
2608 return;
2609 }
2610 pos->mask = char_mask;
2611 pos->value = c;
2612 pos->determines_perfectly = true;
2613 }
2614 characters_filled_in++;
2615 DCHECK(characters_filled_in <= details->characters());
2616 if (characters_filled_in == details->characters()) {
2617 return;
2618 }
2619 }
2620 } else {
2621 QuickCheckDetails::Position* pos =
2622 details->positions(characters_filled_in);
2623 RegExpCharacterClass* tree = elm.char_class();
2624 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2625 if (tree->is_negated()) {
2626 // A quick check uses multi-character mask and compare. There is no
2627 // useful way to incorporate a negative char class into this scheme
2628 // so we just conservatively create a mask and value that will always
2629 // succeed.
2630 pos->mask = 0;
2631 pos->value = 0;
2632 } else {
2633 int first_range = 0;
2634 while (ranges->at(first_range).from() > char_mask) {
2635 first_range++;
2636 if (first_range == ranges->length()) {
2637 details->set_cannot_match();
2638 pos->determines_perfectly = false;
2639 return;
2640 }
2641 }
2642 CharacterRange range = ranges->at(first_range);
2643 uc16 from = range.from();
2644 uc16 to = range.to();
2645 if (to > char_mask) {
2646 to = char_mask;
2647 }
2648 uint32_t differing_bits = (from ^ to);
2649 // A mask and compare is only perfect if the differing bits form a
2650 // number like 00011111 with one single block of trailing 1s.
2651 if ((differing_bits & (differing_bits + 1)) == 0 &&
2652 from + differing_bits == to) {
2653 pos->determines_perfectly = true;
2654 }
2655 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2656 uint32_t bits = (from & common_bits);
2657 for (int i = first_range + 1; i < ranges->length(); i++) {
2658 CharacterRange range = ranges->at(i);
2659 uc16 from = range.from();
2660 uc16 to = range.to();
2661 if (from > char_mask) continue;
2662 if (to > char_mask) to = char_mask;
2663 // Here we are combining more ranges into the mask and compare
2664 // value. With each new range the mask becomes more sparse and
2665 // so the chances of a false positive rise. A character class
2666 // with multiple ranges is assumed never to be equivalent to a
2667 // mask and compare operation.
2668 pos->determines_perfectly = false;
2669 uint32_t new_common_bits = (from ^ to);
2670 new_common_bits = ~SmearBitsRight(new_common_bits);
2671 common_bits &= new_common_bits;
2672 bits &= new_common_bits;
2673 uint32_t differing_bits = (from & common_bits) ^ bits;
2674 common_bits ^= differing_bits;
2675 bits &= common_bits;
2676 }
2677 pos->mask = common_bits;
2678 pos->value = bits;
2679 }
2680 characters_filled_in++;
2681 DCHECK(characters_filled_in <= details->characters());
2682 if (characters_filled_in == details->characters()) {
2683 return;
2684 }
2685 }
2686 }
2687 DCHECK(characters_filled_in != details->characters());
2688 if (!details->cannot_match()) {
2689 on_success()-> GetQuickCheckDetails(details,
2690 compiler,
2691 characters_filled_in,
2692 true);
2693 }
2694}
2695
2696
2697void QuickCheckDetails::Clear() {
2698 for (int i = 0; i < characters_; i++) {
2699 positions_[i].mask = 0;
2700 positions_[i].value = 0;
2701 positions_[i].determines_perfectly = false;
2702 }
2703 characters_ = 0;
2704}
2705
2706
2707void QuickCheckDetails::Advance(int by, bool one_byte) {
2708 if (by >= characters_ || by < 0) {
2709 DCHECK_IMPLIES(by < 0, characters_ == 0);
2710 Clear();
2711 return;
2712 }
2713 DCHECK_LE(characters_ - by, 4);
2714 DCHECK_LE(characters_, 4);
2715 for (int i = 0; i < characters_ - by; i++) {
2716 positions_[i] = positions_[by + i];
2717 }
2718 for (int i = characters_ - by; i < characters_; i++) {
2719 positions_[i].mask = 0;
2720 positions_[i].value = 0;
2721 positions_[i].determines_perfectly = false;
2722 }
2723 characters_ -= by;
2724 // We could change mask_ and value_ here but we would never advance unless
2725 // they had already been used in a check and they won't be used again because
2726 // it would gain us nothing. So there's no point.
2727}
2728
2729
2730void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2731 DCHECK(characters_ == other->characters_);
2732 if (other->cannot_match_) {
2733 return;
2734 }
2735 if (cannot_match_) {
2736 *this = *other;
2737 return;
2738 }
2739 for (int i = from_index; i < characters_; i++) {
2740 QuickCheckDetails::Position* pos = positions(i);
2741 QuickCheckDetails::Position* other_pos = other->positions(i);
2742 if (pos->mask != other_pos->mask ||
2743 pos->value != other_pos->value ||
2744 !other_pos->determines_perfectly) {
2745 // Our mask-compare operation will be approximate unless we have the
2746 // exact same operation on both sides of the alternation.
2747 pos->determines_perfectly = false;
2748 }
2749 pos->mask &= other_pos->mask;
2750 pos->value &= pos->mask;
2751 other_pos->value &= pos->mask;
2752 uc16 differing_bits = (pos->value ^ other_pos->value);
2753 pos->mask &= ~differing_bits;
2754 pos->value &= pos->mask;
2755 }
2756}
2757
2758
2759class VisitMarker {
2760 public:
2761 explicit VisitMarker(NodeInfo* info) : info_(info) {
2762 DCHECK(!info->visited);
2763 info->visited = true;
2764 }
2765 ~VisitMarker() {
2766 info_->visited = false;
2767 }
2768 private:
2769 NodeInfo* info_;
2770};
2771
2772
2773RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2774 if (info()->replacement_calculated) return replacement();
2775 if (depth < 0) return this;
2776 DCHECK(!info()->visited);
2777 VisitMarker marker(info());
2778 return FilterSuccessor(depth - 1, ignore_case);
2779}
2780
2781
2782RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2783 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2784 if (next == NULL) return set_replacement(NULL);
2785 on_success_ = next;
2786 return set_replacement(this);
2787}
2788
2789
2790// We need to check for the following characters: 0x39c 0x3bc 0x178.
2791static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2792 // TODO(dcarney): this could be a lot more efficient.
2793 return range.Contains(0x39c) ||
2794 range.Contains(0x3bc) || range.Contains(0x178);
2795}
2796
2797
2798static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2799 for (int i = 0; i < ranges->length(); i++) {
2800 // TODO(dcarney): this could be a lot more efficient.
2801 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2802 }
2803 return false;
2804}
2805
2806
2807RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2808 if (info()->replacement_calculated) return replacement();
2809 if (depth < 0) return this;
2810 DCHECK(!info()->visited);
2811 VisitMarker marker(info());
2812 int element_count = elements()->length();
2813 for (int i = 0; i < element_count; i++) {
2814 TextElement elm = elements()->at(i);
2815 if (elm.text_type() == TextElement::ATOM) {
2816 Vector<const uc16> quarks = elm.atom()->data();
2817 for (int j = 0; j < quarks.length(); j++) {
2818 uint16_t c = quarks[j];
2819 if (c <= String::kMaxOneByteCharCode) continue;
2820 if (!ignore_case) return set_replacement(NULL);
2821 // Here, we need to check for characters whose upper and lower cases
2822 // are outside the Latin-1 range.
2823 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2824 // Character is outside Latin-1 completely
2825 if (converted == 0) return set_replacement(NULL);
2826 // Convert quark to Latin-1 in place.
2827 uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2828 copy[j] = converted;
2829 }
2830 } else {
2831 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2832 RegExpCharacterClass* cc = elm.char_class();
2833 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +01002834 CharacterRange::Canonicalize(ranges);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00002835 // Now they are in order so we only need to look at the first.
2836 int range_count = ranges->length();
2837 if (cc->is_negated()) {
2838 if (range_count != 0 &&
2839 ranges->at(0).from() == 0 &&
2840 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2841 // This will be handled in a later filter.
2842 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2843 return set_replacement(NULL);
2844 }
2845 } else {
2846 if (range_count == 0 ||
2847 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2848 // This will be handled in a later filter.
2849 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2850 return set_replacement(NULL);
2851 }
2852 }
2853 }
2854 }
2855 return FilterSuccessor(depth - 1, ignore_case);
2856}
2857
2858
2859RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2860 if (info()->replacement_calculated) return replacement();
2861 if (depth < 0) return this;
2862 if (info()->visited) return this;
2863 {
2864 VisitMarker marker(info());
2865
2866 RegExpNode* continue_replacement =
2867 continue_node_->FilterOneByte(depth - 1, ignore_case);
2868 // If we can't continue after the loop then there is no sense in doing the
2869 // loop.
2870 if (continue_replacement == NULL) return set_replacement(NULL);
2871 }
2872
2873 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2874}
2875
2876
2877RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2878 if (info()->replacement_calculated) return replacement();
2879 if (depth < 0) return this;
2880 if (info()->visited) return this;
2881 VisitMarker marker(info());
2882 int choice_count = alternatives_->length();
2883
2884 for (int i = 0; i < choice_count; i++) {
2885 GuardedAlternative alternative = alternatives_->at(i);
2886 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2887 set_replacement(this);
2888 return this;
2889 }
2890 }
2891
2892 int surviving = 0;
2893 RegExpNode* survivor = NULL;
2894 for (int i = 0; i < choice_count; i++) {
2895 GuardedAlternative alternative = alternatives_->at(i);
2896 RegExpNode* replacement =
2897 alternative.node()->FilterOneByte(depth - 1, ignore_case);
2898 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2899 if (replacement != NULL) {
2900 alternatives_->at(i).set_node(replacement);
2901 surviving++;
2902 survivor = replacement;
2903 }
2904 }
2905 if (surviving < 2) return set_replacement(survivor);
2906
2907 set_replacement(this);
2908 if (surviving == choice_count) {
2909 return this;
2910 }
2911 // Only some of the nodes survived the filtering. We need to rebuild the
2912 // alternatives list.
2913 ZoneList<GuardedAlternative>* new_alternatives =
2914 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2915 for (int i = 0; i < choice_count; i++) {
2916 RegExpNode* replacement =
2917 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2918 if (replacement != NULL) {
2919 alternatives_->at(i).set_node(replacement);
2920 new_alternatives->Add(alternatives_->at(i), zone());
2921 }
2922 }
2923 alternatives_ = new_alternatives;
2924 return this;
2925}
2926
2927
2928RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
2929 bool ignore_case) {
2930 if (info()->replacement_calculated) return replacement();
2931 if (depth < 0) return this;
2932 if (info()->visited) return this;
2933 VisitMarker marker(info());
2934 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2935 // afterwards.
2936 RegExpNode* node = alternatives_->at(1).node();
2937 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2938 if (replacement == NULL) return set_replacement(NULL);
2939 alternatives_->at(1).set_node(replacement);
2940
2941 RegExpNode* neg_node = alternatives_->at(0).node();
2942 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2943 // If the negative lookahead is always going to fail then
2944 // we don't need to check it.
2945 if (neg_replacement == NULL) return set_replacement(replacement);
2946 alternatives_->at(0).set_node(neg_replacement);
2947 return set_replacement(this);
2948}
2949
2950
2951void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2952 RegExpCompiler* compiler,
2953 int characters_filled_in,
2954 bool not_at_start) {
2955 if (body_can_be_zero_length_ || info()->visited) return;
2956 VisitMarker marker(info());
2957 return ChoiceNode::GetQuickCheckDetails(details,
2958 compiler,
2959 characters_filled_in,
2960 not_at_start);
2961}
2962
2963
2964void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2965 BoyerMooreLookahead* bm, bool not_at_start) {
2966 if (body_can_be_zero_length_ || budget <= 0) {
2967 bm->SetRest(offset);
2968 SaveBMInfo(bm, not_at_start, offset);
2969 return;
2970 }
2971 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2972 SaveBMInfo(bm, not_at_start, offset);
2973}
2974
2975
2976void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2977 RegExpCompiler* compiler,
2978 int characters_filled_in,
2979 bool not_at_start) {
2980 not_at_start = (not_at_start || not_at_start_);
2981 int choice_count = alternatives_->length();
2982 DCHECK(choice_count > 0);
2983 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2984 compiler,
2985 characters_filled_in,
2986 not_at_start);
2987 for (int i = 1; i < choice_count; i++) {
2988 QuickCheckDetails new_details(details->characters());
2989 RegExpNode* node = alternatives_->at(i).node();
2990 node->GetQuickCheckDetails(&new_details, compiler,
2991 characters_filled_in,
2992 not_at_start);
2993 // Here we merge the quick match details of the two branches.
2994 details->Merge(&new_details, characters_filled_in);
2995 }
2996}
2997
2998
2999// Check for [0-9A-Z_a-z].
3000static void EmitWordCheck(RegExpMacroAssembler* assembler,
3001 Label* word,
3002 Label* non_word,
3003 bool fall_through_on_word) {
3004 if (assembler->CheckSpecialCharacterClass(
3005 fall_through_on_word ? 'w' : 'W',
3006 fall_through_on_word ? non_word : word)) {
3007 // Optimized implementation available.
3008 return;
3009 }
3010 assembler->CheckCharacterGT('z', non_word);
3011 assembler->CheckCharacterLT('0', non_word);
3012 assembler->CheckCharacterGT('a' - 1, word);
3013 assembler->CheckCharacterLT('9' + 1, word);
3014 assembler->CheckCharacterLT('A', non_word);
3015 assembler->CheckCharacterLT('Z' + 1, word);
3016 if (fall_through_on_word) {
3017 assembler->CheckNotCharacter('_', non_word);
3018 } else {
3019 assembler->CheckCharacter('_', word);
3020 }
3021}
3022
3023
3024// Emit the code to check for a ^ in multiline mode (1-character lookbehind
3025// that matches newline or the start of input).
3026static void EmitHat(RegExpCompiler* compiler,
3027 RegExpNode* on_success,
3028 Trace* trace) {
3029 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3030 // We will be loading the previous character into the current character
3031 // register.
3032 Trace new_trace(*trace);
3033 new_trace.InvalidateCurrentCharacter();
3034
3035 Label ok;
3036 if (new_trace.cp_offset() == 0) {
3037 // The start of input counts as a newline in this context, so skip to
3038 // ok if we are at the start.
3039 assembler->CheckAtStart(&ok);
3040 }
3041 // We already checked that we are not at the start of input so it must be
3042 // OK to load the previous character.
3043 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3044 new_trace.backtrack(),
3045 false);
3046 if (!assembler->CheckSpecialCharacterClass('n',
3047 new_trace.backtrack())) {
3048 // Newline means \n, \r, 0x2028 or 0x2029.
3049 if (!compiler->one_byte()) {
3050 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3051 }
3052 assembler->CheckCharacter('\n', &ok);
3053 assembler->CheckNotCharacter('\r', new_trace.backtrack());
3054 }
3055 assembler->Bind(&ok);
3056 on_success->Emit(compiler, &new_trace);
3057}
3058
3059
3060// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3061void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3062 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3063 Isolate* isolate = assembler->isolate();
3064 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3065 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3066 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3067 if (lookahead == NULL) {
3068 int eats_at_least =
3069 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3070 kRecursionBudget,
3071 not_at_start));
3072 if (eats_at_least >= 1) {
3073 BoyerMooreLookahead* bm =
3074 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3075 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
3076 if (bm->at(0)->is_non_word())
3077 next_is_word_character = Trace::FALSE_VALUE;
3078 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3079 }
3080 } else {
3081 if (lookahead->at(0)->is_non_word())
3082 next_is_word_character = Trace::FALSE_VALUE;
3083 if (lookahead->at(0)->is_word())
3084 next_is_word_character = Trace::TRUE_VALUE;
3085 }
3086 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3087 if (next_is_word_character == Trace::UNKNOWN) {
3088 Label before_non_word;
3089 Label before_word;
3090 if (trace->characters_preloaded() != 1) {
3091 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3092 }
3093 // Fall through on non-word.
3094 EmitWordCheck(assembler, &before_word, &before_non_word, false);
3095 // Next character is not a word character.
3096 assembler->Bind(&before_non_word);
3097 Label ok;
3098 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3099 assembler->GoTo(&ok);
3100
3101 assembler->Bind(&before_word);
3102 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3103 assembler->Bind(&ok);
3104 } else if (next_is_word_character == Trace::TRUE_VALUE) {
3105 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3106 } else {
3107 DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3108 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3109 }
3110}
3111
3112
3113void AssertionNode::BacktrackIfPrevious(
3114 RegExpCompiler* compiler,
3115 Trace* trace,
3116 AssertionNode::IfPrevious backtrack_if_previous) {
3117 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3118 Trace new_trace(*trace);
3119 new_trace.InvalidateCurrentCharacter();
3120
3121 Label fall_through, dummy;
3122
3123 Label* non_word = backtrack_if_previous == kIsNonWord ?
3124 new_trace.backtrack() :
3125 &fall_through;
3126 Label* word = backtrack_if_previous == kIsNonWord ?
3127 &fall_through :
3128 new_trace.backtrack();
3129
3130 if (new_trace.cp_offset() == 0) {
3131 // The start of input counts as a non-word character, so the question is
3132 // decided if we are at the start.
3133 assembler->CheckAtStart(non_word);
3134 }
3135 // We already checked that we are not at the start of input so it must be
3136 // OK to load the previous character.
3137 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3138 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3139
3140 assembler->Bind(&fall_through);
3141 on_success()->Emit(compiler, &new_trace);
3142}
3143
3144
3145void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3146 RegExpCompiler* compiler,
3147 int filled_in,
3148 bool not_at_start) {
3149 if (assertion_type_ == AT_START && not_at_start) {
3150 details->set_cannot_match();
3151 return;
3152 }
3153 return on_success()->GetQuickCheckDetails(details,
3154 compiler,
3155 filled_in,
3156 not_at_start);
3157}
3158
3159
3160void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3161 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3162 switch (assertion_type_) {
3163 case AT_END: {
3164 Label ok;
3165 assembler->CheckPosition(trace->cp_offset(), &ok);
3166 assembler->GoTo(trace->backtrack());
3167 assembler->Bind(&ok);
3168 break;
3169 }
3170 case AT_START: {
3171 if (trace->at_start() == Trace::FALSE_VALUE) {
3172 assembler->GoTo(trace->backtrack());
3173 return;
3174 }
3175 if (trace->at_start() == Trace::UNKNOWN) {
3176 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3177 Trace at_start_trace = *trace;
3178 at_start_trace.set_at_start(Trace::TRUE_VALUE);
3179 on_success()->Emit(compiler, &at_start_trace);
3180 return;
3181 }
3182 }
3183 break;
3184 case AFTER_NEWLINE:
3185 EmitHat(compiler, on_success(), trace);
3186 return;
3187 case AT_BOUNDARY:
3188 case AT_NON_BOUNDARY: {
3189 EmitBoundaryCheck(compiler, trace);
3190 return;
3191 }
3192 }
3193 on_success()->Emit(compiler, trace);
3194}
3195
3196
3197static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3198 if (quick_check == NULL) return false;
3199 if (offset >= quick_check->characters()) return false;
3200 return quick_check->positions(offset)->determines_perfectly;
3201}
3202
3203
3204static void UpdateBoundsCheck(int index, int* checked_up_to) {
3205 if (index > *checked_up_to) {
3206 *checked_up_to = index;
3207 }
3208}
3209
3210
3211// We call this repeatedly to generate code for each pass over the text node.
3212// The passes are in increasing order of difficulty because we hope one
3213// of the first passes will fail in which case we are saved the work of the
3214// later passes. for example for the case independent regexp /%[asdfghjkl]a/
3215// we will check the '%' in the first pass, the case independent 'a' in the
3216// second pass and the character class in the last pass.
3217//
3218// The passes are done from right to left, so for example to test for /bar/
3219// we will first test for an 'r' with offset 2, then an 'a' with offset 1
3220// and then a 'b' with offset 0. This means we can avoid the end-of-input
3221// bounds check most of the time. In the example we only need to check for
3222// end-of-input when loading the putative 'r'.
3223//
3224// A slight complication involves the fact that the first character may already
3225// be fetched into a register by the previous node. In this case we want to
3226// do the test for that character first. We do this in separate passes. The
3227// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3228// pass has been performed then subsequent passes will have true in
3229// first_element_checked to indicate that that character does not need to be
3230// checked again.
3231//
3232// In addition to all this we are passed a Trace, which can
3233// contain an AlternativeGeneration object. In this AlternativeGeneration
3234// object we can see details of any quick check that was already passed in
3235// order to get to the code we are now generating. The quick check can involve
3236// loading characters, which means we do not need to recheck the bounds
3237// up to the limit the quick check already checked. In addition the quick
3238// check can have involved a mask and compare operation which may simplify
3239// or obviate the need for further checks at some character positions.
3240void TextNode::TextEmitPass(RegExpCompiler* compiler,
3241 TextEmitPassType pass,
3242 bool preloaded,
3243 Trace* trace,
3244 bool first_element_checked,
3245 int* checked_up_to) {
3246 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3247 Isolate* isolate = assembler->isolate();
3248 bool one_byte = compiler->one_byte();
3249 Label* backtrack = trace->backtrack();
3250 QuickCheckDetails* quick_check = trace->quick_check_performed();
3251 int element_count = elements()->length();
3252 int backward_offset = read_backward() ? -Length() : 0;
3253 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3254 TextElement elm = elements()->at(i);
3255 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3256 if (elm.text_type() == TextElement::ATOM) {
3257 Vector<const uc16> quarks = elm.atom()->data();
3258 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3259 if (first_element_checked && i == 0 && j == 0) continue;
3260 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3261 EmitCharacterFunction* emit_function = NULL;
3262 switch (pass) {
3263 case NON_LATIN1_MATCH:
3264 DCHECK(one_byte);
3265 if (quarks[j] > String::kMaxOneByteCharCode) {
3266 assembler->GoTo(backtrack);
3267 return;
3268 }
3269 break;
3270 case NON_LETTER_CHARACTER_MATCH:
3271 emit_function = &EmitAtomNonLetter;
3272 break;
3273 case SIMPLE_CHARACTER_MATCH:
3274 emit_function = &EmitSimpleCharacter;
3275 break;
3276 case CASE_CHARACTER_MATCH:
3277 emit_function = &EmitAtomLetter;
3278 break;
3279 default:
3280 break;
3281 }
3282 if (emit_function != NULL) {
3283 bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3284 bool bound_checked =
3285 emit_function(isolate, compiler, quarks[j], backtrack,
3286 cp_offset + j, bounds_check, preloaded);
3287 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3288 }
3289 }
3290 } else {
3291 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3292 if (pass == CHARACTER_CLASS_MATCH) {
3293 if (first_element_checked && i == 0) continue;
3294 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3295 RegExpCharacterClass* cc = elm.char_class();
3296 bool bounds_check = *checked_up_to < cp_offset || read_backward();
3297 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3298 bounds_check, preloaded, zone());
3299 UpdateBoundsCheck(cp_offset, checked_up_to);
3300 }
3301 }
3302 }
3303}
3304
3305
3306int TextNode::Length() {
3307 TextElement elm = elements()->last();
3308 DCHECK(elm.cp_offset() >= 0);
3309 return elm.cp_offset() + elm.length();
3310}
3311
3312
3313bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3314 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3315 if (ignore_case) {
3316 return pass == SIMPLE_CHARACTER_MATCH;
3317 } else {
3318 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3319 }
3320}
3321
3322
Ben Murdoch097c5b22016-05-18 11:27:45 +01003323TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3324 ZoneList<CharacterRange>* ranges,
3325 bool read_backward,
3326 RegExpNode* on_success) {
3327 DCHECK_NOT_NULL(ranges);
3328 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3329 elms->Add(
3330 TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
3331 zone);
3332 return new (zone) TextNode(elms, read_backward, on_success);
3333}
3334
3335
3336TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3337 CharacterRange trail,
3338 bool read_backward,
3339 RegExpNode* on_success) {
3340 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3341 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3342 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3343 elms->Add(TextElement::CharClass(
3344 new (zone) RegExpCharacterClass(lead_ranges, false)),
3345 zone);
3346 elms->Add(TextElement::CharClass(
3347 new (zone) RegExpCharacterClass(trail_ranges, false)),
3348 zone);
3349 return new (zone) TextNode(elms, read_backward, on_success);
3350}
3351
3352
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003353// This generates the code to match a text node. A text node can contain
3354// straight character sequences (possibly to be matched in a case-independent
3355// way) and character classes. For efficiency we do not do this in a single
3356// pass from left to right. Instead we pass over the text node several times,
3357// emitting code for some character positions every time. See the comment on
3358// TextEmitPass for details.
3359void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3360 LimitResult limit_result = LimitVersions(compiler, trace);
3361 if (limit_result == DONE) return;
3362 DCHECK(limit_result == CONTINUE);
3363
3364 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3365 compiler->SetRegExpTooBig();
3366 return;
3367 }
3368
3369 if (compiler->one_byte()) {
3370 int dummy = 0;
3371 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3372 }
3373
3374 bool first_elt_done = false;
3375 int bound_checked_to = trace->cp_offset() - 1;
3376 bound_checked_to += trace->bound_checked_up_to();
3377
3378 // If a character is preloaded into the current character register then
3379 // check that now.
3380 if (trace->characters_preloaded() == 1) {
3381 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3382 if (!SkipPass(pass, compiler->ignore_case())) {
3383 TextEmitPass(compiler,
3384 static_cast<TextEmitPassType>(pass),
3385 true,
3386 trace,
3387 false,
3388 &bound_checked_to);
3389 }
3390 }
3391 first_elt_done = true;
3392 }
3393
3394 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3395 if (!SkipPass(pass, compiler->ignore_case())) {
3396 TextEmitPass(compiler,
3397 static_cast<TextEmitPassType>(pass),
3398 false,
3399 trace,
3400 first_elt_done,
3401 &bound_checked_to);
3402 }
3403 }
3404
3405 Trace successor_trace(*trace);
3406 // If we advance backward, we may end up at the start.
3407 successor_trace.AdvanceCurrentPositionInTrace(
3408 read_backward() ? -Length() : Length(), compiler);
3409 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3410 : Trace::FALSE_VALUE);
3411 RecursionCheck rc(compiler);
3412 on_success()->Emit(compiler, &successor_trace);
3413}
3414
3415
3416void Trace::InvalidateCurrentCharacter() {
3417 characters_preloaded_ = 0;
3418}
3419
3420
3421void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3422 // We don't have an instruction for shifting the current character register
3423 // down or for using a shifted value for anything so lets just forget that
3424 // we preloaded any characters into it.
3425 characters_preloaded_ = 0;
3426 // Adjust the offsets of the quick check performed information. This
3427 // information is used to find out what we already determined about the
3428 // characters by means of mask and compare.
3429 quick_check_performed_.Advance(by, compiler->one_byte());
3430 cp_offset_ += by;
3431 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3432 compiler->SetRegExpTooBig();
3433 cp_offset_ = 0;
3434 }
3435 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3436}
3437
3438
3439void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3440 int element_count = elements()->length();
3441 for (int i = 0; i < element_count; i++) {
3442 TextElement elm = elements()->at(i);
3443 if (elm.text_type() == TextElement::CHAR_CLASS) {
3444 RegExpCharacterClass* cc = elm.char_class();
3445 // None of the standard character classes is different in the case
3446 // independent case and it slows us down if we don't know that.
3447 if (cc->is_standard(zone())) continue;
3448 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +01003449 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003450 }
3451 }
3452}
3453
3454
3455int TextNode::GreedyLoopTextLength() { return Length(); }
3456
3457
3458RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3459 RegExpCompiler* compiler) {
3460 if (read_backward()) return NULL;
3461 if (elements()->length() != 1) return NULL;
3462 TextElement elm = elements()->at(0);
3463 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3464 RegExpCharacterClass* node = elm.char_class();
3465 ZoneList<CharacterRange>* ranges = node->ranges(zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +01003466 CharacterRange::Canonicalize(ranges);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003467 if (node->is_negated()) {
3468 return ranges->length() == 0 ? on_success() : NULL;
3469 }
3470 if (ranges->length() != 1) return NULL;
3471 uint32_t max_char;
3472 if (compiler->one_byte()) {
3473 max_char = String::kMaxOneByteCharCode;
3474 } else {
3475 max_char = String::kMaxUtf16CodeUnit;
3476 }
3477 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3478}
3479
3480
3481// Finds the fixed match length of a sequence of nodes that goes from
3482// this alternative and back to this choice node. If there are variable
3483// length nodes or other complications in the way then return a sentinel
3484// value indicating that a greedy loop cannot be constructed.
3485int ChoiceNode::GreedyLoopTextLengthForAlternative(
3486 GuardedAlternative* alternative) {
3487 int length = 0;
3488 RegExpNode* node = alternative->node();
3489 // Later we will generate code for all these text nodes using recursion
3490 // so we have to limit the max number.
3491 int recursion_depth = 0;
3492 while (node != this) {
3493 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3494 return kNodeIsTooComplexForGreedyLoops;
3495 }
3496 int node_length = node->GreedyLoopTextLength();
3497 if (node_length == kNodeIsTooComplexForGreedyLoops) {
3498 return kNodeIsTooComplexForGreedyLoops;
3499 }
3500 length += node_length;
3501 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3502 node = seq_node->on_success();
3503 }
3504 return read_backward() ? -length : length;
3505}
3506
3507
3508void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3509 DCHECK_NULL(loop_node_);
3510 AddAlternative(alt);
3511 loop_node_ = alt.node();
3512}
3513
3514
3515void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3516 DCHECK_NULL(continue_node_);
3517 AddAlternative(alt);
3518 continue_node_ = alt.node();
3519}
3520
3521
3522void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3523 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3524 if (trace->stop_node() == this) {
3525 // Back edge of greedy optimized loop node graph.
3526 int text_length =
3527 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3528 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
3529 // Update the counter-based backtracking info on the stack. This is an
3530 // optimization for greedy loops (see below).
3531 DCHECK(trace->cp_offset() == text_length);
3532 macro_assembler->AdvanceCurrentPosition(text_length);
3533 macro_assembler->GoTo(trace->loop_label());
3534 return;
3535 }
3536 DCHECK_NULL(trace->stop_node());
3537 if (!trace->is_trivial()) {
3538 trace->Flush(compiler, this);
3539 return;
3540 }
3541 ChoiceNode::Emit(compiler, trace);
3542}
3543
3544
3545int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3546 int eats_at_least) {
3547 int preload_characters = Min(4, eats_at_least);
3548 if (compiler->macro_assembler()->CanReadUnaligned()) {
3549 bool one_byte = compiler->one_byte();
3550 if (one_byte) {
3551 if (preload_characters > 4) preload_characters = 4;
3552 // We can't preload 3 characters because there is no machine instruction
3553 // to do that. We can't just load 4 because we could be reading
3554 // beyond the end of the string, which could cause a memory fault.
3555 if (preload_characters == 3) preload_characters = 2;
3556 } else {
3557 if (preload_characters > 2) preload_characters = 2;
3558 }
3559 } else {
3560 if (preload_characters > 1) preload_characters = 1;
3561 }
3562 return preload_characters;
3563}
3564
3565
3566// This class is used when generating the alternatives in a choice node. It
3567// records the way the alternative is being code generated.
3568class AlternativeGeneration: public Malloced {
3569 public:
3570 AlternativeGeneration()
3571 : possible_success(),
3572 expects_preload(false),
3573 after(),
3574 quick_check_details() { }
3575 Label possible_success;
3576 bool expects_preload;
3577 Label after;
3578 QuickCheckDetails quick_check_details;
3579};
3580
3581
3582// Creates a list of AlternativeGenerations. If the list has a reasonable
3583// size then it is on the stack, otherwise the excess is on the heap.
3584class AlternativeGenerationList {
3585 public:
3586 AlternativeGenerationList(int count, Zone* zone)
3587 : alt_gens_(count, zone) {
3588 for (int i = 0; i < count && i < kAFew; i++) {
3589 alt_gens_.Add(a_few_alt_gens_ + i, zone);
3590 }
3591 for (int i = kAFew; i < count; i++) {
3592 alt_gens_.Add(new AlternativeGeneration(), zone);
3593 }
3594 }
3595 ~AlternativeGenerationList() {
3596 for (int i = kAFew; i < alt_gens_.length(); i++) {
3597 delete alt_gens_[i];
3598 alt_gens_[i] = NULL;
3599 }
3600 }
3601
3602 AlternativeGeneration* at(int i) {
3603 return alt_gens_[i];
3604 }
3605
3606 private:
3607 static const int kAFew = 10;
3608 ZoneList<AlternativeGeneration*> alt_gens_;
3609 AlternativeGeneration a_few_alt_gens_[kAFew];
3610};
3611
3612
Ben Murdoch097c5b22016-05-18 11:27:45 +01003613static const uc32 kRangeEndMarker = 0x110000;
3614
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003615// The '2' variant is has inclusive from and exclusive to.
3616// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3617// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
Ben Murdoch097c5b22016-05-18 11:27:45 +01003618static const int kSpaceRanges[] = {
3619 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681,
3620 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3621 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003622static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3623
3624static const int kWordRanges[] = {
Ben Murdoch097c5b22016-05-18 11:27:45 +01003625 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003626static const int kWordRangeCount = arraysize(kWordRanges);
Ben Murdoch097c5b22016-05-18 11:27:45 +01003627static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003628static const int kDigitRangeCount = arraysize(kDigitRanges);
Ben Murdoch097c5b22016-05-18 11:27:45 +01003629static const int kSurrogateRanges[] = {
3630 kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003631static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
Ben Murdoch097c5b22016-05-18 11:27:45 +01003632static const int kLineTerminatorRanges[] = {
3633 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003634static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3635
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003636void BoyerMoorePositionInfo::Set(int character) {
3637 SetInterval(Interval(character, character));
3638}
3639
3640
3641void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3642 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3643 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3644 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3645 surrogate_ =
3646 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3647 if (interval.to() - interval.from() >= kMapSize - 1) {
3648 if (map_count_ != kMapSize) {
3649 map_count_ = kMapSize;
3650 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3651 }
3652 return;
3653 }
3654 for (int i = interval.from(); i <= interval.to(); i++) {
3655 int mod_character = (i & kMask);
3656 if (!map_->at(mod_character)) {
3657 map_count_++;
3658 map_->at(mod_character) = true;
3659 }
3660 if (map_count_ == kMapSize) return;
3661 }
3662}
3663
3664
3665void BoyerMoorePositionInfo::SetAll() {
3666 s_ = w_ = d_ = kLatticeUnknown;
3667 if (map_count_ != kMapSize) {
3668 map_count_ = kMapSize;
3669 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3670 }
3671}
3672
3673
3674BoyerMooreLookahead::BoyerMooreLookahead(
3675 int length, RegExpCompiler* compiler, Zone* zone)
3676 : length_(length),
3677 compiler_(compiler) {
3678 if (compiler->one_byte()) {
3679 max_char_ = String::kMaxOneByteCharCode;
3680 } else {
3681 max_char_ = String::kMaxUtf16CodeUnit;
3682 }
3683 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3684 for (int i = 0; i < length; i++) {
3685 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3686 }
3687}
3688
3689
3690// Find the longest range of lookahead that has the fewest number of different
3691// characters that can occur at a given position. Since we are optimizing two
3692// different parameters at once this is a tradeoff.
3693bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3694 int biggest_points = 0;
3695 // If more than 32 characters out of 128 can occur it is unlikely that we can
3696 // be lucky enough to step forwards much of the time.
3697 const int kMaxMax = 32;
3698 for (int max_number_of_chars = 4;
3699 max_number_of_chars < kMaxMax;
3700 max_number_of_chars *= 2) {
3701 biggest_points =
3702 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3703 }
3704 if (biggest_points == 0) return false;
3705 return true;
3706}
3707
3708
3709// Find the highest-points range between 0 and length_ where the character
3710// information is not too vague. 'Too vague' means that there are more than
3711// max_number_of_chars that can occur at this position. Calculates the number
3712// of points as the product of width-of-the-range and
3713// probability-of-finding-one-of-the-characters, where the probability is
3714// calculated using the frequency distribution of the sample subject string.
3715int BoyerMooreLookahead::FindBestInterval(
3716 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3717 int biggest_points = old_biggest_points;
3718 static const int kSize = RegExpMacroAssembler::kTableSize;
3719 for (int i = 0; i < length_; ) {
3720 while (i < length_ && Count(i) > max_number_of_chars) i++;
3721 if (i == length_) break;
3722 int remembered_from = i;
3723 bool union_map[kSize];
3724 for (int j = 0; j < kSize; j++) union_map[j] = false;
3725 while (i < length_ && Count(i) <= max_number_of_chars) {
3726 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3727 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3728 i++;
3729 }
3730 int frequency = 0;
3731 for (int j = 0; j < kSize; j++) {
3732 if (union_map[j]) {
3733 // Add 1 to the frequency to give a small per-character boost for
3734 // the cases where our sampling is not good enough and many
3735 // characters have a frequency of zero. This means the frequency
3736 // can theoretically be up to 2*kSize though we treat it mostly as
3737 // a fraction of kSize.
3738 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3739 }
3740 }
3741 // We use the probability of skipping times the distance we are skipping to
3742 // judge the effectiveness of this. Actually we have a cut-off: By
3743 // dividing by 2 we switch off the skipping if the probability of skipping
3744 // is less than 50%. This is because the multibyte mask-and-compare
3745 // skipping in quickcheck is more likely to do well on this case.
3746 bool in_quickcheck_range =
3747 ((i - remembered_from < 4) ||
3748 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3749 // Called 'probability' but it is only a rough estimate and can actually
3750 // be outside the 0-kSize range.
3751 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3752 int points = (i - remembered_from) * probability;
3753 if (points > biggest_points) {
3754 *from = remembered_from;
3755 *to = i - 1;
3756 biggest_points = points;
3757 }
3758 }
3759 return biggest_points;
3760}
3761
3762
3763// Take all the characters that will not prevent a successful match if they
3764// occur in the subject string in the range between min_lookahead and
3765// max_lookahead (inclusive) measured from the current position. If the
3766// character at max_lookahead offset is not one of these characters, then we
3767// can safely skip forwards by the number of characters in the range.
3768int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3769 int max_lookahead,
3770 Handle<ByteArray> boolean_skip_table) {
3771 const int kSize = RegExpMacroAssembler::kTableSize;
3772
3773 const int kSkipArrayEntry = 0;
3774 const int kDontSkipArrayEntry = 1;
3775
3776 for (int i = 0; i < kSize; i++) {
3777 boolean_skip_table->set(i, kSkipArrayEntry);
3778 }
3779 int skip = max_lookahead + 1 - min_lookahead;
3780
3781 for (int i = max_lookahead; i >= min_lookahead; i--) {
3782 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3783 for (int j = 0; j < kSize; j++) {
3784 if (map->at(j)) {
3785 boolean_skip_table->set(j, kDontSkipArrayEntry);
3786 }
3787 }
3788 }
3789
3790 return skip;
3791}
3792
3793
3794// See comment above on the implementation of GetSkipTable.
3795void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3796 const int kSize = RegExpMacroAssembler::kTableSize;
3797
3798 int min_lookahead = 0;
3799 int max_lookahead = 0;
3800
3801 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3802
3803 bool found_single_character = false;
3804 int single_character = 0;
3805 for (int i = max_lookahead; i >= min_lookahead; i--) {
3806 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3807 if (map->map_count() > 1 ||
3808 (found_single_character && map->map_count() != 0)) {
3809 found_single_character = false;
3810 break;
3811 }
3812 for (int j = 0; j < kSize; j++) {
3813 if (map->at(j)) {
3814 found_single_character = true;
3815 single_character = j;
3816 break;
3817 }
3818 }
3819 }
3820
3821 int lookahead_width = max_lookahead + 1 - min_lookahead;
3822
3823 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3824 // The mask-compare can probably handle this better.
3825 return;
3826 }
3827
3828 if (found_single_character) {
3829 Label cont, again;
3830 masm->Bind(&again);
3831 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3832 if (max_char_ > kSize) {
3833 masm->CheckCharacterAfterAnd(single_character,
3834 RegExpMacroAssembler::kTableMask,
3835 &cont);
3836 } else {
3837 masm->CheckCharacter(single_character, &cont);
3838 }
3839 masm->AdvanceCurrentPosition(lookahead_width);
3840 masm->GoTo(&again);
3841 masm->Bind(&cont);
3842 return;
3843 }
3844
3845 Factory* factory = masm->isolate()->factory();
3846 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3847 int skip_distance = GetSkipTable(
3848 min_lookahead, max_lookahead, boolean_skip_table);
3849 DCHECK(skip_distance != 0);
3850
3851 Label cont, again;
3852 masm->Bind(&again);
3853 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3854 masm->CheckBitInTable(boolean_skip_table, &cont);
3855 masm->AdvanceCurrentPosition(skip_distance);
3856 masm->GoTo(&again);
3857 masm->Bind(&cont);
3858}
3859
3860
3861/* Code generation for choice nodes.
3862 *
3863 * We generate quick checks that do a mask and compare to eliminate a
3864 * choice. If the quick check succeeds then it jumps to the continuation to
3865 * do slow checks and check subsequent nodes. If it fails (the common case)
3866 * it falls through to the next choice.
3867 *
3868 * Here is the desired flow graph. Nodes directly below each other imply
3869 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3870 * 3 doesn't have a quick check so we have to call the slow check.
3871 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3872 * regexp continuation is generated directly after the Sn node, up to the
3873 * next GoTo if we decide to reuse some already generated code. Some
3874 * nodes expect preload_characters to be preloaded into the current
3875 * character register. R nodes do this preloading. Vertices are marked
3876 * F for failures and S for success (possible success in the case of quick
3877 * nodes). L, V, < and > are used as arrow heads.
3878 *
3879 * ----------> R
3880 * |
3881 * V
3882 * Q1 -----> S1
3883 * | S /
3884 * F| /
3885 * | F/
3886 * | /
3887 * | R
3888 * | /
3889 * V L
3890 * Q2 -----> S2
3891 * | S /
3892 * F| /
3893 * | F/
3894 * | /
3895 * | R
3896 * | /
3897 * V L
3898 * S3
3899 * |
3900 * F|
3901 * |
3902 * R
3903 * |
3904 * backtrack V
3905 * <----------Q4
3906 * \ F |
3907 * \ |S
3908 * \ F V
3909 * \-----S4
3910 *
3911 * For greedy loops we push the current position, then generate the code that
3912 * eats the input specially in EmitGreedyLoop. The other choice (the
3913 * continuation) is generated by the normal code in EmitChoices, and steps back
3914 * in the input to the starting position when it fails to match. The loop code
3915 * looks like this (U is the unwind code that steps back in the greedy loop).
3916 *
3917 * _____
3918 * / \
3919 * V |
3920 * ----------> S1 |
3921 * /| |
3922 * / |S |
3923 * F/ \_____/
3924 * /
3925 * |<-----
3926 * | \
3927 * V |S
3928 * Q2 ---> U----->backtrack
3929 * | F /
3930 * S| /
3931 * V F /
3932 * S2--/
3933 */
3934
3935GreedyLoopState::GreedyLoopState(bool not_at_start) {
3936 counter_backtrack_trace_.set_backtrack(&label_);
3937 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3938}
3939
3940
3941void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3942#ifdef DEBUG
3943 int choice_count = alternatives_->length();
3944 for (int i = 0; i < choice_count - 1; i++) {
3945 GuardedAlternative alternative = alternatives_->at(i);
3946 ZoneList<Guard*>* guards = alternative.guards();
3947 int guard_count = (guards == NULL) ? 0 : guards->length();
3948 for (int j = 0; j < guard_count; j++) {
3949 DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3950 }
3951 }
3952#endif
3953}
3954
3955
3956void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3957 Trace* current_trace,
3958 PreloadState* state) {
3959 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3960 // Save some time by looking at most one machine word ahead.
3961 state->eats_at_least_ =
3962 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3963 current_trace->at_start() == Trace::FALSE_VALUE);
3964 }
3965 state->preload_characters_ =
3966 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3967
3968 state->preload_is_current_ =
3969 (current_trace->characters_preloaded() == state->preload_characters_);
3970 state->preload_has_checked_bounds_ = state->preload_is_current_;
3971}
3972
3973
3974void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3975 int choice_count = alternatives_->length();
3976
Ben Murdoch097c5b22016-05-18 11:27:45 +01003977 if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
3978 alternatives_->at(0).node()->Emit(compiler, trace);
3979 return;
3980 }
3981
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00003982 AssertGuardsMentionRegisters(trace);
3983
3984 LimitResult limit_result = LimitVersions(compiler, trace);
3985 if (limit_result == DONE) return;
3986 DCHECK(limit_result == CONTINUE);
3987
3988 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3989 // other choice nodes we only flush if we are out of code size budget.
3990 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3991 trace->Flush(compiler, this);
3992 return;
3993 }
3994
3995 RecursionCheck rc(compiler);
3996
3997 PreloadState preload;
3998 preload.init();
3999 GreedyLoopState greedy_loop_state(not_at_start());
4000
4001 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
4002 AlternativeGenerationList alt_gens(choice_count, zone());
4003
4004 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
4005 trace = EmitGreedyLoop(compiler,
4006 trace,
4007 &alt_gens,
4008 &preload,
4009 &greedy_loop_state,
4010 text_length);
4011 } else {
4012 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
4013 // match the traces produced pre-cleanup.
4014 Label second_choice;
4015 compiler->macro_assembler()->Bind(&second_choice);
4016
4017 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
4018
4019 EmitChoices(compiler,
4020 &alt_gens,
4021 0,
4022 trace,
4023 &preload);
4024 }
4025
4026 // At this point we need to generate slow checks for the alternatives where
4027 // the quick check was inlined. We can recognize these because the associated
4028 // label was bound.
4029 int new_flush_budget = trace->flush_budget() / choice_count;
4030 for (int i = 0; i < choice_count; i++) {
4031 AlternativeGeneration* alt_gen = alt_gens.at(i);
4032 Trace new_trace(*trace);
4033 // If there are actions to be flushed we have to limit how many times
4034 // they are flushed. Take the budget of the parent trace and distribute
4035 // it fairly amongst the children.
4036 if (new_trace.actions() != NULL) {
4037 new_trace.set_flush_budget(new_flush_budget);
4038 }
4039 bool next_expects_preload =
4040 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
4041 EmitOutOfLineContinuation(compiler,
4042 &new_trace,
4043 alternatives_->at(i),
4044 alt_gen,
4045 preload.preload_characters_,
4046 next_expects_preload);
4047 }
4048}
4049
4050
4051Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
4052 Trace* trace,
4053 AlternativeGenerationList* alt_gens,
4054 PreloadState* preload,
4055 GreedyLoopState* greedy_loop_state,
4056 int text_length) {
4057 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4058 // Here we have special handling for greedy loops containing only text nodes
4059 // and other simple nodes. These are handled by pushing the current
4060 // position on the stack and then incrementing the current position each
4061 // time around the switch. On backtrack we decrement the current position
4062 // and check it against the pushed value. This avoids pushing backtrack
4063 // information for each iteration of the loop, which could take up a lot of
4064 // space.
4065 DCHECK(trace->stop_node() == NULL);
4066 macro_assembler->PushCurrentPosition();
4067 Label greedy_match_failed;
4068 Trace greedy_match_trace;
4069 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
4070 greedy_match_trace.set_backtrack(&greedy_match_failed);
4071 Label loop_label;
4072 macro_assembler->Bind(&loop_label);
4073 greedy_match_trace.set_stop_node(this);
4074 greedy_match_trace.set_loop_label(&loop_label);
4075 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4076 macro_assembler->Bind(&greedy_match_failed);
4077
4078 Label second_choice; // For use in greedy matches.
4079 macro_assembler->Bind(&second_choice);
4080
4081 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4082
4083 EmitChoices(compiler,
4084 alt_gens,
4085 1,
4086 new_trace,
4087 preload);
4088
4089 macro_assembler->Bind(greedy_loop_state->label());
4090 // If we have unwound to the bottom then backtrack.
4091 macro_assembler->CheckGreedyLoop(trace->backtrack());
4092 // Otherwise try the second priority at an earlier position.
4093 macro_assembler->AdvanceCurrentPosition(-text_length);
4094 macro_assembler->GoTo(&second_choice);
4095 return new_trace;
4096}
4097
4098int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4099 Trace* trace) {
4100 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4101 if (alternatives_->length() != 2) return eats_at_least;
4102
4103 GuardedAlternative alt1 = alternatives_->at(1);
4104 if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4105 return eats_at_least;
4106 }
4107 RegExpNode* eats_anything_node = alt1.node();
4108 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4109 return eats_at_least;
4110 }
4111
4112 // Really we should be creating a new trace when we execute this function,
4113 // but there is no need, because the code it generates cannot backtrack, and
4114 // we always arrive here with a trivial trace (since it's the entry to a
4115 // loop. That also implies that there are no preloaded characters, which is
4116 // good, because it means we won't be violating any assumptions by
4117 // overwriting those characters with new load instructions.
4118 DCHECK(trace->is_trivial());
4119
4120 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4121 Isolate* isolate = macro_assembler->isolate();
4122 // At this point we know that we are at a non-greedy loop that will eat
4123 // any character one at a time. Any non-anchored regexp has such a
4124 // loop prepended to it in order to find where it starts. We look for
4125 // a pattern of the form ...abc... where we can look 6 characters ahead
4126 // and step forwards 3 if the character is not one of abc. Abc need
4127 // not be atoms, they can be any reasonably limited character class or
4128 // small alternation.
4129 BoyerMooreLookahead* bm = bm_info(false);
4130 if (bm == NULL) {
4131 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4132 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4133 kRecursionBudget,
4134 false));
4135 if (eats_at_least >= 1) {
4136 bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4137 compiler,
4138 zone());
4139 GuardedAlternative alt0 = alternatives_->at(0);
4140 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4141 }
4142 }
4143 if (bm != NULL) {
4144 bm->EmitSkipInstructions(macro_assembler);
4145 }
4146 return eats_at_least;
4147}
4148
4149
4150void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4151 AlternativeGenerationList* alt_gens,
4152 int first_choice,
4153 Trace* trace,
4154 PreloadState* preload) {
4155 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4156 SetUpPreLoad(compiler, trace, preload);
4157
4158 // For now we just call all choices one after the other. The idea ultimately
4159 // is to use the Dispatch table to try only the relevant ones.
4160 int choice_count = alternatives_->length();
4161
4162 int new_flush_budget = trace->flush_budget() / choice_count;
4163
4164 for (int i = first_choice; i < choice_count; i++) {
4165 bool is_last = i == choice_count - 1;
4166 bool fall_through_on_failure = !is_last;
4167 GuardedAlternative alternative = alternatives_->at(i);
4168 AlternativeGeneration* alt_gen = alt_gens->at(i);
4169 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4170 ZoneList<Guard*>* guards = alternative.guards();
4171 int guard_count = (guards == NULL) ? 0 : guards->length();
4172 Trace new_trace(*trace);
4173 new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4174 preload->preload_characters_ :
4175 0);
4176 if (preload->preload_has_checked_bounds_) {
4177 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4178 }
4179 new_trace.quick_check_performed()->Clear();
4180 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4181 if (!is_last) {
4182 new_trace.set_backtrack(&alt_gen->after);
4183 }
4184 alt_gen->expects_preload = preload->preload_is_current_;
4185 bool generate_full_check_inline = false;
4186 if (compiler->optimize() &&
4187 try_to_emit_quick_check_for_alternative(i == 0) &&
4188 alternative.node()->EmitQuickCheck(
4189 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4190 &alt_gen->possible_success, &alt_gen->quick_check_details,
4191 fall_through_on_failure)) {
4192 // Quick check was generated for this choice.
4193 preload->preload_is_current_ = true;
4194 preload->preload_has_checked_bounds_ = true;
4195 // If we generated the quick check to fall through on possible success,
4196 // we now need to generate the full check inline.
4197 if (!fall_through_on_failure) {
4198 macro_assembler->Bind(&alt_gen->possible_success);
4199 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4200 new_trace.set_characters_preloaded(preload->preload_characters_);
4201 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4202 generate_full_check_inline = true;
4203 }
4204 } else if (alt_gen->quick_check_details.cannot_match()) {
4205 if (!fall_through_on_failure) {
4206 macro_assembler->GoTo(trace->backtrack());
4207 }
4208 continue;
4209 } else {
4210 // No quick check was generated. Put the full code here.
4211 // If this is not the first choice then there could be slow checks from
4212 // previous cases that go here when they fail. There's no reason to
4213 // insist that they preload characters since the slow check we are about
4214 // to generate probably can't use it.
4215 if (i != first_choice) {
4216 alt_gen->expects_preload = false;
4217 new_trace.InvalidateCurrentCharacter();
4218 }
4219 generate_full_check_inline = true;
4220 }
4221 if (generate_full_check_inline) {
4222 if (new_trace.actions() != NULL) {
4223 new_trace.set_flush_budget(new_flush_budget);
4224 }
4225 for (int j = 0; j < guard_count; j++) {
4226 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4227 }
4228 alternative.node()->Emit(compiler, &new_trace);
4229 preload->preload_is_current_ = false;
4230 }
4231 macro_assembler->Bind(&alt_gen->after);
4232 }
4233}
4234
4235
4236void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4237 Trace* trace,
4238 GuardedAlternative alternative,
4239 AlternativeGeneration* alt_gen,
4240 int preload_characters,
4241 bool next_expects_preload) {
4242 if (!alt_gen->possible_success.is_linked()) return;
4243
4244 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4245 macro_assembler->Bind(&alt_gen->possible_success);
4246 Trace out_of_line_trace(*trace);
4247 out_of_line_trace.set_characters_preloaded(preload_characters);
4248 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4249 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4250 ZoneList<Guard*>* guards = alternative.guards();
4251 int guard_count = (guards == NULL) ? 0 : guards->length();
4252 if (next_expects_preload) {
4253 Label reload_current_char;
4254 out_of_line_trace.set_backtrack(&reload_current_char);
4255 for (int j = 0; j < guard_count; j++) {
4256 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4257 }
4258 alternative.node()->Emit(compiler, &out_of_line_trace);
4259 macro_assembler->Bind(&reload_current_char);
4260 // Reload the current character, since the next quick check expects that.
4261 // We don't need to check bounds here because we only get into this
4262 // code through a quick check which already did the checked load.
4263 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4264 NULL,
4265 false,
4266 preload_characters);
4267 macro_assembler->GoTo(&(alt_gen->after));
4268 } else {
4269 out_of_line_trace.set_backtrack(&(alt_gen->after));
4270 for (int j = 0; j < guard_count; j++) {
4271 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4272 }
4273 alternative.node()->Emit(compiler, &out_of_line_trace);
4274 }
4275}
4276
4277
4278void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4279 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4280 LimitResult limit_result = LimitVersions(compiler, trace);
4281 if (limit_result == DONE) return;
4282 DCHECK(limit_result == CONTINUE);
4283
4284 RecursionCheck rc(compiler);
4285
4286 switch (action_type_) {
4287 case STORE_POSITION: {
4288 Trace::DeferredCapture
4289 new_capture(data_.u_position_register.reg,
4290 data_.u_position_register.is_capture,
4291 trace);
4292 Trace new_trace = *trace;
4293 new_trace.add_action(&new_capture);
4294 on_success()->Emit(compiler, &new_trace);
4295 break;
4296 }
4297 case INCREMENT_REGISTER: {
4298 Trace::DeferredIncrementRegister
4299 new_increment(data_.u_increment_register.reg);
4300 Trace new_trace = *trace;
4301 new_trace.add_action(&new_increment);
4302 on_success()->Emit(compiler, &new_trace);
4303 break;
4304 }
4305 case SET_REGISTER: {
4306 Trace::DeferredSetRegister
4307 new_set(data_.u_store_register.reg, data_.u_store_register.value);
4308 Trace new_trace = *trace;
4309 new_trace.add_action(&new_set);
4310 on_success()->Emit(compiler, &new_trace);
4311 break;
4312 }
4313 case CLEAR_CAPTURES: {
4314 Trace::DeferredClearCaptures
4315 new_capture(Interval(data_.u_clear_captures.range_from,
4316 data_.u_clear_captures.range_to));
4317 Trace new_trace = *trace;
4318 new_trace.add_action(&new_capture);
4319 on_success()->Emit(compiler, &new_trace);
4320 break;
4321 }
4322 case BEGIN_SUBMATCH:
4323 if (!trace->is_trivial()) {
4324 trace->Flush(compiler, this);
4325 } else {
4326 assembler->WriteCurrentPositionToRegister(
4327 data_.u_submatch.current_position_register, 0);
4328 assembler->WriteStackPointerToRegister(
4329 data_.u_submatch.stack_pointer_register);
4330 on_success()->Emit(compiler, trace);
4331 }
4332 break;
4333 case EMPTY_MATCH_CHECK: {
4334 int start_pos_reg = data_.u_empty_match_check.start_register;
4335 int stored_pos = 0;
4336 int rep_reg = data_.u_empty_match_check.repetition_register;
4337 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4338 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4339 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4340 // If we know we haven't advanced and there is no minimum we
4341 // can just backtrack immediately.
4342 assembler->GoTo(trace->backtrack());
4343 } else if (know_dist && stored_pos < trace->cp_offset()) {
4344 // If we know we've advanced we can generate the continuation
4345 // immediately.
4346 on_success()->Emit(compiler, trace);
4347 } else if (!trace->is_trivial()) {
4348 trace->Flush(compiler, this);
4349 } else {
4350 Label skip_empty_check;
4351 // If we have a minimum number of repetitions we check the current
4352 // number first and skip the empty check if it's not enough.
4353 if (has_minimum) {
4354 int limit = data_.u_empty_match_check.repetition_limit;
4355 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4356 }
4357 // If the match is empty we bail out, otherwise we fall through
4358 // to the on-success continuation.
4359 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4360 trace->backtrack());
4361 assembler->Bind(&skip_empty_check);
4362 on_success()->Emit(compiler, trace);
4363 }
4364 break;
4365 }
4366 case POSITIVE_SUBMATCH_SUCCESS: {
4367 if (!trace->is_trivial()) {
4368 trace->Flush(compiler, this);
4369 return;
4370 }
4371 assembler->ReadCurrentPositionFromRegister(
4372 data_.u_submatch.current_position_register);
4373 assembler->ReadStackPointerFromRegister(
4374 data_.u_submatch.stack_pointer_register);
4375 int clear_register_count = data_.u_submatch.clear_register_count;
4376 if (clear_register_count == 0) {
4377 on_success()->Emit(compiler, trace);
4378 return;
4379 }
4380 int clear_registers_from = data_.u_submatch.clear_register_from;
4381 Label clear_registers_backtrack;
4382 Trace new_trace = *trace;
4383 new_trace.set_backtrack(&clear_registers_backtrack);
4384 on_success()->Emit(compiler, &new_trace);
4385
4386 assembler->Bind(&clear_registers_backtrack);
4387 int clear_registers_to = clear_registers_from + clear_register_count - 1;
4388 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4389
4390 DCHECK(trace->backtrack() == NULL);
4391 assembler->Backtrack();
4392 return;
4393 }
4394 default:
4395 UNREACHABLE();
4396 }
4397}
4398
4399
4400void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4401 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4402 if (!trace->is_trivial()) {
4403 trace->Flush(compiler, this);
4404 return;
4405 }
4406
4407 LimitResult limit_result = LimitVersions(compiler, trace);
4408 if (limit_result == DONE) return;
4409 DCHECK(limit_result == CONTINUE);
4410
4411 RecursionCheck rc(compiler);
4412
4413 DCHECK_EQ(start_reg_ + 1, end_reg_);
4414 if (compiler->ignore_case()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01004415 assembler->CheckNotBackReferenceIgnoreCase(
4416 start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004417 } else {
4418 assembler->CheckNotBackReference(start_reg_, read_backward(),
4419 trace->backtrack());
4420 }
4421 // We are going to advance backward, so we may end up at the start.
4422 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
Ben Murdoch097c5b22016-05-18 11:27:45 +01004423
4424 // Check that the back reference does not end inside a surrogate pair.
4425 if (compiler->unicode() && !compiler->one_byte()) {
4426 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4427 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004428 on_success()->Emit(compiler, trace);
4429}
4430
4431
4432// -------------------------------------------------------------------
4433// Dot/dotty output
4434
4435
4436#ifdef DEBUG
4437
4438
4439class DotPrinter: public NodeVisitor {
4440 public:
4441 DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4442 : os_(os),
4443 ignore_case_(ignore_case) {}
4444 void PrintNode(const char* label, RegExpNode* node);
4445 void Visit(RegExpNode* node);
4446 void PrintAttributes(RegExpNode* from);
4447 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4448#define DECLARE_VISIT(Type) \
4449 virtual void Visit##Type(Type##Node* that);
4450FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4451#undef DECLARE_VISIT
4452 private:
4453 std::ostream& os_;
4454 bool ignore_case_;
4455};
4456
4457
4458void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4459 os_ << "digraph G {\n graph [label=\"";
4460 for (int i = 0; label[i]; i++) {
4461 switch (label[i]) {
4462 case '\\':
4463 os_ << "\\\\";
4464 break;
4465 case '"':
4466 os_ << "\"";
4467 break;
4468 default:
4469 os_ << label[i];
4470 break;
4471 }
4472 }
4473 os_ << "\"];\n";
4474 Visit(node);
4475 os_ << "}" << std::endl;
4476}
4477
4478
4479void DotPrinter::Visit(RegExpNode* node) {
4480 if (node->info()->visited) return;
4481 node->info()->visited = true;
4482 node->Accept(this);
4483}
4484
4485
4486void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4487 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4488 Visit(on_failure);
4489}
4490
4491
4492class TableEntryBodyPrinter {
4493 public:
4494 TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4495 : os_(os),
4496 choice_(choice) {}
4497 void Call(uc16 from, DispatchTable::Entry entry) {
4498 OutSet* out_set = entry.out_set();
4499 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4500 if (out_set->Get(i)) {
4501 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4502 << choice()->alternatives()->at(i).node() << ";\n";
4503 }
4504 }
4505 }
4506 private:
4507 ChoiceNode* choice() { return choice_; }
4508 std::ostream& os_;
4509 ChoiceNode* choice_;
4510};
4511
4512
4513class TableEntryHeaderPrinter {
4514 public:
4515 explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4516 : first_(true),
4517 os_(os) {}
4518 void Call(uc16 from, DispatchTable::Entry entry) {
4519 if (first_) {
4520 first_ = false;
4521 } else {
4522 os_ << "|";
4523 }
4524 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4525 OutSet* out_set = entry.out_set();
4526 int priority = 0;
4527 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4528 if (out_set->Get(i)) {
4529 if (priority > 0) os_ << "|";
4530 os_ << "<s" << from << "o" << i << "> " << priority;
4531 priority++;
4532 }
4533 }
4534 os_ << "}}";
4535 }
4536
4537 private:
4538 bool first_;
4539 std::ostream& os_;
4540};
4541
4542
4543class AttributePrinter {
4544 public:
4545 explicit AttributePrinter(std::ostream& os) // NOLINT
4546 : os_(os),
4547 first_(true) {}
4548 void PrintSeparator() {
4549 if (first_) {
4550 first_ = false;
4551 } else {
4552 os_ << "|";
4553 }
4554 }
4555 void PrintBit(const char* name, bool value) {
4556 if (!value) return;
4557 PrintSeparator();
4558 os_ << "{" << name << "}";
4559 }
4560 void PrintPositive(const char* name, int value) {
4561 if (value < 0) return;
4562 PrintSeparator();
4563 os_ << "{" << name << "|" << value << "}";
4564 }
4565
4566 private:
4567 std::ostream& os_;
4568 bool first_;
4569};
4570
4571
4572void DotPrinter::PrintAttributes(RegExpNode* that) {
4573 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4574 << "margin=0.1, fontsize=10, label=\"{";
4575 AttributePrinter printer(os_);
4576 NodeInfo* info = that->info();
4577 printer.PrintBit("NI", info->follows_newline_interest);
4578 printer.PrintBit("WI", info->follows_word_interest);
4579 printer.PrintBit("SI", info->follows_start_interest);
4580 Label* label = that->label();
4581 if (label->is_bound())
4582 printer.PrintPositive("@", label->pos());
4583 os_ << "}\"];\n"
4584 << " a" << that << " -> n" << that
4585 << " [style=dashed, color=grey, arrowhead=none];\n";
4586}
4587
4588
4589static const bool kPrintDispatchTable = false;
4590void DotPrinter::VisitChoice(ChoiceNode* that) {
4591 if (kPrintDispatchTable) {
4592 os_ << " n" << that << " [shape=Mrecord, label=\"";
4593 TableEntryHeaderPrinter header_printer(os_);
4594 that->GetTable(ignore_case_)->ForEach(&header_printer);
4595 os_ << "\"]\n";
4596 PrintAttributes(that);
4597 TableEntryBodyPrinter body_printer(os_, that);
4598 that->GetTable(ignore_case_)->ForEach(&body_printer);
4599 } else {
4600 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4601 for (int i = 0; i < that->alternatives()->length(); i++) {
4602 GuardedAlternative alt = that->alternatives()->at(i);
4603 os_ << " n" << that << " -> n" << alt.node();
4604 }
4605 }
4606 for (int i = 0; i < that->alternatives()->length(); i++) {
4607 GuardedAlternative alt = that->alternatives()->at(i);
4608 alt.node()->Accept(this);
4609 }
4610}
4611
4612
4613void DotPrinter::VisitText(TextNode* that) {
4614 Zone* zone = that->zone();
4615 os_ << " n" << that << " [label=\"";
4616 for (int i = 0; i < that->elements()->length(); i++) {
4617 if (i > 0) os_ << " ";
4618 TextElement elm = that->elements()->at(i);
4619 switch (elm.text_type()) {
4620 case TextElement::ATOM: {
4621 Vector<const uc16> data = elm.atom()->data();
4622 for (int i = 0; i < data.length(); i++) {
4623 os_ << static_cast<char>(data[i]);
4624 }
4625 break;
4626 }
4627 case TextElement::CHAR_CLASS: {
4628 RegExpCharacterClass* node = elm.char_class();
4629 os_ << "[";
4630 if (node->is_negated()) os_ << "^";
4631 for (int j = 0; j < node->ranges(zone)->length(); j++) {
4632 CharacterRange range = node->ranges(zone)->at(j);
4633 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4634 }
4635 os_ << "]";
4636 break;
4637 }
4638 default:
4639 UNREACHABLE();
4640 }
4641 }
4642 os_ << "\", shape=box, peripheries=2];\n";
4643 PrintAttributes(that);
4644 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4645 Visit(that->on_success());
4646}
4647
4648
4649void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4650 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4651 << that->end_register() << "\", shape=doubleoctagon];\n";
4652 PrintAttributes(that);
4653 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4654 Visit(that->on_success());
4655}
4656
4657
4658void DotPrinter::VisitEnd(EndNode* that) {
4659 os_ << " n" << that << " [style=bold, shape=point];\n";
4660 PrintAttributes(that);
4661}
4662
4663
4664void DotPrinter::VisitAssertion(AssertionNode* that) {
4665 os_ << " n" << that << " [";
4666 switch (that->assertion_type()) {
4667 case AssertionNode::AT_END:
4668 os_ << "label=\"$\", shape=septagon";
4669 break;
4670 case AssertionNode::AT_START:
4671 os_ << "label=\"^\", shape=septagon";
4672 break;
4673 case AssertionNode::AT_BOUNDARY:
4674 os_ << "label=\"\\b\", shape=septagon";
4675 break;
4676 case AssertionNode::AT_NON_BOUNDARY:
4677 os_ << "label=\"\\B\", shape=septagon";
4678 break;
4679 case AssertionNode::AFTER_NEWLINE:
4680 os_ << "label=\"(?<=\\n)\", shape=septagon";
4681 break;
4682 }
4683 os_ << "];\n";
4684 PrintAttributes(that);
4685 RegExpNode* successor = that->on_success();
4686 os_ << " n" << that << " -> n" << successor << ";\n";
4687 Visit(successor);
4688}
4689
4690
4691void DotPrinter::VisitAction(ActionNode* that) {
4692 os_ << " n" << that << " [";
4693 switch (that->action_type_) {
4694 case ActionNode::SET_REGISTER:
4695 os_ << "label=\"$" << that->data_.u_store_register.reg
4696 << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4697 break;
4698 case ActionNode::INCREMENT_REGISTER:
4699 os_ << "label=\"$" << that->data_.u_increment_register.reg
4700 << "++\", shape=octagon";
4701 break;
4702 case ActionNode::STORE_POSITION:
4703 os_ << "label=\"$" << that->data_.u_position_register.reg
4704 << ":=$pos\", shape=octagon";
4705 break;
4706 case ActionNode::BEGIN_SUBMATCH:
4707 os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4708 << ":=$pos,begin\", shape=septagon";
4709 break;
4710 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4711 os_ << "label=\"escape\", shape=septagon";
4712 break;
4713 case ActionNode::EMPTY_MATCH_CHECK:
4714 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4715 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4716 << "<" << that->data_.u_empty_match_check.repetition_limit
4717 << "?\", shape=septagon";
4718 break;
4719 case ActionNode::CLEAR_CAPTURES: {
4720 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4721 << " to $" << that->data_.u_clear_captures.range_to
4722 << "\", shape=septagon";
4723 break;
4724 }
4725 }
4726 os_ << "];\n";
4727 PrintAttributes(that);
4728 RegExpNode* successor = that->on_success();
4729 os_ << " n" << that << " -> n" << successor << ";\n";
4730 Visit(successor);
4731}
4732
4733
4734class DispatchTableDumper {
4735 public:
4736 explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4737 void Call(uc16 key, DispatchTable::Entry entry);
4738 private:
4739 std::ostream& os_;
4740};
4741
4742
4743void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4744 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4745 OutSet* set = entry.out_set();
4746 bool first = true;
4747 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4748 if (set->Get(i)) {
4749 if (first) {
4750 first = false;
4751 } else {
4752 os_ << ", ";
4753 }
4754 os_ << i;
4755 }
4756 }
4757 os_ << "}\n";
4758}
4759
4760
4761void DispatchTable::Dump() {
4762 OFStream os(stderr);
4763 DispatchTableDumper dumper(os);
4764 tree()->ForEach(&dumper);
4765}
4766
4767
4768void RegExpEngine::DotPrint(const char* label,
4769 RegExpNode* node,
4770 bool ignore_case) {
4771 OFStream os(stdout);
4772 DotPrinter printer(os, ignore_case);
4773 printer.PrintNode(label, node);
4774}
4775
4776
4777#endif // DEBUG
4778
4779
4780// -------------------------------------------------------------------
4781// Tree to graph conversion
4782
4783RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4784 RegExpNode* on_success) {
4785 ZoneList<TextElement>* elms =
4786 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4787 elms->Add(TextElement::Atom(this), compiler->zone());
4788 return new (compiler->zone())
4789 TextNode(elms, compiler->read_backward(), on_success);
4790}
4791
4792
4793RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4794 RegExpNode* on_success) {
4795 return new (compiler->zone())
4796 TextNode(elements(), compiler->read_backward(), on_success);
4797}
4798
4799
4800static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4801 const int* special_class,
4802 int length) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01004803 length--; // Remove final marker.
4804 DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004805 DCHECK(ranges->length() != 0);
4806 DCHECK(length != 0);
4807 DCHECK(special_class[0] != 0);
4808 if (ranges->length() != (length >> 1) + 1) {
4809 return false;
4810 }
4811 CharacterRange range = ranges->at(0);
4812 if (range.from() != 0) {
4813 return false;
4814 }
4815 for (int i = 0; i < length; i += 2) {
4816 if (special_class[i] != (range.to() + 1)) {
4817 return false;
4818 }
4819 range = ranges->at((i >> 1) + 1);
4820 if (special_class[i+1] != range.from()) {
4821 return false;
4822 }
4823 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01004824 if (range.to() != String::kMaxCodePoint) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004825 return false;
4826 }
4827 return true;
4828}
4829
4830
4831static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4832 const int* special_class,
4833 int length) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01004834 length--; // Remove final marker.
4835 DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00004836 if (ranges->length() * 2 != length) {
4837 return false;
4838 }
4839 for (int i = 0; i < length; i += 2) {
4840 CharacterRange range = ranges->at(i >> 1);
4841 if (range.from() != special_class[i] ||
4842 range.to() != special_class[i + 1] - 1) {
4843 return false;
4844 }
4845 }
4846 return true;
4847}
4848
4849
4850bool RegExpCharacterClass::is_standard(Zone* zone) {
4851 // TODO(lrn): Remove need for this function, by not throwing away information
4852 // along the way.
4853 if (is_negated_) {
4854 return false;
4855 }
4856 if (set_.is_standard()) {
4857 return true;
4858 }
4859 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4860 set_.set_standard_set_type('s');
4861 return true;
4862 }
4863 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4864 set_.set_standard_set_type('S');
4865 return true;
4866 }
4867 if (CompareInverseRanges(set_.ranges(zone),
4868 kLineTerminatorRanges,
4869 kLineTerminatorRangeCount)) {
4870 set_.set_standard_set_type('.');
4871 return true;
4872 }
4873 if (CompareRanges(set_.ranges(zone),
4874 kLineTerminatorRanges,
4875 kLineTerminatorRangeCount)) {
4876 set_.set_standard_set_type('n');
4877 return true;
4878 }
4879 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4880 set_.set_standard_set_type('w');
4881 return true;
4882 }
4883 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4884 set_.set_standard_set_type('W');
4885 return true;
4886 }
4887 return false;
4888}
4889
4890
Ben Murdoch097c5b22016-05-18 11:27:45 +01004891UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4892 ZoneList<CharacterRange>* base)
4893 : zone_(zone),
4894 table_(zone),
4895 bmp_(nullptr),
4896 lead_surrogates_(nullptr),
4897 trail_surrogates_(nullptr),
4898 non_bmp_(nullptr) {
4899 // The unicode range splitter categorizes given character ranges into:
4900 // - Code points from the BMP representable by one code unit.
4901 // - Code points outside the BMP that need to be split into surrogate pairs.
4902 // - Lone lead surrogates.
4903 // - Lone trail surrogates.
4904 // Lone surrogates are valid code points, even though no actual characters.
4905 // They require special matching to make sure we do not split surrogate pairs.
4906 // We use the dispatch table to accomplish this. The base range is split up
4907 // by the table by the overlay ranges, and the Call callback is used to
4908 // filter and collect ranges for each category.
4909 for (int i = 0; i < base->length(); i++) {
4910 table_.AddRange(base->at(i), kBase, zone_);
4911 }
4912 // Add overlay ranges.
4913 table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4914 kBmpCodePoints, zone_);
4915 table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4916 kLeadSurrogates, zone_);
4917 table_.AddRange(
4918 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4919 kTrailSurrogates, zone_);
4920 table_.AddRange(
4921 CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4922 kBmpCodePoints, zone_);
4923 table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4924 kNonBmpCodePoints, zone_);
4925 table_.ForEach(this);
4926}
4927
4928
4929void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4930 OutSet* outset = entry.out_set();
4931 if (!outset->Get(kBase)) return;
4932 ZoneList<CharacterRange>** target = NULL;
4933 if (outset->Get(kBmpCodePoints)) {
4934 target = &bmp_;
4935 } else if (outset->Get(kLeadSurrogates)) {
4936 target = &lead_surrogates_;
4937 } else if (outset->Get(kTrailSurrogates)) {
4938 target = &trail_surrogates_;
4939 } else {
4940 DCHECK(outset->Get(kNonBmpCodePoints));
4941 target = &non_bmp_;
4942 }
4943 if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4944 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4945}
4946
4947
4948void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4949 RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4950 ZoneList<CharacterRange>* bmp = splitter->bmp();
4951 if (bmp == nullptr) return;
4952 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4953 compiler->zone(), bmp, compiler->read_backward(), on_success)));
4954}
4955
4956
4957void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4958 RegExpNode* on_success,
4959 UnicodeRangeSplitter* splitter) {
4960 ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4961 if (non_bmp == nullptr) return;
4962 DCHECK(compiler->unicode());
4963 DCHECK(!compiler->one_byte());
4964 Zone* zone = compiler->zone();
4965 CharacterRange::Canonicalize(non_bmp);
4966 for (int i = 0; i < non_bmp->length(); i++) {
4967 // Match surrogate pair.
4968 // E.g. [\u10005-\u11005] becomes
4969 // \ud800[\udc05-\udfff]|
4970 // [\ud801-\ud803][\udc00-\udfff]|
4971 // \ud804[\udc00-\udc05]
4972 uc32 from = non_bmp->at(i).from();
4973 uc32 to = non_bmp->at(i).to();
4974 uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4975 uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4976 uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4977 uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4978 if (from_l == to_l) {
4979 // The lead surrogate is the same.
4980 result->AddAlternative(
4981 GuardedAlternative(TextNode::CreateForSurrogatePair(
4982 zone, CharacterRange::Singleton(from_l),
4983 CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4984 on_success)));
4985 } else {
4986 if (from_t != kTrailSurrogateStart) {
4987 // Add [from_l][from_t-\udfff]
4988 result->AddAlternative(
4989 GuardedAlternative(TextNode::CreateForSurrogatePair(
4990 zone, CharacterRange::Singleton(from_l),
4991 CharacterRange::Range(from_t, kTrailSurrogateEnd),
4992 compiler->read_backward(), on_success)));
4993 from_l++;
4994 }
4995 if (to_t != kTrailSurrogateEnd) {
4996 // Add [to_l][\udc00-to_t]
4997 result->AddAlternative(
4998 GuardedAlternative(TextNode::CreateForSurrogatePair(
4999 zone, CharacterRange::Singleton(to_l),
5000 CharacterRange::Range(kTrailSurrogateStart, to_t),
5001 compiler->read_backward(), on_success)));
5002 to_l--;
5003 }
5004 if (from_l <= to_l) {
5005 // Add [from_l-to_l][\udc00-\udfff]
5006 result->AddAlternative(
5007 GuardedAlternative(TextNode::CreateForSurrogatePair(
5008 zone, CharacterRange::Range(from_l, to_l),
5009 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
5010 compiler->read_backward(), on_success)));
5011 }
5012 }
5013 }
5014}
5015
5016
5017RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
5018 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
5019 ZoneList<CharacterRange>* match, RegExpNode* on_success,
5020 bool read_backward) {
5021 Zone* zone = compiler->zone();
5022 RegExpNode* match_node = TextNode::CreateForCharacterRanges(
5023 zone, match, read_backward, on_success);
5024 int stack_register = compiler->UnicodeLookaroundStackRegister();
5025 int position_register = compiler->UnicodeLookaroundPositionRegister();
5026 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
5027 position_register);
5028 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5029 zone, lookbehind, !read_backward, lookaround.on_match_success());
5030 return lookaround.ForMatch(negative_match);
5031}
5032
5033
5034RegExpNode* MatchAndNegativeLookaroundInReadDirection(
5035 RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
5036 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
5037 bool read_backward) {
5038 Zone* zone = compiler->zone();
5039 int stack_register = compiler->UnicodeLookaroundStackRegister();
5040 int position_register = compiler->UnicodeLookaroundPositionRegister();
5041 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
5042 position_register);
5043 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5044 zone, lookahead, read_backward, lookaround.on_match_success());
5045 return TextNode::CreateForCharacterRanges(
5046 zone, match, read_backward, lookaround.ForMatch(negative_match));
5047}
5048
5049
5050void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5051 RegExpNode* on_success,
5052 UnicodeRangeSplitter* splitter) {
5053 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
5054 if (lead_surrogates == nullptr) return;
5055 Zone* zone = compiler->zone();
5056 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
5057 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
5058 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
5059
5060 RegExpNode* match;
5061 if (compiler->read_backward()) {
5062 // Reading backward. Assert that reading forward, there is no trail
5063 // surrogate, and then backward match the lead surrogate.
5064 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5065 compiler, trail_surrogates, lead_surrogates, on_success, true);
5066 } else {
5067 // Reading forward. Forward match the lead surrogate and assert that
5068 // no trail surrogate follows.
5069 match = MatchAndNegativeLookaroundInReadDirection(
5070 compiler, lead_surrogates, trail_surrogates, on_success, false);
5071 }
5072 result->AddAlternative(GuardedAlternative(match));
5073}
5074
5075
5076void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5077 RegExpNode* on_success,
5078 UnicodeRangeSplitter* splitter) {
5079 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5080 if (trail_surrogates == nullptr) return;
5081 Zone* zone = compiler->zone();
5082 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5083 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5084 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5085
5086 RegExpNode* match;
5087 if (compiler->read_backward()) {
5088 // Reading backward. Backward match the trail surrogate and assert that no
5089 // lead surrogate precedes it.
5090 match = MatchAndNegativeLookaroundInReadDirection(
5091 compiler, trail_surrogates, lead_surrogates, on_success, true);
5092 } else {
5093 // Reading forward. Assert that reading backward, there is no lead
5094 // surrogate, and then forward match the trail surrogate.
5095 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5096 compiler, lead_surrogates, trail_surrogates, on_success, false);
5097 }
5098 result->AddAlternative(GuardedAlternative(match));
5099}
5100
5101RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5102 RegExpNode* on_success) {
5103 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5104 DCHECK(!compiler->read_backward());
5105 Zone* zone = compiler->zone();
5106 // Advance any character. If the character happens to be a lead surrogate and
5107 // we advanced into the middle of a surrogate pair, it will work out, as
5108 // nothing will match from there. We will have to advance again, consuming
5109 // the associated trail surrogate.
5110 ZoneList<CharacterRange>* range = CharacterRange::List(
5111 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5112 return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
5113}
5114
5115
5116void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
5117 ZoneList<CharacterRange>* ranges) {
5118#ifdef V8_I18N_SUPPORT
5119 // Use ICU to compute the case fold closure over the ranges.
5120 DCHECK(compiler->unicode());
5121 DCHECK(compiler->ignore_case());
5122 USet* set = uset_openEmpty();
5123 for (int i = 0; i < ranges->length(); i++) {
5124 uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
5125 }
5126 ranges->Clear();
5127 uset_closeOver(set, USET_CASE_INSENSITIVE);
5128 // Full case mapping map single characters to multiple characters.
5129 // Those are represented as strings in the set. Remove them so that
5130 // we end up with only simple and common case mappings.
5131 uset_removeAllStrings(set);
5132 int item_count = uset_getItemCount(set);
5133 int item_result = 0;
5134 UErrorCode ec = U_ZERO_ERROR;
5135 Zone* zone = compiler->zone();
5136 for (int i = 0; i < item_count; i++) {
5137 uc32 start = 0;
5138 uc32 end = 0;
5139 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
5140 ranges->Add(CharacterRange::Range(start, end), zone);
5141 }
5142 // No errors and everything we collected have been ranges.
5143 DCHECK_EQ(U_ZERO_ERROR, ec);
5144 DCHECK_EQ(0, item_result);
5145 uset_close(set);
5146#else
5147 // Fallback if ICU is not included.
5148 CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
5149 ranges, compiler->one_byte());
5150#endif // V8_I18N_SUPPORT
5151 CharacterRange::Canonicalize(ranges);
5152}
5153
5154
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005155RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5156 RegExpNode* on_success) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01005157 set_.Canonicalize();
5158 Zone* zone = compiler->zone();
5159 ZoneList<CharacterRange>* ranges = this->ranges(zone);
5160 if (compiler->unicode() && compiler->ignore_case()) {
5161 AddUnicodeCaseEquivalents(compiler, ranges);
5162 }
5163 if (compiler->unicode() && !compiler->one_byte()) {
5164 if (is_negated()) {
5165 ZoneList<CharacterRange>* negated =
5166 new (zone) ZoneList<CharacterRange>(2, zone);
5167 CharacterRange::Negate(ranges, negated, zone);
5168 ranges = negated;
5169 }
5170 if (ranges->length() == 0) {
Ben Murdochc5610432016-08-08 18:44:38 +01005171 ranges->Add(CharacterRange::Everything(), zone);
5172 RegExpCharacterClass* fail =
5173 new (zone) RegExpCharacterClass(ranges, true);
5174 return new (zone) TextNode(fail, compiler->read_backward(), on_success);
Ben Murdoch097c5b22016-05-18 11:27:45 +01005175 }
5176 if (standard_type() == '*') {
5177 return UnanchoredAdvance(compiler, on_success);
5178 } else {
5179 ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5180 UnicodeRangeSplitter splitter(zone, ranges);
5181 AddBmpCharacters(compiler, result, on_success, &splitter);
5182 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5183 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5184 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5185 return result;
5186 }
5187 } else {
5188 return new (zone) TextNode(this, compiler->read_backward(), on_success);
5189 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005190}
5191
5192
5193int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5194 RegExpAtom* atom1 = (*a)->AsAtom();
5195 RegExpAtom* atom2 = (*b)->AsAtom();
5196 uc16 character1 = atom1->data().at(0);
5197 uc16 character2 = atom2->data().at(0);
5198 if (character1 < character2) return -1;
5199 if (character1 > character2) return 1;
5200 return 0;
5201}
5202
5203
5204static unibrow::uchar Canonical(
5205 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5206 unibrow::uchar c) {
5207 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5208 int length = canonicalize->get(c, '\0', chars);
5209 DCHECK_LE(length, 1);
5210 unibrow::uchar canonical = c;
5211 if (length == 1) canonical = chars[0];
5212 return canonical;
5213}
5214
5215
5216int CompareFirstCharCaseIndependent(
5217 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5218 RegExpTree* const* a, RegExpTree* const* b) {
5219 RegExpAtom* atom1 = (*a)->AsAtom();
5220 RegExpAtom* atom2 = (*b)->AsAtom();
5221 unibrow::uchar character1 = atom1->data().at(0);
5222 unibrow::uchar character2 = atom2->data().at(0);
5223 if (character1 == character2) return 0;
5224 if (character1 >= 'a' || character2 >= 'a') {
5225 character1 = Canonical(canonicalize, character1);
5226 character2 = Canonical(canonicalize, character2);
5227 }
5228 return static_cast<int>(character1) - static_cast<int>(character2);
5229}
5230
5231
5232// We can stable sort runs of atoms, since the order does not matter if they
5233// start with different characters.
5234// Returns true if any consecutive atoms were found.
5235bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5236 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5237 int length = alternatives->length();
5238 bool found_consecutive_atoms = false;
5239 for (int i = 0; i < length; i++) {
5240 while (i < length) {
5241 RegExpTree* alternative = alternatives->at(i);
5242 if (alternative->IsAtom()) break;
5243 i++;
5244 }
5245 // i is length or it is the index of an atom.
5246 if (i == length) break;
5247 int first_atom = i;
5248 i++;
5249 while (i < length) {
5250 RegExpTree* alternative = alternatives->at(i);
5251 if (!alternative->IsAtom()) break;
5252 i++;
5253 }
5254 // Sort atoms to get ones with common prefixes together.
5255 // This step is more tricky if we are in a case-independent regexp,
5256 // because it would change /is|I/ to /I|is/, and order matters when
5257 // the regexp parts don't match only disjoint starting points. To fix
5258 // this we have a version of CompareFirstChar that uses case-
5259 // independent character classes for comparison.
5260 DCHECK_LT(first_atom, alternatives->length());
5261 DCHECK_LE(i, alternatives->length());
5262 DCHECK_LE(first_atom, i);
5263 if (compiler->ignore_case()) {
5264 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5265 compiler->isolate()->regexp_macro_assembler_canonicalize();
5266 auto compare_closure =
5267 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5268 return CompareFirstCharCaseIndependent(canonicalize, a, b);
5269 };
5270 alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5271 } else {
5272 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5273 }
5274 if (i - first_atom > 1) found_consecutive_atoms = true;
5275 }
5276 return found_consecutive_atoms;
5277}
5278
5279
5280// Optimizes ab|ac|az to a(?:b|c|d).
5281void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5282 Zone* zone = compiler->zone();
5283 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5284 int length = alternatives->length();
5285
5286 int write_posn = 0;
5287 int i = 0;
5288 while (i < length) {
5289 RegExpTree* alternative = alternatives->at(i);
5290 if (!alternative->IsAtom()) {
5291 alternatives->at(write_posn++) = alternatives->at(i);
5292 i++;
5293 continue;
5294 }
5295 RegExpAtom* atom = alternative->AsAtom();
5296 unibrow::uchar common_prefix = atom->data().at(0);
5297 int first_with_prefix = i;
5298 int prefix_length = atom->length();
5299 i++;
5300 while (i < length) {
5301 alternative = alternatives->at(i);
5302 if (!alternative->IsAtom()) break;
5303 atom = alternative->AsAtom();
5304 unibrow::uchar new_prefix = atom->data().at(0);
5305 if (new_prefix != common_prefix) {
5306 if (!compiler->ignore_case()) break;
5307 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5308 compiler->isolate()->regexp_macro_assembler_canonicalize();
5309 new_prefix = Canonical(canonicalize, new_prefix);
5310 common_prefix = Canonical(canonicalize, common_prefix);
5311 if (new_prefix != common_prefix) break;
5312 }
5313 prefix_length = Min(prefix_length, atom->length());
5314 i++;
5315 }
5316 if (i > first_with_prefix + 2) {
5317 // Found worthwhile run of alternatives with common prefix of at least one
5318 // character. The sorting function above did not sort on more than one
5319 // character for reasons of correctness, but there may still be a longer
5320 // common prefix if the terms were similar or presorted in the input.
5321 // Find out how long the common prefix is.
5322 int run_length = i - first_with_prefix;
5323 atom = alternatives->at(first_with_prefix)->AsAtom();
5324 for (int j = 1; j < run_length && prefix_length > 1; j++) {
5325 RegExpAtom* old_atom =
5326 alternatives->at(j + first_with_prefix)->AsAtom();
5327 for (int k = 1; k < prefix_length; k++) {
5328 if (atom->data().at(k) != old_atom->data().at(k)) {
5329 prefix_length = k;
5330 break;
5331 }
5332 }
5333 }
5334 RegExpAtom* prefix =
5335 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
5336 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5337 pair->Add(prefix, zone);
5338 ZoneList<RegExpTree*>* suffixes =
5339 new (zone) ZoneList<RegExpTree*>(run_length, zone);
5340 for (int j = 0; j < run_length; j++) {
5341 RegExpAtom* old_atom =
5342 alternatives->at(j + first_with_prefix)->AsAtom();
5343 int len = old_atom->length();
5344 if (len == prefix_length) {
5345 suffixes->Add(new (zone) RegExpEmpty(), zone);
5346 } else {
5347 RegExpTree* suffix = new (zone) RegExpAtom(
5348 old_atom->data().SubVector(prefix_length, old_atom->length()));
5349 suffixes->Add(suffix, zone);
5350 }
5351 }
5352 pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5353 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5354 } else {
5355 // Just copy any non-worthwhile alternatives.
5356 for (int j = first_with_prefix; j < i; j++) {
5357 alternatives->at(write_posn++) = alternatives->at(j);
5358 }
5359 }
5360 }
5361 alternatives->Rewind(write_posn); // Trim end of array.
5362}
5363
5364
5365// Optimizes b|c|z to [bcz].
5366void RegExpDisjunction::FixSingleCharacterDisjunctions(
5367 RegExpCompiler* compiler) {
5368 Zone* zone = compiler->zone();
5369 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5370 int length = alternatives->length();
5371
5372 int write_posn = 0;
5373 int i = 0;
5374 while (i < length) {
5375 RegExpTree* alternative = alternatives->at(i);
5376 if (!alternative->IsAtom()) {
5377 alternatives->at(write_posn++) = alternatives->at(i);
5378 i++;
5379 continue;
5380 }
5381 RegExpAtom* atom = alternative->AsAtom();
5382 if (atom->length() != 1) {
5383 alternatives->at(write_posn++) = alternatives->at(i);
5384 i++;
5385 continue;
5386 }
5387 int first_in_run = i;
5388 i++;
5389 while (i < length) {
5390 alternative = alternatives->at(i);
5391 if (!alternative->IsAtom()) break;
5392 atom = alternative->AsAtom();
5393 if (atom->length() != 1) break;
5394 i++;
5395 }
5396 if (i > first_in_run + 1) {
5397 // Found non-trivial run of single-character alternatives.
5398 int run_length = i - first_in_run;
5399 ZoneList<CharacterRange>* ranges =
5400 new (zone) ZoneList<CharacterRange>(2, zone);
5401 for (int j = 0; j < run_length; j++) {
5402 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5403 DCHECK_EQ(old_atom->length(), 1);
5404 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5405 }
5406 alternatives->at(write_posn++) =
5407 new (zone) RegExpCharacterClass(ranges, false);
5408 } else {
5409 // Just copy any trivial alternatives.
5410 for (int j = first_in_run; j < i; j++) {
5411 alternatives->at(write_posn++) = alternatives->at(j);
5412 }
5413 }
5414 }
5415 alternatives->Rewind(write_posn); // Trim end of array.
5416}
5417
5418
5419RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5420 RegExpNode* on_success) {
5421 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5422
5423 if (alternatives->length() > 2) {
5424 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5425 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5426 FixSingleCharacterDisjunctions(compiler);
5427 if (alternatives->length() == 1) {
5428 return alternatives->at(0)->ToNode(compiler, on_success);
5429 }
5430 }
5431
5432 int length = alternatives->length();
5433
5434 ChoiceNode* result =
5435 new(compiler->zone()) ChoiceNode(length, compiler->zone());
5436 for (int i = 0; i < length; i++) {
5437 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5438 on_success));
5439 result->AddAlternative(alternative);
5440 }
5441 return result;
5442}
5443
5444
5445RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5446 RegExpNode* on_success) {
5447 return ToNode(min(),
5448 max(),
5449 is_greedy(),
5450 body(),
5451 compiler,
5452 on_success);
5453}
5454
5455
5456// Scoped object to keep track of how much we unroll quantifier loops in the
5457// regexp graph generator.
5458class RegExpExpansionLimiter {
5459 public:
5460 static const int kMaxExpansionFactor = 6;
5461 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5462 : compiler_(compiler),
5463 saved_expansion_factor_(compiler->current_expansion_factor()),
5464 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5465 DCHECK(factor > 0);
5466 if (ok_to_expand_) {
5467 if (factor > kMaxExpansionFactor) {
5468 // Avoid integer overflow of the current expansion factor.
5469 ok_to_expand_ = false;
5470 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5471 } else {
5472 int new_factor = saved_expansion_factor_ * factor;
5473 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5474 compiler->set_current_expansion_factor(new_factor);
5475 }
5476 }
5477 }
5478
5479 ~RegExpExpansionLimiter() {
5480 compiler_->set_current_expansion_factor(saved_expansion_factor_);
5481 }
5482
5483 bool ok_to_expand() { return ok_to_expand_; }
5484
5485 private:
5486 RegExpCompiler* compiler_;
5487 int saved_expansion_factor_;
5488 bool ok_to_expand_;
5489
5490 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5491};
5492
5493
5494RegExpNode* RegExpQuantifier::ToNode(int min,
5495 int max,
5496 bool is_greedy,
5497 RegExpTree* body,
5498 RegExpCompiler* compiler,
5499 RegExpNode* on_success,
5500 bool not_at_start) {
5501 // x{f, t} becomes this:
5502 //
5503 // (r++)<-.
5504 // | `
5505 // | (x)
5506 // v ^
5507 // (r=0)-->(?)---/ [if r < t]
5508 // |
5509 // [if r >= f] \----> ...
5510 //
5511
5512 // 15.10.2.5 RepeatMatcher algorithm.
5513 // The parser has already eliminated the case where max is 0. In the case
5514 // where max_match is zero the parser has removed the quantifier if min was
5515 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5516
5517 // If we know that we cannot match zero length then things are a little
5518 // simpler since we don't need to make the special zero length match check
5519 // from step 2.1. If the min and max are small we can unroll a little in
5520 // this case.
5521 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5522 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5523 if (max == 0) return on_success; // This can happen due to recursion.
5524 bool body_can_be_empty = (body->min_match() == 0);
5525 int body_start_reg = RegExpCompiler::kNoRegister;
5526 Interval capture_registers = body->CaptureRegisters();
5527 bool needs_capture_clearing = !capture_registers.is_empty();
5528 Zone* zone = compiler->zone();
5529
5530 if (body_can_be_empty) {
5531 body_start_reg = compiler->AllocateRegister();
5532 } else if (compiler->optimize() && !needs_capture_clearing) {
5533 // Only unroll if there are no captures and the body can't be
5534 // empty.
5535 {
5536 RegExpExpansionLimiter limiter(
5537 compiler, min + ((max != min) ? 1 : 0));
5538 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5539 int new_max = (max == kInfinity) ? max : max - min;
5540 // Recurse once to get the loop or optional matches after the fixed
5541 // ones.
5542 RegExpNode* answer = ToNode(
5543 0, new_max, is_greedy, body, compiler, on_success, true);
5544 // Unroll the forced matches from 0 to min. This can cause chains of
5545 // TextNodes (which the parser does not generate). These should be
5546 // combined if it turns out they hinder good code generation.
5547 for (int i = 0; i < min; i++) {
5548 answer = body->ToNode(compiler, answer);
5549 }
5550 return answer;
5551 }
5552 }
5553 if (max <= kMaxUnrolledMaxMatches && min == 0) {
5554 DCHECK(max > 0); // Due to the 'if' above.
5555 RegExpExpansionLimiter limiter(compiler, max);
5556 if (limiter.ok_to_expand()) {
5557 // Unroll the optional matches up to max.
5558 RegExpNode* answer = on_success;
5559 for (int i = 0; i < max; i++) {
5560 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5561 if (is_greedy) {
5562 alternation->AddAlternative(
5563 GuardedAlternative(body->ToNode(compiler, answer)));
5564 alternation->AddAlternative(GuardedAlternative(on_success));
5565 } else {
5566 alternation->AddAlternative(GuardedAlternative(on_success));
5567 alternation->AddAlternative(
5568 GuardedAlternative(body->ToNode(compiler, answer)));
5569 }
5570 answer = alternation;
5571 if (not_at_start && !compiler->read_backward()) {
5572 alternation->set_not_at_start();
5573 }
5574 }
5575 return answer;
5576 }
5577 }
5578 }
5579 bool has_min = min > 0;
5580 bool has_max = max < RegExpTree::kInfinity;
5581 bool needs_counter = has_min || has_max;
5582 int reg_ctr = needs_counter
5583 ? compiler->AllocateRegister()
5584 : RegExpCompiler::kNoRegister;
5585 LoopChoiceNode* center = new (zone)
5586 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5587 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5588 RegExpNode* loop_return = needs_counter
5589 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5590 : static_cast<RegExpNode*>(center);
5591 if (body_can_be_empty) {
5592 // If the body can be empty we need to check if it was and then
5593 // backtrack.
5594 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5595 reg_ctr,
5596 min,
5597 loop_return);
5598 }
5599 RegExpNode* body_node = body->ToNode(compiler, loop_return);
5600 if (body_can_be_empty) {
5601 // If the body can be empty we need to store the start position
5602 // so we can bail out if it was empty.
5603 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5604 }
5605 if (needs_capture_clearing) {
5606 // Before entering the body of this loop we need to clear captures.
5607 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5608 }
5609 GuardedAlternative body_alt(body_node);
5610 if (has_max) {
5611 Guard* body_guard =
5612 new(zone) Guard(reg_ctr, Guard::LT, max);
5613 body_alt.AddGuard(body_guard, zone);
5614 }
5615 GuardedAlternative rest_alt(on_success);
5616 if (has_min) {
5617 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5618 rest_alt.AddGuard(rest_guard, zone);
5619 }
5620 if (is_greedy) {
5621 center->AddLoopAlternative(body_alt);
5622 center->AddContinueAlternative(rest_alt);
5623 } else {
5624 center->AddContinueAlternative(rest_alt);
5625 center->AddLoopAlternative(body_alt);
5626 }
5627 if (needs_counter) {
5628 return ActionNode::SetRegister(reg_ctr, 0, center);
5629 } else {
5630 return center;
5631 }
5632}
5633
5634
5635RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5636 RegExpNode* on_success) {
5637 NodeInfo info;
5638 Zone* zone = compiler->zone();
5639
5640 switch (assertion_type()) {
5641 case START_OF_LINE:
5642 return AssertionNode::AfterNewline(on_success);
5643 case START_OF_INPUT:
5644 return AssertionNode::AtStart(on_success);
5645 case BOUNDARY:
5646 return AssertionNode::AtBoundary(on_success);
5647 case NON_BOUNDARY:
5648 return AssertionNode::AtNonBoundary(on_success);
5649 case END_OF_INPUT:
5650 return AssertionNode::AtEnd(on_success);
5651 case END_OF_LINE: {
5652 // Compile $ in multiline regexps as an alternation with a positive
5653 // lookahead in one side and an end-of-input on the other side.
5654 // We need two registers for the lookahead.
5655 int stack_pointer_register = compiler->AllocateRegister();
5656 int position_register = compiler->AllocateRegister();
5657 // The ChoiceNode to distinguish between a newline and end-of-input.
5658 ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5659 // Create a newline atom.
5660 ZoneList<CharacterRange>* newline_ranges =
5661 new(zone) ZoneList<CharacterRange>(3, zone);
5662 CharacterRange::AddClassEscape('n', newline_ranges, zone);
5663 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
5664 TextNode* newline_matcher = new (zone) TextNode(
5665 newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5666 stack_pointer_register, position_register,
5667 0, // No captures inside.
5668 -1, // Ignored if no captures.
5669 on_success));
5670 // Create an end-of-input matcher.
5671 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5672 stack_pointer_register,
5673 position_register,
5674 newline_matcher);
5675 // Add the two alternatives to the ChoiceNode.
5676 GuardedAlternative eol_alternative(end_of_line);
5677 result->AddAlternative(eol_alternative);
5678 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5679 result->AddAlternative(end_alternative);
5680 return result;
5681 }
5682 default:
5683 UNREACHABLE();
5684 }
5685 return on_success;
5686}
5687
5688
5689RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5690 RegExpNode* on_success) {
5691 return new (compiler->zone())
5692 BackReferenceNode(RegExpCapture::StartRegister(index()),
5693 RegExpCapture::EndRegister(index()),
5694 compiler->read_backward(), on_success);
5695}
5696
5697
5698RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5699 RegExpNode* on_success) {
5700 return on_success;
5701}
5702
5703
Ben Murdoch097c5b22016-05-18 11:27:45 +01005704RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5705 int stack_pointer_register,
5706 int position_register,
5707 int capture_register_count,
5708 int capture_register_start)
5709 : is_positive_(is_positive),
5710 on_success_(on_success),
5711 stack_pointer_register_(stack_pointer_register),
5712 position_register_(position_register) {
5713 if (is_positive_) {
5714 on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5715 stack_pointer_register, position_register, capture_register_count,
5716 capture_register_start, on_success_);
5717 } else {
5718 Zone* zone = on_success_->zone();
5719 on_match_success_ = new (zone) NegativeSubmatchSuccess(
5720 stack_pointer_register, position_register, capture_register_count,
5721 capture_register_start, zone);
5722 }
5723}
5724
5725
5726RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5727 if (is_positive_) {
5728 return ActionNode::BeginSubmatch(stack_pointer_register_,
5729 position_register_, match);
5730 } else {
5731 Zone* zone = on_success_->zone();
5732 // We use a ChoiceNode to represent the negative lookaround. The first
5733 // alternative is the negative match. On success, the end node backtracks.
5734 // On failure, the second alternative is tried and leads to success.
5735 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5736 // first exit when calculating quick checks.
5737 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5738 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5739 return ActionNode::BeginSubmatch(stack_pointer_register_,
5740 position_register_, choice_node);
5741 }
5742}
5743
5744
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005745RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5746 RegExpNode* on_success) {
5747 int stack_pointer_register = compiler->AllocateRegister();
5748 int position_register = compiler->AllocateRegister();
5749
5750 const int registers_per_capture = 2;
5751 const int register_of_first_capture = 2;
5752 int register_count = capture_count_ * registers_per_capture;
5753 int register_start =
5754 register_of_first_capture + capture_from_ * registers_per_capture;
5755
5756 RegExpNode* result;
5757 bool was_reading_backward = compiler->read_backward();
5758 compiler->set_read_backward(type() == LOOKBEHIND);
Ben Murdoch097c5b22016-05-18 11:27:45 +01005759 Builder builder(is_positive(), on_success, stack_pointer_register,
5760 position_register, register_count, register_start);
5761 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5762 result = builder.ForMatch(match);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005763 compiler->set_read_backward(was_reading_backward);
5764 return result;
5765}
5766
5767
5768RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5769 RegExpNode* on_success) {
5770 return ToNode(body(), index(), compiler, on_success);
5771}
5772
5773
5774RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5775 int index,
5776 RegExpCompiler* compiler,
5777 RegExpNode* on_success) {
5778 DCHECK_NOT_NULL(body);
5779 int start_reg = RegExpCapture::StartRegister(index);
5780 int end_reg = RegExpCapture::EndRegister(index);
5781 if (compiler->read_backward()) std::swap(start_reg, end_reg);
5782 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5783 RegExpNode* body_node = body->ToNode(compiler, store_end);
5784 return ActionNode::StorePosition(start_reg, true, body_node);
5785}
5786
5787
5788RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5789 RegExpNode* on_success) {
5790 ZoneList<RegExpTree*>* children = nodes();
5791 RegExpNode* current = on_success;
5792 if (compiler->read_backward()) {
5793 for (int i = 0; i < children->length(); i++) {
5794 current = children->at(i)->ToNode(compiler, current);
5795 }
5796 } else {
5797 for (int i = children->length() - 1; i >= 0; i--) {
5798 current = children->at(i)->ToNode(compiler, current);
5799 }
5800 }
5801 return current;
5802}
5803
5804
5805static void AddClass(const int* elmv,
5806 int elmc,
5807 ZoneList<CharacterRange>* ranges,
5808 Zone* zone) {
5809 elmc--;
Ben Murdoch097c5b22016-05-18 11:27:45 +01005810 DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005811 for (int i = 0; i < elmc; i += 2) {
5812 DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch097c5b22016-05-18 11:27:45 +01005813 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005814 }
5815}
5816
5817
5818static void AddClassNegated(const int *elmv,
5819 int elmc,
5820 ZoneList<CharacterRange>* ranges,
5821 Zone* zone) {
5822 elmc--;
Ben Murdoch097c5b22016-05-18 11:27:45 +01005823 DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005824 DCHECK(elmv[0] != 0x0000);
Ben Murdoch097c5b22016-05-18 11:27:45 +01005825 DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005826 uc16 last = 0x0000;
5827 for (int i = 0; i < elmc; i += 2) {
5828 DCHECK(last <= elmv[i] - 1);
5829 DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch097c5b22016-05-18 11:27:45 +01005830 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005831 last = elmv[i + 1];
5832 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01005833 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005834}
5835
5836
5837void CharacterRange::AddClassEscape(uc16 type,
5838 ZoneList<CharacterRange>* ranges,
5839 Zone* zone) {
5840 switch (type) {
5841 case 's':
5842 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5843 break;
5844 case 'S':
5845 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5846 break;
5847 case 'w':
5848 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5849 break;
5850 case 'W':
5851 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5852 break;
5853 case 'd':
5854 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5855 break;
5856 case 'D':
5857 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5858 break;
5859 case '.':
5860 AddClassNegated(kLineTerminatorRanges,
5861 kLineTerminatorRangeCount,
5862 ranges,
5863 zone);
5864 break;
5865 // This is not a character range as defined by the spec but a
5866 // convenient shorthand for a character class that matches any
5867 // character.
5868 case '*':
5869 ranges->Add(CharacterRange::Everything(), zone);
5870 break;
5871 // This is the set of characters matched by the $ and ^ symbols
5872 // in multiline mode.
5873 case 'n':
5874 AddClass(kLineTerminatorRanges,
5875 kLineTerminatorRangeCount,
5876 ranges,
5877 zone);
5878 break;
5879 default:
5880 UNREACHABLE();
5881 }
5882}
5883
5884
5885Vector<const int> CharacterRange::GetWordBounds() {
5886 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5887}
5888
5889
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005890void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5891 ZoneList<CharacterRange>* ranges,
5892 bool is_one_byte) {
Ben Murdochc5610432016-08-08 18:44:38 +01005893 CharacterRange::Canonicalize(ranges);
Ben Murdoch097c5b22016-05-18 11:27:45 +01005894 int range_count = ranges->length();
5895 for (int i = 0; i < range_count; i++) {
5896 CharacterRange range = ranges->at(i);
5897 uc32 bottom = range.from();
5898 if (bottom > String::kMaxUtf16CodeUnit) return;
5899 uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5900 // Nothing to be done for surrogates.
5901 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
5902 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5903 if (bottom > String::kMaxOneByteCharCode) return;
5904 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005905 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01005906 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5907 if (top == bottom) {
5908 // If this is a singleton we just expand the one character.
5909 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005910 for (int i = 0; i < length; i++) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01005911 uc32 chr = chars[i];
5912 if (chr != bottom) {
5913 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005914 }
5915 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01005916 } else {
5917 // If this is a range we expand the characters block by block, expanding
5918 // contiguous subranges (blocks) one at a time. The approach is as
5919 // follows. For a given start character we look up the remainder of the
5920 // block that contains it (represented by the end point), for instance we
5921 // find 'z' if the character is 'c'. A block is characterized by the
5922 // property that all characters uncanonicalize in the same way, except
5923 // that each entry in the result is incremented by the distance from the
5924 // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
5925 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
5926 // we've found the end point we look up its uncanonicalization and
5927 // produce a range for each element. For instance for [c-f] we look up
5928 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
5929 // it is not already contained in the input, so [c-f] will be skipped but
5930 // [C-F] will be added. If this range is not completely contained in a
5931 // block we do this for all the blocks covered by the range (handling
5932 // characters that is not in a block as a "singleton block").
5933 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5934 int pos = bottom;
5935 while (pos <= top) {
5936 int length =
5937 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5938 uc32 block_end;
5939 if (length == 0) {
5940 block_end = pos;
5941 } else {
5942 DCHECK_EQ(1, length);
5943 block_end = equivalents[0];
5944 }
5945 int end = (block_end > top) ? top : block_end;
5946 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5947 equivalents);
5948 for (int i = 0; i < length; i++) {
5949 uc32 c = equivalents[i];
5950 uc32 range_from = c - (block_end - pos);
5951 uc32 range_to = c - (block_end - end);
5952 if (!(bottom <= range_from && range_to <= top)) {
5953 ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5954 }
5955 }
5956 pos = end + 1;
5957 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00005958 }
5959 }
5960}
5961
5962
5963bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5964 DCHECK_NOT_NULL(ranges);
5965 int n = ranges->length();
5966 if (n <= 1) return true;
5967 int max = ranges->at(0).to();
5968 for (int i = 1; i < n; i++) {
5969 CharacterRange next_range = ranges->at(i);
5970 if (next_range.from() <= max + 1) return false;
5971 max = next_range.to();
5972 }
5973 return true;
5974}
5975
5976
5977ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5978 if (ranges_ == NULL) {
5979 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5980 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5981 }
5982 return ranges_;
5983}
5984
5985
5986// Move a number of elements in a zonelist to another position
5987// in the same list. Handles overlapping source and target areas.
5988static void MoveRanges(ZoneList<CharacterRange>* list,
5989 int from,
5990 int to,
5991 int count) {
5992 // Ranges are potentially overlapping.
5993 if (from < to) {
5994 for (int i = count - 1; i >= 0; i--) {
5995 list->at(to + i) = list->at(from + i);
5996 }
5997 } else {
5998 for (int i = 0; i < count; i++) {
5999 list->at(to + i) = list->at(from + i);
6000 }
6001 }
6002}
6003
6004
6005static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
6006 int count,
6007 CharacterRange insert) {
6008 // Inserts a range into list[0..count[, which must be sorted
6009 // by from value and non-overlapping and non-adjacent, using at most
6010 // list[0..count] for the result. Returns the number of resulting
6011 // canonicalized ranges. Inserting a range may collapse existing ranges into
6012 // fewer ranges, so the return value can be anything in the range 1..count+1.
Ben Murdoch097c5b22016-05-18 11:27:45 +01006013 uc32 from = insert.from();
6014 uc32 to = insert.to();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006015 int start_pos = 0;
6016 int end_pos = count;
6017 for (int i = count - 1; i >= 0; i--) {
6018 CharacterRange current = list->at(i);
6019 if (current.from() > to + 1) {
6020 end_pos = i;
6021 } else if (current.to() + 1 < from) {
6022 start_pos = i + 1;
6023 break;
6024 }
6025 }
6026
6027 // Inserted range overlaps, or is adjacent to, ranges at positions
6028 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6029 // not affected by the insertion.
6030 // If start_pos == end_pos, the range must be inserted before start_pos.
6031 // if start_pos < end_pos, the entire range from start_pos to end_pos
6032 // must be merged with the insert range.
6033
6034 if (start_pos == end_pos) {
6035 // Insert between existing ranges at position start_pos.
6036 if (start_pos < count) {
6037 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6038 }
6039 list->at(start_pos) = insert;
6040 return count + 1;
6041 }
6042 if (start_pos + 1 == end_pos) {
6043 // Replace single existing range at position start_pos.
6044 CharacterRange to_replace = list->at(start_pos);
6045 int new_from = Min(to_replace.from(), from);
6046 int new_to = Max(to_replace.to(), to);
Ben Murdoch097c5b22016-05-18 11:27:45 +01006047 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006048 return count;
6049 }
6050 // Replace a number of existing ranges from start_pos to end_pos - 1.
6051 // Move the remaining ranges down.
6052
6053 int new_from = Min(list->at(start_pos).from(), from);
6054 int new_to = Max(list->at(end_pos - 1).to(), to);
6055 if (end_pos < count) {
6056 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6057 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006058 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006059 return count - (end_pos - start_pos) + 1;
6060}
6061
6062
6063void CharacterSet::Canonicalize() {
6064 // Special/default classes are always considered canonical. The result
6065 // of calling ranges() will be sorted.
6066 if (ranges_ == NULL) return;
6067 CharacterRange::Canonicalize(ranges_);
6068}
6069
6070
6071void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6072 if (character_ranges->length() <= 1) return;
6073 // Check whether ranges are already canonical (increasing, non-overlapping,
6074 // non-adjacent).
6075 int n = character_ranges->length();
6076 int max = character_ranges->at(0).to();
6077 int i = 1;
6078 while (i < n) {
6079 CharacterRange current = character_ranges->at(i);
6080 if (current.from() <= max + 1) {
6081 break;
6082 }
6083 max = current.to();
6084 i++;
6085 }
6086 // Canonical until the i'th range. If that's all of them, we are done.
6087 if (i == n) return;
6088
6089 // The ranges at index i and forward are not canonicalized. Make them so by
6090 // doing the equivalent of insertion sort (inserting each into the previous
6091 // list, in order).
6092 // Notice that inserting a range can reduce the number of ranges in the
6093 // result due to combining of adjacent and overlapping ranges.
6094 int read = i; // Range to insert.
6095 int num_canonical = i; // Length of canonicalized part of list.
6096 do {
6097 num_canonical = InsertRangeInCanonicalList(character_ranges,
6098 num_canonical,
6099 character_ranges->at(read));
6100 read++;
6101 } while (read < n);
6102 character_ranges->Rewind(num_canonical);
6103
6104 DCHECK(CharacterRange::IsCanonical(character_ranges));
6105}
6106
6107
6108void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6109 ZoneList<CharacterRange>* negated_ranges,
6110 Zone* zone) {
6111 DCHECK(CharacterRange::IsCanonical(ranges));
6112 DCHECK_EQ(0, negated_ranges->length());
6113 int range_count = ranges->length();
Ben Murdoch097c5b22016-05-18 11:27:45 +01006114 uc32 from = 0;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006115 int i = 0;
6116 if (range_count > 0 && ranges->at(0).from() == 0) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006117 from = ranges->at(0).to() + 1;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006118 i = 1;
6119 }
6120 while (i < range_count) {
6121 CharacterRange range = ranges->at(i);
Ben Murdoch097c5b22016-05-18 11:27:45 +01006122 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6123 from = range.to() + 1;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006124 i++;
6125 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006126 if (from < String::kMaxCodePoint) {
6127 negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006128 zone);
6129 }
6130}
6131
6132
6133// -------------------------------------------------------------------
6134// Splay tree
6135
6136
6137OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6138 if (Get(value))
6139 return this;
6140 if (successors(zone) != NULL) {
6141 for (int i = 0; i < successors(zone)->length(); i++) {
6142 OutSet* successor = successors(zone)->at(i);
6143 if (successor->Get(value))
6144 return successor;
6145 }
6146 } else {
6147 successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6148 }
6149 OutSet* result = new(zone) OutSet(first_, remaining_);
6150 result->Set(value, zone);
6151 successors(zone)->Add(result, zone);
6152 return result;
6153}
6154
6155
6156void OutSet::Set(unsigned value, Zone *zone) {
6157 if (value < kFirstLimit) {
6158 first_ |= (1 << value);
6159 } else {
6160 if (remaining_ == NULL)
6161 remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6162 if (remaining_->is_empty() || !remaining_->Contains(value))
6163 remaining_->Add(value, zone);
6164 }
6165}
6166
6167
6168bool OutSet::Get(unsigned value) const {
6169 if (value < kFirstLimit) {
6170 return (first_ & (1 << value)) != 0;
6171 } else if (remaining_ == NULL) {
6172 return false;
6173 } else {
6174 return remaining_->Contains(value);
6175 }
6176}
6177
6178
Ben Murdoch097c5b22016-05-18 11:27:45 +01006179const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006180
6181
6182void DispatchTable::AddRange(CharacterRange full_range, int value,
6183 Zone* zone) {
6184 CharacterRange current = full_range;
6185 if (tree()->is_empty()) {
6186 // If this is the first range we just insert into the table.
6187 ZoneSplayTree<Config>::Locator loc;
6188 bool inserted = tree()->Insert(current.from(), &loc);
6189 DCHECK(inserted);
6190 USE(inserted);
6191 loc.set_value(Entry(current.from(), current.to(),
6192 empty()->Extend(value, zone)));
6193 return;
6194 }
6195 // First see if there is a range to the left of this one that
6196 // overlaps.
6197 ZoneSplayTree<Config>::Locator loc;
6198 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6199 Entry* entry = &loc.value();
6200 // If we've found a range that overlaps with this one, and it
6201 // starts strictly to the left of this one, we have to fix it
6202 // because the following code only handles ranges that start on
6203 // or after the start point of the range we're adding.
6204 if (entry->from() < current.from() && entry->to() >= current.from()) {
6205 // Snap the overlapping range in half around the start point of
6206 // the range we're adding.
Ben Murdoch097c5b22016-05-18 11:27:45 +01006207 CharacterRange left =
6208 CharacterRange::Range(entry->from(), current.from() - 1);
6209 CharacterRange right = CharacterRange::Range(current.from(), entry->to());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006210 // The left part of the overlapping range doesn't overlap.
6211 // Truncate the whole entry to be just the left part.
6212 entry->set_to(left.to());
6213 // The right part is the one that overlaps. We add this part
6214 // to the map and let the next step deal with merging it with
6215 // the range we're adding.
6216 ZoneSplayTree<Config>::Locator loc;
6217 bool inserted = tree()->Insert(right.from(), &loc);
6218 DCHECK(inserted);
6219 USE(inserted);
6220 loc.set_value(Entry(right.from(),
6221 right.to(),
6222 entry->out_set()));
6223 }
6224 }
6225 while (current.is_valid()) {
6226 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6227 (loc.value().from() <= current.to()) &&
6228 (loc.value().to() >= current.from())) {
6229 Entry* entry = &loc.value();
6230 // We have overlap. If there is space between the start point of
6231 // the range we're adding and where the overlapping range starts
6232 // then we have to add a range covering just that space.
6233 if (current.from() < entry->from()) {
6234 ZoneSplayTree<Config>::Locator ins;
6235 bool inserted = tree()->Insert(current.from(), &ins);
6236 DCHECK(inserted);
6237 USE(inserted);
6238 ins.set_value(Entry(current.from(),
6239 entry->from() - 1,
6240 empty()->Extend(value, zone)));
6241 current.set_from(entry->from());
6242 }
6243 DCHECK_EQ(current.from(), entry->from());
6244 // If the overlapping range extends beyond the one we want to add
6245 // we have to snap the right part off and add it separately.
6246 if (entry->to() > current.to()) {
6247 ZoneSplayTree<Config>::Locator ins;
6248 bool inserted = tree()->Insert(current.to() + 1, &ins);
6249 DCHECK(inserted);
6250 USE(inserted);
6251 ins.set_value(Entry(current.to() + 1,
6252 entry->to(),
6253 entry->out_set()));
6254 entry->set_to(current.to());
6255 }
6256 DCHECK(entry->to() <= current.to());
6257 // The overlapping range is now completely contained by the range
6258 // we're adding so we can just update it and move the start point
6259 // of the range we're adding just past it.
6260 entry->AddValue(value, zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006261 DCHECK(entry->to() + 1 > current.from());
6262 current.set_from(entry->to() + 1);
6263 } else {
6264 // There is no overlap so we can just add the range
6265 ZoneSplayTree<Config>::Locator ins;
6266 bool inserted = tree()->Insert(current.from(), &ins);
6267 DCHECK(inserted);
6268 USE(inserted);
6269 ins.set_value(Entry(current.from(),
6270 current.to(),
6271 empty()->Extend(value, zone)));
6272 break;
6273 }
6274 }
6275}
6276
6277
Ben Murdoch097c5b22016-05-18 11:27:45 +01006278OutSet* DispatchTable::Get(uc32 value) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006279 ZoneSplayTree<Config>::Locator loc;
6280 if (!tree()->FindGreatestLessThan(value, &loc))
6281 return empty();
6282 Entry* entry = &loc.value();
6283 if (value <= entry->to())
6284 return entry->out_set();
6285 else
6286 return empty();
6287}
6288
6289
6290// -------------------------------------------------------------------
6291// Analysis
6292
6293
6294void Analysis::EnsureAnalyzed(RegExpNode* that) {
6295 StackLimitCheck check(isolate());
6296 if (check.HasOverflowed()) {
6297 fail("Stack overflow");
6298 return;
6299 }
6300 if (that->info()->been_analyzed || that->info()->being_analyzed)
6301 return;
6302 that->info()->being_analyzed = true;
6303 that->Accept(this);
6304 that->info()->being_analyzed = false;
6305 that->info()->been_analyzed = true;
6306}
6307
6308
6309void Analysis::VisitEnd(EndNode* that) {
6310 // nothing to do
6311}
6312
6313
6314void TextNode::CalculateOffsets() {
6315 int element_count = elements()->length();
6316 // Set up the offsets of the elements relative to the start. This is a fixed
6317 // quantity since a TextNode can only contain fixed-width things.
6318 int cp_offset = 0;
6319 for (int i = 0; i < element_count; i++) {
6320 TextElement& elm = elements()->at(i);
6321 elm.set_cp_offset(cp_offset);
6322 cp_offset += elm.length();
6323 }
6324}
6325
6326
6327void Analysis::VisitText(TextNode* that) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006328 if (ignore_case()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006329 that->MakeCaseIndependent(isolate(), is_one_byte_);
6330 }
6331 EnsureAnalyzed(that->on_success());
6332 if (!has_failed()) {
6333 that->CalculateOffsets();
6334 }
6335}
6336
6337
6338void Analysis::VisitAction(ActionNode* that) {
6339 RegExpNode* target = that->on_success();
6340 EnsureAnalyzed(target);
6341 if (!has_failed()) {
6342 // If the next node is interested in what it follows then this node
6343 // has to be interested too so it can pass the information on.
6344 that->info()->AddFromFollowing(target->info());
6345 }
6346}
6347
6348
6349void Analysis::VisitChoice(ChoiceNode* that) {
6350 NodeInfo* info = that->info();
6351 for (int i = 0; i < that->alternatives()->length(); i++) {
6352 RegExpNode* node = that->alternatives()->at(i).node();
6353 EnsureAnalyzed(node);
6354 if (has_failed()) return;
6355 // Anything the following nodes need to know has to be known by
6356 // this node also, so it can pass it on.
6357 info->AddFromFollowing(node->info());
6358 }
6359}
6360
6361
6362void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6363 NodeInfo* info = that->info();
6364 for (int i = 0; i < that->alternatives()->length(); i++) {
6365 RegExpNode* node = that->alternatives()->at(i).node();
6366 if (node != that->loop_node()) {
6367 EnsureAnalyzed(node);
6368 if (has_failed()) return;
6369 info->AddFromFollowing(node->info());
6370 }
6371 }
6372 // Check the loop last since it may need the value of this node
6373 // to get a correct result.
6374 EnsureAnalyzed(that->loop_node());
6375 if (!has_failed()) {
6376 info->AddFromFollowing(that->loop_node()->info());
6377 }
6378}
6379
6380
6381void Analysis::VisitBackReference(BackReferenceNode* that) {
6382 EnsureAnalyzed(that->on_success());
6383}
6384
6385
6386void Analysis::VisitAssertion(AssertionNode* that) {
6387 EnsureAnalyzed(that->on_success());
6388}
6389
6390
6391void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6392 BoyerMooreLookahead* bm,
6393 bool not_at_start) {
6394 // Working out the set of characters that a backreference can match is too
6395 // hard, so we just say that any character can match.
6396 bm->SetRest(offset);
6397 SaveBMInfo(bm, not_at_start, offset);
6398}
6399
6400
6401STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6402 RegExpMacroAssembler::kTableSize);
6403
6404
6405void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6406 BoyerMooreLookahead* bm, bool not_at_start) {
6407 ZoneList<GuardedAlternative>* alts = alternatives();
6408 budget = (budget - 1) / alts->length();
6409 for (int i = 0; i < alts->length(); i++) {
6410 GuardedAlternative& alt = alts->at(i);
6411 if (alt.guards() != NULL && alt.guards()->length() != 0) {
6412 bm->SetRest(offset); // Give up trying to fill in info.
6413 SaveBMInfo(bm, not_at_start, offset);
6414 return;
6415 }
6416 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6417 }
6418 SaveBMInfo(bm, not_at_start, offset);
6419}
6420
6421
6422void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6423 BoyerMooreLookahead* bm, bool not_at_start) {
6424 if (initial_offset >= bm->length()) return;
6425 int offset = initial_offset;
6426 int max_char = bm->max_char();
6427 for (int i = 0; i < elements()->length(); i++) {
6428 if (offset >= bm->length()) {
6429 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6430 return;
6431 }
6432 TextElement text = elements()->at(i);
6433 if (text.text_type() == TextElement::ATOM) {
6434 RegExpAtom* atom = text.atom();
6435 for (int j = 0; j < atom->length(); j++, offset++) {
6436 if (offset >= bm->length()) {
6437 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6438 return;
6439 }
6440 uc16 character = atom->data()[j];
6441 if (bm->compiler()->ignore_case()) {
6442 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6443 int length = GetCaseIndependentLetters(
6444 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6445 chars);
6446 for (int j = 0; j < length; j++) {
6447 bm->Set(offset, chars[j]);
6448 }
6449 } else {
6450 if (character <= max_char) bm->Set(offset, character);
6451 }
6452 }
6453 } else {
6454 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6455 RegExpCharacterClass* char_class = text.char_class();
6456 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6457 if (char_class->is_negated()) {
6458 bm->SetAll(offset);
6459 } else {
6460 for (int k = 0; k < ranges->length(); k++) {
6461 CharacterRange& range = ranges->at(k);
6462 if (range.from() > max_char) continue;
6463 int to = Min(max_char, static_cast<int>(range.to()));
6464 bm->SetInterval(offset, Interval(range.from(), to));
6465 }
6466 }
6467 offset++;
6468 }
6469 }
6470 if (offset >= bm->length()) {
6471 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6472 return;
6473 }
6474 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6475 true); // Not at start after a text node.
6476 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6477}
6478
6479
6480// -------------------------------------------------------------------
6481// Dispatch table construction
6482
6483
6484void DispatchTableConstructor::VisitEnd(EndNode* that) {
6485 AddRange(CharacterRange::Everything());
6486}
6487
6488
6489void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6490 node->set_being_calculated(true);
6491 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6492 for (int i = 0; i < alternatives->length(); i++) {
6493 set_choice_index(i);
6494 alternatives->at(i).node()->Accept(this);
6495 }
6496 node->set_being_calculated(false);
6497}
6498
6499
6500class AddDispatchRange {
6501 public:
6502 explicit AddDispatchRange(DispatchTableConstructor* constructor)
6503 : constructor_(constructor) { }
6504 void Call(uc32 from, DispatchTable::Entry entry);
6505 private:
6506 DispatchTableConstructor* constructor_;
6507};
6508
6509
6510void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006511 constructor_->AddRange(CharacterRange::Range(from, entry.to()));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006512}
6513
6514
6515void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6516 if (node->being_calculated())
6517 return;
6518 DispatchTable* table = node->GetTable(ignore_case_);
6519 AddDispatchRange adder(this);
6520 table->ForEach(&adder);
6521}
6522
6523
6524void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6525 // TODO(160): Find the node that we refer back to and propagate its start
6526 // set back to here. For now we just accept anything.
6527 AddRange(CharacterRange::Everything());
6528}
6529
6530
6531void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6532 RegExpNode* target = that->on_success();
6533 target->Accept(this);
6534}
6535
6536
6537static int CompareRangeByFrom(const CharacterRange* a,
6538 const CharacterRange* b) {
6539 return Compare<uc16>(a->from(), b->from());
6540}
6541
6542
6543void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6544 ranges->Sort(CompareRangeByFrom);
6545 uc16 last = 0;
6546 for (int i = 0; i < ranges->length(); i++) {
6547 CharacterRange range = ranges->at(i);
6548 if (last < range.from())
Ben Murdoch097c5b22016-05-18 11:27:45 +01006549 AddRange(CharacterRange::Range(last, range.from() - 1));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006550 if (range.to() >= last) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006551 if (range.to() == String::kMaxCodePoint) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006552 return;
6553 } else {
6554 last = range.to() + 1;
6555 }
6556 }
6557 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006558 AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006559}
6560
6561
6562void DispatchTableConstructor::VisitText(TextNode* that) {
6563 TextElement elm = that->elements()->at(0);
6564 switch (elm.text_type()) {
6565 case TextElement::ATOM: {
6566 uc16 c = elm.atom()->data()[0];
Ben Murdoch097c5b22016-05-18 11:27:45 +01006567 AddRange(CharacterRange::Range(c, c));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006568 break;
6569 }
6570 case TextElement::CHAR_CLASS: {
6571 RegExpCharacterClass* tree = elm.char_class();
6572 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6573 if (tree->is_negated()) {
6574 AddInverse(ranges);
6575 } else {
6576 for (int i = 0; i < ranges->length(); i++)
6577 AddRange(ranges->at(i));
6578 }
6579 break;
6580 }
6581 default: {
6582 UNIMPLEMENTED();
6583 }
6584 }
6585}
6586
6587
6588void DispatchTableConstructor::VisitAction(ActionNode* that) {
6589 RegExpNode* target = that->on_success();
6590 target->Accept(this);
6591}
6592
6593
Ben Murdoch097c5b22016-05-18 11:27:45 +01006594RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6595 RegExpNode* on_success) {
6596 // If the regexp matching starts within a surrogate pair, step back
6597 // to the lead surrogate and start matching from there.
6598 DCHECK(!compiler->read_backward());
6599 Zone* zone = compiler->zone();
6600 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6601 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6602 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6603 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6604
6605 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6606
6607 int stack_register = compiler->UnicodeLookaroundStackRegister();
6608 int position_register = compiler->UnicodeLookaroundPositionRegister();
6609 RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6610 zone, lead_surrogates, true, on_success);
6611 RegExpLookaround::Builder builder(true, step_back, stack_register,
6612 position_register);
6613 RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6614 zone, trail_surrogates, false, builder.on_match_success());
6615
6616 optional_step_back->AddAlternative(
6617 GuardedAlternative(builder.ForMatch(match_trail)));
6618 optional_step_back->AddAlternative(GuardedAlternative(on_success));
6619
6620 return optional_step_back;
6621}
6622
6623
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006624RegExpEngine::CompilationResult RegExpEngine::Compile(
Ben Murdoch097c5b22016-05-18 11:27:45 +01006625 Isolate* isolate, Zone* zone, RegExpCompileData* data,
6626 JSRegExp::Flags flags, Handle<String> pattern,
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006627 Handle<String> sample_subject, bool is_one_byte) {
6628 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6629 return IrregexpRegExpTooBig(isolate);
6630 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006631 bool ignore_case = flags & JSRegExp::kIgnoreCase;
6632 bool is_sticky = flags & JSRegExp::kSticky;
6633 bool is_global = flags & JSRegExp::kGlobal;
6634 bool is_unicode = flags & JSRegExp::kUnicode;
6635 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006636 is_one_byte);
6637
6638 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6639
6640 // Sample some characters from the middle of the string.
6641 static const int kSampleSize = 128;
6642
6643 sample_subject = String::Flatten(sample_subject);
6644 int chars_sampled = 0;
6645 int half_way = (sample_subject->length() - kSampleSize) / 2;
6646 for (int i = Max(0, half_way);
6647 i < sample_subject->length() && chars_sampled < kSampleSize;
6648 i++, chars_sampled++) {
6649 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6650 }
6651
6652 // Wrap the body of the regexp in capture #0.
6653 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6654 0,
6655 &compiler,
6656 compiler.accept());
6657 RegExpNode* node = captured_body;
6658 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6659 bool is_start_anchored = data->tree->IsAnchoredAtStart();
6660 int max_length = data->tree->max_match();
6661 if (!is_start_anchored && !is_sticky) {
6662 // Add a .*? at the beginning, outside the body capture, unless
6663 // this expression is anchored at the beginning or sticky.
6664 RegExpNode* loop_node = RegExpQuantifier::ToNode(
6665 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
6666 &compiler, captured_body, data->contains_anchor);
6667
6668 if (data->contains_anchor) {
6669 // Unroll loop once, to take care of the case that might start
6670 // at the start of input.
6671 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6672 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6673 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6674 new (zone) RegExpCharacterClass('*'), false, loop_node)));
6675 node = first_step_node;
6676 } else {
6677 node = loop_node;
6678 }
6679 }
6680 if (is_one_byte) {
6681 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6682 // Do it again to propagate the new nodes to places where they were not
6683 // put because they had not been calculated yet.
6684 if (node != NULL) {
6685 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6686 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01006687 } else if (compiler.unicode() && (is_global || is_sticky)) {
6688 node = OptionallyStepBackToLeadSurrogate(&compiler, node);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006689 }
6690
6691 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6692 data->node = node;
Ben Murdoch097c5b22016-05-18 11:27:45 +01006693 Analysis analysis(isolate, flags, is_one_byte);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006694 analysis.EnsureAnalyzed(node);
6695 if (analysis.has_failed()) {
6696 const char* error_message = analysis.error_message();
6697 return CompilationResult(isolate, error_message);
6698 }
6699
6700 // Create the correct assembler for the architecture.
6701#ifndef V8_INTERPRETED_REGEXP
6702 // Native regexp implementation.
6703
6704 NativeRegExpMacroAssembler::Mode mode =
6705 is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6706 : NativeRegExpMacroAssembler::UC16;
6707
6708#if V8_TARGET_ARCH_IA32
6709 RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6710 (data->capture_count + 1) * 2);
6711#elif V8_TARGET_ARCH_X64
6712 RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6713 (data->capture_count + 1) * 2);
6714#elif V8_TARGET_ARCH_ARM
6715 RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6716 (data->capture_count + 1) * 2);
6717#elif V8_TARGET_ARCH_ARM64
6718 RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6719 (data->capture_count + 1) * 2);
Ben Murdochda12d292016-06-02 14:46:10 +01006720#elif V8_TARGET_ARCH_S390
6721 RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
6722 (data->capture_count + 1) * 2);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006723#elif V8_TARGET_ARCH_PPC
6724 RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6725 (data->capture_count + 1) * 2);
6726#elif V8_TARGET_ARCH_MIPS
6727 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6728 (data->capture_count + 1) * 2);
6729#elif V8_TARGET_ARCH_MIPS64
6730 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6731 (data->capture_count + 1) * 2);
6732#elif V8_TARGET_ARCH_X87
6733 RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
6734 (data->capture_count + 1) * 2);
6735#else
6736#error "Unsupported architecture"
6737#endif
6738
6739#else // V8_INTERPRETED_REGEXP
6740 // Interpreted regexp implementation.
6741 EmbeddedVector<byte, 1024> codes;
6742 RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6743#endif // V8_INTERPRETED_REGEXP
6744
6745 macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
6746
6747 // Inserted here, instead of in Assembler, because it depends on information
6748 // in the AST that isn't replicated in the Node structure.
6749 static const int kMaxBacksearchLimit = 1024;
6750 if (is_end_anchored &&
6751 !is_start_anchored &&
6752 max_length < kMaxBacksearchLimit) {
6753 macro_assembler.SetCurrentPositionFromEnd(max_length);
6754 }
6755
6756 if (is_global) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01006757 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6758 if (data->tree->min_match() > 0) {
6759 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6760 } else if (is_unicode) {
6761 mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6762 }
6763 macro_assembler.set_global_mode(mode);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006764 }
6765
6766 return compiler.Assemble(&macro_assembler,
6767 node,
6768 data->capture_count,
6769 pattern);
6770}
6771
6772
6773bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
6774 Heap* heap = pattern->GetHeap();
6775 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6776 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
Ben Murdochc5610432016-08-08 18:44:38 +01006777 heap->memory_allocator()->SizeExecutable() >
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00006778 RegExpImpl::kRegExpExecutableMemoryLimit) {
6779 too_much = true;
6780 }
6781 return too_much;
6782}
6783
6784
6785Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
6786 Object* key_pattern,
6787 FixedArray** last_match_cache,
6788 ResultsCacheType type) {
6789 FixedArray* cache;
6790 if (!key_string->IsInternalizedString()) return Smi::FromInt(0);
6791 if (type == STRING_SPLIT_SUBSTRINGS) {
6792 DCHECK(key_pattern->IsString());
6793 if (!key_pattern->IsInternalizedString()) return Smi::FromInt(0);
6794 cache = heap->string_split_cache();
6795 } else {
6796 DCHECK(type == REGEXP_MULTIPLE_INDICES);
6797 DCHECK(key_pattern->IsFixedArray());
6798 cache = heap->regexp_multiple_cache();
6799 }
6800
6801 uint32_t hash = key_string->Hash();
6802 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6803 ~(kArrayEntriesPerCacheEntry - 1));
6804 if (cache->get(index + kStringOffset) != key_string ||
6805 cache->get(index + kPatternOffset) != key_pattern) {
6806 index =
6807 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6808 if (cache->get(index + kStringOffset) != key_string ||
6809 cache->get(index + kPatternOffset) != key_pattern) {
6810 return Smi::FromInt(0);
6811 }
6812 }
6813
6814 *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6815 return cache->get(index + kArrayOffset);
6816}
6817
6818
6819void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6820 Handle<Object> key_pattern,
6821 Handle<FixedArray> value_array,
6822 Handle<FixedArray> last_match_cache,
6823 ResultsCacheType type) {
6824 Factory* factory = isolate->factory();
6825 Handle<FixedArray> cache;
6826 if (!key_string->IsInternalizedString()) return;
6827 if (type == STRING_SPLIT_SUBSTRINGS) {
6828 DCHECK(key_pattern->IsString());
6829 if (!key_pattern->IsInternalizedString()) return;
6830 cache = factory->string_split_cache();
6831 } else {
6832 DCHECK(type == REGEXP_MULTIPLE_INDICES);
6833 DCHECK(key_pattern->IsFixedArray());
6834 cache = factory->regexp_multiple_cache();
6835 }
6836
6837 uint32_t hash = key_string->Hash();
6838 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6839 ~(kArrayEntriesPerCacheEntry - 1));
6840 if (cache->get(index + kStringOffset) == Smi::FromInt(0)) {
6841 cache->set(index + kStringOffset, *key_string);
6842 cache->set(index + kPatternOffset, *key_pattern);
6843 cache->set(index + kArrayOffset, *value_array);
6844 cache->set(index + kLastMatchOffset, *last_match_cache);
6845 } else {
6846 uint32_t index2 =
6847 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6848 if (cache->get(index2 + kStringOffset) == Smi::FromInt(0)) {
6849 cache->set(index2 + kStringOffset, *key_string);
6850 cache->set(index2 + kPatternOffset, *key_pattern);
6851 cache->set(index2 + kArrayOffset, *value_array);
6852 cache->set(index2 + kLastMatchOffset, *last_match_cache);
6853 } else {
6854 cache->set(index2 + kStringOffset, Smi::FromInt(0));
6855 cache->set(index2 + kPatternOffset, Smi::FromInt(0));
6856 cache->set(index2 + kArrayOffset, Smi::FromInt(0));
6857 cache->set(index2 + kLastMatchOffset, Smi::FromInt(0));
6858 cache->set(index + kStringOffset, *key_string);
6859 cache->set(index + kPatternOffset, *key_pattern);
6860 cache->set(index + kArrayOffset, *value_array);
6861 cache->set(index + kLastMatchOffset, *last_match_cache);
6862 }
6863 }
6864 // If the array is a reasonably short list of substrings, convert it into a
6865 // list of internalized strings.
6866 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6867 for (int i = 0; i < value_array->length(); i++) {
6868 Handle<String> str(String::cast(value_array->get(i)), isolate);
6869 Handle<String> internalized_str = factory->InternalizeString(str);
6870 value_array->set(i, *internalized_str);
6871 }
6872 }
6873 // Convert backing store to a copy-on-write array.
6874 value_array->set_map_no_write_barrier(*factory->fixed_cow_array_map());
6875}
6876
6877
6878void RegExpResultsCache::Clear(FixedArray* cache) {
6879 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6880 cache->set(i, Smi::FromInt(0));
6881 }
6882}
6883
6884} // namespace internal
6885} // namespace v8