blob: 98aca72665675665de9cebc86c1d8a7711e9fc8b [file] [log] [blame]
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001// Copyright 2012 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
Steve Blocka7e24c12009-10-30 11:49:00 +00004
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005#include "src/v8.h"
Steve Blocka7e24c12009-10-30 11:49:00 +00006
Ben Murdochb8a8cc12014-11-26 15:28:44 +00007#include "src/ast.h"
8#include "src/base/platform/platform.h"
9#include "src/compilation-cache.h"
10#include "src/compiler.h"
11#include "src/execution.h"
12#include "src/factory.h"
13#include "src/jsregexp-inl.h"
14#include "src/jsregexp.h"
15#include "src/ostreams.h"
16#include "src/parser.h"
17#include "src/regexp-macro-assembler.h"
18#include "src/regexp-macro-assembler-irregexp.h"
19#include "src/regexp-macro-assembler-tracer.h"
20#include "src/regexp-stack.h"
21#include "src/runtime.h"
22#include "src/string-search.h"
Steve Blocka7e24c12009-10-30 11:49:00 +000023
Steve Block6ded16b2010-05-10 14:33:55 +010024#ifndef V8_INTERPRETED_REGEXP
Steve Blocka7e24c12009-10-30 11:49:00 +000025#if V8_TARGET_ARCH_IA32
Ben Murdochb8a8cc12014-11-26 15:28:44 +000026#include "src/ia32/regexp-macro-assembler-ia32.h" // NOLINT
Steve Blocka7e24c12009-10-30 11:49:00 +000027#elif V8_TARGET_ARCH_X64
Ben Murdochb8a8cc12014-11-26 15:28:44 +000028#include "src/x64/regexp-macro-assembler-x64.h" // NOLINT
29#elif V8_TARGET_ARCH_ARM64
30#include "src/arm64/regexp-macro-assembler-arm64.h" // NOLINT
Steve Blocka7e24c12009-10-30 11:49:00 +000031#elif V8_TARGET_ARCH_ARM
Ben Murdochb8a8cc12014-11-26 15:28:44 +000032#include "src/arm/regexp-macro-assembler-arm.h" // NOLINT
Steve Block44f0eee2011-05-26 01:26:41 +010033#elif V8_TARGET_ARCH_MIPS
Ben Murdochb8a8cc12014-11-26 15:28:44 +000034#include "src/mips/regexp-macro-assembler-mips.h" // NOLINT
35#elif V8_TARGET_ARCH_MIPS64
36#include "src/mips64/regexp-macro-assembler-mips64.h" // NOLINT
37#elif V8_TARGET_ARCH_X87
38#include "src/x87/regexp-macro-assembler-x87.h" // NOLINT
Steve Blocka7e24c12009-10-30 11:49:00 +000039#else
40#error Unsupported target architecture.
41#endif
42#endif
43
Ben Murdochb8a8cc12014-11-26 15:28:44 +000044#include "src/interpreter-irregexp.h"
Steve Blocka7e24c12009-10-30 11:49:00 +000045
46
47namespace v8 {
48namespace internal {
49
Ben Murdochb8a8cc12014-11-26 15:28:44 +000050MaybeHandle<Object> RegExpImpl::CreateRegExpLiteral(
51 Handle<JSFunction> constructor,
52 Handle<String> pattern,
53 Handle<String> flags) {
Steve Blocka7e24c12009-10-30 11:49:00 +000054 // Call the construct code with 2 arguments.
Ben Murdoch3ef787d2012-04-12 10:51:47 +010055 Handle<Object> argv[] = { pattern, flags };
Ben Murdochb8a8cc12014-11-26 15:28:44 +000056 return Execution::New(constructor, arraysize(argv), argv);
Steve Blocka7e24c12009-10-30 11:49:00 +000057}
58
59
60static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
61 int flags = JSRegExp::NONE;
62 for (int i = 0; i < str->length(); i++) {
63 switch (str->Get(i)) {
64 case 'i':
65 flags |= JSRegExp::IGNORE_CASE;
66 break;
67 case 'g':
68 flags |= JSRegExp::GLOBAL;
69 break;
70 case 'm':
71 flags |= JSRegExp::MULTILINE;
72 break;
Ben Murdochb8a8cc12014-11-26 15:28:44 +000073 case 'y':
74 if (FLAG_harmony_regexps) flags |= JSRegExp::STICKY;
75 break;
Steve Blocka7e24c12009-10-30 11:49:00 +000076 }
77 }
78 return JSRegExp::Flags(flags);
79}
80
81
Ben Murdochb8a8cc12014-11-26 15:28:44 +000082MUST_USE_RESULT
83static inline MaybeHandle<Object> ThrowRegExpException(
84 Handle<JSRegExp> re,
85 Handle<String> pattern,
86 Handle<String> error_text,
87 const char* message) {
Steve Block44f0eee2011-05-26 01:26:41 +010088 Isolate* isolate = re->GetIsolate();
89 Factory* factory = isolate->factory();
90 Handle<FixedArray> elements = factory->NewFixedArray(2);
Ben Murdoche0cee9b2011-05-25 10:26:03 +010091 elements->set(0, *pattern);
92 elements->set(1, *error_text);
Steve Block44f0eee2011-05-26 01:26:41 +010093 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
Ben Murdochb8a8cc12014-11-26 15:28:44 +000094 Handle<Object> regexp_err;
95 THROW_NEW_ERROR(isolate, NewSyntaxError(message, array), Object);
96}
97
98
99ContainedInLattice AddRange(ContainedInLattice containment,
100 const int* ranges,
101 int ranges_length,
102 Interval new_range) {
103 DCHECK((ranges_length & 1) == 1);
104 DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
105 if (containment == kLatticeUnknown) return containment;
106 bool inside = false;
107 int last = 0;
108 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
109 // Consider the range from last to ranges[i].
110 // We haven't got to the new range yet.
111 if (ranges[i] <= new_range.from()) continue;
112 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
113 // inclusive, but the values in ranges are not.
114 if (last <= new_range.from() && new_range.to() < ranges[i]) {
115 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
116 }
117 return kLatticeUnknown;
118 }
119 return containment;
120}
121
122
123// More makes code generation slower, less makes V8 benchmark score lower.
124const int kMaxLookaheadForBoyerMoore = 8;
125// In a 3-character pattern you can maximally step forwards 3 characters
126// at a time, which is not always enough to pay for the extra logic.
127const int kPatternTooShortForBoyerMoore = 2;
128
129
130// Identifies the sort of regexps where the regexp engine is faster
131// than the code used for atom matches.
132static bool HasFewDifferentCharacters(Handle<String> pattern) {
133 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
134 if (length <= kPatternTooShortForBoyerMoore) return false;
135 const int kMod = 128;
136 bool character_found[kMod];
137 int different = 0;
138 memset(&character_found[0], 0, sizeof(character_found));
139 for (int i = 0; i < length; i++) {
140 int ch = (pattern->Get(i) & (kMod - 1));
141 if (!character_found[ch]) {
142 character_found[ch] = true;
143 different++;
144 // We declare a regexp low-alphabet if it has at least 3 times as many
145 // characters as it has different characters.
146 if (different * 3 > length) return false;
147 }
148 }
149 return true;
Steve Blocka7e24c12009-10-30 11:49:00 +0000150}
151
152
153// Generic RegExp methods. Dispatches to implementation specific methods.
154
155
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000156MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
157 Handle<String> pattern,
158 Handle<String> flag_str) {
Steve Block44f0eee2011-05-26 01:26:41 +0100159 Isolate* isolate = re->GetIsolate();
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000160 Zone zone(isolate);
Steve Blocka7e24c12009-10-30 11:49:00 +0000161 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
Steve Block44f0eee2011-05-26 01:26:41 +0100162 CompilationCache* compilation_cache = isolate->compilation_cache();
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000163 MaybeHandle<FixedArray> maybe_cached =
164 compilation_cache->LookupRegExp(pattern, flags);
165 Handle<FixedArray> cached;
166 bool in_cache = maybe_cached.ToHandle(&cached);
Steve Block44f0eee2011-05-26 01:26:41 +0100167 LOG(isolate, RegExpCompileEvent(re, in_cache));
Steve Blocka7e24c12009-10-30 11:49:00 +0000168
169 Handle<Object> result;
170 if (in_cache) {
171 re->set_data(*cached);
172 return re;
173 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000174 pattern = String::Flatten(pattern);
Steve Block44f0eee2011-05-26 01:26:41 +0100175 PostponeInterruptsScope postpone(isolate);
Steve Blocka7e24c12009-10-30 11:49:00 +0000176 RegExpCompileData parse_result;
Steve Block44f0eee2011-05-26 01:26:41 +0100177 FlatStringReader reader(isolate, pattern);
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800178 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000179 &parse_result, &zone)) {
Steve Blocka7e24c12009-10-30 11:49:00 +0000180 // Throw an exception if we fail to parse the pattern.
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000181 return ThrowRegExpException(re,
182 pattern,
183 parse_result.error,
184 "malformed_regexp");
Steve Blocka7e24c12009-10-30 11:49:00 +0000185 }
186
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000187 bool has_been_compiled = false;
188
189 if (parse_result.simple &&
190 !flags.is_ignore_case() &&
191 !flags.is_sticky() &&
192 !HasFewDifferentCharacters(pattern)) {
Steve Blocka7e24c12009-10-30 11:49:00 +0000193 // Parse-tree is a single atom that is equal to the pattern.
194 AtomCompile(re, pattern, flags, pattern);
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000195 has_been_compiled = true;
Steve Blocka7e24c12009-10-30 11:49:00 +0000196 } else if (parse_result.tree->IsAtom() &&
197 !flags.is_ignore_case() &&
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000198 !flags.is_sticky() &&
Steve Blocka7e24c12009-10-30 11:49:00 +0000199 parse_result.capture_count == 0) {
200 RegExpAtom* atom = parse_result.tree->AsAtom();
201 Vector<const uc16> atom_pattern = atom->data();
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000202 Handle<String> atom_string;
203 ASSIGN_RETURN_ON_EXCEPTION(
204 isolate, atom_string,
205 isolate->factory()->NewStringFromTwoByte(atom_pattern),
206 Object);
207 if (!HasFewDifferentCharacters(atom_string)) {
208 AtomCompile(re, pattern, flags, atom_string);
209 has_been_compiled = true;
210 }
211 }
212 if (!has_been_compiled) {
Steve Block6ded16b2010-05-10 14:33:55 +0100213 IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
Steve Blocka7e24c12009-10-30 11:49:00 +0000214 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000215 DCHECK(re->data()->IsFixedArray());
Steve Blocka7e24c12009-10-30 11:49:00 +0000216 // Compilation succeeded so the data is set on the regexp
217 // and we can store it in the cache.
218 Handle<FixedArray> data(FixedArray::cast(re->data()));
Steve Block44f0eee2011-05-26 01:26:41 +0100219 compilation_cache->PutRegExp(pattern, flags, data);
Steve Blocka7e24c12009-10-30 11:49:00 +0000220
221 return re;
222}
223
224
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000225MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
226 Handle<String> subject,
227 int index,
228 Handle<JSArray> last_match_info) {
Steve Blocka7e24c12009-10-30 11:49:00 +0000229 switch (regexp->TypeTag()) {
230 case JSRegExp::ATOM:
231 return AtomExec(regexp, subject, index, last_match_info);
232 case JSRegExp::IRREGEXP: {
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000233 return IrregexpExec(regexp, subject, index, last_match_info);
Steve Blocka7e24c12009-10-30 11:49:00 +0000234 }
235 default:
236 UNREACHABLE();
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000237 return MaybeHandle<Object>();
Steve Blocka7e24c12009-10-30 11:49:00 +0000238 }
239}
240
241
242// RegExp Atom implementation: Simple string search using indexOf.
243
244
245void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
246 Handle<String> pattern,
247 JSRegExp::Flags flags,
248 Handle<String> match_pattern) {
Steve Block44f0eee2011-05-26 01:26:41 +0100249 re->GetIsolate()->factory()->SetRegExpAtomData(re,
250 JSRegExp::ATOM,
251 pattern,
252 flags,
253 match_pattern);
Steve Blocka7e24c12009-10-30 11:49:00 +0000254}
255
256
257static void SetAtomLastCapture(FixedArray* array,
258 String* subject,
259 int from,
260 int to) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000261 SealHandleScope shs(array->GetIsolate());
Steve Blocka7e24c12009-10-30 11:49:00 +0000262 RegExpImpl::SetLastCaptureCount(array, 2);
263 RegExpImpl::SetLastSubject(array, subject);
264 RegExpImpl::SetLastInput(array, subject);
265 RegExpImpl::SetCapture(array, 0, from);
266 RegExpImpl::SetCapture(array, 1, to);
267}
268
269
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000270int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
271 Handle<String> subject,
272 int index,
273 int32_t* output,
274 int output_size) {
275 Isolate* isolate = regexp->GetIsolate();
276
277 DCHECK(0 <= index);
278 DCHECK(index <= subject->length());
279
280 subject = String::Flatten(subject);
281 DisallowHeapAllocation no_gc; // ensure vectors stay valid
282
283 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
284 int needle_len = needle->length();
285 DCHECK(needle->IsFlat());
286 DCHECK_LT(0, needle_len);
287
288 if (index + needle_len > subject->length()) {
289 return RegExpImpl::RE_FAILURE;
290 }
291
292 for (int i = 0; i < output_size; i += 2) {
293 String::FlatContent needle_content = needle->GetFlatContent();
294 String::FlatContent subject_content = subject->GetFlatContent();
295 DCHECK(needle_content.IsFlat());
296 DCHECK(subject_content.IsFlat());
297 // dispatch on type of strings
298 index =
299 (needle_content.IsOneByte()
300 ? (subject_content.IsOneByte()
301 ? SearchString(isolate, subject_content.ToOneByteVector(),
302 needle_content.ToOneByteVector(), index)
303 : SearchString(isolate, subject_content.ToUC16Vector(),
304 needle_content.ToOneByteVector(), index))
305 : (subject_content.IsOneByte()
306 ? SearchString(isolate, subject_content.ToOneByteVector(),
307 needle_content.ToUC16Vector(), index)
308 : SearchString(isolate, subject_content.ToUC16Vector(),
309 needle_content.ToUC16Vector(), index)));
310 if (index == -1) {
311 return i / 2; // Return number of matches.
312 } else {
313 output[i] = index;
314 output[i+1] = index + needle_len;
315 index += needle_len;
316 }
317 }
318 return output_size / 2;
319}
320
321
Steve Blocka7e24c12009-10-30 11:49:00 +0000322Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
323 Handle<String> subject,
324 int index,
325 Handle<JSArray> last_match_info) {
Steve Block44f0eee2011-05-26 01:26:41 +0100326 Isolate* isolate = re->GetIsolate();
327
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000328 static const int kNumRegisters = 2;
329 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
330 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
Steve Blocka7e24c12009-10-30 11:49:00 +0000331
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000332 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
Steve Blocka7e24c12009-10-30 11:49:00 +0000333
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000334 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
Ben Murdochb0fe1622011-05-05 13:52:32 +0100335
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000336 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
337 SealHandleScope shs(isolate);
338 FixedArray* array = FixedArray::cast(last_match_info->elements());
339 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
Steve Blocka7e24c12009-10-30 11:49:00 +0000340 return last_match_info;
341}
342
343
344// Irregexp implementation.
345
346// Ensures that the regexp object contains a compiled version of the
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000347// source for either one-byte or two-byte subject strings.
Steve Blocka7e24c12009-10-30 11:49:00 +0000348// If the compiled version doesn't already exist, it is compiled
349// from the source pattern.
350// If compilation fails, an exception is thrown and this function
351// returns false.
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000352bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
353 Handle<String> sample_subject,
354 bool is_one_byte) {
355 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
Steve Block6ded16b2010-05-10 14:33:55 +0100356#ifdef V8_INTERPRETED_REGEXP
Steve Blocka7e24c12009-10-30 11:49:00 +0000357 if (compiled_code->IsByteArray()) return true;
Steve Block6ded16b2010-05-10 14:33:55 +0100358#else // V8_INTERPRETED_REGEXP (RegExp native code)
359 if (compiled_code->IsCode()) return true;
Steve Blocka7e24c12009-10-30 11:49:00 +0000360#endif
Ben Murdoch257744e2011-11-30 15:57:28 +0000361 // We could potentially have marked this as flushable, but have kept
362 // a saved version if we did not flush it yet.
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000363 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
Ben Murdoch257744e2011-11-30 15:57:28 +0000364 if (saved_code->IsCode()) {
365 // Reinstate the code in the original place.
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000366 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
367 DCHECK(compiled_code->IsSmi());
Ben Murdoch257744e2011-11-30 15:57:28 +0000368 return true;
369 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000370 return CompileIrregexp(re, sample_subject, is_one_byte);
Steve Blocka7e24c12009-10-30 11:49:00 +0000371}
372
373
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000374static void CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
Ben Murdoch257744e2011-11-30 15:57:28 +0000375 Handle<String> error_message,
376 Isolate* isolate) {
377 Factory* factory = isolate->factory();
378 Handle<FixedArray> elements = factory->NewFixedArray(2);
379 elements->set(0, re->Pattern());
380 elements->set(1, *error_message);
381 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000382 Handle<Object> error;
383 MaybeHandle<Object> maybe_error =
Ben Murdoch257744e2011-11-30 15:57:28 +0000384 factory->NewSyntaxError("malformed_regexp", array);
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000385 if (maybe_error.ToHandle(&error)) isolate->Throw(*error);
Ben Murdoch257744e2011-11-30 15:57:28 +0000386}
387
388
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000389bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
390 Handle<String> sample_subject,
391 bool is_one_byte) {
Steve Blocka7e24c12009-10-30 11:49:00 +0000392 // Compile the RegExp.
Steve Block44f0eee2011-05-26 01:26:41 +0100393 Isolate* isolate = re->GetIsolate();
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000394 Zone zone(isolate);
Steve Block44f0eee2011-05-26 01:26:41 +0100395 PostponeInterruptsScope postpone(isolate);
Ben Murdoch257744e2011-11-30 15:57:28 +0000396 // If we had a compilation error the last time this is saved at the
397 // saved code index.
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000398 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
Ben Murdoch257744e2011-11-30 15:57:28 +0000399 // When arriving here entry can only be a smi, either representing an
400 // uncompiled regexp, a previous compilation error, or code that has
401 // been flushed.
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000402 DCHECK(entry->IsSmi());
Ben Murdoch257744e2011-11-30 15:57:28 +0000403 int entry_value = Smi::cast(entry)->value();
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000404 DCHECK(entry_value == JSRegExp::kUninitializedValue ||
Ben Murdoch257744e2011-11-30 15:57:28 +0000405 entry_value == JSRegExp::kCompilationErrorValue ||
406 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
407
408 if (entry_value == JSRegExp::kCompilationErrorValue) {
409 // A previous compilation failed and threw an error which we store in
410 // the saved code index (we store the error message, not the actual
411 // error). Recreate the error object and throw it.
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000412 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
413 DCHECK(error_string->IsString());
Ben Murdoch257744e2011-11-30 15:57:28 +0000414 Handle<String> error_message(String::cast(error_string));
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000415 CreateRegExpErrorObjectAndThrow(re, error_message, isolate);
Steve Blocka7e24c12009-10-30 11:49:00 +0000416 return false;
417 }
Steve Blocka7e24c12009-10-30 11:49:00 +0000418
419 JSRegExp::Flags flags = re->GetFlags();
420
421 Handle<String> pattern(re->Pattern());
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000422 pattern = String::Flatten(pattern);
Steve Blocka7e24c12009-10-30 11:49:00 +0000423 RegExpCompileData compile_data;
Steve Block44f0eee2011-05-26 01:26:41 +0100424 FlatStringReader reader(isolate, pattern);
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800425 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000426 &compile_data,
427 &zone)) {
Steve Blocka7e24c12009-10-30 11:49:00 +0000428 // Throw an exception if we fail to parse the pattern.
429 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000430 USE(ThrowRegExpException(re,
431 pattern,
432 compile_data.error,
433 "malformed_regexp"));
Steve Blocka7e24c12009-10-30 11:49:00 +0000434 return false;
435 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000436 RegExpEngine::CompilationResult result = RegExpEngine::Compile(
437 &compile_data, flags.is_ignore_case(), flags.is_global(),
438 flags.is_multiline(), flags.is_sticky(), pattern, sample_subject,
439 is_one_byte, &zone);
Steve Blocka7e24c12009-10-30 11:49:00 +0000440 if (result.error_message != NULL) {
441 // Unable to compile regexp.
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000442 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
443 CStrVector(result.error_message)).ToHandleChecked();
444 CreateRegExpErrorObjectAndThrow(re, error_message, isolate);
Steve Blocka7e24c12009-10-30 11:49:00 +0000445 return false;
446 }
447
448 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000449 data->set(JSRegExp::code_index(is_one_byte), result.code);
Steve Blocka7e24c12009-10-30 11:49:00 +0000450 int register_max = IrregexpMaxRegisterCount(*data);
451 if (result.num_registers > register_max) {
452 SetIrregexpMaxRegisterCount(*data, result.num_registers);
453 }
454
455 return true;
456}
457
458
459int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
460 return Smi::cast(
461 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
462}
463
464
465void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
466 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
467}
468
469
470int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
471 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
472}
473
474
475int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
476 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
477}
478
479
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000480ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
481 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
Steve Blocka7e24c12009-10-30 11:49:00 +0000482}
483
484
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000485Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
486 return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
Steve Blocka7e24c12009-10-30 11:49:00 +0000487}
488
489
Steve Block6ded16b2010-05-10 14:33:55 +0100490void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
491 Handle<String> pattern,
492 JSRegExp::Flags flags,
493 int capture_count) {
Steve Blocka7e24c12009-10-30 11:49:00 +0000494 // Initialize compiled code entries to null.
Steve Block44f0eee2011-05-26 01:26:41 +0100495 re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
496 JSRegExp::IRREGEXP,
497 pattern,
498 flags,
499 capture_count);
Steve Blocka7e24c12009-10-30 11:49:00 +0000500}
501
502
Steve Block6ded16b2010-05-10 14:33:55 +0100503int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
504 Handle<String> subject) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000505 subject = String::Flatten(subject);
Ben Murdoch69a99ed2011-11-30 16:03:39 +0000506
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000507 // Check representation of the underlying storage.
508 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
509 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
Ben Murdoch69a99ed2011-11-30 16:03:39 +0000510
Steve Block6ded16b2010-05-10 14:33:55 +0100511#ifdef V8_INTERPRETED_REGEXP
512 // Byte-code regexp needs space allocated for all its registers.
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000513 // The result captures are copied to the start of the registers array
514 // if the match succeeds. This way those registers are not clobbered
515 // when we set the last match info from last successful match.
516 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
517 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
Steve Block6ded16b2010-05-10 14:33:55 +0100518#else // V8_INTERPRETED_REGEXP
519 // Native regexp only needs room to output captures. Registers are handled
520 // internally.
521 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
522#endif // V8_INTERPRETED_REGEXP
523}
524
525
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000526int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
527 Handle<String> subject,
528 int index,
529 int32_t* output,
530 int output_size) {
Steve Block44f0eee2011-05-26 01:26:41 +0100531 Isolate* isolate = regexp->GetIsolate();
532
533 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
Steve Block6ded16b2010-05-10 14:33:55 +0100534
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000535 DCHECK(index >= 0);
536 DCHECK(index <= subject->length());
537 DCHECK(subject->IsFlat());
Steve Block6ded16b2010-05-10 14:33:55 +0100538
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000539 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
Steve Block8defd9f2010-07-08 12:39:36 +0100540
Steve Block6ded16b2010-05-10 14:33:55 +0100541#ifndef V8_INTERPRETED_REGEXP
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000542 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
Steve Block6ded16b2010-05-10 14:33:55 +0100543 do {
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000544 EnsureCompiledIrregexp(regexp, subject, is_one_byte);
545 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
546 // The stack is used to allocate registers for the compiled regexp code.
547 // This means that in case of failure, the output registers array is left
548 // untouched and contains the capture results from the previous successful
549 // match. We can use that to set the last match info lazily.
Steve Block6ded16b2010-05-10 14:33:55 +0100550 NativeRegExpMacroAssembler::Result res =
551 NativeRegExpMacroAssembler::Match(code,
552 subject,
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000553 output,
554 output_size,
Steve Block44f0eee2011-05-26 01:26:41 +0100555 index,
556 isolate);
Steve Block6ded16b2010-05-10 14:33:55 +0100557 if (res != NativeRegExpMacroAssembler::RETRY) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000558 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
Steve Block44f0eee2011-05-26 01:26:41 +0100559 isolate->has_pending_exception());
Steve Block6ded16b2010-05-10 14:33:55 +0100560 STATIC_ASSERT(
561 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
562 STATIC_ASSERT(
563 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
564 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
565 == RE_EXCEPTION);
566 return static_cast<IrregexpResult>(res);
567 }
568 // If result is RETRY, the string has changed representation, and we
569 // must restart from scratch.
570 // In this case, it means we must make sure we are prepared to handle
Steve Block8defd9f2010-07-08 12:39:36 +0100571 // the, potentially, different subject (the string can switch between
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000572 // being internal and external, and even between being Latin1 and UC16,
Steve Block6ded16b2010-05-10 14:33:55 +0100573 // but the characters are always the same).
574 IrregexpPrepare(regexp, subject);
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000575 is_one_byte = subject->IsOneByteRepresentationUnderneath();
Steve Block6ded16b2010-05-10 14:33:55 +0100576 } while (true);
577 UNREACHABLE();
578 return RE_EXCEPTION;
579#else // V8_INTERPRETED_REGEXP
580
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000581 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
Steve Block6ded16b2010-05-10 14:33:55 +0100582 // We must have done EnsureCompiledIrregexp, so we can get the number of
583 // registers.
Steve Block6ded16b2010-05-10 14:33:55 +0100584 int number_of_capture_registers =
585 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000586 int32_t* raw_output = &output[number_of_capture_registers];
587 // We do not touch the actual capture result registers until we know there
588 // has been a match so that we can use those capture results to set the
589 // last match info.
Steve Block6ded16b2010-05-10 14:33:55 +0100590 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000591 raw_output[i] = -1;
Steve Block6ded16b2010-05-10 14:33:55 +0100592 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000593 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
594 isolate);
Steve Block6ded16b2010-05-10 14:33:55 +0100595
Ben Murdoch3ef787d2012-04-12 10:51:47 +0100596 IrregexpResult result = IrregexpInterpreter::Match(isolate,
597 byte_codes,
598 subject,
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000599 raw_output,
Ben Murdoch3ef787d2012-04-12 10:51:47 +0100600 index);
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000601 if (result == RE_SUCCESS) {
602 // Copy capture results to the start of the registers array.
603 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
604 }
Ben Murdoch3ef787d2012-04-12 10:51:47 +0100605 if (result == RE_EXCEPTION) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000606 DCHECK(!isolate->has_pending_exception());
Ben Murdoch3ef787d2012-04-12 10:51:47 +0100607 isolate->StackOverflow();
Steve Block6ded16b2010-05-10 14:33:55 +0100608 }
Ben Murdoch3ef787d2012-04-12 10:51:47 +0100609 return result;
Steve Block6ded16b2010-05-10 14:33:55 +0100610#endif // V8_INTERPRETED_REGEXP
611}
612
613
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000614MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
615 Handle<String> subject,
616 int previous_index,
617 Handle<JSArray> last_match_info) {
618 Isolate* isolate = regexp->GetIsolate();
619 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
Steve Blocka7e24c12009-10-30 11:49:00 +0000620
621 // Prepare space for the return values.
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000622#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
Steve Blocka7e24c12009-10-30 11:49:00 +0000623 if (FLAG_trace_regexp_bytecodes) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000624 String* pattern = regexp->Pattern();
625 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
626 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
Steve Blocka7e24c12009-10-30 11:49:00 +0000627 }
628#endif
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000629 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
Steve Block6ded16b2010-05-10 14:33:55 +0100630 if (required_registers < 0) {
631 // Compiling failed with an exception.
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000632 DCHECK(isolate->has_pending_exception());
633 return MaybeHandle<Object>();
Steve Blocka7e24c12009-10-30 11:49:00 +0000634 }
Steve Blocka7e24c12009-10-30 11:49:00 +0000635
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000636 int32_t* output_registers = NULL;
637 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
638 output_registers = NewArray<int32_t>(required_registers);
639 }
640 SmartArrayPointer<int32_t> auto_release(output_registers);
641 if (output_registers == NULL) {
642 output_registers = isolate->jsregexp_static_offsets_vector();
643 }
Steve Blocka7e24c12009-10-30 11:49:00 +0000644
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000645 int res = RegExpImpl::IrregexpExecRaw(
646 regexp, subject, previous_index, output_registers, required_registers);
Steve Block6ded16b2010-05-10 14:33:55 +0100647 if (res == RE_SUCCESS) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000648 int capture_count =
649 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
650 return SetLastMatchInfo(
651 last_match_info, subject, capture_count, output_registers);
Steve Blocka7e24c12009-10-30 11:49:00 +0000652 }
Steve Block6ded16b2010-05-10 14:33:55 +0100653 if (res == RE_EXCEPTION) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000654 DCHECK(isolate->has_pending_exception());
655 return MaybeHandle<Object>();
Steve Blocka7e24c12009-10-30 11:49:00 +0000656 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000657 DCHECK(res == RE_FAILURE);
Ben Murdoch3ef787d2012-04-12 10:51:47 +0100658 return isolate->factory()->null_value();
Steve Blocka7e24c12009-10-30 11:49:00 +0000659}
660
661
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000662Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
663 Handle<String> subject,
664 int capture_count,
665 int32_t* match) {
666 DCHECK(last_match_info->HasFastObjectElements());
667 int capture_register_count = (capture_count + 1) * 2;
668 JSArray::EnsureSize(last_match_info,
669 capture_register_count + kLastMatchOverhead);
670 DisallowHeapAllocation no_allocation;
671 FixedArray* array = FixedArray::cast(last_match_info->elements());
672 if (match != NULL) {
673 for (int i = 0; i < capture_register_count; i += 2) {
674 SetCapture(array, i, match[i]);
675 SetCapture(array, i + 1, match[i + 1]);
676 }
677 }
678 SetLastCaptureCount(array, capture_register_count);
679 SetLastSubject(array, *subject);
680 SetLastInput(array, *subject);
681 return last_match_info;
682}
683
684
685RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
686 Handle<String> subject,
687 bool is_global,
688 Isolate* isolate)
689 : register_array_(NULL),
690 register_array_size_(0),
691 regexp_(regexp),
692 subject_(subject) {
693#ifdef V8_INTERPRETED_REGEXP
694 bool interpreted = true;
695#else
696 bool interpreted = false;
697#endif // V8_INTERPRETED_REGEXP
698
699 if (regexp_->TypeTag() == JSRegExp::ATOM) {
700 static const int kAtomRegistersPerMatch = 2;
701 registers_per_match_ = kAtomRegistersPerMatch;
702 // There is no distinction between interpreted and native for atom regexps.
703 interpreted = false;
704 } else {
705 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
706 if (registers_per_match_ < 0) {
707 num_matches_ = -1; // Signal exception.
708 return;
709 }
710 }
711
712 if (is_global && !interpreted) {
713 register_array_size_ =
714 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
715 max_matches_ = register_array_size_ / registers_per_match_;
716 } else {
717 // Global loop in interpreted regexp is not implemented. We choose
718 // the size of the offsets vector so that it can only store one match.
719 register_array_size_ = registers_per_match_;
720 max_matches_ = 1;
721 }
722
723 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
724 register_array_ = NewArray<int32_t>(register_array_size_);
725 } else {
726 register_array_ = isolate->jsregexp_static_offsets_vector();
727 }
728
729 // Set state so that fetching the results the first time triggers a call
730 // to the compiled regexp.
731 current_match_index_ = max_matches_ - 1;
732 num_matches_ = max_matches_;
733 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
734 DCHECK_GE(register_array_size_, registers_per_match_);
735 int32_t* last_match =
736 &register_array_[current_match_index_ * registers_per_match_];
737 last_match[0] = -1;
738 last_match[1] = 0;
739}
740
741
Steve Blocka7e24c12009-10-30 11:49:00 +0000742// -------------------------------------------------------------------
743// Implementation of the Irregexp regular expression engine.
744//
745// The Irregexp regular expression engine is intended to be a complete
746// implementation of ECMAScript regular expressions. It generates either
747// bytecodes or native code.
748
749// The Irregexp regexp engine is structured in three steps.
750// 1) The parser generates an abstract syntax tree. See ast.cc.
751// 2) From the AST a node network is created. The nodes are all
752// subclasses of RegExpNode. The nodes represent states when
753// executing a regular expression. Several optimizations are
754// performed on the node network.
755// 3) From the nodes we generate either byte codes or native code
756// that can actually execute the regular expression (perform
757// the search). The code generation step is described in more
758// detail below.
759
760// Code generation.
761//
762// The nodes are divided into four main categories.
763// * Choice nodes
764// These represent places where the regular expression can
765// match in more than one way. For example on entry to an
766// alternation (foo|bar) or a repetition (*, +, ? or {}).
767// * Action nodes
768// These represent places where some action should be
769// performed. Examples include recording the current position
770// in the input string to a register (in order to implement
771// captures) or other actions on register for example in order
772// to implement the counters needed for {} repetitions.
773// * Matching nodes
774// These attempt to match some element part of the input string.
775// Examples of elements include character classes, plain strings
776// or back references.
777// * End nodes
778// These are used to implement the actions required on finding
779// a successful match or failing to find a match.
780//
781// The code generated (whether as byte codes or native code) maintains
782// some state as it runs. This consists of the following elements:
783//
784// * The capture registers. Used for string captures.
785// * Other registers. Used for counters etc.
786// * The current position.
787// * The stack of backtracking information. Used when a matching node
788// fails to find a match and needs to try an alternative.
789//
790// Conceptual regular expression execution model:
791//
792// There is a simple conceptual model of regular expression execution
793// which will be presented first. The actual code generated is a more
794// efficient simulation of the simple conceptual model:
795//
796// * Choice nodes are implemented as follows:
797// For each choice except the last {
798// push current position
799// push backtrack code location
800// <generate code to test for choice>
801// backtrack code location:
802// pop current position
803// }
804// <generate code to test for last choice>
805//
806// * Actions nodes are generated as follows
807// <push affected registers on backtrack stack>
808// <generate code to perform action>
809// push backtrack code location
810// <generate code to test for following nodes>
811// backtrack code location:
812// <pop affected registers to restore their state>
813// <pop backtrack location from stack and go to it>
814//
815// * Matching nodes are generated as follows:
816// if input string matches at current position
817// update current position
818// <generate code to test for following nodes>
819// else
820// <pop backtrack location from stack and go to it>
821//
822// Thus it can be seen that the current position is saved and restored
823// by the choice nodes, whereas the registers are saved and restored by
824// by the action nodes that manipulate them.
825//
826// The other interesting aspect of this model is that nodes are generated
827// at the point where they are needed by a recursive call to Emit(). If
828// the node has already been code generated then the Emit() call will
829// generate a jump to the previously generated code instead. In order to
830// limit recursion it is possible for the Emit() function to put the node
831// on a work list for later generation and instead generate a jump. The
832// destination of the jump is resolved later when the code is generated.
833//
834// Actual regular expression code generation.
835//
836// Code generation is actually more complicated than the above. In order
837// to improve the efficiency of the generated code some optimizations are
838// performed
839//
840// * Choice nodes have 1-character lookahead.
841// A choice node looks at the following character and eliminates some of
842// the choices immediately based on that character. This is not yet
843// implemented.
844// * Simple greedy loops store reduced backtracking information.
845// A quantifier like /.*foo/m will greedily match the whole input. It will
846// then need to backtrack to a point where it can match "foo". The naive
847// implementation of this would push each character position onto the
848// backtracking stack, then pop them off one by one. This would use space
849// proportional to the length of the input string. However since the "."
850// can only match in one way and always has a constant length (in this case
851// of 1) it suffices to store the current position on the top of the stack
852// once. Matching now becomes merely incrementing the current position and
853// backtracking becomes decrementing the current position and checking the
854// result against the stored current position. This is faster and saves
855// space.
856// * The current state is virtualized.
857// This is used to defer expensive operations until it is clear that they
858// are needed and to generate code for a node more than once, allowing
859// specialized an efficient versions of the code to be created. This is
860// explained in the section below.
861//
862// Execution state virtualization.
863//
864// Instead of emitting code, nodes that manipulate the state can record their
865// manipulation in an object called the Trace. The Trace object can record a
866// current position offset, an optional backtrack code location on the top of
867// the virtualized backtrack stack and some register changes. When a node is
868// to be emitted it can flush the Trace or update it. Flushing the Trace
869// will emit code to bring the actual state into line with the virtual state.
Ben Murdoch3ef787d2012-04-12 10:51:47 +0100870// Avoiding flushing the state can postpone some work (e.g. updates of capture
Steve Blocka7e24c12009-10-30 11:49:00 +0000871// registers). Postponing work can save time when executing the regular
872// expression since it may be found that the work never has to be done as a
873// failure to match can occur. In addition it is much faster to jump to a
874// known backtrack code location than it is to pop an unknown backtrack
875// location from the stack and jump there.
876//
877// The virtual state found in the Trace affects code generation. For example
878// the virtual state contains the difference between the actual current
879// position and the virtual current position, and matching code needs to use
880// this offset to attempt a match in the correct location of the input
881// string. Therefore code generated for a non-trivial trace is specialized
882// to that trace. The code generator therefore has the ability to generate
883// code for each node several times. In order to limit the size of the
884// generated code there is an arbitrary limit on how many specialized sets of
885// code may be generated for a given node. If the limit is reached, the
886// trace is flushed and a generic version of the code for a node is emitted.
887// This is subsequently used for that node. The code emitted for non-generic
888// trace is not recorded in the node and so it cannot currently be reused in
889// the event that code generation is requested for an identical trace.
890
891
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000892void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
Steve Blocka7e24c12009-10-30 11:49:00 +0000893 UNREACHABLE();
894}
895
896
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000897void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
898 text->AddElement(TextElement::Atom(this), zone);
Steve Blocka7e24c12009-10-30 11:49:00 +0000899}
900
901
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000902void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
903 text->AddElement(TextElement::CharClass(this), zone);
Steve Blocka7e24c12009-10-30 11:49:00 +0000904}
905
906
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000907void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
Steve Blocka7e24c12009-10-30 11:49:00 +0000908 for (int i = 0; i < elements()->length(); i++)
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000909 text->AddElement(elements()->at(i), zone);
Steve Blocka7e24c12009-10-30 11:49:00 +0000910}
911
912
913TextElement TextElement::Atom(RegExpAtom* atom) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000914 return TextElement(ATOM, atom);
Steve Blocka7e24c12009-10-30 11:49:00 +0000915}
916
917
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000918TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
919 return TextElement(CHAR_CLASS, char_class);
Steve Blocka7e24c12009-10-30 11:49:00 +0000920}
921
922
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000923int TextElement::length() const {
924 switch (text_type()) {
925 case ATOM:
926 return atom()->length();
927
928 case CHAR_CLASS:
929 return 1;
Steve Blocka7e24c12009-10-30 11:49:00 +0000930 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000931 UNREACHABLE();
932 return 0;
Steve Blocka7e24c12009-10-30 11:49:00 +0000933}
934
935
936DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
937 if (table_ == NULL) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000938 table_ = new(zone()) DispatchTable(zone());
939 DispatchTableConstructor cons(table_, ignore_case, zone());
Steve Blocka7e24c12009-10-30 11:49:00 +0000940 cons.BuildTable(this);
941 }
942 return table_;
943}
944
945
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000946class FrequencyCollator {
947 public:
948 FrequencyCollator() : total_samples_(0) {
949 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
950 frequencies_[i] = CharacterFrequency(i);
951 }
952 }
953
954 void CountCharacter(int character) {
955 int index = (character & RegExpMacroAssembler::kTableMask);
956 frequencies_[index].Increment();
957 total_samples_++;
958 }
959
960 // Does not measure in percent, but rather per-128 (the table size from the
961 // regexp macro assembler).
962 int Frequency(int in_character) {
963 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
964 if (total_samples_ < 1) return 1; // Division by zero.
965 int freq_in_per128 =
966 (frequencies_[in_character].counter() * 128) / total_samples_;
967 return freq_in_per128;
968 }
969
970 private:
971 class CharacterFrequency {
972 public:
973 CharacterFrequency() : counter_(0), character_(-1) { }
974 explicit CharacterFrequency(int character)
975 : counter_(0), character_(character) { }
976
977 void Increment() { counter_++; }
978 int counter() { return counter_; }
979 int character() { return character_; }
980
981 private:
982 int counter_;
983 int character_;
984 };
985
986
987 private:
988 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
989 int total_samples_;
990};
991
992
Steve Blocka7e24c12009-10-30 11:49:00 +0000993class RegExpCompiler {
994 public:
Ben Murdochb8a8cc12014-11-26 15:28:44 +0000995 RegExpCompiler(int capture_count, bool ignore_case, bool is_one_byte,
996 Zone* zone);
Steve Blocka7e24c12009-10-30 11:49:00 +0000997
998 int AllocateRegister() {
999 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
1000 reg_exp_too_big_ = true;
1001 return next_register_;
1002 }
1003 return next_register_++;
1004 }
1005
1006 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
1007 RegExpNode* start,
1008 int capture_count,
1009 Handle<String> pattern);
1010
1011 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1012
1013 static const int kImplementationOffset = 0;
1014 static const int kNumberOfRegistersOffset = 0;
1015 static const int kCodeOffset = 1;
1016
1017 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1018 EndNode* accept() { return accept_; }
1019
1020 static const int kMaxRecursion = 100;
1021 inline int recursion_depth() { return recursion_depth_; }
1022 inline void IncrementRecursionDepth() { recursion_depth_++; }
1023 inline void DecrementRecursionDepth() { recursion_depth_--; }
1024
1025 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1026
1027 inline bool ignore_case() { return ignore_case_; }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001028 inline bool one_byte() { return one_byte_; }
1029 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
Steve Blocka7e24c12009-10-30 11:49:00 +00001030
Ben Murdoch257744e2011-11-30 15:57:28 +00001031 int current_expansion_factor() { return current_expansion_factor_; }
1032 void set_current_expansion_factor(int value) {
1033 current_expansion_factor_ = value;
1034 }
1035
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001036 Zone* zone() const { return zone_; }
1037
Steve Blocka7e24c12009-10-30 11:49:00 +00001038 static const int kNoRegister = -1;
Ben Murdoch3fb3ca82011-12-02 17:19:32 +00001039
Steve Blocka7e24c12009-10-30 11:49:00 +00001040 private:
1041 EndNode* accept_;
1042 int next_register_;
1043 List<RegExpNode*>* work_list_;
1044 int recursion_depth_;
1045 RegExpMacroAssembler* macro_assembler_;
1046 bool ignore_case_;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001047 bool one_byte_;
Steve Blocka7e24c12009-10-30 11:49:00 +00001048 bool reg_exp_too_big_;
Ben Murdoch257744e2011-11-30 15:57:28 +00001049 int current_expansion_factor_;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001050 FrequencyCollator frequency_collator_;
1051 Zone* zone_;
Steve Blocka7e24c12009-10-30 11:49:00 +00001052};
1053
1054
1055class RecursionCheck {
1056 public:
1057 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1058 compiler->IncrementRecursionDepth();
1059 }
1060 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1061 private:
1062 RegExpCompiler* compiler_;
1063};
1064
1065
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001066static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1067 return RegExpEngine::CompilationResult(isolate, "RegExp too big");
Steve Blocka7e24c12009-10-30 11:49:00 +00001068}
1069
1070
1071// Attempts to compile the regexp using an Irregexp code generator. Returns
1072// a fixed array or a null handle depending on whether it succeeded.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001073RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case,
1074 bool one_byte, Zone* zone)
Steve Blocka7e24c12009-10-30 11:49:00 +00001075 : next_register_(2 * (capture_count + 1)),
1076 work_list_(NULL),
1077 recursion_depth_(0),
1078 ignore_case_(ignore_case),
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001079 one_byte_(one_byte),
Ben Murdoch257744e2011-11-30 15:57:28 +00001080 reg_exp_too_big_(false),
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001081 current_expansion_factor_(1),
1082 frequency_collator_(),
1083 zone_(zone) {
1084 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1085 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
Steve Blocka7e24c12009-10-30 11:49:00 +00001086}
1087
1088
1089RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1090 RegExpMacroAssembler* macro_assembler,
1091 RegExpNode* start,
1092 int capture_count,
1093 Handle<String> pattern) {
Steve Block053d10c2011-06-13 19:13:29 +01001094 Heap* heap = pattern->GetHeap();
1095
1096 bool use_slow_safe_regexp_compiler = false;
1097 if (heap->total_regexp_code_generated() >
1098 RegExpImpl::kRegWxpCompiledLimit &&
1099 heap->isolate()->memory_allocator()->SizeExecutable() >
1100 RegExpImpl::kRegExpExecutableMemoryLimit) {
1101 use_slow_safe_regexp_compiler = true;
1102 }
1103
1104 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
1105
Steve Blocka7e24c12009-10-30 11:49:00 +00001106#ifdef DEBUG
1107 if (FLAG_trace_regexp_assembler)
1108 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
1109 else
1110#endif
1111 macro_assembler_ = macro_assembler;
Steve Block053d10c2011-06-13 19:13:29 +01001112
Steve Blocka7e24c12009-10-30 11:49:00 +00001113 List <RegExpNode*> work_list(0);
1114 work_list_ = &work_list;
1115 Label fail;
1116 macro_assembler_->PushBacktrack(&fail);
1117 Trace new_trace;
1118 start->Emit(this, &new_trace);
1119 macro_assembler_->Bind(&fail);
1120 macro_assembler_->Fail();
1121 while (!work_list.is_empty()) {
1122 work_list.RemoveLast()->Emit(this, &new_trace);
1123 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001124 if (reg_exp_too_big_) return IrregexpRegExpTooBig(zone_->isolate());
Steve Blocka7e24c12009-10-30 11:49:00 +00001125
Steve Block053d10c2011-06-13 19:13:29 +01001126 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1127 heap->IncreaseTotalRegexpCodeGenerated(code->Size());
Steve Blocka7e24c12009-10-30 11:49:00 +00001128 work_list_ = NULL;
1129#ifdef DEBUG
Steve Block44f0eee2011-05-26 01:26:41 +01001130 if (FLAG_print_code) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001131 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
1132 OFStream os(trace_scope.file());
1133 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
Steve Block44f0eee2011-05-26 01:26:41 +01001134 }
Steve Blocka7e24c12009-10-30 11:49:00 +00001135 if (FLAG_trace_regexp_assembler) {
1136 delete macro_assembler_;
1137 }
1138#endif
1139 return RegExpEngine::CompilationResult(*code, next_register_);
1140}
1141
1142
1143bool Trace::DeferredAction::Mentions(int that) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001144 if (action_type() == ActionNode::CLEAR_CAPTURES) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001145 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1146 return range.Contains(that);
1147 } else {
1148 return reg() == that;
1149 }
1150}
1151
1152
1153bool Trace::mentions_reg(int reg) {
1154 for (DeferredAction* action = actions_;
1155 action != NULL;
1156 action = action->next()) {
1157 if (action->Mentions(reg))
1158 return true;
1159 }
1160 return false;
1161}
1162
1163
1164bool Trace::GetStoredPosition(int reg, int* cp_offset) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001165 DCHECK_EQ(0, *cp_offset);
Steve Blocka7e24c12009-10-30 11:49:00 +00001166 for (DeferredAction* action = actions_;
1167 action != NULL;
1168 action = action->next()) {
1169 if (action->Mentions(reg)) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001170 if (action->action_type() == ActionNode::STORE_POSITION) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001171 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1172 return true;
1173 } else {
1174 return false;
1175 }
1176 }
1177 }
1178 return false;
1179}
1180
1181
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001182int Trace::FindAffectedRegisters(OutSet* affected_registers,
1183 Zone* zone) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001184 int max_register = RegExpCompiler::kNoRegister;
1185 for (DeferredAction* action = actions_;
1186 action != NULL;
1187 action = action->next()) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001188 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001189 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1190 for (int i = range.from(); i <= range.to(); i++)
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001191 affected_registers->Set(i, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00001192 if (range.to() > max_register) max_register = range.to();
1193 } else {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001194 affected_registers->Set(action->reg(), zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00001195 if (action->reg() > max_register) max_register = action->reg();
1196 }
1197 }
1198 return max_register;
1199}
1200
1201
1202void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1203 int max_register,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001204 const OutSet& registers_to_pop,
1205 const OutSet& registers_to_clear) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001206 for (int reg = max_register; reg >= 0; reg--) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001207 if (registers_to_pop.Get(reg)) {
1208 assembler->PopRegister(reg);
1209 } else if (registers_to_clear.Get(reg)) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001210 int clear_to = reg;
1211 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1212 reg--;
1213 }
1214 assembler->ClearRegisters(reg, clear_to);
1215 }
1216 }
1217}
1218
1219
1220void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1221 int max_register,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001222 const OutSet& affected_registers,
Steve Blocka7e24c12009-10-30 11:49:00 +00001223 OutSet* registers_to_pop,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001224 OutSet* registers_to_clear,
1225 Zone* zone) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001226 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1227 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1228
1229 // Count pushes performed to force a stack limit check occasionally.
1230 int pushes = 0;
1231
1232 for (int reg = 0; reg <= max_register; reg++) {
1233 if (!affected_registers.Get(reg)) {
1234 continue;
1235 }
1236
1237 // The chronologically first deferred action in the trace
1238 // is used to infer the action needed to restore a register
1239 // to its previous state (or not, if it's safe to ignore it).
1240 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1241 DeferredActionUndoType undo_action = IGNORE;
1242
1243 int value = 0;
1244 bool absolute = false;
1245 bool clear = false;
1246 int store_position = -1;
1247 // This is a little tricky because we are scanning the actions in reverse
1248 // historical order (newest first).
1249 for (DeferredAction* action = actions_;
1250 action != NULL;
1251 action = action->next()) {
1252 if (action->Mentions(reg)) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001253 switch (action->action_type()) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001254 case ActionNode::SET_REGISTER: {
1255 Trace::DeferredSetRegister* psr =
1256 static_cast<Trace::DeferredSetRegister*>(action);
1257 if (!absolute) {
1258 value += psr->value();
1259 absolute = true;
1260 }
1261 // SET_REGISTER is currently only used for newly introduced loop
1262 // counters. They can have a significant previous value if they
1263 // occour in a loop. TODO(lrn): Propagate this information, so
1264 // we can set undo_action to IGNORE if we know there is no value to
1265 // restore.
1266 undo_action = RESTORE;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001267 DCHECK_EQ(store_position, -1);
1268 DCHECK(!clear);
Steve Blocka7e24c12009-10-30 11:49:00 +00001269 break;
1270 }
1271 case ActionNode::INCREMENT_REGISTER:
1272 if (!absolute) {
1273 value++;
1274 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001275 DCHECK_EQ(store_position, -1);
1276 DCHECK(!clear);
Steve Blocka7e24c12009-10-30 11:49:00 +00001277 undo_action = RESTORE;
1278 break;
1279 case ActionNode::STORE_POSITION: {
1280 Trace::DeferredCapture* pc =
1281 static_cast<Trace::DeferredCapture*>(action);
1282 if (!clear && store_position == -1) {
1283 store_position = pc->cp_offset();
1284 }
1285
1286 // For captures we know that stores and clears alternate.
1287 // Other register, are never cleared, and if the occur
1288 // inside a loop, they might be assigned more than once.
1289 if (reg <= 1) {
1290 // Registers zero and one, aka "capture zero", is
1291 // always set correctly if we succeed. There is no
1292 // need to undo a setting on backtrack, because we
1293 // will set it again or fail.
1294 undo_action = IGNORE;
1295 } else {
1296 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1297 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001298 DCHECK(!absolute);
1299 DCHECK_EQ(value, 0);
Steve Blocka7e24c12009-10-30 11:49:00 +00001300 break;
1301 }
1302 case ActionNode::CLEAR_CAPTURES: {
1303 // Since we're scanning in reverse order, if we've already
1304 // set the position we have to ignore historically earlier
1305 // clearing operations.
1306 if (store_position == -1) {
1307 clear = true;
1308 }
1309 undo_action = RESTORE;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001310 DCHECK(!absolute);
1311 DCHECK_EQ(value, 0);
Steve Blocka7e24c12009-10-30 11:49:00 +00001312 break;
1313 }
1314 default:
1315 UNREACHABLE();
1316 break;
1317 }
1318 }
1319 }
1320 // Prepare for the undo-action (e.g., push if it's going to be popped).
1321 if (undo_action == RESTORE) {
1322 pushes++;
1323 RegExpMacroAssembler::StackCheckFlag stack_check =
1324 RegExpMacroAssembler::kNoStackLimitCheck;
1325 if (pushes == push_limit) {
1326 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1327 pushes = 0;
1328 }
1329
1330 assembler->PushRegister(reg, stack_check);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001331 registers_to_pop->Set(reg, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00001332 } else if (undo_action == CLEAR) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001333 registers_to_clear->Set(reg, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00001334 }
1335 // Perform the chronologically last action (or accumulated increment)
1336 // for the register.
1337 if (store_position != -1) {
1338 assembler->WriteCurrentPositionToRegister(reg, store_position);
1339 } else if (clear) {
1340 assembler->ClearRegisters(reg, reg);
1341 } else if (absolute) {
1342 assembler->SetRegister(reg, value);
1343 } else if (value != 0) {
1344 assembler->AdvanceRegister(reg, value);
1345 }
1346 }
1347}
1348
1349
1350// This is called as we come into a loop choice node and some other tricky
1351// nodes. It normalizes the state of the code generator to ensure we can
1352// generate generic code.
1353void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1354 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1355
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001356 DCHECK(!is_trivial());
Steve Blocka7e24c12009-10-30 11:49:00 +00001357
1358 if (actions_ == NULL && backtrack() == NULL) {
1359 // Here we just have some deferred cp advances to fix and we are back to
1360 // a normal situation. We may also have to forget some information gained
1361 // through a quick check that was already performed.
1362 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1363 // Create a new trivial state and generate the node with that.
1364 Trace new_state;
1365 successor->Emit(compiler, &new_state);
1366 return;
1367 }
1368
1369 // Generate deferred actions here along with code to undo them again.
1370 OutSet affected_registers;
1371
1372 if (backtrack() != NULL) {
1373 // Here we have a concrete backtrack location. These are set up by choice
1374 // nodes and so they indicate that we have a deferred save of the current
1375 // position which we may need to emit here.
1376 assembler->PushCurrentPosition();
1377 }
1378
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001379 int max_register = FindAffectedRegisters(&affected_registers,
1380 compiler->zone());
Steve Blocka7e24c12009-10-30 11:49:00 +00001381 OutSet registers_to_pop;
1382 OutSet registers_to_clear;
1383 PerformDeferredActions(assembler,
1384 max_register,
1385 affected_registers,
1386 &registers_to_pop,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001387 &registers_to_clear,
1388 compiler->zone());
Steve Blocka7e24c12009-10-30 11:49:00 +00001389 if (cp_offset_ != 0) {
1390 assembler->AdvanceCurrentPosition(cp_offset_);
1391 }
1392
1393 // Create a new trivial state and generate the node with that.
1394 Label undo;
1395 assembler->PushBacktrack(&undo);
1396 Trace new_state;
1397 successor->Emit(compiler, &new_state);
1398
1399 // On backtrack we need to restore state.
1400 assembler->Bind(&undo);
1401 RestoreAffectedRegisters(assembler,
1402 max_register,
1403 registers_to_pop,
1404 registers_to_clear);
1405 if (backtrack() == NULL) {
1406 assembler->Backtrack();
1407 } else {
1408 assembler->PopCurrentPosition();
1409 assembler->GoTo(backtrack());
1410 }
1411}
1412
1413
1414void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1415 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1416
1417 // Omit flushing the trace. We discard the entire stack frame anyway.
1418
1419 if (!label()->is_bound()) {
1420 // We are completely independent of the trace, since we ignore it,
1421 // so this code can be used as the generic version.
1422 assembler->Bind(label());
1423 }
1424
1425 // Throw away everything on the backtrack stack since the start
1426 // of the negative submatch and restore the character position.
1427 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1428 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1429 if (clear_capture_count_ > 0) {
1430 // Clear any captures that might have been performed during the success
1431 // of the body of the negative look-ahead.
1432 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1433 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1434 }
1435 // Now that we have unwound the stack we find at the top of the stack the
1436 // backtrack that the BeginSubmatch node got.
1437 assembler->Backtrack();
1438}
1439
1440
1441void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1442 if (!trace->is_trivial()) {
1443 trace->Flush(compiler, this);
1444 return;
1445 }
1446 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1447 if (!label()->is_bound()) {
1448 assembler->Bind(label());
1449 }
1450 switch (action_) {
1451 case ACCEPT:
1452 assembler->Succeed();
1453 return;
1454 case BACKTRACK:
1455 assembler->GoTo(trace->backtrack());
1456 return;
1457 case NEGATIVE_SUBMATCH_SUCCESS:
1458 // This case is handled in a different virtual method.
1459 UNREACHABLE();
1460 }
1461 UNIMPLEMENTED();
1462}
1463
1464
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001465void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001466 if (guards_ == NULL)
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001467 guards_ = new(zone) ZoneList<Guard*>(1, zone);
1468 guards_->Add(guard, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00001469}
1470
1471
1472ActionNode* ActionNode::SetRegister(int reg,
1473 int val,
1474 RegExpNode* on_success) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001475 ActionNode* result =
1476 new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
Steve Blocka7e24c12009-10-30 11:49:00 +00001477 result->data_.u_store_register.reg = reg;
1478 result->data_.u_store_register.value = val;
1479 return result;
1480}
1481
1482
1483ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001484 ActionNode* result =
1485 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
Steve Blocka7e24c12009-10-30 11:49:00 +00001486 result->data_.u_increment_register.reg = reg;
1487 return result;
1488}
1489
1490
1491ActionNode* ActionNode::StorePosition(int reg,
1492 bool is_capture,
1493 RegExpNode* on_success) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001494 ActionNode* result =
1495 new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
Steve Blocka7e24c12009-10-30 11:49:00 +00001496 result->data_.u_position_register.reg = reg;
1497 result->data_.u_position_register.is_capture = is_capture;
1498 return result;
1499}
1500
1501
1502ActionNode* ActionNode::ClearCaptures(Interval range,
1503 RegExpNode* on_success) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001504 ActionNode* result =
1505 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
Steve Blocka7e24c12009-10-30 11:49:00 +00001506 result->data_.u_clear_captures.range_from = range.from();
1507 result->data_.u_clear_captures.range_to = range.to();
1508 return result;
1509}
1510
1511
1512ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1513 int position_reg,
1514 RegExpNode* on_success) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001515 ActionNode* result =
1516 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
Steve Blocka7e24c12009-10-30 11:49:00 +00001517 result->data_.u_submatch.stack_pointer_register = stack_reg;
1518 result->data_.u_submatch.current_position_register = position_reg;
1519 return result;
1520}
1521
1522
1523ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1524 int position_reg,
1525 int clear_register_count,
1526 int clear_register_from,
1527 RegExpNode* on_success) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001528 ActionNode* result =
1529 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
Steve Blocka7e24c12009-10-30 11:49:00 +00001530 result->data_.u_submatch.stack_pointer_register = stack_reg;
1531 result->data_.u_submatch.current_position_register = position_reg;
1532 result->data_.u_submatch.clear_register_count = clear_register_count;
1533 result->data_.u_submatch.clear_register_from = clear_register_from;
1534 return result;
1535}
1536
1537
1538ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1539 int repetition_register,
1540 int repetition_limit,
1541 RegExpNode* on_success) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001542 ActionNode* result =
1543 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
Steve Blocka7e24c12009-10-30 11:49:00 +00001544 result->data_.u_empty_match_check.start_register = start_register;
1545 result->data_.u_empty_match_check.repetition_register = repetition_register;
1546 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1547 return result;
1548}
1549
1550
1551#define DEFINE_ACCEPT(Type) \
1552 void Type##Node::Accept(NodeVisitor* visitor) { \
1553 visitor->Visit##Type(this); \
1554 }
1555FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1556#undef DEFINE_ACCEPT
1557
1558
1559void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1560 visitor->VisitLoopChoice(this);
1561}
1562
1563
1564// -------------------------------------------------------------------
1565// Emit code.
1566
1567
1568void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1569 Guard* guard,
1570 Trace* trace) {
1571 switch (guard->op()) {
1572 case Guard::LT:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001573 DCHECK(!trace->mentions_reg(guard->reg()));
Steve Blocka7e24c12009-10-30 11:49:00 +00001574 macro_assembler->IfRegisterGE(guard->reg(),
1575 guard->value(),
1576 trace->backtrack());
1577 break;
1578 case Guard::GEQ:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001579 DCHECK(!trace->mentions_reg(guard->reg()));
Steve Blocka7e24c12009-10-30 11:49:00 +00001580 macro_assembler->IfRegisterLT(guard->reg(),
1581 guard->value(),
1582 trace->backtrack());
1583 break;
1584 }
1585}
1586
1587
Steve Blocka7e24c12009-10-30 11:49:00 +00001588// Returns the number of characters in the equivalence class, omitting those
1589// that cannot occur in the source string because it is ASCII.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001590static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1591 bool one_byte_subject,
Steve Blocka7e24c12009-10-30 11:49:00 +00001592 unibrow::uchar* letters) {
Steve Block44f0eee2011-05-26 01:26:41 +01001593 int length =
1594 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
Ben Murdochbb769b22010-08-11 14:56:33 +01001595 // Unibrow returns 0 or 1 for characters where case independence is
Steve Blocka7e24c12009-10-30 11:49:00 +00001596 // trivial.
1597 if (length == 0) {
1598 letters[0] = character;
1599 length = 1;
1600 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001601 if (!one_byte_subject || character <= String::kMaxOneByteCharCode) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001602 return length;
1603 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001604
Steve Blocka7e24c12009-10-30 11:49:00 +00001605 // The standard requires that non-ASCII characters cannot have ASCII
1606 // character codes in their equivalence class.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001607 // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore,
1608 // is it? For example, \u00C5 is equivalent to \u212B.
Steve Blocka7e24c12009-10-30 11:49:00 +00001609 return 0;
1610}
1611
1612
Steve Block44f0eee2011-05-26 01:26:41 +01001613static inline bool EmitSimpleCharacter(Isolate* isolate,
1614 RegExpCompiler* compiler,
Steve Blocka7e24c12009-10-30 11:49:00 +00001615 uc16 c,
1616 Label* on_failure,
1617 int cp_offset,
1618 bool check,
1619 bool preloaded) {
1620 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1621 bool bound_checked = false;
1622 if (!preloaded) {
1623 assembler->LoadCurrentCharacter(
1624 cp_offset,
1625 on_failure,
1626 check);
1627 bound_checked = true;
1628 }
1629 assembler->CheckNotCharacter(c, on_failure);
1630 return bound_checked;
1631}
1632
1633
1634// Only emits non-letters (things that don't have case). Only used for case
1635// independent matches.
Steve Block44f0eee2011-05-26 01:26:41 +01001636static inline bool EmitAtomNonLetter(Isolate* isolate,
1637 RegExpCompiler* compiler,
Steve Blocka7e24c12009-10-30 11:49:00 +00001638 uc16 c,
1639 Label* on_failure,
1640 int cp_offset,
1641 bool check,
1642 bool preloaded) {
1643 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001644 bool one_byte = compiler->one_byte();
Steve Blocka7e24c12009-10-30 11:49:00 +00001645 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001646 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
Steve Blocka7e24c12009-10-30 11:49:00 +00001647 if (length < 1) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001648 // This can't match. Must be an one-byte subject and a non-one-byte
1649 // character. We do not need to do anything since the one-byte pass
1650 // already handled this.
Steve Blocka7e24c12009-10-30 11:49:00 +00001651 return false; // Bounds not checked.
1652 }
1653 bool checked = false;
1654 // We handle the length > 1 case in a later pass.
1655 if (length == 1) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001656 if (one_byte && c > String::kMaxOneByteCharCodeU) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001657 // Can't match - see above.
1658 return false; // Bounds not checked.
1659 }
1660 if (!preloaded) {
1661 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1662 checked = check;
1663 }
1664 macro_assembler->CheckNotCharacter(c, on_failure);
1665 }
1666 return checked;
1667}
1668
1669
1670static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001671 bool one_byte, uc16 c1, uc16 c2,
Steve Blocka7e24c12009-10-30 11:49:00 +00001672 Label* on_failure) {
1673 uc16 char_mask;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001674 if (one_byte) {
1675 char_mask = String::kMaxOneByteCharCode;
Steve Blocka7e24c12009-10-30 11:49:00 +00001676 } else {
Ben Murdoch3ef787d2012-04-12 10:51:47 +01001677 char_mask = String::kMaxUtf16CodeUnit;
Steve Blocka7e24c12009-10-30 11:49:00 +00001678 }
1679 uc16 exor = c1 ^ c2;
1680 // Check whether exor has only one bit set.
1681 if (((exor - 1) & exor) == 0) {
1682 // If c1 and c2 differ only by one bit.
1683 // Ecma262UnCanonicalize always gives the highest number last.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001684 DCHECK(c2 > c1);
Steve Blocka7e24c12009-10-30 11:49:00 +00001685 uc16 mask = char_mask ^ exor;
1686 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1687 return true;
1688 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001689 DCHECK(c2 > c1);
Steve Blocka7e24c12009-10-30 11:49:00 +00001690 uc16 diff = c2 - c1;
1691 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1692 // If the characters differ by 2^n but don't differ by one bit then
1693 // subtract the difference from the found character, then do the or
1694 // trick. We avoid the theoretical case where negative numbers are
1695 // involved in order to simplify code generation.
1696 uc16 mask = char_mask ^ diff;
1697 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1698 diff,
1699 mask,
1700 on_failure);
1701 return true;
1702 }
1703 return false;
1704}
1705
1706
Steve Block44f0eee2011-05-26 01:26:41 +01001707typedef bool EmitCharacterFunction(Isolate* isolate,
1708 RegExpCompiler* compiler,
Steve Blocka7e24c12009-10-30 11:49:00 +00001709 uc16 c,
1710 Label* on_failure,
1711 int cp_offset,
1712 bool check,
1713 bool preloaded);
1714
1715// Only emits letters (things that have case). Only used for case independent
1716// matches.
Steve Block44f0eee2011-05-26 01:26:41 +01001717static inline bool EmitAtomLetter(Isolate* isolate,
1718 RegExpCompiler* compiler,
Steve Blocka7e24c12009-10-30 11:49:00 +00001719 uc16 c,
1720 Label* on_failure,
1721 int cp_offset,
1722 bool check,
1723 bool preloaded) {
1724 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001725 bool one_byte = compiler->one_byte();
Steve Blocka7e24c12009-10-30 11:49:00 +00001726 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001727 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
Steve Blocka7e24c12009-10-30 11:49:00 +00001728 if (length <= 1) return false;
1729 // We may not need to check against the end of the input string
1730 // if this character lies before a character that matched.
1731 if (!preloaded) {
1732 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1733 }
1734 Label ok;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001735 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
Steve Blocka7e24c12009-10-30 11:49:00 +00001736 switch (length) {
1737 case 2: {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001738 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1739 chars[1], on_failure)) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001740 } else {
1741 macro_assembler->CheckCharacter(chars[0], &ok);
1742 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1743 macro_assembler->Bind(&ok);
1744 }
1745 break;
1746 }
1747 case 4:
1748 macro_assembler->CheckCharacter(chars[3], &ok);
1749 // Fall through!
1750 case 3:
1751 macro_assembler->CheckCharacter(chars[0], &ok);
1752 macro_assembler->CheckCharacter(chars[1], &ok);
1753 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1754 macro_assembler->Bind(&ok);
1755 break;
1756 default:
1757 UNREACHABLE();
1758 break;
1759 }
1760 return true;
1761}
1762
1763
Ben Murdochb8a8cc12014-11-26 15:28:44 +00001764static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1765 int border,
1766 Label* fall_through,
1767 Label* above_or_equal,
1768 Label* below) {
1769 if (below != fall_through) {
1770 masm->CheckCharacterLT(border, below);
1771 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1772 } else {
1773 masm->CheckCharacterGT(border - 1, above_or_equal);
1774 }
1775}
1776
1777
1778static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1779 int first,
1780 int last,
1781 Label* fall_through,
1782 Label* in_range,
1783 Label* out_of_range) {
1784 if (in_range == fall_through) {
1785 if (first == last) {
1786 masm->CheckNotCharacter(first, out_of_range);
1787 } else {
1788 masm->CheckCharacterNotInRange(first, last, out_of_range);
1789 }
1790 } else {
1791 if (first == last) {
1792 masm->CheckCharacter(first, in_range);
1793 } else {
1794 masm->CheckCharacterInRange(first, last, in_range);
1795 }
1796 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1797 }
1798}
1799
1800
1801// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1802// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1803static void EmitUseLookupTable(
1804 RegExpMacroAssembler* masm,
1805 ZoneList<int>* ranges,
1806 int start_index,
1807 int end_index,
1808 int min_char,
1809 Label* fall_through,
1810 Label* even_label,
1811 Label* odd_label) {
1812 static const int kSize = RegExpMacroAssembler::kTableSize;
1813 static const int kMask = RegExpMacroAssembler::kTableMask;
1814
1815 int base = (min_char & ~kMask);
1816 USE(base);
1817
1818 // Assert that everything is on one kTableSize page.
1819 for (int i = start_index; i <= end_index; i++) {
1820 DCHECK_EQ(ranges->at(i) & ~kMask, base);
1821 }
1822 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1823
1824 char templ[kSize];
1825 Label* on_bit_set;
1826 Label* on_bit_clear;
1827 int bit;
1828 if (even_label == fall_through) {
1829 on_bit_set = odd_label;
1830 on_bit_clear = even_label;
1831 bit = 1;
1832 } else {
1833 on_bit_set = even_label;
1834 on_bit_clear = odd_label;
1835 bit = 0;
1836 }
1837 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1838 templ[i] = bit;
1839 }
1840 int j = 0;
1841 bit ^= 1;
1842 for (int i = start_index; i < end_index; i++) {
1843 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1844 templ[j] = bit;
1845 }
1846 bit ^= 1;
1847 }
1848 for (int i = j; i < kSize; i++) {
1849 templ[i] = bit;
1850 }
1851 Factory* factory = masm->zone()->isolate()->factory();
1852 // TODO(erikcorry): Cache these.
1853 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1854 for (int i = 0; i < kSize; i++) {
1855 ba->set(i, templ[i]);
1856 }
1857 masm->CheckBitInTable(ba, on_bit_set);
1858 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1859}
1860
1861
1862static void CutOutRange(RegExpMacroAssembler* masm,
1863 ZoneList<int>* ranges,
1864 int start_index,
1865 int end_index,
1866 int cut_index,
1867 Label* even_label,
1868 Label* odd_label) {
1869 bool odd = (((cut_index - start_index) & 1) == 1);
1870 Label* in_range_label = odd ? odd_label : even_label;
1871 Label dummy;
1872 EmitDoubleBoundaryTest(masm,
1873 ranges->at(cut_index),
1874 ranges->at(cut_index + 1) - 1,
1875 &dummy,
1876 in_range_label,
1877 &dummy);
1878 DCHECK(!dummy.is_linked());
1879 // Cut out the single range by rewriting the array. This creates a new
1880 // range that is a merger of the two ranges on either side of the one we
1881 // are cutting out. The oddity of the labels is preserved.
1882 for (int j = cut_index; j > start_index; j--) {
1883 ranges->at(j) = ranges->at(j - 1);
1884 }
1885 for (int j = cut_index + 1; j < end_index; j++) {
1886 ranges->at(j) = ranges->at(j + 1);
1887 }
1888}
1889
1890
1891// Unicode case. Split the search space into kSize spaces that are handled
1892// with recursion.
1893static void SplitSearchSpace(ZoneList<int>* ranges,
1894 int start_index,
1895 int end_index,
1896 int* new_start_index,
1897 int* new_end_index,
1898 int* border) {
1899 static const int kSize = RegExpMacroAssembler::kTableSize;
1900 static const int kMask = RegExpMacroAssembler::kTableMask;
1901
1902 int first = ranges->at(start_index);
1903 int last = ranges->at(end_index) - 1;
1904
1905 *new_start_index = start_index;
1906 *border = (ranges->at(start_index) & ~kMask) + kSize;
1907 while (*new_start_index < end_index) {
1908 if (ranges->at(*new_start_index) > *border) break;
1909 (*new_start_index)++;
1910 }
1911 // new_start_index is the index of the first edge that is beyond the
1912 // current kSize space.
1913
1914 // For very large search spaces we do a binary chop search of the non-Latin1
1915 // space instead of just going to the end of the current kSize space. The
1916 // heuristics are complicated a little by the fact that any 128-character
1917 // encoding space can be quickly tested with a table lookup, so we don't
1918 // wish to do binary chop search at a smaller granularity than that. A
1919 // 128-character space can take up a lot of space in the ranges array if,
1920 // for example, we only want to match every second character (eg. the lower
1921 // case characters on some Unicode pages).
1922 int binary_chop_index = (end_index + start_index) / 2;
1923 // The first test ensures that we get to the code that handles the Latin1
1924 // range with a single not-taken branch, speeding up this important
1925 // character range (even non-Latin1 charset-based text has spaces and
1926 // punctuation).
1927 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1928 end_index - start_index > (*new_start_index - start_index) * 2 &&
1929 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1930 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1931 int scan_forward_for_section_border = binary_chop_index;;
1932 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1933
1934 while (scan_forward_for_section_border < end_index) {
1935 if (ranges->at(scan_forward_for_section_border) > new_border) {
1936 *new_start_index = scan_forward_for_section_border;
1937 *border = new_border;
1938 break;
1939 }
1940 scan_forward_for_section_border++;
1941 }
1942 }
1943
1944 DCHECK(*new_start_index > start_index);
1945 *new_end_index = *new_start_index - 1;
1946 if (ranges->at(*new_end_index) == *border) {
1947 (*new_end_index)--;
1948 }
1949 if (*border >= ranges->at(end_index)) {
1950 *border = ranges->at(end_index);
1951 *new_start_index = end_index; // Won't be used.
1952 *new_end_index = end_index - 1;
1953 }
1954}
1955
1956
1957// Gets a series of segment boundaries representing a character class. If the
1958// character is in the range between an even and an odd boundary (counting from
1959// start_index) then go to even_label, otherwise go to odd_label. We already
1960// know that the character is in the range of min_char to max_char inclusive.
1961// Either label can be NULL indicating backtracking. Either label can also be
1962// equal to the fall_through label.
1963static void GenerateBranches(RegExpMacroAssembler* masm,
1964 ZoneList<int>* ranges,
1965 int start_index,
1966 int end_index,
1967 uc16 min_char,
1968 uc16 max_char,
1969 Label* fall_through,
1970 Label* even_label,
1971 Label* odd_label) {
1972 int first = ranges->at(start_index);
1973 int last = ranges->at(end_index) - 1;
1974
1975 DCHECK_LT(min_char, first);
1976
1977 // Just need to test if the character is before or on-or-after
1978 // a particular character.
1979 if (start_index == end_index) {
1980 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1981 return;
1982 }
1983
1984 // Another almost trivial case: There is one interval in the middle that is
1985 // different from the end intervals.
1986 if (start_index + 1 == end_index) {
1987 EmitDoubleBoundaryTest(
1988 masm, first, last, fall_through, even_label, odd_label);
1989 return;
1990 }
1991
1992 // It's not worth using table lookup if there are very few intervals in the
1993 // character class.
1994 if (end_index - start_index <= 6) {
1995 // It is faster to test for individual characters, so we look for those
1996 // first, then try arbitrary ranges in the second round.
1997 static int kNoCutIndex = -1;
1998 int cut = kNoCutIndex;
1999 for (int i = start_index; i < end_index; i++) {
2000 if (ranges->at(i) == ranges->at(i + 1) - 1) {
2001 cut = i;
2002 break;
2003 }
2004 }
2005 if (cut == kNoCutIndex) cut = start_index;
2006 CutOutRange(
2007 masm, ranges, start_index, end_index, cut, even_label, odd_label);
2008 DCHECK_GE(end_index - start_index, 2);
2009 GenerateBranches(masm,
2010 ranges,
2011 start_index + 1,
2012 end_index - 1,
2013 min_char,
2014 max_char,
2015 fall_through,
2016 even_label,
2017 odd_label);
2018 return;
2019 }
2020
2021 // If there are a lot of intervals in the regexp, then we will use tables to
2022 // determine whether the character is inside or outside the character class.
2023 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2024
2025 if ((max_char >> kBits) == (min_char >> kBits)) {
2026 EmitUseLookupTable(masm,
2027 ranges,
2028 start_index,
2029 end_index,
2030 min_char,
2031 fall_through,
2032 even_label,
2033 odd_label);
2034 return;
2035 }
2036
2037 if ((min_char >> kBits) != (first >> kBits)) {
2038 masm->CheckCharacterLT(first, odd_label);
2039 GenerateBranches(masm,
2040 ranges,
2041 start_index + 1,
2042 end_index,
2043 first,
2044 max_char,
2045 fall_through,
2046 odd_label,
2047 even_label);
2048 return;
2049 }
2050
2051 int new_start_index = 0;
2052 int new_end_index = 0;
2053 int border = 0;
2054
2055 SplitSearchSpace(ranges,
2056 start_index,
2057 end_index,
2058 &new_start_index,
2059 &new_end_index,
2060 &border);
2061
2062 Label handle_rest;
2063 Label* above = &handle_rest;
2064 if (border == last + 1) {
2065 // We didn't find any section that started after the limit, so everything
2066 // above the border is one of the terminal labels.
2067 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2068 DCHECK(new_end_index == end_index - 1);
2069 }
2070
2071 DCHECK_LE(start_index, new_end_index);
2072 DCHECK_LE(new_start_index, end_index);
2073 DCHECK_LT(start_index, new_start_index);
2074 DCHECK_LT(new_end_index, end_index);
2075 DCHECK(new_end_index + 1 == new_start_index ||
2076 (new_end_index + 2 == new_start_index &&
2077 border == ranges->at(new_end_index + 1)));
2078 DCHECK_LT(min_char, border - 1);
2079 DCHECK_LT(border, max_char);
2080 DCHECK_LT(ranges->at(new_end_index), border);
2081 DCHECK(border < ranges->at(new_start_index) ||
2082 (border == ranges->at(new_start_index) &&
2083 new_start_index == end_index &&
2084 new_end_index == end_index - 1 &&
2085 border == last + 1));
2086 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2087
2088 masm->CheckCharacterGT(border - 1, above);
2089 Label dummy;
2090 GenerateBranches(masm,
2091 ranges,
2092 start_index,
2093 new_end_index,
2094 min_char,
2095 border - 1,
2096 &dummy,
2097 even_label,
2098 odd_label);
2099 if (handle_rest.is_linked()) {
2100 masm->Bind(&handle_rest);
2101 bool flip = (new_start_index & 1) != (start_index & 1);
2102 GenerateBranches(masm,
2103 ranges,
2104 new_start_index,
2105 end_index,
2106 border,
2107 max_char,
2108 &dummy,
2109 flip ? odd_label : even_label,
2110 flip ? even_label : odd_label);
2111 }
2112}
2113
2114
Steve Blocka7e24c12009-10-30 11:49:00 +00002115static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002116 RegExpCharacterClass* cc, bool one_byte,
2117 Label* on_failure, int cp_offset, bool check_offset,
2118 bool preloaded, Zone* zone) {
2119 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2120 if (!CharacterRange::IsCanonical(ranges)) {
2121 CharacterRange::Canonicalize(ranges);
2122 }
2123
Steve Blocka7e24c12009-10-30 11:49:00 +00002124 int max_char;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002125 if (one_byte) {
2126 max_char = String::kMaxOneByteCharCode;
Steve Blocka7e24c12009-10-30 11:49:00 +00002127 } else {
Ben Murdoch3ef787d2012-04-12 10:51:47 +01002128 max_char = String::kMaxUtf16CodeUnit;
Steve Blocka7e24c12009-10-30 11:49:00 +00002129 }
2130
Steve Blocka7e24c12009-10-30 11:49:00 +00002131 int range_count = ranges->length();
2132
2133 int last_valid_range = range_count - 1;
2134 while (last_valid_range >= 0) {
2135 CharacterRange& range = ranges->at(last_valid_range);
2136 if (range.from() <= max_char) {
2137 break;
2138 }
2139 last_valid_range--;
2140 }
2141
2142 if (last_valid_range < 0) {
2143 if (!cc->is_negated()) {
Steve Blocka7e24c12009-10-30 11:49:00 +00002144 macro_assembler->GoTo(on_failure);
2145 }
2146 if (check_offset) {
2147 macro_assembler->CheckPosition(cp_offset, on_failure);
2148 }
2149 return;
2150 }
2151
2152 if (last_valid_range == 0 &&
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002153 ranges->at(0).IsEverything(max_char)) {
2154 if (cc->is_negated()) {
2155 macro_assembler->GoTo(on_failure);
2156 } else {
2157 // This is a common case hit by non-anchored expressions.
2158 if (check_offset) {
2159 macro_assembler->CheckPosition(cp_offset, on_failure);
2160 }
2161 }
2162 return;
2163 }
2164 if (last_valid_range == 0 &&
Steve Blocka7e24c12009-10-30 11:49:00 +00002165 !cc->is_negated() &&
2166 ranges->at(0).IsEverything(max_char)) {
2167 // This is a common case hit by non-anchored expressions.
2168 if (check_offset) {
2169 macro_assembler->CheckPosition(cp_offset, on_failure);
2170 }
2171 return;
2172 }
2173
2174 if (!preloaded) {
2175 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2176 }
2177
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002178 if (cc->is_standard(zone) &&
Leon Clarkee46be812010-01-19 14:06:41 +00002179 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2180 on_failure)) {
2181 return;
2182 }
2183
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002184
2185 // A new list with ascending entries. Each entry is a code unit
2186 // where there is a boundary between code units that are part of
2187 // the class and code units that are not. Normally we insert an
2188 // entry at zero which goes to the failure label, but if there
2189 // was already one there we fall through for success on that entry.
2190 // Subsequent entries have alternating meaning (success/failure).
2191 ZoneList<int>* range_boundaries =
2192 new(zone) ZoneList<int>(last_valid_range, zone);
2193
2194 bool zeroth_entry_is_failure = !cc->is_negated();
2195
2196 for (int i = 0; i <= last_valid_range; i++) {
Steve Blocka7e24c12009-10-30 11:49:00 +00002197 CharacterRange& range = ranges->at(i);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002198 if (range.from() == 0) {
2199 DCHECK_EQ(i, 0);
2200 zeroth_entry_is_failure = !zeroth_entry_is_failure;
Steve Blocka7e24c12009-10-30 11:49:00 +00002201 } else {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002202 range_boundaries->Add(range.from(), zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00002203 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002204 range_boundaries->Add(range.to() + 1, zone);
2205 }
2206 int end_index = range_boundaries->length() - 1;
2207 if (range_boundaries->at(end_index) > max_char) {
2208 end_index--;
Steve Blocka7e24c12009-10-30 11:49:00 +00002209 }
2210
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002211 Label fall_through;
2212 GenerateBranches(macro_assembler,
2213 range_boundaries,
2214 0, // start_index.
2215 end_index,
2216 0, // min_char.
2217 max_char,
2218 &fall_through,
2219 zeroth_entry_is_failure ? &fall_through : on_failure,
2220 zeroth_entry_is_failure ? on_failure : &fall_through);
2221 macro_assembler->Bind(&fall_through);
Steve Blocka7e24c12009-10-30 11:49:00 +00002222}
2223
2224
2225RegExpNode::~RegExpNode() {
2226}
2227
2228
2229RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2230 Trace* trace) {
2231 // If we are generating a greedy loop then don't stop and don't reuse code.
2232 if (trace->stop_node() != NULL) {
2233 return CONTINUE;
2234 }
2235
2236 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2237 if (trace->is_trivial()) {
2238 if (label_.is_bound()) {
2239 // We are being asked to generate a generic version, but that's already
2240 // been done so just go to it.
2241 macro_assembler->GoTo(&label_);
2242 return DONE;
2243 }
2244 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
2245 // To avoid too deep recursion we push the node to the work queue and just
2246 // generate a goto here.
2247 compiler->AddWork(this);
2248 macro_assembler->GoTo(&label_);
2249 return DONE;
2250 }
2251 // Generate generic version of the node and bind the label for later use.
2252 macro_assembler->Bind(&label_);
2253 return CONTINUE;
2254 }
2255
2256 // We are being asked to make a non-generic version. Keep track of how many
2257 // non-generic versions we generate so as not to overdo it.
2258 trace_count_++;
2259 if (FLAG_regexp_optimization &&
2260 trace_count_ < kMaxCopiesCodeGenerated &&
2261 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
2262 return CONTINUE;
2263 }
2264
2265 // If we get here code has been generated for this node too many times or
2266 // recursion is too deep. Time to switch to a generic version. The code for
2267 // generic versions above can handle deep recursion properly.
2268 trace->Flush(compiler, this);
2269 return DONE;
2270}
2271
2272
Ben Murdochb0fe1622011-05-05 13:52:32 +01002273int ActionNode::EatsAtLeast(int still_to_find,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002274 int budget,
Ben Murdochb0fe1622011-05-05 13:52:32 +01002275 bool not_at_start) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002276 if (budget <= 0) return 0;
2277 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
Ben Murdochb0fe1622011-05-05 13:52:32 +01002278 return on_success()->EatsAtLeast(still_to_find,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002279 budget - 1,
Ben Murdochb0fe1622011-05-05 13:52:32 +01002280 not_at_start);
Steve Blocka7e24c12009-10-30 11:49:00 +00002281}
2282
2283
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002284void ActionNode::FillInBMInfo(int offset,
2285 int budget,
2286 BoyerMooreLookahead* bm,
2287 bool not_at_start) {
2288 if (action_type_ == BEGIN_SUBMATCH) {
2289 bm->SetRest(offset);
2290 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2291 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
2292 }
2293 SaveBMInfo(bm, not_at_start, offset);
2294}
2295
2296
Ben Murdochb0fe1622011-05-05 13:52:32 +01002297int AssertionNode::EatsAtLeast(int still_to_find,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002298 int budget,
Ben Murdochb0fe1622011-05-05 13:52:32 +01002299 bool not_at_start) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002300 if (budget <= 0) return 0;
Ben Murdochb0fe1622011-05-05 13:52:32 +01002301 // If we know we are not at the start and we are asked "how many characters
2302 // will you match if you succeed?" then we can answer anything since false
2303 // implies false. So lets just return the max answer (still_to_find) since
2304 // that won't prevent us from preloading a lot of characters for the other
2305 // branches in the node graph.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002306 if (assertion_type() == AT_START && not_at_start) return still_to_find;
Ben Murdochb0fe1622011-05-05 13:52:32 +01002307 return on_success()->EatsAtLeast(still_to_find,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002308 budget - 1,
Ben Murdochb0fe1622011-05-05 13:52:32 +01002309 not_at_start);
Steve Blocka7e24c12009-10-30 11:49:00 +00002310}
2311
2312
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002313void AssertionNode::FillInBMInfo(int offset,
2314 int budget,
2315 BoyerMooreLookahead* bm,
2316 bool not_at_start) {
2317 // Match the behaviour of EatsAtLeast on this node.
2318 if (assertion_type() == AT_START && not_at_start) return;
2319 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
2320 SaveBMInfo(bm, not_at_start, offset);
2321}
2322
2323
Ben Murdochb0fe1622011-05-05 13:52:32 +01002324int BackReferenceNode::EatsAtLeast(int still_to_find,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002325 int budget,
Ben Murdochb0fe1622011-05-05 13:52:32 +01002326 bool not_at_start) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002327 if (budget <= 0) return 0;
Ben Murdochb0fe1622011-05-05 13:52:32 +01002328 return on_success()->EatsAtLeast(still_to_find,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002329 budget - 1,
Ben Murdochb0fe1622011-05-05 13:52:32 +01002330 not_at_start);
Steve Blocka7e24c12009-10-30 11:49:00 +00002331}
2332
2333
Ben Murdochb0fe1622011-05-05 13:52:32 +01002334int TextNode::EatsAtLeast(int still_to_find,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002335 int budget,
Ben Murdochb0fe1622011-05-05 13:52:32 +01002336 bool not_at_start) {
Steve Blocka7e24c12009-10-30 11:49:00 +00002337 int answer = Length();
2338 if (answer >= still_to_find) return answer;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002339 if (budget <= 0) return answer;
Ben Murdochb0fe1622011-05-05 13:52:32 +01002340 // We are not at start after this node so we set the last argument to 'true'.
Steve Blocka7e24c12009-10-30 11:49:00 +00002341 return answer + on_success()->EatsAtLeast(still_to_find - answer,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002342 budget - 1,
Ben Murdochb0fe1622011-05-05 13:52:32 +01002343 true);
Steve Blocka7e24c12009-10-30 11:49:00 +00002344}
2345
2346
Leon Clarkee46be812010-01-19 14:06:41 +00002347int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002348 int budget,
Ben Murdochb0fe1622011-05-05 13:52:32 +01002349 bool not_at_start) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002350 if (budget <= 0) return 0;
Steve Blocka7e24c12009-10-30 11:49:00 +00002351 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2352 // afterwards.
2353 RegExpNode* node = alternatives_->at(1).node();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002354 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
Steve Blocka7e24c12009-10-30 11:49:00 +00002355}
2356
2357
2358void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
2359 QuickCheckDetails* details,
2360 RegExpCompiler* compiler,
2361 int filled_in,
2362 bool not_at_start) {
2363 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2364 // afterwards.
2365 RegExpNode* node = alternatives_->at(1).node();
2366 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2367}
2368
2369
2370int ChoiceNode::EatsAtLeastHelper(int still_to_find,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002371 int budget,
Ben Murdochb0fe1622011-05-05 13:52:32 +01002372 RegExpNode* ignore_this_node,
2373 bool not_at_start) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002374 if (budget <= 0) return 0;
Steve Blocka7e24c12009-10-30 11:49:00 +00002375 int min = 100;
2376 int choice_count = alternatives_->length();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002377 budget = (budget - 1) / choice_count;
Steve Blocka7e24c12009-10-30 11:49:00 +00002378 for (int i = 0; i < choice_count; i++) {
2379 RegExpNode* node = alternatives_->at(i).node();
2380 if (node == ignore_this_node) continue;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002381 int node_eats_at_least =
2382 node->EatsAtLeast(still_to_find, budget, not_at_start);
Steve Blocka7e24c12009-10-30 11:49:00 +00002383 if (node_eats_at_least < min) min = node_eats_at_least;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002384 if (min == 0) return 0;
Steve Blocka7e24c12009-10-30 11:49:00 +00002385 }
2386 return min;
2387}
2388
2389
Ben Murdochb0fe1622011-05-05 13:52:32 +01002390int LoopChoiceNode::EatsAtLeast(int still_to_find,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002391 int budget,
Ben Murdochb0fe1622011-05-05 13:52:32 +01002392 bool not_at_start) {
2393 return EatsAtLeastHelper(still_to_find,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002394 budget - 1,
Ben Murdochb0fe1622011-05-05 13:52:32 +01002395 loop_node_,
2396 not_at_start);
Steve Blocka7e24c12009-10-30 11:49:00 +00002397}
2398
2399
Ben Murdochb0fe1622011-05-05 13:52:32 +01002400int ChoiceNode::EatsAtLeast(int still_to_find,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002401 int budget,
Ben Murdochb0fe1622011-05-05 13:52:32 +01002402 bool not_at_start) {
2403 return EatsAtLeastHelper(still_to_find,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002404 budget,
Ben Murdochb0fe1622011-05-05 13:52:32 +01002405 NULL,
2406 not_at_start);
Steve Blocka7e24c12009-10-30 11:49:00 +00002407}
2408
2409
2410// Takes the left-most 1-bit and smears it out, setting all bits to its right.
2411static inline uint32_t SmearBitsRight(uint32_t v) {
2412 v |= v >> 1;
2413 v |= v >> 2;
2414 v |= v >> 4;
2415 v |= v >> 8;
2416 v |= v >> 16;
2417 return v;
2418}
2419
2420
2421bool QuickCheckDetails::Rationalize(bool asc) {
2422 bool found_useful_op = false;
2423 uint32_t char_mask;
2424 if (asc) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002425 char_mask = String::kMaxOneByteCharCode;
Steve Blocka7e24c12009-10-30 11:49:00 +00002426 } else {
Ben Murdoch3ef787d2012-04-12 10:51:47 +01002427 char_mask = String::kMaxUtf16CodeUnit;
Steve Blocka7e24c12009-10-30 11:49:00 +00002428 }
2429 mask_ = 0;
2430 value_ = 0;
2431 int char_shift = 0;
2432 for (int i = 0; i < characters_; i++) {
2433 Position* pos = &positions_[i];
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002434 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
Steve Blocka7e24c12009-10-30 11:49:00 +00002435 found_useful_op = true;
2436 }
2437 mask_ |= (pos->mask & char_mask) << char_shift;
2438 value_ |= (pos->value & char_mask) << char_shift;
2439 char_shift += asc ? 8 : 16;
2440 }
2441 return found_useful_op;
2442}
2443
2444
2445bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002446 Trace* bounds_check_trace,
Steve Blocka7e24c12009-10-30 11:49:00 +00002447 Trace* trace,
2448 bool preload_has_checked_bounds,
2449 Label* on_possible_success,
2450 QuickCheckDetails* details,
2451 bool fall_through_on_failure) {
2452 if (details->characters() == 0) return false;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002453 GetQuickCheckDetails(
2454 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
Steve Blocka7e24c12009-10-30 11:49:00 +00002455 if (details->cannot_match()) return false;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002456 if (!details->Rationalize(compiler->one_byte())) return false;
2457 DCHECK(details->characters() == 1 ||
Steve Blocka7e24c12009-10-30 11:49:00 +00002458 compiler->macro_assembler()->CanReadUnaligned());
2459 uint32_t mask = details->mask();
2460 uint32_t value = details->value();
2461
2462 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2463
2464 if (trace->characters_preloaded() != details->characters()) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002465 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2466 // We are attempting to preload the minimum number of characters
2467 // any choice would eat, so if the bounds check fails, then none of the
2468 // choices can succeed, so we can just immediately backtrack, rather
2469 // than go to the next choice.
Steve Blocka7e24c12009-10-30 11:49:00 +00002470 assembler->LoadCurrentCharacter(trace->cp_offset(),
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002471 bounds_check_trace->backtrack(),
Steve Blocka7e24c12009-10-30 11:49:00 +00002472 !preload_has_checked_bounds,
2473 details->characters());
2474 }
2475
2476
2477 bool need_mask = true;
2478
2479 if (details->characters() == 1) {
2480 // If number of characters preloaded is 1 then we used a byte or 16 bit
2481 // load so the value is already masked down.
2482 uint32_t char_mask;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002483 if (compiler->one_byte()) {
2484 char_mask = String::kMaxOneByteCharCode;
Steve Blocka7e24c12009-10-30 11:49:00 +00002485 } else {
Ben Murdoch3ef787d2012-04-12 10:51:47 +01002486 char_mask = String::kMaxUtf16CodeUnit;
Steve Blocka7e24c12009-10-30 11:49:00 +00002487 }
2488 if ((mask & char_mask) == char_mask) need_mask = false;
2489 mask &= char_mask;
2490 } else {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002491 // For 2-character preloads in one-byte mode or 1-character preloads in
2492 // two-byte mode we also use a 16 bit load with zero extend.
2493 if (details->characters() == 2 && compiler->one_byte()) {
2494 if ((mask & 0xffff) == 0xffff) need_mask = false;
2495 } else if (details->characters() == 1 && !compiler->one_byte()) {
Steve Blocka7e24c12009-10-30 11:49:00 +00002496 if ((mask & 0xffff) == 0xffff) need_mask = false;
2497 } else {
2498 if (mask == 0xffffffff) need_mask = false;
2499 }
2500 }
2501
2502 if (fall_through_on_failure) {
2503 if (need_mask) {
2504 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2505 } else {
2506 assembler->CheckCharacter(value, on_possible_success);
2507 }
2508 } else {
2509 if (need_mask) {
2510 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2511 } else {
2512 assembler->CheckNotCharacter(value, trace->backtrack());
2513 }
2514 }
2515 return true;
2516}
2517
2518
2519// Here is the meat of GetQuickCheckDetails (see also the comment on the
2520// super-class in the .h file).
2521//
2522// We iterate along the text object, building up for each character a
2523// mask and value that can be used to test for a quick failure to match.
2524// The masks and values for the positions will be combined into a single
2525// machine word for the current character width in order to be used in
2526// generating a quick check.
2527void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2528 RegExpCompiler* compiler,
2529 int characters_filled_in,
2530 bool not_at_start) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002531 Isolate* isolate = compiler->macro_assembler()->zone()->isolate();
2532 DCHECK(characters_filled_in < details->characters());
Steve Blocka7e24c12009-10-30 11:49:00 +00002533 int characters = details->characters();
2534 int char_mask;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002535 if (compiler->one_byte()) {
2536 char_mask = String::kMaxOneByteCharCode;
Steve Blocka7e24c12009-10-30 11:49:00 +00002537 } else {
Ben Murdoch3ef787d2012-04-12 10:51:47 +01002538 char_mask = String::kMaxUtf16CodeUnit;
Steve Blocka7e24c12009-10-30 11:49:00 +00002539 }
2540 for (int k = 0; k < elms_->length(); k++) {
2541 TextElement elm = elms_->at(k);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002542 if (elm.text_type() == TextElement::ATOM) {
2543 Vector<const uc16> quarks = elm.atom()->data();
Steve Blocka7e24c12009-10-30 11:49:00 +00002544 for (int i = 0; i < characters && i < quarks.length(); i++) {
2545 QuickCheckDetails::Position* pos =
2546 details->positions(characters_filled_in);
2547 uc16 c = quarks[i];
2548 if (c > char_mask) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002549 // If we expect a non-Latin1 character from an one-byte string,
2550 // there is no way we can match. Not even case-independent
2551 // matching can turn an Latin1 character into non-Latin1 or
Steve Blocka7e24c12009-10-30 11:49:00 +00002552 // vice versa.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002553 // TODO(dcarney): issue 3550. Verify that this works as expected.
2554 // For example, \u0178 is uppercase of \u00ff (y-umlaut).
Steve Blocka7e24c12009-10-30 11:49:00 +00002555 details->set_cannot_match();
2556 pos->determines_perfectly = false;
2557 return;
2558 }
2559 if (compiler->ignore_case()) {
2560 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002561 int length = GetCaseIndependentLetters(isolate, c,
2562 compiler->one_byte(), chars);
2563 DCHECK(length != 0); // Can only happen if c > char_mask (see above).
Steve Blocka7e24c12009-10-30 11:49:00 +00002564 if (length == 1) {
2565 // This letter has no case equivalents, so it's nice and simple
2566 // and the mask-compare will determine definitely whether we have
2567 // a match at this character position.
2568 pos->mask = char_mask;
2569 pos->value = c;
2570 pos->determines_perfectly = true;
2571 } else {
2572 uint32_t common_bits = char_mask;
2573 uint32_t bits = chars[0];
2574 for (int j = 1; j < length; j++) {
2575 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2576 common_bits ^= differing_bits;
2577 bits &= common_bits;
2578 }
2579 // If length is 2 and common bits has only one zero in it then
2580 // our mask and compare instruction will determine definitely
2581 // whether we have a match at this character position. Otherwise
2582 // it can only be an approximate check.
2583 uint32_t one_zero = (common_bits | ~char_mask);
2584 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2585 pos->determines_perfectly = true;
2586 }
2587 pos->mask = common_bits;
2588 pos->value = bits;
2589 }
2590 } else {
2591 // Don't ignore case. Nice simple case where the mask-compare will
2592 // determine definitely whether we have a match at this character
2593 // position.
2594 pos->mask = char_mask;
2595 pos->value = c;
2596 pos->determines_perfectly = true;
2597 }
2598 characters_filled_in++;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002599 DCHECK(characters_filled_in <= details->characters());
Steve Blocka7e24c12009-10-30 11:49:00 +00002600 if (characters_filled_in == details->characters()) {
2601 return;
2602 }
2603 }
2604 } else {
2605 QuickCheckDetails::Position* pos =
2606 details->positions(characters_filled_in);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002607 RegExpCharacterClass* tree = elm.char_class();
2608 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
Steve Blocka7e24c12009-10-30 11:49:00 +00002609 if (tree->is_negated()) {
2610 // A quick check uses multi-character mask and compare. There is no
2611 // useful way to incorporate a negative char class into this scheme
2612 // so we just conservatively create a mask and value that will always
2613 // succeed.
2614 pos->mask = 0;
2615 pos->value = 0;
2616 } else {
2617 int first_range = 0;
2618 while (ranges->at(first_range).from() > char_mask) {
2619 first_range++;
2620 if (first_range == ranges->length()) {
2621 details->set_cannot_match();
2622 pos->determines_perfectly = false;
2623 return;
2624 }
2625 }
2626 CharacterRange range = ranges->at(first_range);
2627 uc16 from = range.from();
2628 uc16 to = range.to();
2629 if (to > char_mask) {
2630 to = char_mask;
2631 }
2632 uint32_t differing_bits = (from ^ to);
2633 // A mask and compare is only perfect if the differing bits form a
2634 // number like 00011111 with one single block of trailing 1s.
2635 if ((differing_bits & (differing_bits + 1)) == 0 &&
2636 from + differing_bits == to) {
2637 pos->determines_perfectly = true;
2638 }
2639 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2640 uint32_t bits = (from & common_bits);
2641 for (int i = first_range + 1; i < ranges->length(); i++) {
2642 CharacterRange range = ranges->at(i);
2643 uc16 from = range.from();
2644 uc16 to = range.to();
2645 if (from > char_mask) continue;
2646 if (to > char_mask) to = char_mask;
2647 // Here we are combining more ranges into the mask and compare
2648 // value. With each new range the mask becomes more sparse and
2649 // so the chances of a false positive rise. A character class
2650 // with multiple ranges is assumed never to be equivalent to a
2651 // mask and compare operation.
2652 pos->determines_perfectly = false;
2653 uint32_t new_common_bits = (from ^ to);
2654 new_common_bits = ~SmearBitsRight(new_common_bits);
2655 common_bits &= new_common_bits;
2656 bits &= new_common_bits;
2657 uint32_t differing_bits = (from & common_bits) ^ bits;
2658 common_bits ^= differing_bits;
2659 bits &= common_bits;
2660 }
2661 pos->mask = common_bits;
2662 pos->value = bits;
2663 }
2664 characters_filled_in++;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002665 DCHECK(characters_filled_in <= details->characters());
Steve Blocka7e24c12009-10-30 11:49:00 +00002666 if (characters_filled_in == details->characters()) {
2667 return;
2668 }
2669 }
2670 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002671 DCHECK(characters_filled_in != details->characters());
2672 if (!details->cannot_match()) {
2673 on_success()-> GetQuickCheckDetails(details,
2674 compiler,
2675 characters_filled_in,
2676 true);
2677 }
Steve Blocka7e24c12009-10-30 11:49:00 +00002678}
2679
2680
2681void QuickCheckDetails::Clear() {
2682 for (int i = 0; i < characters_; i++) {
2683 positions_[i].mask = 0;
2684 positions_[i].value = 0;
2685 positions_[i].determines_perfectly = false;
2686 }
2687 characters_ = 0;
2688}
2689
2690
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002691void QuickCheckDetails::Advance(int by, bool one_byte) {
2692 DCHECK(by >= 0);
Steve Blocka7e24c12009-10-30 11:49:00 +00002693 if (by >= characters_) {
2694 Clear();
2695 return;
2696 }
2697 for (int i = 0; i < characters_ - by; i++) {
2698 positions_[i] = positions_[by + i];
2699 }
2700 for (int i = characters_ - by; i < characters_; i++) {
2701 positions_[i].mask = 0;
2702 positions_[i].value = 0;
2703 positions_[i].determines_perfectly = false;
2704 }
2705 characters_ -= by;
2706 // We could change mask_ and value_ here but we would never advance unless
2707 // they had already been used in a check and they won't be used again because
2708 // it would gain us nothing. So there's no point.
2709}
2710
2711
2712void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002713 DCHECK(characters_ == other->characters_);
Steve Blocka7e24c12009-10-30 11:49:00 +00002714 if (other->cannot_match_) {
2715 return;
2716 }
2717 if (cannot_match_) {
2718 *this = *other;
2719 return;
2720 }
2721 for (int i = from_index; i < characters_; i++) {
2722 QuickCheckDetails::Position* pos = positions(i);
2723 QuickCheckDetails::Position* other_pos = other->positions(i);
2724 if (pos->mask != other_pos->mask ||
2725 pos->value != other_pos->value ||
2726 !other_pos->determines_perfectly) {
2727 // Our mask-compare operation will be approximate unless we have the
2728 // exact same operation on both sides of the alternation.
2729 pos->determines_perfectly = false;
2730 }
2731 pos->mask &= other_pos->mask;
2732 pos->value &= pos->mask;
2733 other_pos->value &= pos->mask;
2734 uc16 differing_bits = (pos->value ^ other_pos->value);
2735 pos->mask &= ~differing_bits;
2736 pos->value &= pos->mask;
2737 }
2738}
2739
2740
2741class VisitMarker {
2742 public:
2743 explicit VisitMarker(NodeInfo* info) : info_(info) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002744 DCHECK(!info->visited);
Steve Blocka7e24c12009-10-30 11:49:00 +00002745 info->visited = true;
2746 }
2747 ~VisitMarker() {
2748 info_->visited = false;
2749 }
2750 private:
2751 NodeInfo* info_;
2752};
2753
2754
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002755RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2756 if (info()->replacement_calculated) return replacement();
2757 if (depth < 0) return this;
2758 DCHECK(!info()->visited);
2759 VisitMarker marker(info());
2760 return FilterSuccessor(depth - 1, ignore_case);
2761}
2762
2763
2764RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2765 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2766 if (next == NULL) return set_replacement(NULL);
2767 on_success_ = next;
2768 return set_replacement(this);
2769}
2770
2771
2772// We need to check for the following characters: 0x39c 0x3bc 0x178.
2773static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2774 // TODO(dcarney): this could be a lot more efficient.
2775 return range.Contains(0x39c) ||
2776 range.Contains(0x3bc) || range.Contains(0x178);
2777}
2778
2779
2780static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2781 for (int i = 0; i < ranges->length(); i++) {
2782 // TODO(dcarney): this could be a lot more efficient.
2783 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2784 }
2785 return false;
2786}
2787
2788
2789RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2790 if (info()->replacement_calculated) return replacement();
2791 if (depth < 0) return this;
2792 DCHECK(!info()->visited);
2793 VisitMarker marker(info());
2794 int element_count = elms_->length();
2795 for (int i = 0; i < element_count; i++) {
2796 TextElement elm = elms_->at(i);
2797 if (elm.text_type() == TextElement::ATOM) {
2798 Vector<const uc16> quarks = elm.atom()->data();
2799 for (int j = 0; j < quarks.length(); j++) {
2800 uint16_t c = quarks[j];
2801 if (c <= String::kMaxOneByteCharCode) continue;
2802 if (!ignore_case) return set_replacement(NULL);
2803 // Here, we need to check for characters whose upper and lower cases
2804 // are outside the Latin-1 range.
2805 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2806 // Character is outside Latin-1 completely
2807 if (converted == 0) return set_replacement(NULL);
2808 // Convert quark to Latin-1 in place.
2809 uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2810 copy[j] = converted;
2811 }
2812 } else {
2813 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2814 RegExpCharacterClass* cc = elm.char_class();
2815 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2816 if (!CharacterRange::IsCanonical(ranges)) {
2817 CharacterRange::Canonicalize(ranges);
2818 }
2819 // Now they are in order so we only need to look at the first.
2820 int range_count = ranges->length();
2821 if (cc->is_negated()) {
2822 if (range_count != 0 &&
2823 ranges->at(0).from() == 0 &&
2824 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2825 // This will be handled in a later filter.
2826 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2827 return set_replacement(NULL);
2828 }
2829 } else {
2830 if (range_count == 0 ||
2831 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2832 // This will be handled in a later filter.
2833 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2834 return set_replacement(NULL);
2835 }
2836 }
2837 }
2838 }
2839 return FilterSuccessor(depth - 1, ignore_case);
2840}
2841
2842
2843RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2844 if (info()->replacement_calculated) return replacement();
2845 if (depth < 0) return this;
2846 if (info()->visited) return this;
2847 {
2848 VisitMarker marker(info());
2849
2850 RegExpNode* continue_replacement =
2851 continue_node_->FilterOneByte(depth - 1, ignore_case);
2852 // If we can't continue after the loop then there is no sense in doing the
2853 // loop.
2854 if (continue_replacement == NULL) return set_replacement(NULL);
2855 }
2856
2857 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2858}
2859
2860
2861RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2862 if (info()->replacement_calculated) return replacement();
2863 if (depth < 0) return this;
2864 if (info()->visited) return this;
2865 VisitMarker marker(info());
2866 int choice_count = alternatives_->length();
2867
2868 for (int i = 0; i < choice_count; i++) {
2869 GuardedAlternative alternative = alternatives_->at(i);
2870 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2871 set_replacement(this);
2872 return this;
2873 }
2874 }
2875
2876 int surviving = 0;
2877 RegExpNode* survivor = NULL;
2878 for (int i = 0; i < choice_count; i++) {
2879 GuardedAlternative alternative = alternatives_->at(i);
2880 RegExpNode* replacement =
2881 alternative.node()->FilterOneByte(depth - 1, ignore_case);
2882 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2883 if (replacement != NULL) {
2884 alternatives_->at(i).set_node(replacement);
2885 surviving++;
2886 survivor = replacement;
2887 }
2888 }
2889 if (surviving < 2) return set_replacement(survivor);
2890
2891 set_replacement(this);
2892 if (surviving == choice_count) {
2893 return this;
2894 }
2895 // Only some of the nodes survived the filtering. We need to rebuild the
2896 // alternatives list.
2897 ZoneList<GuardedAlternative>* new_alternatives =
2898 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2899 for (int i = 0; i < choice_count; i++) {
2900 RegExpNode* replacement =
2901 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2902 if (replacement != NULL) {
2903 alternatives_->at(i).set_node(replacement);
2904 new_alternatives->Add(alternatives_->at(i), zone());
2905 }
2906 }
2907 alternatives_ = new_alternatives;
2908 return this;
2909}
2910
2911
2912RegExpNode* NegativeLookaheadChoiceNode::FilterOneByte(int depth,
2913 bool ignore_case) {
2914 if (info()->replacement_calculated) return replacement();
2915 if (depth < 0) return this;
2916 if (info()->visited) return this;
2917 VisitMarker marker(info());
2918 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2919 // afterwards.
2920 RegExpNode* node = alternatives_->at(1).node();
2921 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2922 if (replacement == NULL) return set_replacement(NULL);
2923 alternatives_->at(1).set_node(replacement);
2924
2925 RegExpNode* neg_node = alternatives_->at(0).node();
2926 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2927 // If the negative lookahead is always going to fail then
2928 // we don't need to check it.
2929 if (neg_replacement == NULL) return set_replacement(replacement);
2930 alternatives_->at(0).set_node(neg_replacement);
2931 return set_replacement(this);
2932}
2933
2934
Steve Blocka7e24c12009-10-30 11:49:00 +00002935void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2936 RegExpCompiler* compiler,
2937 int characters_filled_in,
2938 bool not_at_start) {
2939 if (body_can_be_zero_length_ || info()->visited) return;
2940 VisitMarker marker(info());
2941 return ChoiceNode::GetQuickCheckDetails(details,
2942 compiler,
2943 characters_filled_in,
2944 not_at_start);
2945}
2946
2947
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002948void LoopChoiceNode::FillInBMInfo(int offset,
2949 int budget,
2950 BoyerMooreLookahead* bm,
2951 bool not_at_start) {
2952 if (body_can_be_zero_length_ || budget <= 0) {
2953 bm->SetRest(offset);
2954 SaveBMInfo(bm, not_at_start, offset);
2955 return;
2956 }
2957 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);
2958 SaveBMInfo(bm, not_at_start, offset);
2959}
2960
2961
Steve Blocka7e24c12009-10-30 11:49:00 +00002962void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2963 RegExpCompiler* compiler,
2964 int characters_filled_in,
2965 bool not_at_start) {
2966 not_at_start = (not_at_start || not_at_start_);
2967 int choice_count = alternatives_->length();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00002968 DCHECK(choice_count > 0);
Steve Blocka7e24c12009-10-30 11:49:00 +00002969 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2970 compiler,
2971 characters_filled_in,
2972 not_at_start);
2973 for (int i = 1; i < choice_count; i++) {
2974 QuickCheckDetails new_details(details->characters());
2975 RegExpNode* node = alternatives_->at(i).node();
2976 node->GetQuickCheckDetails(&new_details, compiler,
2977 characters_filled_in,
2978 not_at_start);
2979 // Here we merge the quick match details of the two branches.
2980 details->Merge(&new_details, characters_filled_in);
2981 }
2982}
2983
2984
2985// Check for [0-9A-Z_a-z].
2986static void EmitWordCheck(RegExpMacroAssembler* assembler,
2987 Label* word,
2988 Label* non_word,
2989 bool fall_through_on_word) {
Leon Clarkee46be812010-01-19 14:06:41 +00002990 if (assembler->CheckSpecialCharacterClass(
2991 fall_through_on_word ? 'w' : 'W',
2992 fall_through_on_word ? non_word : word)) {
2993 // Optimized implementation available.
2994 return;
2995 }
Steve Blocka7e24c12009-10-30 11:49:00 +00002996 assembler->CheckCharacterGT('z', non_word);
2997 assembler->CheckCharacterLT('0', non_word);
2998 assembler->CheckCharacterGT('a' - 1, word);
2999 assembler->CheckCharacterLT('9' + 1, word);
3000 assembler->CheckCharacterLT('A', non_word);
3001 assembler->CheckCharacterLT('Z' + 1, word);
3002 if (fall_through_on_word) {
3003 assembler->CheckNotCharacter('_', non_word);
3004 } else {
3005 assembler->CheckCharacter('_', word);
3006 }
3007}
3008
3009
3010// Emit the code to check for a ^ in multiline mode (1-character lookbehind
3011// that matches newline or the start of input).
3012static void EmitHat(RegExpCompiler* compiler,
3013 RegExpNode* on_success,
3014 Trace* trace) {
3015 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3016 // We will be loading the previous character into the current character
3017 // register.
3018 Trace new_trace(*trace);
3019 new_trace.InvalidateCurrentCharacter();
3020
3021 Label ok;
3022 if (new_trace.cp_offset() == 0) {
3023 // The start of input counts as a newline in this context, so skip to
3024 // ok if we are at the start.
3025 assembler->CheckAtStart(&ok);
3026 }
3027 // We already checked that we are not at the start of input so it must be
3028 // OK to load the previous character.
3029 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3030 new_trace.backtrack(),
3031 false);
Leon Clarkee46be812010-01-19 14:06:41 +00003032 if (!assembler->CheckSpecialCharacterClass('n',
3033 new_trace.backtrack())) {
3034 // Newline means \n, \r, 0x2028 or 0x2029.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003035 if (!compiler->one_byte()) {
Leon Clarkee46be812010-01-19 14:06:41 +00003036 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3037 }
3038 assembler->CheckCharacter('\n', &ok);
3039 assembler->CheckNotCharacter('\r', new_trace.backtrack());
Steve Blocka7e24c12009-10-30 11:49:00 +00003040 }
Steve Blocka7e24c12009-10-30 11:49:00 +00003041 assembler->Bind(&ok);
3042 on_success->Emit(compiler, &new_trace);
3043}
3044
3045
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003046// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3047void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
Leon Clarkee46be812010-01-19 14:06:41 +00003048 RegExpMacroAssembler* assembler = compiler->macro_assembler();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003049 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3050 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3051 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3052 if (lookahead == NULL) {
3053 int eats_at_least =
3054 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3055 kRecursionBudget,
3056 not_at_start));
3057 if (eats_at_least >= 1) {
3058 BoyerMooreLookahead* bm =
3059 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3060 FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
3061 if (bm->at(0)->is_non_word())
3062 next_is_word_character = Trace::FALSE_VALUE;
3063 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3064 }
3065 } else {
3066 if (lookahead->at(0)->is_non_word())
3067 next_is_word_character = Trace::FALSE_VALUE;
3068 if (lookahead->at(0)->is_word())
3069 next_is_word_character = Trace::TRUE_VALUE;
Leon Clarkee46be812010-01-19 14:06:41 +00003070 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003071 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3072 if (next_is_word_character == Trace::UNKNOWN) {
3073 Label before_non_word;
3074 Label before_word;
3075 if (trace->characters_preloaded() != 1) {
3076 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3077 }
3078 // Fall through on non-word.
3079 EmitWordCheck(assembler, &before_word, &before_non_word, false);
3080 // Next character is not a word character.
3081 assembler->Bind(&before_non_word);
3082 Label ok;
3083 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3084 assembler->GoTo(&ok);
Leon Clarkee46be812010-01-19 14:06:41 +00003085
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003086 assembler->Bind(&before_word);
3087 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3088 assembler->Bind(&ok);
3089 } else if (next_is_word_character == Trace::TRUE_VALUE) {
3090 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3091 } else {
3092 DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3093 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3094 }
Leon Clarkee46be812010-01-19 14:06:41 +00003095}
3096
3097
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003098void AssertionNode::BacktrackIfPrevious(
3099 RegExpCompiler* compiler,
3100 Trace* trace,
3101 AssertionNode::IfPrevious backtrack_if_previous) {
Steve Blocka7e24c12009-10-30 11:49:00 +00003102 RegExpMacroAssembler* assembler = compiler->macro_assembler();
Steve Blocka7e24c12009-10-30 11:49:00 +00003103 Trace new_trace(*trace);
3104 new_trace.InvalidateCurrentCharacter();
3105
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003106 Label fall_through, dummy;
Steve Blocka7e24c12009-10-30 11:49:00 +00003107
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003108 Label* non_word = backtrack_if_previous == kIsNonWord ?
3109 new_trace.backtrack() :
3110 &fall_through;
3111 Label* word = backtrack_if_previous == kIsNonWord ?
3112 &fall_through :
3113 new_trace.backtrack();
3114
Steve Blocka7e24c12009-10-30 11:49:00 +00003115 if (new_trace.cp_offset() == 0) {
3116 // The start of input counts as a non-word character, so the question is
3117 // decided if we are at the start.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003118 assembler->CheckAtStart(non_word);
Steve Blocka7e24c12009-10-30 11:49:00 +00003119 }
3120 // We already checked that we are not at the start of input so it must be
3121 // OK to load the previous character.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003122 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3123 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
Steve Blocka7e24c12009-10-30 11:49:00 +00003124
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003125 assembler->Bind(&fall_through);
3126 on_success()->Emit(compiler, &new_trace);
Steve Blocka7e24c12009-10-30 11:49:00 +00003127}
3128
3129
3130void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3131 RegExpCompiler* compiler,
3132 int filled_in,
3133 bool not_at_start) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003134 if (assertion_type_ == AT_START && not_at_start) {
Steve Blocka7e24c12009-10-30 11:49:00 +00003135 details->set_cannot_match();
3136 return;
3137 }
3138 return on_success()->GetQuickCheckDetails(details,
3139 compiler,
3140 filled_in,
3141 not_at_start);
3142}
3143
3144
3145void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3146 RegExpMacroAssembler* assembler = compiler->macro_assembler();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003147 switch (assertion_type_) {
Steve Blocka7e24c12009-10-30 11:49:00 +00003148 case AT_END: {
3149 Label ok;
3150 assembler->CheckPosition(trace->cp_offset(), &ok);
3151 assembler->GoTo(trace->backtrack());
3152 assembler->Bind(&ok);
3153 break;
3154 }
3155 case AT_START: {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003156 if (trace->at_start() == Trace::FALSE_VALUE) {
Steve Blocka7e24c12009-10-30 11:49:00 +00003157 assembler->GoTo(trace->backtrack());
3158 return;
3159 }
3160 if (trace->at_start() == Trace::UNKNOWN) {
3161 assembler->CheckNotAtStart(trace->backtrack());
3162 Trace at_start_trace = *trace;
3163 at_start_trace.set_at_start(true);
3164 on_success()->Emit(compiler, &at_start_trace);
3165 return;
3166 }
3167 }
3168 break;
3169 case AFTER_NEWLINE:
3170 EmitHat(compiler, on_success(), trace);
3171 return;
Steve Blocka7e24c12009-10-30 11:49:00 +00003172 case AT_BOUNDARY:
Leon Clarkee46be812010-01-19 14:06:41 +00003173 case AT_NON_BOUNDARY: {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003174 EmitBoundaryCheck(compiler, trace);
Steve Blocka7e24c12009-10-30 11:49:00 +00003175 return;
Leon Clarkee46be812010-01-19 14:06:41 +00003176 }
Steve Blocka7e24c12009-10-30 11:49:00 +00003177 }
3178 on_success()->Emit(compiler, trace);
3179}
3180
3181
3182static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3183 if (quick_check == NULL) return false;
3184 if (offset >= quick_check->characters()) return false;
3185 return quick_check->positions(offset)->determines_perfectly;
3186}
3187
3188
3189static void UpdateBoundsCheck(int index, int* checked_up_to) {
3190 if (index > *checked_up_to) {
3191 *checked_up_to = index;
3192 }
3193}
3194
3195
3196// We call this repeatedly to generate code for each pass over the text node.
3197// The passes are in increasing order of difficulty because we hope one
3198// of the first passes will fail in which case we are saved the work of the
3199// later passes. for example for the case independent regexp /%[asdfghjkl]a/
3200// we will check the '%' in the first pass, the case independent 'a' in the
3201// second pass and the character class in the last pass.
3202//
3203// The passes are done from right to left, so for example to test for /bar/
3204// we will first test for an 'r' with offset 2, then an 'a' with offset 1
3205// and then a 'b' with offset 0. This means we can avoid the end-of-input
3206// bounds check most of the time. In the example we only need to check for
3207// end-of-input when loading the putative 'r'.
3208//
3209// A slight complication involves the fact that the first character may already
3210// be fetched into a register by the previous node. In this case we want to
3211// do the test for that character first. We do this in separate passes. The
3212// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3213// pass has been performed then subsequent passes will have true in
3214// first_element_checked to indicate that that character does not need to be
3215// checked again.
3216//
3217// In addition to all this we are passed a Trace, which can
3218// contain an AlternativeGeneration object. In this AlternativeGeneration
3219// object we can see details of any quick check that was already passed in
3220// order to get to the code we are now generating. The quick check can involve
3221// loading characters, which means we do not need to recheck the bounds
3222// up to the limit the quick check already checked. In addition the quick
3223// check can have involved a mask and compare operation which may simplify
3224// or obviate the need for further checks at some character positions.
3225void TextNode::TextEmitPass(RegExpCompiler* compiler,
3226 TextEmitPassType pass,
3227 bool preloaded,
3228 Trace* trace,
3229 bool first_element_checked,
3230 int* checked_up_to) {
3231 RegExpMacroAssembler* assembler = compiler->macro_assembler();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003232 Isolate* isolate = assembler->zone()->isolate();
3233 bool one_byte = compiler->one_byte();
Steve Blocka7e24c12009-10-30 11:49:00 +00003234 Label* backtrack = trace->backtrack();
3235 QuickCheckDetails* quick_check = trace->quick_check_performed();
3236 int element_count = elms_->length();
3237 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3238 TextElement elm = elms_->at(i);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003239 int cp_offset = trace->cp_offset() + elm.cp_offset();
3240 if (elm.text_type() == TextElement::ATOM) {
3241 Vector<const uc16> quarks = elm.atom()->data();
Steve Blocka7e24c12009-10-30 11:49:00 +00003242 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3243 if (first_element_checked && i == 0 && j == 0) continue;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003244 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
Steve Blocka7e24c12009-10-30 11:49:00 +00003245 EmitCharacterFunction* emit_function = NULL;
3246 switch (pass) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003247 case NON_LATIN1_MATCH:
3248 DCHECK(one_byte);
3249 if (quarks[j] > String::kMaxOneByteCharCode) {
Steve Blocka7e24c12009-10-30 11:49:00 +00003250 assembler->GoTo(backtrack);
3251 return;
3252 }
3253 break;
3254 case NON_LETTER_CHARACTER_MATCH:
3255 emit_function = &EmitAtomNonLetter;
3256 break;
3257 case SIMPLE_CHARACTER_MATCH:
3258 emit_function = &EmitSimpleCharacter;
3259 break;
3260 case CASE_CHARACTER_MATCH:
3261 emit_function = &EmitAtomLetter;
3262 break;
3263 default:
3264 break;
3265 }
3266 if (emit_function != NULL) {
Steve Block44f0eee2011-05-26 01:26:41 +01003267 bool bound_checked = emit_function(isolate,
3268 compiler,
Steve Blocka7e24c12009-10-30 11:49:00 +00003269 quarks[j],
3270 backtrack,
3271 cp_offset + j,
3272 *checked_up_to < cp_offset + j,
3273 preloaded);
3274 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3275 }
3276 }
3277 } else {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003278 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
Steve Blocka7e24c12009-10-30 11:49:00 +00003279 if (pass == CHARACTER_CLASS_MATCH) {
3280 if (first_element_checked && i == 0) continue;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003281 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3282 RegExpCharacterClass* cc = elm.char_class();
3283 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3284 *checked_up_to < cp_offset, preloaded, zone());
Steve Blocka7e24c12009-10-30 11:49:00 +00003285 UpdateBoundsCheck(cp_offset, checked_up_to);
3286 }
3287 }
3288 }
3289}
3290
3291
3292int TextNode::Length() {
3293 TextElement elm = elms_->last();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003294 DCHECK(elm.cp_offset() >= 0);
3295 return elm.cp_offset() + elm.length();
Steve Blocka7e24c12009-10-30 11:49:00 +00003296}
3297
3298
3299bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3300 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3301 if (ignore_case) {
3302 return pass == SIMPLE_CHARACTER_MATCH;
3303 } else {
3304 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3305 }
3306}
3307
3308
3309// This generates the code to match a text node. A text node can contain
3310// straight character sequences (possibly to be matched in a case-independent
3311// way) and character classes. For efficiency we do not do this in a single
3312// pass from left to right. Instead we pass over the text node several times,
3313// emitting code for some character positions every time. See the comment on
3314// TextEmitPass for details.
3315void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3316 LimitResult limit_result = LimitVersions(compiler, trace);
3317 if (limit_result == DONE) return;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003318 DCHECK(limit_result == CONTINUE);
Steve Blocka7e24c12009-10-30 11:49:00 +00003319
3320 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3321 compiler->SetRegExpTooBig();
3322 return;
3323 }
3324
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003325 if (compiler->one_byte()) {
Steve Blocka7e24c12009-10-30 11:49:00 +00003326 int dummy = 0;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003327 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
Steve Blocka7e24c12009-10-30 11:49:00 +00003328 }
3329
3330 bool first_elt_done = false;
3331 int bound_checked_to = trace->cp_offset() - 1;
3332 bound_checked_to += trace->bound_checked_up_to();
3333
3334 // If a character is preloaded into the current character register then
3335 // check that now.
3336 if (trace->characters_preloaded() == 1) {
3337 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3338 if (!SkipPass(pass, compiler->ignore_case())) {
3339 TextEmitPass(compiler,
3340 static_cast<TextEmitPassType>(pass),
3341 true,
3342 trace,
3343 false,
3344 &bound_checked_to);
3345 }
3346 }
3347 first_elt_done = true;
3348 }
3349
3350 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3351 if (!SkipPass(pass, compiler->ignore_case())) {
3352 TextEmitPass(compiler,
3353 static_cast<TextEmitPassType>(pass),
3354 false,
3355 trace,
3356 first_elt_done,
3357 &bound_checked_to);
3358 }
3359 }
3360
3361 Trace successor_trace(*trace);
3362 successor_trace.set_at_start(false);
3363 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
3364 RecursionCheck rc(compiler);
3365 on_success()->Emit(compiler, &successor_trace);
3366}
3367
3368
3369void Trace::InvalidateCurrentCharacter() {
3370 characters_preloaded_ = 0;
3371}
3372
3373
3374void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003375 DCHECK(by > 0);
Steve Blocka7e24c12009-10-30 11:49:00 +00003376 // We don't have an instruction for shifting the current character register
3377 // down or for using a shifted value for anything so lets just forget that
3378 // we preloaded any characters into it.
3379 characters_preloaded_ = 0;
3380 // Adjust the offsets of the quick check performed information. This
3381 // information is used to find out what we already determined about the
3382 // characters by means of mask and compare.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003383 quick_check_performed_.Advance(by, compiler->one_byte());
Steve Blocka7e24c12009-10-30 11:49:00 +00003384 cp_offset_ += by;
3385 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3386 compiler->SetRegExpTooBig();
3387 cp_offset_ = 0;
3388 }
3389 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3390}
3391
3392
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003393void TextNode::MakeCaseIndependent(bool is_one_byte) {
Steve Blocka7e24c12009-10-30 11:49:00 +00003394 int element_count = elms_->length();
3395 for (int i = 0; i < element_count; i++) {
3396 TextElement elm = elms_->at(i);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003397 if (elm.text_type() == TextElement::CHAR_CLASS) {
3398 RegExpCharacterClass* cc = elm.char_class();
Ben Murdoch3ef787d2012-04-12 10:51:47 +01003399 // None of the standard character classes is different in the case
Steve Blockd0582a62009-12-15 09:54:21 +00003400 // independent case and it slows us down if we don't know that.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003401 if (cc->is_standard(zone())) continue;
3402 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Steve Blocka7e24c12009-10-30 11:49:00 +00003403 int range_count = ranges->length();
Steve Blockd0582a62009-12-15 09:54:21 +00003404 for (int j = 0; j < range_count; j++) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003405 ranges->at(j).AddCaseEquivalents(ranges, is_one_byte, zone());
Steve Blocka7e24c12009-10-30 11:49:00 +00003406 }
3407 }
3408 }
3409}
3410
3411
3412int TextNode::GreedyLoopTextLength() {
3413 TextElement elm = elms_->at(elms_->length() - 1);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003414 return elm.cp_offset() + elm.length();
3415}
3416
3417
3418RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3419 RegExpCompiler* compiler) {
3420 if (elms_->length() != 1) return NULL;
3421 TextElement elm = elms_->at(0);
3422 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3423 RegExpCharacterClass* node = elm.char_class();
3424 ZoneList<CharacterRange>* ranges = node->ranges(zone());
3425 if (!CharacterRange::IsCanonical(ranges)) {
3426 CharacterRange::Canonicalize(ranges);
Steve Blocka7e24c12009-10-30 11:49:00 +00003427 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003428 if (node->is_negated()) {
3429 return ranges->length() == 0 ? on_success() : NULL;
3430 }
3431 if (ranges->length() != 1) return NULL;
3432 uint32_t max_char;
3433 if (compiler->one_byte()) {
3434 max_char = String::kMaxOneByteCharCode;
3435 } else {
3436 max_char = String::kMaxUtf16CodeUnit;
3437 }
3438 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
Steve Blocka7e24c12009-10-30 11:49:00 +00003439}
3440
3441
3442// Finds the fixed match length of a sequence of nodes that goes from
3443// this alternative and back to this choice node. If there are variable
3444// length nodes or other complications in the way then return a sentinel
3445// value indicating that a greedy loop cannot be constructed.
Ben Murdoch589d6972011-11-30 16:04:58 +00003446int ChoiceNode::GreedyLoopTextLengthForAlternative(
3447 GuardedAlternative* alternative) {
Steve Blocka7e24c12009-10-30 11:49:00 +00003448 int length = 0;
3449 RegExpNode* node = alternative->node();
3450 // Later we will generate code for all these text nodes using recursion
3451 // so we have to limit the max number.
3452 int recursion_depth = 0;
3453 while (node != this) {
3454 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3455 return kNodeIsTooComplexForGreedyLoops;
3456 }
3457 int node_length = node->GreedyLoopTextLength();
3458 if (node_length == kNodeIsTooComplexForGreedyLoops) {
3459 return kNodeIsTooComplexForGreedyLoops;
3460 }
3461 length += node_length;
3462 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3463 node = seq_node->on_success();
3464 }
3465 return length;
3466}
3467
3468
3469void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003470 DCHECK_EQ(loop_node_, NULL);
Steve Blocka7e24c12009-10-30 11:49:00 +00003471 AddAlternative(alt);
3472 loop_node_ = alt.node();
3473}
3474
3475
3476void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003477 DCHECK_EQ(continue_node_, NULL);
Steve Blocka7e24c12009-10-30 11:49:00 +00003478 AddAlternative(alt);
3479 continue_node_ = alt.node();
3480}
3481
3482
3483void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3484 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3485 if (trace->stop_node() == this) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003486 // Back edge of greedy optimized loop node graph.
Ben Murdoch589d6972011-11-30 16:04:58 +00003487 int text_length =
3488 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003489 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
Steve Blocka7e24c12009-10-30 11:49:00 +00003490 // Update the counter-based backtracking info on the stack. This is an
3491 // optimization for greedy loops (see below).
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003492 DCHECK(trace->cp_offset() == text_length);
Steve Blocka7e24c12009-10-30 11:49:00 +00003493 macro_assembler->AdvanceCurrentPosition(text_length);
3494 macro_assembler->GoTo(trace->loop_label());
3495 return;
3496 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003497 DCHECK(trace->stop_node() == NULL);
Steve Blocka7e24c12009-10-30 11:49:00 +00003498 if (!trace->is_trivial()) {
3499 trace->Flush(compiler, this);
3500 return;
3501 }
3502 ChoiceNode::Emit(compiler, trace);
3503}
3504
3505
Ben Murdochb0fe1622011-05-05 13:52:32 +01003506int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003507 int eats_at_least) {
3508 int preload_characters = Min(4, eats_at_least);
Steve Blocka7e24c12009-10-30 11:49:00 +00003509 if (compiler->macro_assembler()->CanReadUnaligned()) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003510 bool one_byte = compiler->one_byte();
3511 if (one_byte) {
Steve Blocka7e24c12009-10-30 11:49:00 +00003512 if (preload_characters > 4) preload_characters = 4;
3513 // We can't preload 3 characters because there is no machine instruction
3514 // to do that. We can't just load 4 because we could be reading
3515 // beyond the end of the string, which could cause a memory fault.
3516 if (preload_characters == 3) preload_characters = 2;
3517 } else {
3518 if (preload_characters > 2) preload_characters = 2;
3519 }
3520 } else {
3521 if (preload_characters > 1) preload_characters = 1;
3522 }
3523 return preload_characters;
3524}
3525
3526
3527// This class is used when generating the alternatives in a choice node. It
3528// records the way the alternative is being code generated.
3529class AlternativeGeneration: public Malloced {
3530 public:
3531 AlternativeGeneration()
3532 : possible_success(),
3533 expects_preload(false),
3534 after(),
3535 quick_check_details() { }
3536 Label possible_success;
3537 bool expects_preload;
3538 Label after;
3539 QuickCheckDetails quick_check_details;
3540};
3541
3542
3543// Creates a list of AlternativeGenerations. If the list has a reasonable
3544// size then it is on the stack, otherwise the excess is on the heap.
3545class AlternativeGenerationList {
3546 public:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003547 AlternativeGenerationList(int count, Zone* zone)
3548 : alt_gens_(count, zone) {
Steve Blocka7e24c12009-10-30 11:49:00 +00003549 for (int i = 0; i < count && i < kAFew; i++) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003550 alt_gens_.Add(a_few_alt_gens_ + i, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00003551 }
3552 for (int i = kAFew; i < count; i++) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003553 alt_gens_.Add(new AlternativeGeneration(), zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00003554 }
3555 }
3556 ~AlternativeGenerationList() {
3557 for (int i = kAFew; i < alt_gens_.length(); i++) {
3558 delete alt_gens_[i];
3559 alt_gens_[i] = NULL;
3560 }
3561 }
3562
3563 AlternativeGeneration* at(int i) {
3564 return alt_gens_[i];
3565 }
Ben Murdoch3fb3ca82011-12-02 17:19:32 +00003566
Steve Blocka7e24c12009-10-30 11:49:00 +00003567 private:
3568 static const int kAFew = 10;
3569 ZoneList<AlternativeGeneration*> alt_gens_;
3570 AlternativeGeneration a_few_alt_gens_[kAFew];
3571};
3572
3573
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003574// The '2' variant is has inclusive from and exclusive to.
3575// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3576// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3577static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
3578 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
3579 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
3580 0xFEFF, 0xFF00, 0x10000 };
3581static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3582
3583static const int kWordRanges[] = {
3584 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
3585static const int kWordRangeCount = arraysize(kWordRanges);
3586static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
3587static const int kDigitRangeCount = arraysize(kDigitRanges);
3588static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
3589static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3590static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
3591 0x2028, 0x202A, 0x10000 };
3592static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3593
3594
3595void BoyerMoorePositionInfo::Set(int character) {
3596 SetInterval(Interval(character, character));
3597}
3598
3599
3600void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3601 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3602 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3603 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3604 surrogate_ =
3605 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3606 if (interval.to() - interval.from() >= kMapSize - 1) {
3607 if (map_count_ != kMapSize) {
3608 map_count_ = kMapSize;
3609 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3610 }
3611 return;
3612 }
3613 for (int i = interval.from(); i <= interval.to(); i++) {
3614 int mod_character = (i & kMask);
3615 if (!map_->at(mod_character)) {
3616 map_count_++;
3617 map_->at(mod_character) = true;
3618 }
3619 if (map_count_ == kMapSize) return;
3620 }
3621}
3622
3623
3624void BoyerMoorePositionInfo::SetAll() {
3625 s_ = w_ = d_ = kLatticeUnknown;
3626 if (map_count_ != kMapSize) {
3627 map_count_ = kMapSize;
3628 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3629 }
3630}
3631
3632
3633BoyerMooreLookahead::BoyerMooreLookahead(
3634 int length, RegExpCompiler* compiler, Zone* zone)
3635 : length_(length),
3636 compiler_(compiler) {
3637 if (compiler->one_byte()) {
3638 max_char_ = String::kMaxOneByteCharCode;
3639 } else {
3640 max_char_ = String::kMaxUtf16CodeUnit;
3641 }
3642 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3643 for (int i = 0; i < length; i++) {
3644 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3645 }
3646}
3647
3648
3649// Find the longest range of lookahead that has the fewest number of different
3650// characters that can occur at a given position. Since we are optimizing two
3651// different parameters at once this is a tradeoff.
3652bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3653 int biggest_points = 0;
3654 // If more than 32 characters out of 128 can occur it is unlikely that we can
3655 // be lucky enough to step forwards much of the time.
3656 const int kMaxMax = 32;
3657 for (int max_number_of_chars = 4;
3658 max_number_of_chars < kMaxMax;
3659 max_number_of_chars *= 2) {
3660 biggest_points =
3661 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3662 }
3663 if (biggest_points == 0) return false;
3664 return true;
3665}
3666
3667
3668// Find the highest-points range between 0 and length_ where the character
3669// information is not too vague. 'Too vague' means that there are more than
3670// max_number_of_chars that can occur at this position. Calculates the number
3671// of points as the product of width-of-the-range and
3672// probability-of-finding-one-of-the-characters, where the probability is
3673// calculated using the frequency distribution of the sample subject string.
3674int BoyerMooreLookahead::FindBestInterval(
3675 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3676 int biggest_points = old_biggest_points;
3677 static const int kSize = RegExpMacroAssembler::kTableSize;
3678 for (int i = 0; i < length_; ) {
3679 while (i < length_ && Count(i) > max_number_of_chars) i++;
3680 if (i == length_) break;
3681 int remembered_from = i;
3682 bool union_map[kSize];
3683 for (int j = 0; j < kSize; j++) union_map[j] = false;
3684 while (i < length_ && Count(i) <= max_number_of_chars) {
3685 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3686 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3687 i++;
3688 }
3689 int frequency = 0;
3690 for (int j = 0; j < kSize; j++) {
3691 if (union_map[j]) {
3692 // Add 1 to the frequency to give a small per-character boost for
3693 // the cases where our sampling is not good enough and many
3694 // characters have a frequency of zero. This means the frequency
3695 // can theoretically be up to 2*kSize though we treat it mostly as
3696 // a fraction of kSize.
3697 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3698 }
3699 }
3700 // We use the probability of skipping times the distance we are skipping to
3701 // judge the effectiveness of this. Actually we have a cut-off: By
3702 // dividing by 2 we switch off the skipping if the probability of skipping
3703 // is less than 50%. This is because the multibyte mask-and-compare
3704 // skipping in quickcheck is more likely to do well on this case.
3705 bool in_quickcheck_range =
3706 ((i - remembered_from < 4) ||
3707 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3708 // Called 'probability' but it is only a rough estimate and can actually
3709 // be outside the 0-kSize range.
3710 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3711 int points = (i - remembered_from) * probability;
3712 if (points > biggest_points) {
3713 *from = remembered_from;
3714 *to = i - 1;
3715 biggest_points = points;
3716 }
3717 }
3718 return biggest_points;
3719}
3720
3721
3722// Take all the characters that will not prevent a successful match if they
3723// occur in the subject string in the range between min_lookahead and
3724// max_lookahead (inclusive) measured from the current position. If the
3725// character at max_lookahead offset is not one of these characters, then we
3726// can safely skip forwards by the number of characters in the range.
3727int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3728 int max_lookahead,
3729 Handle<ByteArray> boolean_skip_table) {
3730 const int kSize = RegExpMacroAssembler::kTableSize;
3731
3732 const int kSkipArrayEntry = 0;
3733 const int kDontSkipArrayEntry = 1;
3734
3735 for (int i = 0; i < kSize; i++) {
3736 boolean_skip_table->set(i, kSkipArrayEntry);
3737 }
3738 int skip = max_lookahead + 1 - min_lookahead;
3739
3740 for (int i = max_lookahead; i >= min_lookahead; i--) {
3741 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3742 for (int j = 0; j < kSize; j++) {
3743 if (map->at(j)) {
3744 boolean_skip_table->set(j, kDontSkipArrayEntry);
3745 }
3746 }
3747 }
3748
3749 return skip;
3750}
3751
3752
3753// See comment above on the implementation of GetSkipTable.
3754void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3755 const int kSize = RegExpMacroAssembler::kTableSize;
3756
3757 int min_lookahead = 0;
3758 int max_lookahead = 0;
3759
3760 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3761
3762 bool found_single_character = false;
3763 int single_character = 0;
3764 for (int i = max_lookahead; i >= min_lookahead; i--) {
3765 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3766 if (map->map_count() > 1 ||
3767 (found_single_character && map->map_count() != 0)) {
3768 found_single_character = false;
3769 break;
3770 }
3771 for (int j = 0; j < kSize; j++) {
3772 if (map->at(j)) {
3773 found_single_character = true;
3774 single_character = j;
3775 break;
3776 }
3777 }
3778 }
3779
3780 int lookahead_width = max_lookahead + 1 - min_lookahead;
3781
3782 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3783 // The mask-compare can probably handle this better.
3784 return;
3785 }
3786
3787 if (found_single_character) {
3788 Label cont, again;
3789 masm->Bind(&again);
3790 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3791 if (max_char_ > kSize) {
3792 masm->CheckCharacterAfterAnd(single_character,
3793 RegExpMacroAssembler::kTableMask,
3794 &cont);
3795 } else {
3796 masm->CheckCharacter(single_character, &cont);
3797 }
3798 masm->AdvanceCurrentPosition(lookahead_width);
3799 masm->GoTo(&again);
3800 masm->Bind(&cont);
3801 return;
3802 }
3803
3804 Factory* factory = masm->zone()->isolate()->factory();
3805 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3806 int skip_distance = GetSkipTable(
3807 min_lookahead, max_lookahead, boolean_skip_table);
3808 DCHECK(skip_distance != 0);
3809
3810 Label cont, again;
3811 masm->Bind(&again);
3812 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3813 masm->CheckBitInTable(boolean_skip_table, &cont);
3814 masm->AdvanceCurrentPosition(skip_distance);
3815 masm->GoTo(&again);
3816 masm->Bind(&cont);
3817}
3818
3819
Steve Blocka7e24c12009-10-30 11:49:00 +00003820/* Code generation for choice nodes.
3821 *
3822 * We generate quick checks that do a mask and compare to eliminate a
3823 * choice. If the quick check succeeds then it jumps to the continuation to
3824 * do slow checks and check subsequent nodes. If it fails (the common case)
3825 * it falls through to the next choice.
3826 *
3827 * Here is the desired flow graph. Nodes directly below each other imply
3828 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3829 * 3 doesn't have a quick check so we have to call the slow check.
3830 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3831 * regexp continuation is generated directly after the Sn node, up to the
3832 * next GoTo if we decide to reuse some already generated code. Some
3833 * nodes expect preload_characters to be preloaded into the current
3834 * character register. R nodes do this preloading. Vertices are marked
3835 * F for failures and S for success (possible success in the case of quick
3836 * nodes). L, V, < and > are used as arrow heads.
3837 *
3838 * ----------> R
3839 * |
3840 * V
3841 * Q1 -----> S1
3842 * | S /
3843 * F| /
3844 * | F/
3845 * | /
3846 * | R
3847 * | /
3848 * V L
3849 * Q2 -----> S2
3850 * | S /
3851 * F| /
3852 * | F/
3853 * | /
3854 * | R
3855 * | /
3856 * V L
3857 * S3
3858 * |
3859 * F|
3860 * |
3861 * R
3862 * |
3863 * backtrack V
3864 * <----------Q4
3865 * \ F |
3866 * \ |S
3867 * \ F V
3868 * \-----S4
3869 *
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003870 * For greedy loops we push the current position, then generate the code that
3871 * eats the input specially in EmitGreedyLoop. The other choice (the
3872 * continuation) is generated by the normal code in EmitChoices, and steps back
3873 * in the input to the starting position when it fails to match. The loop code
3874 * looks like this (U is the unwind code that steps back in the greedy loop).
3875 *
Steve Blocka7e24c12009-10-30 11:49:00 +00003876 * _____
3877 * / \
3878 * V |
3879 * ----------> S1 |
3880 * /| |
3881 * / |S |
3882 * F/ \_____/
3883 * /
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003884 * |<-----
3885 * | \
3886 * V |S
3887 * Q2 ---> U----->backtrack
3888 * | F /
3889 * S| /
3890 * V F /
3891 * S2--/
Steve Blocka7e24c12009-10-30 11:49:00 +00003892 */
3893
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003894GreedyLoopState::GreedyLoopState(bool not_at_start) {
3895 counter_backtrack_trace_.set_backtrack(&label_);
3896 if (not_at_start) counter_backtrack_trace_.set_at_start(false);
3897}
Steve Blocka7e24c12009-10-30 11:49:00 +00003898
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003899
3900void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
Steve Blocka7e24c12009-10-30 11:49:00 +00003901#ifdef DEBUG
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003902 int choice_count = alternatives_->length();
Steve Blocka7e24c12009-10-30 11:49:00 +00003903 for (int i = 0; i < choice_count - 1; i++) {
3904 GuardedAlternative alternative = alternatives_->at(i);
3905 ZoneList<Guard*>* guards = alternative.guards();
3906 int guard_count = (guards == NULL) ? 0 : guards->length();
3907 for (int j = 0; j < guard_count; j++) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003908 DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
Steve Blocka7e24c12009-10-30 11:49:00 +00003909 }
3910 }
3911#endif
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003912}
3913
3914
3915void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3916 Trace* current_trace,
3917 PreloadState* state) {
3918 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3919 // Save some time by looking at most one machine word ahead.
3920 state->eats_at_least_ =
3921 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3922 current_trace->at_start() == Trace::FALSE_VALUE);
3923 }
3924 state->preload_characters_ =
3925 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3926
3927 state->preload_is_current_ =
3928 (current_trace->characters_preloaded() == state->preload_characters_);
3929 state->preload_has_checked_bounds_ = state->preload_is_current_;
3930}
3931
3932
3933void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3934 int choice_count = alternatives_->length();
3935
3936 AssertGuardsMentionRegisters(trace);
Steve Blocka7e24c12009-10-30 11:49:00 +00003937
3938 LimitResult limit_result = LimitVersions(compiler, trace);
3939 if (limit_result == DONE) return;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003940 DCHECK(limit_result == CONTINUE);
Steve Blocka7e24c12009-10-30 11:49:00 +00003941
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003942 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3943 // other choice nodes we only flush if we are out of code size budget.
Steve Blocka7e24c12009-10-30 11:49:00 +00003944 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3945 trace->Flush(compiler, this);
3946 return;
3947 }
3948
3949 RecursionCheck rc(compiler);
3950
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003951 PreloadState preload;
3952 preload.init();
3953 GreedyLoopState greedy_loop_state(not_at_start());
Steve Blocka7e24c12009-10-30 11:49:00 +00003954
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003955 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3956 AlternativeGenerationList alt_gens(choice_count, zone());
Steve Blocka7e24c12009-10-30 11:49:00 +00003957
3958 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003959 trace = EmitGreedyLoop(compiler,
3960 trace,
3961 &alt_gens,
3962 &preload,
3963 &greedy_loop_state,
3964 text_length);
3965 } else {
3966 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3967 // match the traces produced pre-cleanup.
3968 Label second_choice;
3969 compiler->macro_assembler()->Bind(&second_choice);
3970
3971 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3972
3973 EmitChoices(compiler,
3974 &alt_gens,
3975 0,
3976 trace,
3977 &preload);
Steve Blocka7e24c12009-10-30 11:49:00 +00003978 }
3979
Ben Murdochb8a8cc12014-11-26 15:28:44 +00003980 // At this point we need to generate slow checks for the alternatives where
3981 // the quick check was inlined. We can recognize these because the associated
3982 // label was bound.
3983 int new_flush_budget = trace->flush_budget() / choice_count;
3984 for (int i = 0; i < choice_count; i++) {
3985 AlternativeGeneration* alt_gen = alt_gens.at(i);
3986 Trace new_trace(*trace);
3987 // If there are actions to be flushed we have to limit how many times
3988 // they are flushed. Take the budget of the parent trace and distribute
3989 // it fairly amongst the children.
3990 if (new_trace.actions() != NULL) {
3991 new_trace.set_flush_budget(new_flush_budget);
3992 }
3993 bool next_expects_preload =
3994 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3995 EmitOutOfLineContinuation(compiler,
3996 &new_trace,
3997 alternatives_->at(i),
3998 alt_gen,
3999 preload.preload_characters_,
4000 next_expects_preload);
4001 }
4002}
4003
4004
4005Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
4006 Trace* trace,
4007 AlternativeGenerationList* alt_gens,
4008 PreloadState* preload,
4009 GreedyLoopState* greedy_loop_state,
4010 int text_length) {
4011 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4012 // Here we have special handling for greedy loops containing only text nodes
4013 // and other simple nodes. These are handled by pushing the current
4014 // position on the stack and then incrementing the current position each
4015 // time around the switch. On backtrack we decrement the current position
4016 // and check it against the pushed value. This avoids pushing backtrack
4017 // information for each iteration of the loop, which could take up a lot of
4018 // space.
4019 DCHECK(trace->stop_node() == NULL);
4020 macro_assembler->PushCurrentPosition();
4021 Label greedy_match_failed;
4022 Trace greedy_match_trace;
4023 if (not_at_start()) greedy_match_trace.set_at_start(false);
4024 greedy_match_trace.set_backtrack(&greedy_match_failed);
4025 Label loop_label;
4026 macro_assembler->Bind(&loop_label);
4027 greedy_match_trace.set_stop_node(this);
4028 greedy_match_trace.set_loop_label(&loop_label);
4029 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4030 macro_assembler->Bind(&greedy_match_failed);
4031
Steve Blocka7e24c12009-10-30 11:49:00 +00004032 Label second_choice; // For use in greedy matches.
4033 macro_assembler->Bind(&second_choice);
4034
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004035 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
Steve Blocka7e24c12009-10-30 11:49:00 +00004036
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004037 EmitChoices(compiler,
4038 alt_gens,
4039 1,
4040 new_trace,
4041 preload);
Steve Blocka7e24c12009-10-30 11:49:00 +00004042
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004043 macro_assembler->Bind(greedy_loop_state->label());
4044 // If we have unwound to the bottom then backtrack.
4045 macro_assembler->CheckGreedyLoop(trace->backtrack());
4046 // Otherwise try the second priority at an earlier position.
4047 macro_assembler->AdvanceCurrentPosition(-text_length);
4048 macro_assembler->GoTo(&second_choice);
4049 return new_trace;
4050}
4051
4052int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4053 Trace* trace) {
4054 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4055 if (alternatives_->length() != 2) return eats_at_least;
4056
4057 GuardedAlternative alt1 = alternatives_->at(1);
4058 if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4059 return eats_at_least;
4060 }
4061 RegExpNode* eats_anything_node = alt1.node();
4062 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4063 return eats_at_least;
4064 }
4065
4066 // Really we should be creating a new trace when we execute this function,
4067 // but there is no need, because the code it generates cannot backtrack, and
4068 // we always arrive here with a trivial trace (since it's the entry to a
4069 // loop. That also implies that there are no preloaded characters, which is
4070 // good, because it means we won't be violating any assumptions by
4071 // overwriting those characters with new load instructions.
4072 DCHECK(trace->is_trivial());
4073
4074 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4075 // At this point we know that we are at a non-greedy loop that will eat
4076 // any character one at a time. Any non-anchored regexp has such a
4077 // loop prepended to it in order to find where it starts. We look for
4078 // a pattern of the form ...abc... where we can look 6 characters ahead
4079 // and step forwards 3 if the character is not one of abc. Abc need
4080 // not be atoms, they can be any reasonably limited character class or
4081 // small alternation.
4082 BoyerMooreLookahead* bm = bm_info(false);
4083 if (bm == NULL) {
4084 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4085 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4086 kRecursionBudget,
4087 false));
4088 if (eats_at_least >= 1) {
4089 bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4090 compiler,
4091 zone());
4092 GuardedAlternative alt0 = alternatives_->at(0);
4093 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, false);
4094 }
4095 }
4096 if (bm != NULL) {
4097 bm->EmitSkipInstructions(macro_assembler);
4098 }
4099 return eats_at_least;
4100}
4101
4102
4103void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4104 AlternativeGenerationList* alt_gens,
4105 int first_choice,
4106 Trace* trace,
4107 PreloadState* preload) {
4108 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4109 SetUpPreLoad(compiler, trace, preload);
Steve Blocka7e24c12009-10-30 11:49:00 +00004110
4111 // For now we just call all choices one after the other. The idea ultimately
4112 // is to use the Dispatch table to try only the relevant ones.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004113 int choice_count = alternatives_->length();
4114
4115 int new_flush_budget = trace->flush_budget() / choice_count;
4116
4117 for (int i = first_choice; i < choice_count; i++) {
4118 bool is_last = i == choice_count - 1;
4119 bool fall_through_on_failure = !is_last;
Steve Blocka7e24c12009-10-30 11:49:00 +00004120 GuardedAlternative alternative = alternatives_->at(i);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004121 AlternativeGeneration* alt_gen = alt_gens->at(i);
4122 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
Steve Blocka7e24c12009-10-30 11:49:00 +00004123 ZoneList<Guard*>* guards = alternative.guards();
4124 int guard_count = (guards == NULL) ? 0 : guards->length();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004125 Trace new_trace(*trace);
4126 new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4127 preload->preload_characters_ :
Steve Blocka7e24c12009-10-30 11:49:00 +00004128 0);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004129 if (preload->preload_has_checked_bounds_) {
4130 new_trace.set_bound_checked_up_to(preload->preload_characters_);
Steve Blocka7e24c12009-10-30 11:49:00 +00004131 }
4132 new_trace.quick_check_performed()->Clear();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004133 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4134 if (!is_last) {
4135 new_trace.set_backtrack(&alt_gen->after);
4136 }
4137 alt_gen->expects_preload = preload->preload_is_current_;
Steve Blocka7e24c12009-10-30 11:49:00 +00004138 bool generate_full_check_inline = false;
4139 if (FLAG_regexp_optimization &&
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004140 try_to_emit_quick_check_for_alternative(i == 0) &&
Steve Blocka7e24c12009-10-30 11:49:00 +00004141 alternative.node()->EmitQuickCheck(compiler,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004142 trace,
Steve Blocka7e24c12009-10-30 11:49:00 +00004143 &new_trace,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004144 preload->preload_has_checked_bounds_,
Steve Blocka7e24c12009-10-30 11:49:00 +00004145 &alt_gen->possible_success,
4146 &alt_gen->quick_check_details,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004147 fall_through_on_failure)) {
Steve Blocka7e24c12009-10-30 11:49:00 +00004148 // Quick check was generated for this choice.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004149 preload->preload_is_current_ = true;
4150 preload->preload_has_checked_bounds_ = true;
4151 // If we generated the quick check to fall through on possible success,
4152 // we now need to generate the full check inline.
4153 if (!fall_through_on_failure) {
Steve Blocka7e24c12009-10-30 11:49:00 +00004154 macro_assembler->Bind(&alt_gen->possible_success);
4155 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004156 new_trace.set_characters_preloaded(preload->preload_characters_);
4157 new_trace.set_bound_checked_up_to(preload->preload_characters_);
Steve Blocka7e24c12009-10-30 11:49:00 +00004158 generate_full_check_inline = true;
4159 }
4160 } else if (alt_gen->quick_check_details.cannot_match()) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004161 if (!fall_through_on_failure) {
Steve Blocka7e24c12009-10-30 11:49:00 +00004162 macro_assembler->GoTo(trace->backtrack());
4163 }
4164 continue;
4165 } else {
4166 // No quick check was generated. Put the full code here.
4167 // If this is not the first choice then there could be slow checks from
4168 // previous cases that go here when they fail. There's no reason to
4169 // insist that they preload characters since the slow check we are about
4170 // to generate probably can't use it.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004171 if (i != first_choice) {
Steve Blocka7e24c12009-10-30 11:49:00 +00004172 alt_gen->expects_preload = false;
Leon Clarkee46be812010-01-19 14:06:41 +00004173 new_trace.InvalidateCurrentCharacter();
Steve Blocka7e24c12009-10-30 11:49:00 +00004174 }
Steve Blocka7e24c12009-10-30 11:49:00 +00004175 generate_full_check_inline = true;
4176 }
4177 if (generate_full_check_inline) {
4178 if (new_trace.actions() != NULL) {
4179 new_trace.set_flush_budget(new_flush_budget);
4180 }
4181 for (int j = 0; j < guard_count; j++) {
4182 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4183 }
4184 alternative.node()->Emit(compiler, &new_trace);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004185 preload->preload_is_current_ = false;
Steve Blocka7e24c12009-10-30 11:49:00 +00004186 }
4187 macro_assembler->Bind(&alt_gen->after);
4188 }
Steve Blocka7e24c12009-10-30 11:49:00 +00004189}
4190
4191
4192void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4193 Trace* trace,
4194 GuardedAlternative alternative,
4195 AlternativeGeneration* alt_gen,
4196 int preload_characters,
4197 bool next_expects_preload) {
4198 if (!alt_gen->possible_success.is_linked()) return;
4199
4200 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4201 macro_assembler->Bind(&alt_gen->possible_success);
4202 Trace out_of_line_trace(*trace);
4203 out_of_line_trace.set_characters_preloaded(preload_characters);
4204 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004205 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
Steve Blocka7e24c12009-10-30 11:49:00 +00004206 ZoneList<Guard*>* guards = alternative.guards();
4207 int guard_count = (guards == NULL) ? 0 : guards->length();
4208 if (next_expects_preload) {
4209 Label reload_current_char;
4210 out_of_line_trace.set_backtrack(&reload_current_char);
4211 for (int j = 0; j < guard_count; j++) {
4212 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4213 }
4214 alternative.node()->Emit(compiler, &out_of_line_trace);
4215 macro_assembler->Bind(&reload_current_char);
4216 // Reload the current character, since the next quick check expects that.
4217 // We don't need to check bounds here because we only get into this
4218 // code through a quick check which already did the checked load.
4219 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4220 NULL,
4221 false,
4222 preload_characters);
4223 macro_assembler->GoTo(&(alt_gen->after));
4224 } else {
4225 out_of_line_trace.set_backtrack(&(alt_gen->after));
4226 for (int j = 0; j < guard_count; j++) {
4227 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4228 }
4229 alternative.node()->Emit(compiler, &out_of_line_trace);
4230 }
4231}
4232
4233
4234void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4235 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4236 LimitResult limit_result = LimitVersions(compiler, trace);
4237 if (limit_result == DONE) return;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004238 DCHECK(limit_result == CONTINUE);
Steve Blocka7e24c12009-10-30 11:49:00 +00004239
4240 RecursionCheck rc(compiler);
4241
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004242 switch (action_type_) {
Steve Blocka7e24c12009-10-30 11:49:00 +00004243 case STORE_POSITION: {
4244 Trace::DeferredCapture
4245 new_capture(data_.u_position_register.reg,
4246 data_.u_position_register.is_capture,
4247 trace);
4248 Trace new_trace = *trace;
4249 new_trace.add_action(&new_capture);
4250 on_success()->Emit(compiler, &new_trace);
4251 break;
4252 }
4253 case INCREMENT_REGISTER: {
4254 Trace::DeferredIncrementRegister
4255 new_increment(data_.u_increment_register.reg);
4256 Trace new_trace = *trace;
4257 new_trace.add_action(&new_increment);
4258 on_success()->Emit(compiler, &new_trace);
4259 break;
4260 }
4261 case SET_REGISTER: {
4262 Trace::DeferredSetRegister
4263 new_set(data_.u_store_register.reg, data_.u_store_register.value);
4264 Trace new_trace = *trace;
4265 new_trace.add_action(&new_set);
4266 on_success()->Emit(compiler, &new_trace);
4267 break;
4268 }
4269 case CLEAR_CAPTURES: {
4270 Trace::DeferredClearCaptures
4271 new_capture(Interval(data_.u_clear_captures.range_from,
4272 data_.u_clear_captures.range_to));
4273 Trace new_trace = *trace;
4274 new_trace.add_action(&new_capture);
4275 on_success()->Emit(compiler, &new_trace);
4276 break;
4277 }
4278 case BEGIN_SUBMATCH:
4279 if (!trace->is_trivial()) {
4280 trace->Flush(compiler, this);
4281 } else {
4282 assembler->WriteCurrentPositionToRegister(
4283 data_.u_submatch.current_position_register, 0);
4284 assembler->WriteStackPointerToRegister(
4285 data_.u_submatch.stack_pointer_register);
4286 on_success()->Emit(compiler, trace);
4287 }
4288 break;
4289 case EMPTY_MATCH_CHECK: {
4290 int start_pos_reg = data_.u_empty_match_check.start_register;
4291 int stored_pos = 0;
4292 int rep_reg = data_.u_empty_match_check.repetition_register;
4293 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4294 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4295 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4296 // If we know we haven't advanced and there is no minimum we
4297 // can just backtrack immediately.
4298 assembler->GoTo(trace->backtrack());
4299 } else if (know_dist && stored_pos < trace->cp_offset()) {
4300 // If we know we've advanced we can generate the continuation
4301 // immediately.
4302 on_success()->Emit(compiler, trace);
4303 } else if (!trace->is_trivial()) {
4304 trace->Flush(compiler, this);
4305 } else {
4306 Label skip_empty_check;
4307 // If we have a minimum number of repetitions we check the current
4308 // number first and skip the empty check if it's not enough.
4309 if (has_minimum) {
4310 int limit = data_.u_empty_match_check.repetition_limit;
4311 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4312 }
4313 // If the match is empty we bail out, otherwise we fall through
4314 // to the on-success continuation.
4315 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4316 trace->backtrack());
4317 assembler->Bind(&skip_empty_check);
4318 on_success()->Emit(compiler, trace);
4319 }
4320 break;
4321 }
4322 case POSITIVE_SUBMATCH_SUCCESS: {
4323 if (!trace->is_trivial()) {
4324 trace->Flush(compiler, this);
4325 return;
4326 }
4327 assembler->ReadCurrentPositionFromRegister(
4328 data_.u_submatch.current_position_register);
4329 assembler->ReadStackPointerFromRegister(
4330 data_.u_submatch.stack_pointer_register);
4331 int clear_register_count = data_.u_submatch.clear_register_count;
4332 if (clear_register_count == 0) {
4333 on_success()->Emit(compiler, trace);
4334 return;
4335 }
4336 int clear_registers_from = data_.u_submatch.clear_register_from;
4337 Label clear_registers_backtrack;
4338 Trace new_trace = *trace;
4339 new_trace.set_backtrack(&clear_registers_backtrack);
4340 on_success()->Emit(compiler, &new_trace);
4341
4342 assembler->Bind(&clear_registers_backtrack);
4343 int clear_registers_to = clear_registers_from + clear_register_count - 1;
4344 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4345
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004346 DCHECK(trace->backtrack() == NULL);
Steve Blocka7e24c12009-10-30 11:49:00 +00004347 assembler->Backtrack();
4348 return;
4349 }
4350 default:
4351 UNREACHABLE();
4352 }
4353}
4354
4355
4356void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4357 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4358 if (!trace->is_trivial()) {
4359 trace->Flush(compiler, this);
4360 return;
4361 }
4362
4363 LimitResult limit_result = LimitVersions(compiler, trace);
4364 if (limit_result == DONE) return;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004365 DCHECK(limit_result == CONTINUE);
Steve Blocka7e24c12009-10-30 11:49:00 +00004366
4367 RecursionCheck rc(compiler);
4368
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004369 DCHECK_EQ(start_reg_ + 1, end_reg_);
Steve Blocka7e24c12009-10-30 11:49:00 +00004370 if (compiler->ignore_case()) {
4371 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
4372 trace->backtrack());
4373 } else {
4374 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
4375 }
4376 on_success()->Emit(compiler, trace);
4377}
4378
4379
4380// -------------------------------------------------------------------
4381// Dot/dotty output
4382
4383
4384#ifdef DEBUG
4385
4386
4387class DotPrinter: public NodeVisitor {
4388 public:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004389 DotPrinter(OStream& os, bool ignore_case) // NOLINT
4390 : os_(os),
4391 ignore_case_(ignore_case) {}
Steve Blocka7e24c12009-10-30 11:49:00 +00004392 void PrintNode(const char* label, RegExpNode* node);
4393 void Visit(RegExpNode* node);
4394 void PrintAttributes(RegExpNode* from);
Steve Blocka7e24c12009-10-30 11:49:00 +00004395 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4396#define DECLARE_VISIT(Type) \
4397 virtual void Visit##Type(Type##Node* that);
4398FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4399#undef DECLARE_VISIT
4400 private:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004401 OStream& os_;
Steve Blocka7e24c12009-10-30 11:49:00 +00004402 bool ignore_case_;
Steve Blocka7e24c12009-10-30 11:49:00 +00004403};
4404
4405
4406void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004407 os_ << "digraph G {\n graph [label=\"";
Steve Blocka7e24c12009-10-30 11:49:00 +00004408 for (int i = 0; label[i]; i++) {
4409 switch (label[i]) {
4410 case '\\':
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004411 os_ << "\\\\";
Steve Blocka7e24c12009-10-30 11:49:00 +00004412 break;
4413 case '"':
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004414 os_ << "\"";
Steve Blocka7e24c12009-10-30 11:49:00 +00004415 break;
4416 default:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004417 os_ << label[i];
Steve Blocka7e24c12009-10-30 11:49:00 +00004418 break;
4419 }
4420 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004421 os_ << "\"];\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004422 Visit(node);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004423 os_ << "}" << endl;
Steve Blocka7e24c12009-10-30 11:49:00 +00004424}
4425
4426
4427void DotPrinter::Visit(RegExpNode* node) {
4428 if (node->info()->visited) return;
4429 node->info()->visited = true;
4430 node->Accept(this);
4431}
4432
4433
4434void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004435 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004436 Visit(on_failure);
4437}
4438
4439
4440class TableEntryBodyPrinter {
4441 public:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004442 TableEntryBodyPrinter(OStream& os, ChoiceNode* choice) // NOLINT
4443 : os_(os),
4444 choice_(choice) {}
Steve Blocka7e24c12009-10-30 11:49:00 +00004445 void Call(uc16 from, DispatchTable::Entry entry) {
4446 OutSet* out_set = entry.out_set();
4447 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4448 if (out_set->Get(i)) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004449 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4450 << choice()->alternatives()->at(i).node() << ";\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004451 }
4452 }
4453 }
4454 private:
Steve Blocka7e24c12009-10-30 11:49:00 +00004455 ChoiceNode* choice() { return choice_; }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004456 OStream& os_;
Steve Blocka7e24c12009-10-30 11:49:00 +00004457 ChoiceNode* choice_;
4458};
4459
4460
4461class TableEntryHeaderPrinter {
4462 public:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004463 explicit TableEntryHeaderPrinter(OStream& os) // NOLINT
4464 : first_(true),
4465 os_(os) {}
Steve Blocka7e24c12009-10-30 11:49:00 +00004466 void Call(uc16 from, DispatchTable::Entry entry) {
4467 if (first_) {
4468 first_ = false;
4469 } else {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004470 os_ << "|";
Steve Blocka7e24c12009-10-30 11:49:00 +00004471 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004472 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
Steve Blocka7e24c12009-10-30 11:49:00 +00004473 OutSet* out_set = entry.out_set();
4474 int priority = 0;
4475 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4476 if (out_set->Get(i)) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004477 if (priority > 0) os_ << "|";
4478 os_ << "<s" << from << "o" << i << "> " << priority;
Steve Blocka7e24c12009-10-30 11:49:00 +00004479 priority++;
4480 }
4481 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004482 os_ << "}}";
Steve Blocka7e24c12009-10-30 11:49:00 +00004483 }
Ben Murdoch3fb3ca82011-12-02 17:19:32 +00004484
Steve Blocka7e24c12009-10-30 11:49:00 +00004485 private:
4486 bool first_;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004487 OStream& os_;
Steve Blocka7e24c12009-10-30 11:49:00 +00004488};
4489
4490
4491class AttributePrinter {
4492 public:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004493 explicit AttributePrinter(OStream& os) // NOLINT
4494 : os_(os),
4495 first_(true) {}
Steve Blocka7e24c12009-10-30 11:49:00 +00004496 void PrintSeparator() {
4497 if (first_) {
4498 first_ = false;
4499 } else {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004500 os_ << "|";
Steve Blocka7e24c12009-10-30 11:49:00 +00004501 }
4502 }
4503 void PrintBit(const char* name, bool value) {
4504 if (!value) return;
4505 PrintSeparator();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004506 os_ << "{" << name << "}";
Steve Blocka7e24c12009-10-30 11:49:00 +00004507 }
4508 void PrintPositive(const char* name, int value) {
4509 if (value < 0) return;
4510 PrintSeparator();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004511 os_ << "{" << name << "|" << value << "}";
Steve Blocka7e24c12009-10-30 11:49:00 +00004512 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004513
Steve Blocka7e24c12009-10-30 11:49:00 +00004514 private:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004515 OStream& os_;
Steve Blocka7e24c12009-10-30 11:49:00 +00004516 bool first_;
4517};
4518
4519
4520void DotPrinter::PrintAttributes(RegExpNode* that) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004521 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4522 << "margin=0.1, fontsize=10, label=\"{";
4523 AttributePrinter printer(os_);
Steve Blocka7e24c12009-10-30 11:49:00 +00004524 NodeInfo* info = that->info();
4525 printer.PrintBit("NI", info->follows_newline_interest);
4526 printer.PrintBit("WI", info->follows_word_interest);
4527 printer.PrintBit("SI", info->follows_start_interest);
4528 Label* label = that->label();
4529 if (label->is_bound())
4530 printer.PrintPositive("@", label->pos());
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004531 os_ << "}\"];\n"
4532 << " a" << that << " -> n" << that
4533 << " [style=dashed, color=grey, arrowhead=none];\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004534}
4535
4536
4537static const bool kPrintDispatchTable = false;
4538void DotPrinter::VisitChoice(ChoiceNode* that) {
4539 if (kPrintDispatchTable) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004540 os_ << " n" << that << " [shape=Mrecord, label=\"";
4541 TableEntryHeaderPrinter header_printer(os_);
Steve Blocka7e24c12009-10-30 11:49:00 +00004542 that->GetTable(ignore_case_)->ForEach(&header_printer);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004543 os_ << "\"]\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004544 PrintAttributes(that);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004545 TableEntryBodyPrinter body_printer(os_, that);
Steve Blocka7e24c12009-10-30 11:49:00 +00004546 that->GetTable(ignore_case_)->ForEach(&body_printer);
4547 } else {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004548 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004549 for (int i = 0; i < that->alternatives()->length(); i++) {
4550 GuardedAlternative alt = that->alternatives()->at(i);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004551 os_ << " n" << that << " -> n" << alt.node();
Steve Blocka7e24c12009-10-30 11:49:00 +00004552 }
4553 }
4554 for (int i = 0; i < that->alternatives()->length(); i++) {
4555 GuardedAlternative alt = that->alternatives()->at(i);
4556 alt.node()->Accept(this);
4557 }
4558}
4559
4560
4561void DotPrinter::VisitText(TextNode* that) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004562 Zone* zone = that->zone();
4563 os_ << " n" << that << " [label=\"";
Steve Blocka7e24c12009-10-30 11:49:00 +00004564 for (int i = 0; i < that->elements()->length(); i++) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004565 if (i > 0) os_ << " ";
Steve Blocka7e24c12009-10-30 11:49:00 +00004566 TextElement elm = that->elements()->at(i);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004567 switch (elm.text_type()) {
Steve Blocka7e24c12009-10-30 11:49:00 +00004568 case TextElement::ATOM: {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004569 Vector<const uc16> data = elm.atom()->data();
4570 for (int i = 0; i < data.length(); i++) {
4571 os_ << static_cast<char>(data[i]);
4572 }
Steve Blocka7e24c12009-10-30 11:49:00 +00004573 break;
4574 }
4575 case TextElement::CHAR_CLASS: {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004576 RegExpCharacterClass* node = elm.char_class();
4577 os_ << "[";
4578 if (node->is_negated()) os_ << "^";
4579 for (int j = 0; j < node->ranges(zone)->length(); j++) {
4580 CharacterRange range = node->ranges(zone)->at(j);
4581 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
Steve Blocka7e24c12009-10-30 11:49:00 +00004582 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004583 os_ << "]";
Steve Blocka7e24c12009-10-30 11:49:00 +00004584 break;
4585 }
4586 default:
4587 UNREACHABLE();
4588 }
4589 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004590 os_ << "\", shape=box, peripheries=2];\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004591 PrintAttributes(that);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004592 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004593 Visit(that->on_success());
4594}
4595
4596
4597void DotPrinter::VisitBackReference(BackReferenceNode* that) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004598 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4599 << that->end_register() << "\", shape=doubleoctagon];\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004600 PrintAttributes(that);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004601 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004602 Visit(that->on_success());
4603}
4604
4605
4606void DotPrinter::VisitEnd(EndNode* that) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004607 os_ << " n" << that << " [style=bold, shape=point];\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004608 PrintAttributes(that);
4609}
4610
4611
4612void DotPrinter::VisitAssertion(AssertionNode* that) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004613 os_ << " n" << that << " [";
4614 switch (that->assertion_type()) {
Steve Blocka7e24c12009-10-30 11:49:00 +00004615 case AssertionNode::AT_END:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004616 os_ << "label=\"$\", shape=septagon";
Steve Blocka7e24c12009-10-30 11:49:00 +00004617 break;
4618 case AssertionNode::AT_START:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004619 os_ << "label=\"^\", shape=septagon";
Steve Blocka7e24c12009-10-30 11:49:00 +00004620 break;
4621 case AssertionNode::AT_BOUNDARY:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004622 os_ << "label=\"\\b\", shape=septagon";
Steve Blocka7e24c12009-10-30 11:49:00 +00004623 break;
4624 case AssertionNode::AT_NON_BOUNDARY:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004625 os_ << "label=\"\\B\", shape=septagon";
Steve Blocka7e24c12009-10-30 11:49:00 +00004626 break;
4627 case AssertionNode::AFTER_NEWLINE:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004628 os_ << "label=\"(?<=\\n)\", shape=septagon";
Leon Clarkee46be812010-01-19 14:06:41 +00004629 break;
Steve Blocka7e24c12009-10-30 11:49:00 +00004630 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004631 os_ << "];\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004632 PrintAttributes(that);
4633 RegExpNode* successor = that->on_success();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004634 os_ << " n" << that << " -> n" << successor << ";\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004635 Visit(successor);
4636}
4637
4638
4639void DotPrinter::VisitAction(ActionNode* that) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004640 os_ << " n" << that << " [";
4641 switch (that->action_type_) {
Steve Blocka7e24c12009-10-30 11:49:00 +00004642 case ActionNode::SET_REGISTER:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004643 os_ << "label=\"$" << that->data_.u_store_register.reg
4644 << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
Steve Blocka7e24c12009-10-30 11:49:00 +00004645 break;
4646 case ActionNode::INCREMENT_REGISTER:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004647 os_ << "label=\"$" << that->data_.u_increment_register.reg
4648 << "++\", shape=octagon";
Steve Blocka7e24c12009-10-30 11:49:00 +00004649 break;
4650 case ActionNode::STORE_POSITION:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004651 os_ << "label=\"$" << that->data_.u_position_register.reg
4652 << ":=$pos\", shape=octagon";
Steve Blocka7e24c12009-10-30 11:49:00 +00004653 break;
4654 case ActionNode::BEGIN_SUBMATCH:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004655 os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4656 << ":=$pos,begin\", shape=septagon";
Steve Blocka7e24c12009-10-30 11:49:00 +00004657 break;
4658 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004659 os_ << "label=\"escape\", shape=septagon";
Steve Blocka7e24c12009-10-30 11:49:00 +00004660 break;
4661 case ActionNode::EMPTY_MATCH_CHECK:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004662 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4663 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4664 << "<" << that->data_.u_empty_match_check.repetition_limit
4665 << "?\", shape=septagon";
Steve Blocka7e24c12009-10-30 11:49:00 +00004666 break;
4667 case ActionNode::CLEAR_CAPTURES: {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004668 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4669 << " to $" << that->data_.u_clear_captures.range_to
4670 << "\", shape=septagon";
Steve Blocka7e24c12009-10-30 11:49:00 +00004671 break;
4672 }
4673 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004674 os_ << "];\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004675 PrintAttributes(that);
4676 RegExpNode* successor = that->on_success();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004677 os_ << " n" << that << " -> n" << successor << ";\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004678 Visit(successor);
4679}
4680
4681
4682class DispatchTableDumper {
4683 public:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004684 explicit DispatchTableDumper(OStream& os) : os_(os) {}
Steve Blocka7e24c12009-10-30 11:49:00 +00004685 void Call(uc16 key, DispatchTable::Entry entry);
Steve Blocka7e24c12009-10-30 11:49:00 +00004686 private:
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004687 OStream& os_;
Steve Blocka7e24c12009-10-30 11:49:00 +00004688};
4689
4690
4691void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004692 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
Steve Blocka7e24c12009-10-30 11:49:00 +00004693 OutSet* set = entry.out_set();
4694 bool first = true;
4695 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4696 if (set->Get(i)) {
4697 if (first) {
4698 first = false;
4699 } else {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004700 os_ << ", ";
Steve Blocka7e24c12009-10-30 11:49:00 +00004701 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004702 os_ << i;
Steve Blocka7e24c12009-10-30 11:49:00 +00004703 }
4704 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004705 os_ << "}\n";
Steve Blocka7e24c12009-10-30 11:49:00 +00004706}
4707
4708
4709void DispatchTable::Dump() {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004710 OFStream os(stderr);
4711 DispatchTableDumper dumper(os);
Steve Blocka7e24c12009-10-30 11:49:00 +00004712 tree()->ForEach(&dumper);
Steve Blocka7e24c12009-10-30 11:49:00 +00004713}
4714
4715
4716void RegExpEngine::DotPrint(const char* label,
4717 RegExpNode* node,
4718 bool ignore_case) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004719 OFStream os(stdout);
4720 DotPrinter printer(os, ignore_case);
Steve Blocka7e24c12009-10-30 11:49:00 +00004721 printer.PrintNode(label, node);
4722}
4723
4724
4725#endif // DEBUG
4726
4727
4728// -------------------------------------------------------------------
4729// Tree to graph conversion
4730
Steve Blocka7e24c12009-10-30 11:49:00 +00004731RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4732 RegExpNode* on_success) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004733 ZoneList<TextElement>* elms =
4734 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4735 elms->Add(TextElement::Atom(this), compiler->zone());
4736 return new(compiler->zone()) TextNode(elms, on_success);
Steve Blocka7e24c12009-10-30 11:49:00 +00004737}
4738
4739
4740RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4741 RegExpNode* on_success) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004742 return new(compiler->zone()) TextNode(elements(), on_success);
Steve Blocka7e24c12009-10-30 11:49:00 +00004743}
4744
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004745
Steve Blocka7e24c12009-10-30 11:49:00 +00004746static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004747 const int* special_class,
Steve Blocka7e24c12009-10-30 11:49:00 +00004748 int length) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004749 length--; // Remove final 0x10000.
4750 DCHECK(special_class[length] == 0x10000);
4751 DCHECK(ranges->length() != 0);
4752 DCHECK(length != 0);
4753 DCHECK(special_class[0] != 0);
Steve Blocka7e24c12009-10-30 11:49:00 +00004754 if (ranges->length() != (length >> 1) + 1) {
4755 return false;
4756 }
4757 CharacterRange range = ranges->at(0);
4758 if (range.from() != 0) {
4759 return false;
4760 }
4761 for (int i = 0; i < length; i += 2) {
4762 if (special_class[i] != (range.to() + 1)) {
4763 return false;
4764 }
4765 range = ranges->at((i >> 1) + 1);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004766 if (special_class[i+1] != range.from()) {
Steve Blocka7e24c12009-10-30 11:49:00 +00004767 return false;
4768 }
4769 }
4770 if (range.to() != 0xffff) {
4771 return false;
4772 }
4773 return true;
4774}
4775
4776
4777static bool CompareRanges(ZoneList<CharacterRange>* ranges,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004778 const int* special_class,
Steve Blocka7e24c12009-10-30 11:49:00 +00004779 int length) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004780 length--; // Remove final 0x10000.
4781 DCHECK(special_class[length] == 0x10000);
Steve Blocka7e24c12009-10-30 11:49:00 +00004782 if (ranges->length() * 2 != length) {
4783 return false;
4784 }
4785 for (int i = 0; i < length; i += 2) {
4786 CharacterRange range = ranges->at(i >> 1);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004787 if (range.from() != special_class[i] ||
4788 range.to() != special_class[i + 1] - 1) {
Steve Blocka7e24c12009-10-30 11:49:00 +00004789 return false;
4790 }
4791 }
4792 return true;
4793}
4794
4795
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004796bool RegExpCharacterClass::is_standard(Zone* zone) {
Steve Blocka7e24c12009-10-30 11:49:00 +00004797 // TODO(lrn): Remove need for this function, by not throwing away information
4798 // along the way.
4799 if (is_negated_) {
4800 return false;
4801 }
4802 if (set_.is_standard()) {
4803 return true;
4804 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004805 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
Steve Blocka7e24c12009-10-30 11:49:00 +00004806 set_.set_standard_set_type('s');
4807 return true;
4808 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004809 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
Steve Blocka7e24c12009-10-30 11:49:00 +00004810 set_.set_standard_set_type('S');
4811 return true;
4812 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004813 if (CompareInverseRanges(set_.ranges(zone),
Steve Blocka7e24c12009-10-30 11:49:00 +00004814 kLineTerminatorRanges,
4815 kLineTerminatorRangeCount)) {
4816 set_.set_standard_set_type('.');
4817 return true;
4818 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004819 if (CompareRanges(set_.ranges(zone),
Leon Clarkee46be812010-01-19 14:06:41 +00004820 kLineTerminatorRanges,
4821 kLineTerminatorRangeCount)) {
4822 set_.set_standard_set_type('n');
4823 return true;
4824 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004825 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
Leon Clarkee46be812010-01-19 14:06:41 +00004826 set_.set_standard_set_type('w');
4827 return true;
4828 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004829 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
Leon Clarkee46be812010-01-19 14:06:41 +00004830 set_.set_standard_set_type('W');
4831 return true;
4832 }
Steve Blocka7e24c12009-10-30 11:49:00 +00004833 return false;
4834}
4835
4836
4837RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4838 RegExpNode* on_success) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004839 return new(compiler->zone()) TextNode(this, on_success);
Steve Blocka7e24c12009-10-30 11:49:00 +00004840}
4841
4842
4843RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
4844 RegExpNode* on_success) {
4845 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4846 int length = alternatives->length();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004847 ChoiceNode* result =
4848 new(compiler->zone()) ChoiceNode(length, compiler->zone());
Steve Blocka7e24c12009-10-30 11:49:00 +00004849 for (int i = 0; i < length; i++) {
4850 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
4851 on_success));
4852 result->AddAlternative(alternative);
4853 }
4854 return result;
4855}
4856
4857
4858RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
4859 RegExpNode* on_success) {
4860 return ToNode(min(),
4861 max(),
4862 is_greedy(),
4863 body(),
4864 compiler,
4865 on_success);
4866}
4867
4868
Ben Murdoch257744e2011-11-30 15:57:28 +00004869// Scoped object to keep track of how much we unroll quantifier loops in the
4870// regexp graph generator.
4871class RegExpExpansionLimiter {
4872 public:
4873 static const int kMaxExpansionFactor = 6;
4874 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
4875 : compiler_(compiler),
4876 saved_expansion_factor_(compiler->current_expansion_factor()),
4877 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004878 DCHECK(factor > 0);
Ben Murdoch257744e2011-11-30 15:57:28 +00004879 if (ok_to_expand_) {
4880 if (factor > kMaxExpansionFactor) {
4881 // Avoid integer overflow of the current expansion factor.
4882 ok_to_expand_ = false;
4883 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
4884 } else {
4885 int new_factor = saved_expansion_factor_ * factor;
4886 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4887 compiler->set_current_expansion_factor(new_factor);
4888 }
4889 }
4890 }
4891
4892 ~RegExpExpansionLimiter() {
4893 compiler_->set_current_expansion_factor(saved_expansion_factor_);
4894 }
4895
4896 bool ok_to_expand() { return ok_to_expand_; }
4897
4898 private:
4899 RegExpCompiler* compiler_;
4900 int saved_expansion_factor_;
4901 bool ok_to_expand_;
4902
4903 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
4904};
4905
4906
Steve Blocka7e24c12009-10-30 11:49:00 +00004907RegExpNode* RegExpQuantifier::ToNode(int min,
4908 int max,
4909 bool is_greedy,
4910 RegExpTree* body,
4911 RegExpCompiler* compiler,
4912 RegExpNode* on_success,
4913 bool not_at_start) {
4914 // x{f, t} becomes this:
4915 //
4916 // (r++)<-.
4917 // | `
4918 // | (x)
4919 // v ^
4920 // (r=0)-->(?)---/ [if r < t]
4921 // |
4922 // [if r >= f] \----> ...
4923 //
4924
4925 // 15.10.2.5 RepeatMatcher algorithm.
4926 // The parser has already eliminated the case where max is 0. In the case
4927 // where max_match is zero the parser has removed the quantifier if min was
4928 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
4929
4930 // If we know that we cannot match zero length then things are a little
4931 // simpler since we don't need to make the special zero length match check
4932 // from step 2.1. If the min and max are small we can unroll a little in
4933 // this case.
4934 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
4935 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
4936 if (max == 0) return on_success; // This can happen due to recursion.
4937 bool body_can_be_empty = (body->min_match() == 0);
4938 int body_start_reg = RegExpCompiler::kNoRegister;
4939 Interval capture_registers = body->CaptureRegisters();
4940 bool needs_capture_clearing = !capture_registers.is_empty();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004941 Zone* zone = compiler->zone();
4942
Steve Blocka7e24c12009-10-30 11:49:00 +00004943 if (body_can_be_empty) {
4944 body_start_reg = compiler->AllocateRegister();
4945 } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
4946 // Only unroll if there are no captures and the body can't be
4947 // empty.
Ben Murdoch257744e2011-11-30 15:57:28 +00004948 {
4949 RegExpExpansionLimiter limiter(
4950 compiler, min + ((max != min) ? 1 : 0));
4951 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4952 int new_max = (max == kInfinity) ? max : max - min;
4953 // Recurse once to get the loop or optional matches after the fixed
4954 // ones.
4955 RegExpNode* answer = ToNode(
4956 0, new_max, is_greedy, body, compiler, on_success, true);
4957 // Unroll the forced matches from 0 to min. This can cause chains of
4958 // TextNodes (which the parser does not generate). These should be
4959 // combined if it turns out they hinder good code generation.
4960 for (int i = 0; i < min; i++) {
4961 answer = body->ToNode(compiler, answer);
Steve Blocka7e24c12009-10-30 11:49:00 +00004962 }
Ben Murdoch257744e2011-11-30 15:57:28 +00004963 return answer;
Steve Blocka7e24c12009-10-30 11:49:00 +00004964 }
Ben Murdoch257744e2011-11-30 15:57:28 +00004965 }
4966 if (max <= kMaxUnrolledMaxMatches && min == 0) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004967 DCHECK(max > 0); // Due to the 'if' above.
Ben Murdoch257744e2011-11-30 15:57:28 +00004968 RegExpExpansionLimiter limiter(compiler, max);
4969 if (limiter.ok_to_expand()) {
4970 // Unroll the optional matches up to max.
4971 RegExpNode* answer = on_success;
4972 for (int i = 0; i < max; i++) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004973 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
Ben Murdoch257744e2011-11-30 15:57:28 +00004974 if (is_greedy) {
4975 alternation->AddAlternative(
4976 GuardedAlternative(body->ToNode(compiler, answer)));
4977 alternation->AddAlternative(GuardedAlternative(on_success));
4978 } else {
4979 alternation->AddAlternative(GuardedAlternative(on_success));
4980 alternation->AddAlternative(
4981 GuardedAlternative(body->ToNode(compiler, answer)));
4982 }
4983 answer = alternation;
4984 if (not_at_start) alternation->set_not_at_start();
4985 }
4986 return answer;
4987 }
Steve Blocka7e24c12009-10-30 11:49:00 +00004988 }
4989 }
4990 bool has_min = min > 0;
4991 bool has_max = max < RegExpTree::kInfinity;
4992 bool needs_counter = has_min || has_max;
4993 int reg_ctr = needs_counter
4994 ? compiler->AllocateRegister()
4995 : RegExpCompiler::kNoRegister;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00004996 LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0,
4997 zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00004998 if (not_at_start) center->set_not_at_start();
4999 RegExpNode* loop_return = needs_counter
5000 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5001 : static_cast<RegExpNode*>(center);
5002 if (body_can_be_empty) {
5003 // If the body can be empty we need to check if it was and then
5004 // backtrack.
5005 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5006 reg_ctr,
5007 min,
5008 loop_return);
5009 }
5010 RegExpNode* body_node = body->ToNode(compiler, loop_return);
5011 if (body_can_be_empty) {
5012 // If the body can be empty we need to store the start position
5013 // so we can bail out if it was empty.
5014 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5015 }
5016 if (needs_capture_clearing) {
5017 // Before entering the body of this loop we need to clear captures.
5018 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5019 }
5020 GuardedAlternative body_alt(body_node);
5021 if (has_max) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005022 Guard* body_guard =
5023 new(zone) Guard(reg_ctr, Guard::LT, max);
5024 body_alt.AddGuard(body_guard, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005025 }
5026 GuardedAlternative rest_alt(on_success);
5027 if (has_min) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005028 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5029 rest_alt.AddGuard(rest_guard, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005030 }
5031 if (is_greedy) {
5032 center->AddLoopAlternative(body_alt);
5033 center->AddContinueAlternative(rest_alt);
5034 } else {
5035 center->AddContinueAlternative(rest_alt);
5036 center->AddLoopAlternative(body_alt);
5037 }
5038 if (needs_counter) {
5039 return ActionNode::SetRegister(reg_ctr, 0, center);
5040 } else {
5041 return center;
5042 }
5043}
5044
5045
5046RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5047 RegExpNode* on_success) {
5048 NodeInfo info;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005049 Zone* zone = compiler->zone();
5050
5051 switch (assertion_type()) {
Steve Blocka7e24c12009-10-30 11:49:00 +00005052 case START_OF_LINE:
5053 return AssertionNode::AfterNewline(on_success);
5054 case START_OF_INPUT:
5055 return AssertionNode::AtStart(on_success);
5056 case BOUNDARY:
5057 return AssertionNode::AtBoundary(on_success);
5058 case NON_BOUNDARY:
5059 return AssertionNode::AtNonBoundary(on_success);
5060 case END_OF_INPUT:
5061 return AssertionNode::AtEnd(on_success);
5062 case END_OF_LINE: {
5063 // Compile $ in multiline regexps as an alternation with a positive
5064 // lookahead in one side and an end-of-input on the other side.
5065 // We need two registers for the lookahead.
5066 int stack_pointer_register = compiler->AllocateRegister();
5067 int position_register = compiler->AllocateRegister();
5068 // The ChoiceNode to distinguish between a newline and end-of-input.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005069 ChoiceNode* result = new(zone) ChoiceNode(2, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005070 // Create a newline atom.
5071 ZoneList<CharacterRange>* newline_ranges =
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005072 new(zone) ZoneList<CharacterRange>(3, zone);
5073 CharacterRange::AddClassEscape('n', newline_ranges, zone);
5074 RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n');
5075 TextNode* newline_matcher = new(zone) TextNode(
Steve Blocka7e24c12009-10-30 11:49:00 +00005076 newline_atom,
5077 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5078 position_register,
5079 0, // No captures inside.
5080 -1, // Ignored if no captures.
5081 on_success));
5082 // Create an end-of-input matcher.
5083 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5084 stack_pointer_register,
5085 position_register,
5086 newline_matcher);
5087 // Add the two alternatives to the ChoiceNode.
5088 GuardedAlternative eol_alternative(end_of_line);
5089 result->AddAlternative(eol_alternative);
5090 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5091 result->AddAlternative(end_alternative);
5092 return result;
5093 }
5094 default:
5095 UNREACHABLE();
5096 }
5097 return on_success;
5098}
5099
5100
5101RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5102 RegExpNode* on_success) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005103 return new(compiler->zone())
5104 BackReferenceNode(RegExpCapture::StartRegister(index()),
5105 RegExpCapture::EndRegister(index()),
5106 on_success);
Steve Blocka7e24c12009-10-30 11:49:00 +00005107}
5108
5109
5110RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5111 RegExpNode* on_success) {
5112 return on_success;
5113}
5114
5115
5116RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
5117 RegExpNode* on_success) {
5118 int stack_pointer_register = compiler->AllocateRegister();
5119 int position_register = compiler->AllocateRegister();
5120
5121 const int registers_per_capture = 2;
5122 const int register_of_first_capture = 2;
5123 int register_count = capture_count_ * registers_per_capture;
5124 int register_start =
5125 register_of_first_capture + capture_from_ * registers_per_capture;
5126
5127 RegExpNode* success;
5128 if (is_positive()) {
5129 RegExpNode* node = ActionNode::BeginSubmatch(
5130 stack_pointer_register,
5131 position_register,
5132 body()->ToNode(
5133 compiler,
5134 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5135 position_register,
5136 register_count,
5137 register_start,
5138 on_success)));
5139 return node;
5140 } else {
5141 // We use a ChoiceNode for a negative lookahead because it has most of
5142 // the characteristics we need. It has the body of the lookahead as its
5143 // first alternative and the expression after the lookahead of the second
5144 // alternative. If the first alternative succeeds then the
5145 // NegativeSubmatchSuccess will unwind the stack including everything the
5146 // choice node set up and backtrack. If the first alternative fails then
5147 // the second alternative is tried, which is exactly the desired result
5148 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
5149 // ChoiceNode that knows to ignore the first exit when calculating quick
5150 // checks.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005151 Zone* zone = compiler->zone();
5152
Steve Blocka7e24c12009-10-30 11:49:00 +00005153 GuardedAlternative body_alt(
5154 body()->ToNode(
5155 compiler,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005156 success = new(zone) NegativeSubmatchSuccess(stack_pointer_register,
5157 position_register,
5158 register_count,
5159 register_start,
5160 zone)));
Steve Blocka7e24c12009-10-30 11:49:00 +00005161 ChoiceNode* choice_node =
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005162 new(zone) NegativeLookaheadChoiceNode(body_alt,
5163 GuardedAlternative(on_success),
5164 zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005165 return ActionNode::BeginSubmatch(stack_pointer_register,
5166 position_register,
5167 choice_node);
5168 }
5169}
5170
5171
5172RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5173 RegExpNode* on_success) {
5174 return ToNode(body(), index(), compiler, on_success);
5175}
5176
5177
5178RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5179 int index,
5180 RegExpCompiler* compiler,
5181 RegExpNode* on_success) {
5182 int start_reg = RegExpCapture::StartRegister(index);
5183 int end_reg = RegExpCapture::EndRegister(index);
5184 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5185 RegExpNode* body_node = body->ToNode(compiler, store_end);
5186 return ActionNode::StorePosition(start_reg, true, body_node);
5187}
5188
5189
5190RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5191 RegExpNode* on_success) {
5192 ZoneList<RegExpTree*>* children = nodes();
5193 RegExpNode* current = on_success;
5194 for (int i = children->length() - 1; i >= 0; i--) {
5195 current = children->at(i)->ToNode(compiler, current);
5196 }
5197 return current;
5198}
5199
5200
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005201static void AddClass(const int* elmv,
Steve Blocka7e24c12009-10-30 11:49:00 +00005202 int elmc,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005203 ZoneList<CharacterRange>* ranges,
5204 Zone* zone) {
5205 elmc--;
5206 DCHECK(elmv[elmc] == 0x10000);
Steve Blocka7e24c12009-10-30 11:49:00 +00005207 for (int i = 0; i < elmc; i += 2) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005208 DCHECK(elmv[i] < elmv[i + 1]);
5209 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005210 }
5211}
5212
5213
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005214static void AddClassNegated(const int *elmv,
Steve Blocka7e24c12009-10-30 11:49:00 +00005215 int elmc,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005216 ZoneList<CharacterRange>* ranges,
5217 Zone* zone) {
5218 elmc--;
5219 DCHECK(elmv[elmc] == 0x10000);
5220 DCHECK(elmv[0] != 0x0000);
5221 DCHECK(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
Steve Blocka7e24c12009-10-30 11:49:00 +00005222 uc16 last = 0x0000;
5223 for (int i = 0; i < elmc; i += 2) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005224 DCHECK(last <= elmv[i] - 1);
5225 DCHECK(elmv[i] < elmv[i + 1]);
5226 ranges->Add(CharacterRange(last, elmv[i] - 1), zone);
5227 last = elmv[i + 1];
Steve Blocka7e24c12009-10-30 11:49:00 +00005228 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005229 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005230}
5231
5232
5233void CharacterRange::AddClassEscape(uc16 type,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005234 ZoneList<CharacterRange>* ranges,
5235 Zone* zone) {
Steve Blocka7e24c12009-10-30 11:49:00 +00005236 switch (type) {
5237 case 's':
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005238 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005239 break;
5240 case 'S':
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005241 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005242 break;
5243 case 'w':
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005244 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005245 break;
5246 case 'W':
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005247 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005248 break;
5249 case 'd':
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005250 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005251 break;
5252 case 'D':
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005253 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005254 break;
5255 case '.':
5256 AddClassNegated(kLineTerminatorRanges,
5257 kLineTerminatorRangeCount,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005258 ranges,
5259 zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005260 break;
5261 // This is not a character range as defined by the spec but a
5262 // convenient shorthand for a character class that matches any
5263 // character.
5264 case '*':
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005265 ranges->Add(CharacterRange::Everything(), zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005266 break;
5267 // This is the set of characters matched by the $ and ^ symbols
5268 // in multiline mode.
5269 case 'n':
5270 AddClass(kLineTerminatorRanges,
5271 kLineTerminatorRangeCount,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005272 ranges,
5273 zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005274 break;
5275 default:
5276 UNREACHABLE();
5277 }
5278}
5279
5280
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005281Vector<const int> CharacterRange::GetWordBounds() {
5282 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
Steve Blocka7e24c12009-10-30 11:49:00 +00005283}
5284
5285
5286class CharacterRangeSplitter {
5287 public:
5288 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005289 ZoneList<CharacterRange>** excluded,
5290 Zone* zone)
Steve Blocka7e24c12009-10-30 11:49:00 +00005291 : included_(included),
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005292 excluded_(excluded),
5293 zone_(zone) { }
Steve Blocka7e24c12009-10-30 11:49:00 +00005294 void Call(uc16 from, DispatchTable::Entry entry);
5295
5296 static const int kInBase = 0;
5297 static const int kInOverlay = 1;
5298
5299 private:
5300 ZoneList<CharacterRange>** included_;
5301 ZoneList<CharacterRange>** excluded_;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005302 Zone* zone_;
Steve Blocka7e24c12009-10-30 11:49:00 +00005303};
5304
5305
5306void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
5307 if (!entry.out_set()->Get(kInBase)) return;
5308 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
5309 ? included_
5310 : excluded_;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005311 if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_);
5312 (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);
Steve Blocka7e24c12009-10-30 11:49:00 +00005313}
5314
5315
5316void CharacterRange::Split(ZoneList<CharacterRange>* base,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005317 Vector<const int> overlay,
Steve Blocka7e24c12009-10-30 11:49:00 +00005318 ZoneList<CharacterRange>** included,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005319 ZoneList<CharacterRange>** excluded,
5320 Zone* zone) {
5321 DCHECK_EQ(NULL, *included);
5322 DCHECK_EQ(NULL, *excluded);
5323 DispatchTable table(zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005324 for (int i = 0; i < base->length(); i++)
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005325 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005326 for (int i = 0; i < overlay.length(); i += 2) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005327 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
5328 CharacterRangeSplitter::kInOverlay, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005329 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005330 CharacterRangeSplitter callback(included, excluded, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005331 table.ForEach(&callback);
5332}
5333
5334
Steve Blockd0582a62009-12-15 09:54:21 +00005335void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005336 bool is_one_byte, Zone* zone) {
5337 Isolate* isolate = zone->isolate();
Steve Blockd0582a62009-12-15 09:54:21 +00005338 uc16 bottom = from();
5339 uc16 top = to();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005340 if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {
5341 if (bottom > String::kMaxOneByteCharCode) return;
5342 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
Steve Blockd0582a62009-12-15 09:54:21 +00005343 }
Steve Blocka7e24c12009-10-30 11:49:00 +00005344 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Blockd0582a62009-12-15 09:54:21 +00005345 if (top == bottom) {
Steve Blocka7e24c12009-10-30 11:49:00 +00005346 // If this is a singleton we just expand the one character.
Steve Block44f0eee2011-05-26 01:26:41 +01005347 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
Steve Blocka7e24c12009-10-30 11:49:00 +00005348 for (int i = 0; i < length; i++) {
5349 uc32 chr = chars[i];
Steve Blockd0582a62009-12-15 09:54:21 +00005350 if (chr != bottom) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005351 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005352 }
5353 }
Ben Murdochbb769b22010-08-11 14:56:33 +01005354 } else {
Steve Blocka7e24c12009-10-30 11:49:00 +00005355 // If this is a range we expand the characters block by block,
5356 // expanding contiguous subranges (blocks) one at a time.
5357 // The approach is as follows. For a given start character we
Ben Murdochbb769b22010-08-11 14:56:33 +01005358 // look up the remainder of the block that contains it (represented
5359 // by the end point), for instance we find 'z' if the character
5360 // is 'c'. A block is characterized by the property
5361 // that all characters uncanonicalize in the same way, except that
5362 // each entry in the result is incremented by the distance from the first
5363 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
5364 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
5365 // Once we've found the end point we look up its uncanonicalization
Steve Blocka7e24c12009-10-30 11:49:00 +00005366 // and produce a range for each element. For instance for [c-f]
Ben Murdochbb769b22010-08-11 14:56:33 +01005367 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
Steve Blocka7e24c12009-10-30 11:49:00 +00005368 // add a range if it is not already contained in the input, so [c-f]
5369 // will be skipped but [C-F] will be added. If this range is not
5370 // completely contained in a block we do this for all the blocks
Ben Murdochbb769b22010-08-11 14:56:33 +01005371 // covered by the range (handling characters that is not in a block
5372 // as a "singleton block").
Steve Blocka7e24c12009-10-30 11:49:00 +00005373 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Blockd0582a62009-12-15 09:54:21 +00005374 int pos = bottom;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005375 while (pos <= top) {
Steve Block44f0eee2011-05-26 01:26:41 +01005376 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
Ben Murdochbb769b22010-08-11 14:56:33 +01005377 uc16 block_end;
Steve Blocka7e24c12009-10-30 11:49:00 +00005378 if (length == 0) {
Ben Murdochbb769b22010-08-11 14:56:33 +01005379 block_end = pos;
Steve Blocka7e24c12009-10-30 11:49:00 +00005380 } else {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005381 DCHECK_EQ(1, length);
Ben Murdochbb769b22010-08-11 14:56:33 +01005382 block_end = range[0];
Steve Blocka7e24c12009-10-30 11:49:00 +00005383 }
Steve Blockd0582a62009-12-15 09:54:21 +00005384 int end = (block_end > top) ? top : block_end;
Steve Block44f0eee2011-05-26 01:26:41 +01005385 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
Steve Blocka7e24c12009-10-30 11:49:00 +00005386 for (int i = 0; i < length; i++) {
5387 uc32 c = range[i];
Ben Murdochbb769b22010-08-11 14:56:33 +01005388 uc16 range_from = c - (block_end - pos);
5389 uc16 range_to = c - (block_end - end);
Steve Blockd0582a62009-12-15 09:54:21 +00005390 if (!(bottom <= range_from && range_to <= top)) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005391 ranges->Add(CharacterRange(range_from, range_to), zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005392 }
5393 }
Ben Murdochbb769b22010-08-11 14:56:33 +01005394 pos = end + 1;
Steve Blocka7e24c12009-10-30 11:49:00 +00005395 }
Steve Blockd0582a62009-12-15 09:54:21 +00005396 }
5397}
5398
5399
Leon Clarkee46be812010-01-19 14:06:41 +00005400bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005401 DCHECK_NOT_NULL(ranges);
Leon Clarkee46be812010-01-19 14:06:41 +00005402 int n = ranges->length();
5403 if (n <= 1) return true;
5404 int max = ranges->at(0).to();
5405 for (int i = 1; i < n; i++) {
5406 CharacterRange next_range = ranges->at(i);
5407 if (next_range.from() <= max + 1) return false;
5408 max = next_range.to();
5409 }
5410 return true;
5411}
5412
Leon Clarkee46be812010-01-19 14:06:41 +00005413
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005414ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
Steve Blocka7e24c12009-10-30 11:49:00 +00005415 if (ranges_ == NULL) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005416 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5417 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005418 }
5419 return ranges_;
5420}
5421
5422
Leon Clarkee46be812010-01-19 14:06:41 +00005423// Move a number of elements in a zonelist to another position
5424// in the same list. Handles overlapping source and target areas.
5425static void MoveRanges(ZoneList<CharacterRange>* list,
5426 int from,
5427 int to,
5428 int count) {
5429 // Ranges are potentially overlapping.
5430 if (from < to) {
5431 for (int i = count - 1; i >= 0; i--) {
5432 list->at(to + i) = list->at(from + i);
5433 }
5434 } else {
5435 for (int i = 0; i < count; i++) {
5436 list->at(to + i) = list->at(from + i);
5437 }
5438 }
5439}
5440
5441
5442static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5443 int count,
5444 CharacterRange insert) {
5445 // Inserts a range into list[0..count[, which must be sorted
5446 // by from value and non-overlapping and non-adjacent, using at most
5447 // list[0..count] for the result. Returns the number of resulting
5448 // canonicalized ranges. Inserting a range may collapse existing ranges into
5449 // fewer ranges, so the return value can be anything in the range 1..count+1.
5450 uc16 from = insert.from();
5451 uc16 to = insert.to();
5452 int start_pos = 0;
5453 int end_pos = count;
5454 for (int i = count - 1; i >= 0; i--) {
5455 CharacterRange current = list->at(i);
5456 if (current.from() > to + 1) {
5457 end_pos = i;
5458 } else if (current.to() + 1 < from) {
5459 start_pos = i + 1;
5460 break;
5461 }
5462 }
5463
5464 // Inserted range overlaps, or is adjacent to, ranges at positions
5465 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
5466 // not affected by the insertion.
5467 // If start_pos == end_pos, the range must be inserted before start_pos.
5468 // if start_pos < end_pos, the entire range from start_pos to end_pos
5469 // must be merged with the insert range.
5470
5471 if (start_pos == end_pos) {
5472 // Insert between existing ranges at position start_pos.
5473 if (start_pos < count) {
5474 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
5475 }
5476 list->at(start_pos) = insert;
5477 return count + 1;
5478 }
5479 if (start_pos + 1 == end_pos) {
5480 // Replace single existing range at position start_pos.
5481 CharacterRange to_replace = list->at(start_pos);
5482 int new_from = Min(to_replace.from(), from);
5483 int new_to = Max(to_replace.to(), to);
5484 list->at(start_pos) = CharacterRange(new_from, new_to);
5485 return count;
5486 }
5487 // Replace a number of existing ranges from start_pos to end_pos - 1.
5488 // Move the remaining ranges down.
5489
5490 int new_from = Min(list->at(start_pos).from(), from);
5491 int new_to = Max(list->at(end_pos - 1).to(), to);
5492 if (end_pos < count) {
5493 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
5494 }
5495 list->at(start_pos) = CharacterRange(new_from, new_to);
5496 return count - (end_pos - start_pos) + 1;
5497}
5498
5499
5500void CharacterSet::Canonicalize() {
5501 // Special/default classes are always considered canonical. The result
5502 // of calling ranges() will be sorted.
5503 if (ranges_ == NULL) return;
5504 CharacterRange::Canonicalize(ranges_);
5505}
5506
5507
5508void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
5509 if (character_ranges->length() <= 1) return;
5510 // Check whether ranges are already canonical (increasing, non-overlapping,
5511 // non-adjacent).
5512 int n = character_ranges->length();
5513 int max = character_ranges->at(0).to();
5514 int i = 1;
5515 while (i < n) {
5516 CharacterRange current = character_ranges->at(i);
5517 if (current.from() <= max + 1) {
5518 break;
5519 }
5520 max = current.to();
5521 i++;
5522 }
5523 // Canonical until the i'th range. If that's all of them, we are done.
5524 if (i == n) return;
5525
5526 // The ranges at index i and forward are not canonicalized. Make them so by
5527 // doing the equivalent of insertion sort (inserting each into the previous
5528 // list, in order).
5529 // Notice that inserting a range can reduce the number of ranges in the
5530 // result due to combining of adjacent and overlapping ranges.
5531 int read = i; // Range to insert.
5532 int num_canonical = i; // Length of canonicalized part of list.
5533 do {
5534 num_canonical = InsertRangeInCanonicalList(character_ranges,
5535 num_canonical,
5536 character_ranges->at(read));
5537 read++;
5538 } while (read < n);
5539 character_ranges->Rewind(num_canonical);
5540
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005541 DCHECK(CharacterRange::IsCanonical(character_ranges));
Leon Clarkee46be812010-01-19 14:06:41 +00005542}
5543
5544
5545void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005546 ZoneList<CharacterRange>* negated_ranges,
5547 Zone* zone) {
5548 DCHECK(CharacterRange::IsCanonical(ranges));
5549 DCHECK_EQ(0, negated_ranges->length());
Leon Clarkee46be812010-01-19 14:06:41 +00005550 int range_count = ranges->length();
5551 uc16 from = 0;
5552 int i = 0;
5553 if (range_count > 0 && ranges->at(0).from() == 0) {
5554 from = ranges->at(0).to();
5555 i = 1;
5556 }
5557 while (i < range_count) {
5558 CharacterRange range = ranges->at(i);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005559 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);
Leon Clarkee46be812010-01-19 14:06:41 +00005560 from = range.to();
5561 i++;
5562 }
Ben Murdoch3ef787d2012-04-12 10:51:47 +01005563 if (from < String::kMaxUtf16CodeUnit) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005564 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit),
5565 zone);
Leon Clarkee46be812010-01-19 14:06:41 +00005566 }
5567}
5568
5569
Steve Blocka7e24c12009-10-30 11:49:00 +00005570// -------------------------------------------------------------------
5571// Splay tree
5572
5573
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005574OutSet* OutSet::Extend(unsigned value, Zone* zone) {
Steve Blocka7e24c12009-10-30 11:49:00 +00005575 if (Get(value))
5576 return this;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005577 if (successors(zone) != NULL) {
5578 for (int i = 0; i < successors(zone)->length(); i++) {
5579 OutSet* successor = successors(zone)->at(i);
Steve Blocka7e24c12009-10-30 11:49:00 +00005580 if (successor->Get(value))
5581 return successor;
5582 }
5583 } else {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005584 successors_ = new(zone) ZoneList<OutSet*>(2, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005585 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005586 OutSet* result = new(zone) OutSet(first_, remaining_);
5587 result->Set(value, zone);
5588 successors(zone)->Add(result, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005589 return result;
5590}
5591
5592
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005593void OutSet::Set(unsigned value, Zone *zone) {
Steve Blocka7e24c12009-10-30 11:49:00 +00005594 if (value < kFirstLimit) {
5595 first_ |= (1 << value);
5596 } else {
5597 if (remaining_ == NULL)
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005598 remaining_ = new(zone) ZoneList<unsigned>(1, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005599 if (remaining_->is_empty() || !remaining_->Contains(value))
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005600 remaining_->Add(value, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005601 }
5602}
5603
5604
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005605bool OutSet::Get(unsigned value) const {
Steve Blocka7e24c12009-10-30 11:49:00 +00005606 if (value < kFirstLimit) {
5607 return (first_ & (1 << value)) != 0;
5608 } else if (remaining_ == NULL) {
5609 return false;
5610 } else {
5611 return remaining_->Contains(value);
5612 }
5613}
5614
5615
5616const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
Steve Blocka7e24c12009-10-30 11:49:00 +00005617
5618
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005619void DispatchTable::AddRange(CharacterRange full_range, int value,
5620 Zone* zone) {
Steve Blocka7e24c12009-10-30 11:49:00 +00005621 CharacterRange current = full_range;
5622 if (tree()->is_empty()) {
5623 // If this is the first range we just insert into the table.
5624 ZoneSplayTree<Config>::Locator loc;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005625 DCHECK_RESULT(tree()->Insert(current.from(), &loc));
5626 loc.set_value(Entry(current.from(), current.to(),
5627 empty()->Extend(value, zone)));
Steve Blocka7e24c12009-10-30 11:49:00 +00005628 return;
5629 }
5630 // First see if there is a range to the left of this one that
5631 // overlaps.
5632 ZoneSplayTree<Config>::Locator loc;
5633 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
5634 Entry* entry = &loc.value();
5635 // If we've found a range that overlaps with this one, and it
5636 // starts strictly to the left of this one, we have to fix it
5637 // because the following code only handles ranges that start on
5638 // or after the start point of the range we're adding.
5639 if (entry->from() < current.from() && entry->to() >= current.from()) {
5640 // Snap the overlapping range in half around the start point of
5641 // the range we're adding.
5642 CharacterRange left(entry->from(), current.from() - 1);
5643 CharacterRange right(current.from(), entry->to());
5644 // The left part of the overlapping range doesn't overlap.
5645 // Truncate the whole entry to be just the left part.
5646 entry->set_to(left.to());
5647 // The right part is the one that overlaps. We add this part
5648 // to the map and let the next step deal with merging it with
5649 // the range we're adding.
5650 ZoneSplayTree<Config>::Locator loc;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005651 DCHECK_RESULT(tree()->Insert(right.from(), &loc));
Steve Blocka7e24c12009-10-30 11:49:00 +00005652 loc.set_value(Entry(right.from(),
5653 right.to(),
5654 entry->out_set()));
5655 }
5656 }
5657 while (current.is_valid()) {
5658 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
5659 (loc.value().from() <= current.to()) &&
5660 (loc.value().to() >= current.from())) {
5661 Entry* entry = &loc.value();
5662 // We have overlap. If there is space between the start point of
5663 // the range we're adding and where the overlapping range starts
5664 // then we have to add a range covering just that space.
5665 if (current.from() < entry->from()) {
5666 ZoneSplayTree<Config>::Locator ins;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005667 DCHECK_RESULT(tree()->Insert(current.from(), &ins));
Steve Blocka7e24c12009-10-30 11:49:00 +00005668 ins.set_value(Entry(current.from(),
5669 entry->from() - 1,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005670 empty()->Extend(value, zone)));
Steve Blocka7e24c12009-10-30 11:49:00 +00005671 current.set_from(entry->from());
5672 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005673 DCHECK_EQ(current.from(), entry->from());
Steve Blocka7e24c12009-10-30 11:49:00 +00005674 // If the overlapping range extends beyond the one we want to add
5675 // we have to snap the right part off and add it separately.
5676 if (entry->to() > current.to()) {
5677 ZoneSplayTree<Config>::Locator ins;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005678 DCHECK_RESULT(tree()->Insert(current.to() + 1, &ins));
Steve Blocka7e24c12009-10-30 11:49:00 +00005679 ins.set_value(Entry(current.to() + 1,
5680 entry->to(),
5681 entry->out_set()));
5682 entry->set_to(current.to());
5683 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005684 DCHECK(entry->to() <= current.to());
Steve Blocka7e24c12009-10-30 11:49:00 +00005685 // The overlapping range is now completely contained by the range
5686 // we're adding so we can just update it and move the start point
5687 // of the range we're adding just past it.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005688 entry->AddValue(value, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00005689 // Bail out if the last interval ended at 0xFFFF since otherwise
5690 // adding 1 will wrap around to 0.
Ben Murdoch3ef787d2012-04-12 10:51:47 +01005691 if (entry->to() == String::kMaxUtf16CodeUnit)
Steve Blocka7e24c12009-10-30 11:49:00 +00005692 break;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005693 DCHECK(entry->to() + 1 > current.from());
Steve Blocka7e24c12009-10-30 11:49:00 +00005694 current.set_from(entry->to() + 1);
5695 } else {
5696 // There is no overlap so we can just add the range
5697 ZoneSplayTree<Config>::Locator ins;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005698 DCHECK_RESULT(tree()->Insert(current.from(), &ins));
Steve Blocka7e24c12009-10-30 11:49:00 +00005699 ins.set_value(Entry(current.from(),
5700 current.to(),
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005701 empty()->Extend(value, zone)));
Steve Blocka7e24c12009-10-30 11:49:00 +00005702 break;
5703 }
5704 }
5705}
5706
5707
5708OutSet* DispatchTable::Get(uc16 value) {
5709 ZoneSplayTree<Config>::Locator loc;
5710 if (!tree()->FindGreatestLessThan(value, &loc))
5711 return empty();
5712 Entry* entry = &loc.value();
5713 if (value <= entry->to())
5714 return entry->out_set();
5715 else
5716 return empty();
5717}
5718
5719
5720// -------------------------------------------------------------------
5721// Analysis
5722
5723
5724void Analysis::EnsureAnalyzed(RegExpNode* that) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005725 StackLimitCheck check(that->zone()->isolate());
Steve Blocka7e24c12009-10-30 11:49:00 +00005726 if (check.HasOverflowed()) {
5727 fail("Stack overflow");
5728 return;
5729 }
5730 if (that->info()->been_analyzed || that->info()->being_analyzed)
5731 return;
5732 that->info()->being_analyzed = true;
5733 that->Accept(this);
5734 that->info()->being_analyzed = false;
5735 that->info()->been_analyzed = true;
5736}
5737
5738
5739void Analysis::VisitEnd(EndNode* that) {
5740 // nothing to do
5741}
5742
5743
5744void TextNode::CalculateOffsets() {
5745 int element_count = elements()->length();
5746 // Set up the offsets of the elements relative to the start. This is a fixed
5747 // quantity since a TextNode can only contain fixed-width things.
5748 int cp_offset = 0;
5749 for (int i = 0; i < element_count; i++) {
5750 TextElement& elm = elements()->at(i);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005751 elm.set_cp_offset(cp_offset);
5752 cp_offset += elm.length();
Steve Blocka7e24c12009-10-30 11:49:00 +00005753 }
5754}
5755
5756
5757void Analysis::VisitText(TextNode* that) {
5758 if (ignore_case_) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005759 that->MakeCaseIndependent(is_one_byte_);
Steve Blocka7e24c12009-10-30 11:49:00 +00005760 }
5761 EnsureAnalyzed(that->on_success());
5762 if (!has_failed()) {
5763 that->CalculateOffsets();
5764 }
5765}
5766
5767
5768void Analysis::VisitAction(ActionNode* that) {
5769 RegExpNode* target = that->on_success();
5770 EnsureAnalyzed(target);
5771 if (!has_failed()) {
5772 // If the next node is interested in what it follows then this node
5773 // has to be interested too so it can pass the information on.
5774 that->info()->AddFromFollowing(target->info());
5775 }
5776}
5777
5778
5779void Analysis::VisitChoice(ChoiceNode* that) {
5780 NodeInfo* info = that->info();
5781 for (int i = 0; i < that->alternatives()->length(); i++) {
5782 RegExpNode* node = that->alternatives()->at(i).node();
5783 EnsureAnalyzed(node);
5784 if (has_failed()) return;
5785 // Anything the following nodes need to know has to be known by
5786 // this node also, so it can pass it on.
5787 info->AddFromFollowing(node->info());
5788 }
5789}
5790
5791
5792void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
5793 NodeInfo* info = that->info();
5794 for (int i = 0; i < that->alternatives()->length(); i++) {
5795 RegExpNode* node = that->alternatives()->at(i).node();
5796 if (node != that->loop_node()) {
5797 EnsureAnalyzed(node);
5798 if (has_failed()) return;
5799 info->AddFromFollowing(node->info());
5800 }
5801 }
5802 // Check the loop last since it may need the value of this node
5803 // to get a correct result.
5804 EnsureAnalyzed(that->loop_node());
5805 if (!has_failed()) {
5806 info->AddFromFollowing(that->loop_node()->info());
5807 }
5808}
5809
5810
5811void Analysis::VisitBackReference(BackReferenceNode* that) {
5812 EnsureAnalyzed(that->on_success());
5813}
5814
5815
5816void Analysis::VisitAssertion(AssertionNode* that) {
5817 EnsureAnalyzed(that->on_success());
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005818}
Leon Clarkee46be812010-01-19 14:06:41 +00005819
Leon Clarkee46be812010-01-19 14:06:41 +00005820
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005821void BackReferenceNode::FillInBMInfo(int offset,
5822 int budget,
5823 BoyerMooreLookahead* bm,
5824 bool not_at_start) {
5825 // Working out the set of characters that a backreference can match is too
5826 // hard, so we just say that any character can match.
5827 bm->SetRest(offset);
5828 SaveBMInfo(bm, not_at_start, offset);
5829}
5830
5831
5832STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
5833 RegExpMacroAssembler::kTableSize);
5834
5835
5836void ChoiceNode::FillInBMInfo(int offset,
5837 int budget,
5838 BoyerMooreLookahead* bm,
5839 bool not_at_start) {
5840 ZoneList<GuardedAlternative>* alts = alternatives();
5841 budget = (budget - 1) / alts->length();
5842 for (int i = 0; i < alts->length(); i++) {
5843 GuardedAlternative& alt = alts->at(i);
5844 if (alt.guards() != NULL && alt.guards()->length() != 0) {
5845 bm->SetRest(offset); // Give up trying to fill in info.
5846 SaveBMInfo(bm, not_at_start, offset);
5847 return;
Leon Clarkee46be812010-01-19 14:06:41 +00005848 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005849 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
Leon Clarkee46be812010-01-19 14:06:41 +00005850 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005851 SaveBMInfo(bm, not_at_start, offset);
Steve Blocka7e24c12009-10-30 11:49:00 +00005852}
5853
5854
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005855void TextNode::FillInBMInfo(int initial_offset,
5856 int budget,
5857 BoyerMooreLookahead* bm,
5858 bool not_at_start) {
5859 if (initial_offset >= bm->length()) return;
5860 int offset = initial_offset;
5861 int max_char = bm->max_char();
5862 for (int i = 0; i < elements()->length(); i++) {
5863 if (offset >= bm->length()) {
5864 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5865 return;
Leon Clarkee46be812010-01-19 14:06:41 +00005866 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005867 TextElement text = elements()->at(i);
5868 if (text.text_type() == TextElement::ATOM) {
5869 RegExpAtom* atom = text.atom();
5870 for (int j = 0; j < atom->length(); j++, offset++) {
5871 if (offset >= bm->length()) {
5872 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5873 return;
Leon Clarkee46be812010-01-19 14:06:41 +00005874 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005875 uc16 character = atom->data()[j];
5876 if (bm->compiler()->ignore_case()) {
5877 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5878 int length = GetCaseIndependentLetters(
5879 Isolate::Current(),
5880 character,
5881 bm->max_char() == String::kMaxOneByteCharCode,
5882 chars);
5883 for (int j = 0; j < length; j++) {
5884 bm->Set(offset, chars[j]);
Leon Clarkee46be812010-01-19 14:06:41 +00005885 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005886 } else {
5887 if (character <= max_char) bm->Set(offset, character);
Leon Clarkee46be812010-01-19 14:06:41 +00005888 }
Leon Clarkee46be812010-01-19 14:06:41 +00005889 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005890 } else {
5891 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
5892 RegExpCharacterClass* char_class = text.char_class();
5893 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
5894 if (char_class->is_negated()) {
5895 bm->SetAll(offset);
5896 } else {
5897 for (int k = 0; k < ranges->length(); k++) {
5898 CharacterRange& range = ranges->at(k);
5899 if (range.from() > max_char) continue;
5900 int to = Min(max_char, static_cast<int>(range.to()));
5901 bm->SetInterval(offset, Interval(range.from(), to));
5902 }
5903 }
5904 offset++;
Leon Clarkee46be812010-01-19 14:06:41 +00005905 }
5906 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00005907 if (offset >= bm->length()) {
5908 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5909 return;
5910 }
5911 on_success()->FillInBMInfo(offset,
5912 budget - 1,
5913 bm,
5914 true); // Not at start after a text node.
5915 if (initial_offset == 0) set_bm_info(not_at_start, bm);
Leon Clarkee46be812010-01-19 14:06:41 +00005916}
5917
5918
Steve Blocka7e24c12009-10-30 11:49:00 +00005919// -------------------------------------------------------------------
5920// Dispatch table construction
5921
5922
5923void DispatchTableConstructor::VisitEnd(EndNode* that) {
5924 AddRange(CharacterRange::Everything());
5925}
5926
5927
5928void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
5929 node->set_being_calculated(true);
5930 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
5931 for (int i = 0; i < alternatives->length(); i++) {
5932 set_choice_index(i);
5933 alternatives->at(i).node()->Accept(this);
5934 }
5935 node->set_being_calculated(false);
5936}
5937
5938
5939class AddDispatchRange {
5940 public:
5941 explicit AddDispatchRange(DispatchTableConstructor* constructor)
5942 : constructor_(constructor) { }
5943 void Call(uc32 from, DispatchTable::Entry entry);
5944 private:
5945 DispatchTableConstructor* constructor_;
5946};
5947
5948
5949void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
5950 CharacterRange range(from, entry.to());
5951 constructor_->AddRange(range);
5952}
5953
5954
5955void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
5956 if (node->being_calculated())
5957 return;
5958 DispatchTable* table = node->GetTable(ignore_case_);
5959 AddDispatchRange adder(this);
5960 table->ForEach(&adder);
5961}
5962
5963
5964void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
5965 // TODO(160): Find the node that we refer back to and propagate its start
5966 // set back to here. For now we just accept anything.
5967 AddRange(CharacterRange::Everything());
5968}
5969
5970
5971void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
5972 RegExpNode* target = that->on_success();
5973 target->Accept(this);
5974}
5975
5976
Steve Blocka7e24c12009-10-30 11:49:00 +00005977static int CompareRangeByFrom(const CharacterRange* a,
5978 const CharacterRange* b) {
5979 return Compare<uc16>(a->from(), b->from());
5980}
5981
5982
5983void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
5984 ranges->Sort(CompareRangeByFrom);
5985 uc16 last = 0;
5986 for (int i = 0; i < ranges->length(); i++) {
5987 CharacterRange range = ranges->at(i);
5988 if (last < range.from())
5989 AddRange(CharacterRange(last, range.from() - 1));
5990 if (range.to() >= last) {
Ben Murdoch3ef787d2012-04-12 10:51:47 +01005991 if (range.to() == String::kMaxUtf16CodeUnit) {
Steve Blocka7e24c12009-10-30 11:49:00 +00005992 return;
5993 } else {
5994 last = range.to() + 1;
5995 }
5996 }
5997 }
Ben Murdoch3ef787d2012-04-12 10:51:47 +01005998 AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
Steve Blocka7e24c12009-10-30 11:49:00 +00005999}
6000
6001
6002void DispatchTableConstructor::VisitText(TextNode* that) {
6003 TextElement elm = that->elements()->at(0);
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006004 switch (elm.text_type()) {
Steve Blocka7e24c12009-10-30 11:49:00 +00006005 case TextElement::ATOM: {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006006 uc16 c = elm.atom()->data()[0];
Steve Blocka7e24c12009-10-30 11:49:00 +00006007 AddRange(CharacterRange(c, c));
6008 break;
6009 }
6010 case TextElement::CHAR_CLASS: {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006011 RegExpCharacterClass* tree = elm.char_class();
6012 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
Steve Blocka7e24c12009-10-30 11:49:00 +00006013 if (tree->is_negated()) {
6014 AddInverse(ranges);
6015 } else {
6016 for (int i = 0; i < ranges->length(); i++)
6017 AddRange(ranges->at(i));
6018 }
6019 break;
6020 }
6021 default: {
6022 UNIMPLEMENTED();
6023 }
6024 }
6025}
6026
6027
6028void DispatchTableConstructor::VisitAction(ActionNode* that) {
6029 RegExpNode* target = that->on_success();
6030 target->Accept(this);
6031}
6032
6033
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006034RegExpEngine::CompilationResult RegExpEngine::Compile(
6035 RegExpCompileData* data, bool ignore_case, bool is_global,
6036 bool is_multiline, bool is_sticky, Handle<String> pattern,
6037 Handle<String> sample_subject, bool is_one_byte, Zone* zone) {
Steve Blocka7e24c12009-10-30 11:49:00 +00006038 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006039 return IrregexpRegExpTooBig(zone->isolate());
Steve Blocka7e24c12009-10-30 11:49:00 +00006040 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006041 RegExpCompiler compiler(data->capture_count, ignore_case, is_one_byte, zone);
6042
6043 // Sample some characters from the middle of the string.
6044 static const int kSampleSize = 128;
6045
6046 sample_subject = String::Flatten(sample_subject);
6047 int chars_sampled = 0;
6048 int half_way = (sample_subject->length() - kSampleSize) / 2;
6049 for (int i = Max(0, half_way);
6050 i < sample_subject->length() && chars_sampled < kSampleSize;
6051 i++, chars_sampled++) {
6052 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6053 }
6054
Steve Blocka7e24c12009-10-30 11:49:00 +00006055 // Wrap the body of the regexp in capture #0.
6056 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6057 0,
6058 &compiler,
6059 compiler.accept());
6060 RegExpNode* node = captured_body;
Ben Murdochf87a2032010-10-22 12:50:53 +01006061 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6062 bool is_start_anchored = data->tree->IsAnchoredAtStart();
6063 int max_length = data->tree->max_match();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006064 if (!is_start_anchored && !is_sticky) {
Steve Blocka7e24c12009-10-30 11:49:00 +00006065 // Add a .*? at the beginning, outside the body capture, unless
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006066 // this expression is anchored at the beginning or sticky.
Steve Blocka7e24c12009-10-30 11:49:00 +00006067 RegExpNode* loop_node =
6068 RegExpQuantifier::ToNode(0,
6069 RegExpTree::kInfinity,
6070 false,
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006071 new(zone) RegExpCharacterClass('*'),
Steve Blocka7e24c12009-10-30 11:49:00 +00006072 &compiler,
6073 captured_body,
6074 data->contains_anchor);
6075
6076 if (data->contains_anchor) {
6077 // Unroll loop once, to take care of the case that might start
6078 // at the start of input.
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006079 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00006080 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6081 first_step_node->AddAlternative(GuardedAlternative(
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006082 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node)));
Steve Blocka7e24c12009-10-30 11:49:00 +00006083 node = first_step_node;
6084 } else {
6085 node = loop_node;
6086 }
6087 }
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006088 if (is_one_byte) {
6089 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6090 // Do it again to propagate the new nodes to places where they were not
6091 // put because they had not been calculated yet.
6092 if (node != NULL) {
6093 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6094 }
6095 }
6096
6097 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00006098 data->node = node;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006099 Analysis analysis(ignore_case, is_one_byte);
Steve Blocka7e24c12009-10-30 11:49:00 +00006100 analysis.EnsureAnalyzed(node);
6101 if (analysis.has_failed()) {
6102 const char* error_message = analysis.error_message();
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006103 return CompilationResult(zone->isolate(), error_message);
Steve Blocka7e24c12009-10-30 11:49:00 +00006104 }
6105
Steve Blocka7e24c12009-10-30 11:49:00 +00006106 // Create the correct assembler for the architecture.
Steve Block6ded16b2010-05-10 14:33:55 +01006107#ifndef V8_INTERPRETED_REGEXP
Steve Blocka7e24c12009-10-30 11:49:00 +00006108 // Native regexp implementation.
6109
6110 NativeRegExpMacroAssembler::Mode mode =
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006111 is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6112 : NativeRegExpMacroAssembler::UC16;
Steve Blocka7e24c12009-10-30 11:49:00 +00006113
6114#if V8_TARGET_ARCH_IA32
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006115 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2,
6116 zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00006117#elif V8_TARGET_ARCH_X64
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006118 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2,
6119 zone);
Steve Blocka7e24c12009-10-30 11:49:00 +00006120#elif V8_TARGET_ARCH_ARM
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006121 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2,
6122 zone);
6123#elif V8_TARGET_ARCH_ARM64
6124 RegExpMacroAssemblerARM64 macro_assembler(mode, (data->capture_count + 1) * 2,
6125 zone);
Steve Block44f0eee2011-05-26 01:26:41 +01006126#elif V8_TARGET_ARCH_MIPS
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006127 RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2,
6128 zone);
6129#elif V8_TARGET_ARCH_MIPS64
6130 RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2,
6131 zone);
6132#elif V8_TARGET_ARCH_X87
6133 RegExpMacroAssemblerX87 macro_assembler(mode, (data->capture_count + 1) * 2,
6134 zone);
6135#else
6136#error "Unsupported architecture"
Steve Blocka7e24c12009-10-30 11:49:00 +00006137#endif
6138
Steve Block6ded16b2010-05-10 14:33:55 +01006139#else // V8_INTERPRETED_REGEXP
Steve Blocka7e24c12009-10-30 11:49:00 +00006140 // Interpreted regexp implementation.
6141 EmbeddedVector<byte, 1024> codes;
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006142 RegExpMacroAssemblerIrregexp macro_assembler(codes, zone);
Steve Block6ded16b2010-05-10 14:33:55 +01006143#endif // V8_INTERPRETED_REGEXP
Steve Blocka7e24c12009-10-30 11:49:00 +00006144
Ben Murdochf87a2032010-10-22 12:50:53 +01006145 // Inserted here, instead of in Assembler, because it depends on information
6146 // in the AST that isn't replicated in the Node structure.
6147 static const int kMaxBacksearchLimit = 1024;
6148 if (is_end_anchored &&
6149 !is_start_anchored &&
6150 max_length < kMaxBacksearchLimit) {
6151 macro_assembler.SetCurrentPositionFromEnd(max_length);
6152 }
6153
Ben Murdochb8a8cc12014-11-26 15:28:44 +00006154 if (is_global) {
6155 macro_assembler.set_global_mode(
6156 (data->tree->min_match() > 0)
6157 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
6158 : RegExpMacroAssembler::GLOBAL);
6159 }
6160
Steve Blocka7e24c12009-10-30 11:49:00 +00006161 return compiler.Assemble(&macro_assembler,
6162 node,
6163 data->capture_count,
6164 pattern);
6165}
6166
Leon Clarkee46be812010-01-19 14:06:41 +00006167
Steve Blocka7e24c12009-10-30 11:49:00 +00006168}} // namespace v8::internal