blob: 5da73985dec07c23de1a9413de1997486159d92f [file] [log] [blame]
mstarzinger@chromium.org15613d02012-05-23 12:04:37 +00001// Copyright 2012 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
ager@chromium.orga74f0da2008-12-03 16:05:52 +000030#include "ast.h"
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +000031#include "compiler.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000032#include "execution.h"
33#include "factory.h"
sgjesse@chromium.org0b6db592009-07-30 14:48:31 +000034#include "jsregexp.h"
mmassi@chromium.org2f0efde2013-02-06 14:12:58 +000035#include "jsregexp-inl.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000036#include "platform.h"
kasperl@chromium.orga5551262010-12-07 12:49:48 +000037#include "string-search.h"
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000038#include "runtime.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000039#include "compilation-cache.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000040#include "string-stream.h"
41#include "parser.h"
42#include "regexp-macro-assembler.h"
43#include "regexp-macro-assembler-tracer.h"
44#include "regexp-macro-assembler-irregexp.h"
ager@chromium.org32912102009-01-16 10:38:43 +000045#include "regexp-stack.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000046
ricow@chromium.orgc9c80822010-04-21 08:22:37 +000047#ifndef V8_INTERPRETED_REGEXP
kasperl@chromium.org71affb52009-05-26 05:44:31 +000048#if V8_TARGET_ARCH_IA32
ager@chromium.org3a37e9b2009-04-27 09:26:21 +000049#include "ia32/regexp-macro-assembler-ia32.h"
ager@chromium.org9085a012009-05-11 19:22:57 +000050#elif V8_TARGET_ARCH_X64
ager@chromium.org9085a012009-05-11 19:22:57 +000051#include "x64/regexp-macro-assembler-x64.h"
52#elif V8_TARGET_ARCH_ARM
53#include "arm/regexp-macro-assembler-arm.h"
lrn@chromium.org7516f052011-03-30 08:52:27 +000054#elif V8_TARGET_ARCH_MIPS
55#include "mips/regexp-macro-assembler-mips.h"
kasperl@chromium.org2abc4502009-07-02 07:00:29 +000056#else
57#error Unsupported target architecture.
ager@chromium.orga74f0da2008-12-03 16:05:52 +000058#endif
sgjesse@chromium.org911335c2009-08-19 12:59:44 +000059#endif
ager@chromium.orga74f0da2008-12-03 16:05:52 +000060
61#include "interpreter-irregexp.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000062
ager@chromium.orga74f0da2008-12-03 16:05:52 +000063
kasperl@chromium.org71affb52009-05-26 05:44:31 +000064namespace v8 {
65namespace internal {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000066
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000067Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
68 Handle<String> pattern,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000069 Handle<String> flags,
70 bool* has_pending_exception) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000071 // Call the construct code with 2 arguments.
svenpanne@chromium.orga8bb4d92011-10-10 13:20:40 +000072 Handle<Object> argv[] = { pattern, flags };
73 return Execution::New(constructor, ARRAY_SIZE(argv), argv,
74 has_pending_exception);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000075}
76
77
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000078static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
79 int flags = JSRegExp::NONE;
ager@chromium.orgbb29dc92009-03-24 13:25:23 +000080 for (int i = 0; i < str->length(); i++) {
81 switch (str->Get(i)) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000082 case 'i':
83 flags |= JSRegExp::IGNORE_CASE;
84 break;
85 case 'g':
86 flags |= JSRegExp::GLOBAL;
87 break;
88 case 'm':
89 flags |= JSRegExp::MULTILINE;
90 break;
91 }
92 }
93 return JSRegExp::Flags(flags);
94}
95
96
ager@chromium.orga74f0da2008-12-03 16:05:52 +000097static inline void ThrowRegExpException(Handle<JSRegExp> re,
98 Handle<String> pattern,
99 Handle<String> error_text,
100 const char* message) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000101 Isolate* isolate = re->GetIsolate();
102 Factory* factory = isolate->factory();
103 Handle<FixedArray> elements = factory->NewFixedArray(2);
karlklose@chromium.org8f806e82011-03-07 14:06:08 +0000104 elements->set(0, *pattern);
105 elements->set(1, *error_text);
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000106 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
107 Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
108 isolate->Throw(*regexp_err);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000109}
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000110
111
erik.corry@gmail.comed49e962012-04-17 11:57:53 +0000112ContainedInLattice AddRange(ContainedInLattice containment,
113 const int* ranges,
114 int ranges_length,
115 Interval new_range) {
116 ASSERT((ranges_length & 1) == 1);
117 ASSERT(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
118 if (containment == kLatticeUnknown) return containment;
119 bool inside = false;
120 int last = 0;
121 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
122 // Consider the range from last to ranges[i].
123 // We haven't got to the new range yet.
124 if (ranges[i] <= new_range.from()) continue;
125 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
126 // inclusive, but the values in ranges are not.
127 if (last <= new_range.from() && new_range.to() < ranges[i]) {
128 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
129 }
130 return kLatticeUnknown;
131 }
132 return containment;
133}
134
135
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000136// More makes code generation slower, less makes V8 benchmark score lower.
137const int kMaxLookaheadForBoyerMoore = 8;
138// In a 3-character pattern you can maximally step forwards 3 characters
139// at a time, which is not always enough to pay for the extra logic.
140const int kPatternTooShortForBoyerMoore = 2;
141
142
143// Identifies the sort of regexps where the regexp engine is faster
144// than the code used for atom matches.
145static bool HasFewDifferentCharacters(Handle<String> pattern) {
146 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
147 if (length <= kPatternTooShortForBoyerMoore) return false;
148 const int kMod = 128;
149 bool character_found[kMod];
150 int different = 0;
151 memset(&character_found[0], 0, sizeof(character_found));
152 for (int i = 0; i < length; i++) {
153 int ch = (pattern->Get(i) & (kMod - 1));
154 if (!character_found[ch]) {
155 character_found[ch] = true;
156 different++;
157 // We declare a regexp low-alphabet if it has at least 3 times as many
158 // characters as it has different characters.
159 if (different * 3 > length) return false;
160 }
161 }
162 return true;
163}
164
165
ager@chromium.org8bb60582008-12-11 12:02:20 +0000166// Generic RegExp methods. Dispatches to implementation specific methods.
167
168
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000169Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
170 Handle<String> pattern,
mstarzinger@chromium.org1510d582013-06-28 14:00:48 +0000171 Handle<String> flag_str) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000172 Isolate* isolate = re->GetIsolate();
mstarzinger@chromium.org1510d582013-06-28 14:00:48 +0000173 Zone zone(isolate);
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000174 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000175 CompilationCache* compilation_cache = isolate->compilation_cache();
176 Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags);
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000177 bool in_cache = !cached.is_null();
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000178 LOG(isolate, RegExpCompileEvent(re, in_cache));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000179
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000180 Handle<Object> result;
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000181 if (in_cache) {
182 re->set_data(*cached);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000183 return re;
184 }
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000185 pattern = FlattenGetString(pattern);
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000186 PostponeInterruptsScope postpone(isolate);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000187 RegExpCompileData parse_result;
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000188 FlatStringReader reader(isolate, pattern);
fschneider@chromium.orge03fb642010-11-01 12:34:09 +0000189 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
mstarzinger@chromium.org1510d582013-06-28 14:00:48 +0000190 &parse_result, &zone)) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000191 // Throw an exception if we fail to parse the pattern.
192 ThrowRegExpException(re,
193 pattern,
194 parse_result.error,
195 "malformed_regexp");
196 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000197 }
198
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000199 bool has_been_compiled = false;
200
201 if (parse_result.simple &&
202 !flags.is_ignore_case() &&
203 !HasFewDifferentCharacters(pattern)) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000204 // Parse-tree is a single atom that is equal to the pattern.
205 AtomCompile(re, pattern, flags, pattern);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000206 has_been_compiled = true;
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000207 } else if (parse_result.tree->IsAtom() &&
208 !flags.is_ignore_case() &&
209 parse_result.capture_count == 0) {
210 RegExpAtom* atom = parse_result.tree->AsAtom();
211 Vector<const uc16> atom_pattern = atom->data();
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000212 Handle<String> atom_string =
213 isolate->factory()->NewStringFromTwoByte(atom_pattern);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000214 if (!HasFewDifferentCharacters(atom_string)) {
215 AtomCompile(re, pattern, flags, atom_string);
216 has_been_compiled = true;
217 }
218 }
219 if (!has_been_compiled) {
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000220 IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000221 }
222 ASSERT(re->data()->IsFixedArray());
223 // Compilation succeeded so the data is set on the regexp
224 // and we can store it in the cache.
225 Handle<FixedArray> data(FixedArray::cast(re->data()));
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000226 compilation_cache->PutRegExp(pattern, flags, data);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000227
228 return re;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000229}
230
231
232Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
233 Handle<String> subject,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000234 int index,
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000235 Handle<JSArray> last_match_info) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000236 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000237 case JSRegExp::ATOM:
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000238 return AtomExec(regexp, subject, index, last_match_info);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000239 case JSRegExp::IRREGEXP: {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000240 Handle<Object> result =
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000241 IrregexpExec(regexp, subject, index, last_match_info);
ulan@chromium.org812308e2012-02-29 15:58:45 +0000242 ASSERT(!result.is_null() ||
243 regexp->GetIsolate()->has_pending_exception());
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000244 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000245 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000246 default:
247 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000248 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000249 }
250}
251
252
ager@chromium.org8bb60582008-12-11 12:02:20 +0000253// RegExp Atom implementation: Simple string search using indexOf.
254
255
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000256void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
257 Handle<String> pattern,
258 JSRegExp::Flags flags,
259 Handle<String> match_pattern) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000260 re->GetIsolate()->factory()->SetRegExpAtomData(re,
261 JSRegExp::ATOM,
262 pattern,
263 flags,
264 match_pattern);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000265}
266
267
268static void SetAtomLastCapture(FixedArray* array,
269 String* subject,
270 int from,
271 int to) {
rossberg@chromium.org79e79022013-06-03 15:43:46 +0000272 SealHandleScope shs(array->GetIsolate());
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000273 RegExpImpl::SetLastCaptureCount(array, 2);
274 RegExpImpl::SetLastSubject(array, subject);
275 RegExpImpl::SetLastInput(array, subject);
276 RegExpImpl::SetCapture(array, 0, from);
277 RegExpImpl::SetCapture(array, 1, to);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000278}
279
280
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000281int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
282 Handle<String> subject,
283 int index,
284 int32_t* output,
285 int output_size) {
286 Isolate* isolate = regexp->GetIsolate();
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000287
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000288 ASSERT(0 <= index);
289 ASSERT(index <= subject->length());
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000290
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000291 if (!subject->IsFlat()) FlattenString(subject);
rossberg@chromium.org79e79022013-06-03 15:43:46 +0000292 DisallowHeapAllocation no_gc; // ensure vectors stay valid
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000293
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000294 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000295 int needle_len = needle->length();
ricow@chromium.orgddd545c2011-08-24 12:02:41 +0000296 ASSERT(needle->IsFlat());
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000297 ASSERT_LT(0, needle_len);
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000298
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000299 if (index + needle_len > subject->length()) {
300 return RegExpImpl::RE_FAILURE;
301 }
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000302
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000303 for (int i = 0; i < output_size; i += 2) {
ricow@chromium.orgddd545c2011-08-24 12:02:41 +0000304 String::FlatContent needle_content = needle->GetFlatContent();
305 String::FlatContent subject_content = subject->GetFlatContent();
306 ASSERT(needle_content.IsFlat());
307 ASSERT(subject_content.IsFlat());
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000308 // dispatch on type of strings
ricow@chromium.orgddd545c2011-08-24 12:02:41 +0000309 index = (needle_content.IsAscii()
310 ? (subject_content.IsAscii()
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000311 ? SearchString(isolate,
jkummerow@chromium.org59297c72013-01-09 16:32:23 +0000312 subject_content.ToOneByteVector(),
313 needle_content.ToOneByteVector(),
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000314 index)
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000315 : SearchString(isolate,
ricow@chromium.orgddd545c2011-08-24 12:02:41 +0000316 subject_content.ToUC16Vector(),
jkummerow@chromium.org59297c72013-01-09 16:32:23 +0000317 needle_content.ToOneByteVector(),
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000318 index))
ricow@chromium.orgddd545c2011-08-24 12:02:41 +0000319 : (subject_content.IsAscii()
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000320 ? SearchString(isolate,
jkummerow@chromium.org59297c72013-01-09 16:32:23 +0000321 subject_content.ToOneByteVector(),
ricow@chromium.orgddd545c2011-08-24 12:02:41 +0000322 needle_content.ToUC16Vector(),
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000323 index)
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000324 : SearchString(isolate,
ricow@chromium.orgddd545c2011-08-24 12:02:41 +0000325 subject_content.ToUC16Vector(),
326 needle_content.ToUC16Vector(),
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000327 index)));
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000328 if (index == -1) {
329 return i / 2; // Return number of matches.
330 } else {
331 output[i] = index;
332 output[i+1] = index + needle_len;
333 index += needle_len;
334 }
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000335 }
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000336 return output_size / 2;
337}
ager@chromium.org7c537e22008-10-16 08:43:32 +0000338
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000339
340Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
341 Handle<String> subject,
342 int index,
343 Handle<JSArray> last_match_info) {
344 Isolate* isolate = re->GetIsolate();
345
346 static const int kNumRegisters = 2;
347 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
348 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
349
350 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
351
352 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
353
354 ASSERT_EQ(res, RegExpImpl::RE_SUCCESS);
rossberg@chromium.org79e79022013-06-03 15:43:46 +0000355 SealHandleScope shs(isolate);
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000356 FixedArray* array = FixedArray::cast(last_match_info->elements());
357 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000358 return last_match_info;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000359}
360
361
ager@chromium.org8bb60582008-12-11 12:02:20 +0000362// Irregexp implementation.
363
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000364// Ensures that the regexp object contains a compiled version of the
365// source for either ASCII or non-ASCII strings.
366// If the compiled version doesn't already exist, it is compiled
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000367// from the source pattern.
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000368// If compilation fails, an exception is thrown and this function
369// returns false.
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000370bool RegExpImpl::EnsureCompiledIrregexp(
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000371 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) {
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000372 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000373#ifdef V8_INTERPRETED_REGEXP
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000374 if (compiled_code->IsByteArray()) return true;
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000375#else // V8_INTERPRETED_REGEXP (RegExp native code)
376 if (compiled_code->IsCode()) return true;
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000377#endif
jkummerow@chromium.orgddda9e82011-07-06 11:27:02 +0000378 // We could potentially have marked this as flushable, but have kept
379 // a saved version if we did not flush it yet.
380 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii));
381 if (saved_code->IsCode()) {
382 // Reinstate the code in the original place.
383 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code);
384 ASSERT(compiled_code->IsSmi());
385 return true;
386 }
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000387 return CompileIrregexp(re, sample_subject, is_ascii);
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000388}
ager@chromium.org8bb60582008-12-11 12:02:20 +0000389
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000390
jkummerow@chromium.orgddda9e82011-07-06 11:27:02 +0000391static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
392 bool is_ascii,
393 Handle<String> error_message,
394 Isolate* isolate) {
395 Factory* factory = isolate->factory();
396 Handle<FixedArray> elements = factory->NewFixedArray(2);
397 elements->set(0, re->Pattern());
398 elements->set(1, *error_message);
399 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
400 Handle<Object> regexp_err =
401 factory->NewSyntaxError("malformed_regexp", array);
402 isolate->Throw(*regexp_err);
403 return false;
404}
405
406
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000407bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
408 Handle<String> sample_subject,
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000409 bool is_ascii) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000410 // Compile the RegExp.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000411 Isolate* isolate = re->GetIsolate();
mstarzinger@chromium.org1510d582013-06-28 14:00:48 +0000412 Zone zone(isolate);
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000413 PostponeInterruptsScope postpone(isolate);
jkummerow@chromium.orgddda9e82011-07-06 11:27:02 +0000414 // If we had a compilation error the last time this is saved at the
415 // saved code index.
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000416 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
jkummerow@chromium.orgddda9e82011-07-06 11:27:02 +0000417 // When arriving here entry can only be a smi, either representing an
418 // uncompiled regexp, a previous compilation error, or code that has
419 // been flushed.
420 ASSERT(entry->IsSmi());
421 int entry_value = Smi::cast(entry)->value();
422 ASSERT(entry_value == JSRegExp::kUninitializedValue ||
423 entry_value == JSRegExp::kCompilationErrorValue ||
424 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
425
426 if (entry_value == JSRegExp::kCompilationErrorValue) {
427 // A previous compilation failed and threw an error which we store in
428 // the saved code index (we store the error message, not the actual
429 // error). Recreate the error object and throw it.
430 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii));
431 ASSERT(error_string->IsString());
432 Handle<String> error_message(String::cast(error_string));
433 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000434 return false;
435 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000436
437 JSRegExp::Flags flags = re->GetFlags();
438
439 Handle<String> pattern(re->Pattern());
ricow@chromium.org4668a2c2011-08-29 10:41:00 +0000440 if (!pattern->IsFlat()) FlattenString(pattern);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000441 RegExpCompileData compile_data;
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000442 FlatStringReader reader(isolate, pattern);
fschneider@chromium.orge03fb642010-11-01 12:34:09 +0000443 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000444 &compile_data,
mstarzinger@chromium.org1510d582013-06-28 14:00:48 +0000445 &zone)) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000446 // Throw an exception if we fail to parse the pattern.
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000447 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000448 ThrowRegExpException(re,
449 pattern,
450 compile_data.error,
451 "malformed_regexp");
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000452 return false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000453 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000454 RegExpEngine::CompilationResult result =
ager@chromium.org8bb60582008-12-11 12:02:20 +0000455 RegExpEngine::Compile(&compile_data,
456 flags.is_ignore_case(),
mstarzinger@chromium.org15613d02012-05-23 12:04:37 +0000457 flags.is_global(),
ager@chromium.org8bb60582008-12-11 12:02:20 +0000458 flags.is_multiline(),
459 pattern,
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000460 sample_subject,
rossberg@chromium.org400388e2012-06-06 09:29:22 +0000461 is_ascii,
mstarzinger@chromium.org1510d582013-06-28 14:00:48 +0000462 &zone);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000463 if (result.error_message != NULL) {
464 // Unable to compile regexp.
karlklose@chromium.org8f806e82011-03-07 14:06:08 +0000465 Handle<String> error_message =
jkummerow@chromium.orgddda9e82011-07-06 11:27:02 +0000466 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message));
467 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000468 return false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000469 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000470
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000471 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
472 data->set(JSRegExp::code_index(is_ascii), result.code);
473 int register_max = IrregexpMaxRegisterCount(*data);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000474 if (result.num_registers > register_max) {
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000475 SetIrregexpMaxRegisterCount(*data, result.num_registers);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000476 }
477
478 return true;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000479}
480
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000481
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000482int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
483 return Smi::cast(
484 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000485}
486
487
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000488void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
489 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000490}
491
492
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000493int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
494 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000495}
496
497
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000498int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
499 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000500}
501
502
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000503ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000504 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000505}
506
507
508Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000509 return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000510}
511
512
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000513void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
514 Handle<String> pattern,
515 JSRegExp::Flags flags,
516 int capture_count) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000517 // Initialize compiled code entries to null.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000518 re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
519 JSRegExp::IRREGEXP,
520 pattern,
521 flags,
522 capture_count);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000523}
524
525
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000526int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000527 Handle<String> subject) {
ricow@chromium.org4668a2c2011-08-29 10:41:00 +0000528 if (!subject->IsFlat()) FlattenString(subject);
529
lrn@chromium.org32d961d2010-06-30 09:09:34 +0000530 // Check the asciiness of the underlying storage.
ulan@chromium.org8e8d8822012-11-23 14:36:46 +0000531 bool is_ascii = subject->IsOneByteRepresentationUnderneath();
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000532 if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1;
ricow@chromium.org4668a2c2011-08-29 10:41:00 +0000533
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000534#ifdef V8_INTERPRETED_REGEXP
535 // Byte-code regexp needs space allocated for all its registers.
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000536 // The result captures are copied to the start of the registers array
537 // if the match succeeds. This way those registers are not clobbered
538 // when we set the last match info from last successful match.
539 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
540 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000541#else // V8_INTERPRETED_REGEXP
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000542 // Native regexp only needs room to output captures. Registers are handled
543 // internally.
544 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000545#endif // V8_INTERPRETED_REGEXP
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000546}
547
548
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000549int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
550 Handle<String> subject,
551 int index,
552 int32_t* output,
553 int output_size) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000554 Isolate* isolate = regexp->GetIsolate();
555
556 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000557
558 ASSERT(index >= 0);
559 ASSERT(index <= subject->length());
560 ASSERT(subject->IsFlat());
561
ulan@chromium.org8e8d8822012-11-23 14:36:46 +0000562 bool is_ascii = subject->IsOneByteRepresentationUnderneath();
lrn@chromium.org32d961d2010-06-30 09:09:34 +0000563
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000564#ifndef V8_INTERPRETED_REGEXP
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000565 ASSERT(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000566 do {
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000567 EnsureCompiledIrregexp(regexp, subject, is_ascii);
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000568 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000569 // The stack is used to allocate registers for the compiled regexp code.
570 // This means that in case of failure, the output registers array is left
571 // untouched and contains the capture results from the previous successful
572 // match. We can use that to set the last match info lazily.
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000573 NativeRegExpMacroAssembler::Result res =
574 NativeRegExpMacroAssembler::Match(code,
575 subject,
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000576 output,
577 output_size,
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000578 index,
579 isolate);
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000580 if (res != NativeRegExpMacroAssembler::RETRY) {
581 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION ||
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000582 isolate->has_pending_exception());
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000583 STATIC_ASSERT(
584 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
585 STATIC_ASSERT(
586 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
587 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
588 == RE_EXCEPTION);
589 return static_cast<IrregexpResult>(res);
590 }
591 // If result is RETRY, the string has changed representation, and we
592 // must restart from scratch.
593 // In this case, it means we must make sure we are prepared to handle
lrn@chromium.org32d961d2010-06-30 09:09:34 +0000594 // the, potentially, different subject (the string can switch between
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000595 // being internal and external, and even between being ASCII and UC16,
596 // but the characters are always the same).
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000597 IrregexpPrepare(regexp, subject);
ulan@chromium.org8e8d8822012-11-23 14:36:46 +0000598 is_ascii = subject->IsOneByteRepresentationUnderneath();
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000599 } while (true);
600 UNREACHABLE();
601 return RE_EXCEPTION;
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000602#else // V8_INTERPRETED_REGEXP
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000603
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000604 ASSERT(output_size >= IrregexpNumberOfRegisters(*irregexp));
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000605 // We must have done EnsureCompiledIrregexp, so we can get the number of
606 // registers.
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000607 int number_of_capture_registers =
608 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000609 int32_t* raw_output = &output[number_of_capture_registers];
610 // We do not touch the actual capture result registers until we know there
611 // has been a match so that we can use those capture results to set the
612 // last match info.
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000613 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000614 raw_output[i] = -1;
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000615 }
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000616 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate);
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000617
erik.corry@gmail.com394dbcf2011-10-27 07:38:48 +0000618 IrregexpResult result = IrregexpInterpreter::Match(isolate,
619 byte_codes,
620 subject,
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000621 raw_output,
erik.corry@gmail.com394dbcf2011-10-27 07:38:48 +0000622 index);
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000623 if (result == RE_SUCCESS) {
624 // Copy capture results to the start of the registers array.
mstarzinger@chromium.orge27d6172013-04-17 11:51:44 +0000625 OS::MemCopy(
626 output, raw_output, number_of_capture_registers * sizeof(int32_t));
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000627 }
erik.corry@gmail.com394dbcf2011-10-27 07:38:48 +0000628 if (result == RE_EXCEPTION) {
629 ASSERT(!isolate->has_pending_exception());
630 isolate->StackOverflow();
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000631 }
erik.corry@gmail.com394dbcf2011-10-27 07:38:48 +0000632 return result;
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000633#endif // V8_INTERPRETED_REGEXP
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000634}
635
636
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000637Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000638 Handle<String> subject,
ager@chromium.org41826e72009-03-30 13:30:57 +0000639 int previous_index,
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000640 Handle<JSArray> last_match_info) {
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000641 Isolate* isolate = regexp->GetIsolate();
642 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000643
ager@chromium.org8bb60582008-12-11 12:02:20 +0000644 // Prepare space for the return values.
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000645#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
ager@chromium.org8bb60582008-12-11 12:02:20 +0000646 if (FLAG_trace_regexp_bytecodes) {
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000647 String* pattern = regexp->Pattern();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000648 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
649 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
650 }
651#endif
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000652 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000653 if (required_registers < 0) {
654 // Compiling failed with an exception.
ulan@chromium.org812308e2012-02-29 15:58:45 +0000655 ASSERT(isolate->has_pending_exception());
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000656 return Handle<Object>::null();
657 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000658
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000659 int32_t* output_registers = NULL;
660 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
661 output_registers = NewArray<int32_t>(required_registers);
662 }
663 SmartArrayPointer<int32_t> auto_release(output_registers);
664 if (output_registers == NULL) {
665 output_registers = isolate->jsregexp_static_offsets_vector();
666 }
ager@chromium.org5aa501c2009-06-23 07:57:28 +0000667
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000668 int res = RegExpImpl::IrregexpExecRaw(
669 regexp, subject, previous_index, output_registers, required_registers);
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000670 if (res == RE_SUCCESS) {
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000671 int capture_count =
672 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
673 return SetLastMatchInfo(
674 last_match_info, subject, capture_count, output_registers);
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000675 }
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000676 if (res == RE_EXCEPTION) {
ulan@chromium.org812308e2012-02-29 15:58:45 +0000677 ASSERT(isolate->has_pending_exception());
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000678 return Handle<Object>::null();
679 }
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000680 ASSERT(res == RE_FAILURE);
ulan@chromium.org812308e2012-02-29 15:58:45 +0000681 return isolate->factory()->null_value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000682}
683
684
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000685Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
686 Handle<String> subject,
687 int capture_count,
688 int32_t* match) {
hpayer@chromium.org8432c912013-02-28 15:55:26 +0000689 ASSERT(last_match_info->HasFastObjectElements());
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000690 int capture_register_count = (capture_count + 1) * 2;
691 last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead);
rossberg@chromium.org79e79022013-06-03 15:43:46 +0000692 DisallowHeapAllocation no_allocation;
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000693 FixedArray* array = FixedArray::cast(last_match_info->elements());
694 if (match != NULL) {
695 for (int i = 0; i < capture_register_count; i += 2) {
696 SetCapture(array, i, match[i]);
697 SetCapture(array, i + 1, match[i + 1]);
698 }
699 }
700 SetLastCaptureCount(array, capture_register_count);
701 SetLastSubject(array, *subject);
702 SetLastInput(array, *subject);
703 return last_match_info;
704}
705
706
707RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
708 Handle<String> subject,
709 bool is_global,
danno@chromium.org412fa512012-09-14 13:28:26 +0000710 Isolate* isolate)
711 : register_array_(NULL),
712 register_array_size_(0),
713 regexp_(regexp),
714 subject_(subject) {
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000715#ifdef V8_INTERPRETED_REGEXP
716 bool interpreted = true;
717#else
718 bool interpreted = false;
719#endif // V8_INTERPRETED_REGEXP
720
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000721 if (regexp_->TypeTag() == JSRegExp::ATOM) {
722 static const int kAtomRegistersPerMatch = 2;
723 registers_per_match_ = kAtomRegistersPerMatch;
724 // There is no distinction between interpreted and native for atom regexps.
725 interpreted = false;
726 } else {
727 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
728 if (registers_per_match_ < 0) {
729 num_matches_ = -1; // Signal exception.
730 return;
731 }
732 }
733
734 if (is_global && !interpreted) {
735 register_array_size_ =
736 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
737 max_matches_ = register_array_size_ / registers_per_match_;
738 } else {
739 // Global loop in interpreted regexp is not implemented. We choose
740 // the size of the offsets vector so that it can only store one match.
741 register_array_size_ = registers_per_match_;
742 max_matches_ = 1;
743 }
744
745 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
746 register_array_ = NewArray<int32_t>(register_array_size_);
747 } else {
748 register_array_ = isolate->jsregexp_static_offsets_vector();
749 }
750
751 // Set state so that fetching the results the first time triggers a call
752 // to the compiled regexp.
753 current_match_index_ = max_matches_ - 1;
754 num_matches_ = max_matches_;
755 ASSERT(registers_per_match_ >= 2); // Each match has at least one capture.
756 ASSERT_GE(register_array_size_, registers_per_match_);
757 int32_t* last_match =
758 &register_array_[current_match_index_ * registers_per_match_];
759 last_match[0] = -1;
760 last_match[1] = 0;
761}
762
763
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000764// -------------------------------------------------------------------
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000765// Implementation of the Irregexp regular expression engine.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000766//
767// The Irregexp regular expression engine is intended to be a complete
768// implementation of ECMAScript regular expressions. It generates either
769// bytecodes or native code.
770
771// The Irregexp regexp engine is structured in three steps.
772// 1) The parser generates an abstract syntax tree. See ast.cc.
773// 2) From the AST a node network is created. The nodes are all
774// subclasses of RegExpNode. The nodes represent states when
775// executing a regular expression. Several optimizations are
776// performed on the node network.
777// 3) From the nodes we generate either byte codes or native code
778// that can actually execute the regular expression (perform
779// the search). The code generation step is described in more
780// detail below.
781
782// Code generation.
783//
784// The nodes are divided into four main categories.
785// * Choice nodes
786// These represent places where the regular expression can
787// match in more than one way. For example on entry to an
788// alternation (foo|bar) or a repetition (*, +, ? or {}).
789// * Action nodes
790// These represent places where some action should be
791// performed. Examples include recording the current position
792// in the input string to a register (in order to implement
793// captures) or other actions on register for example in order
794// to implement the counters needed for {} repetitions.
795// * Matching nodes
796// These attempt to match some element part of the input string.
797// Examples of elements include character classes, plain strings
798// or back references.
799// * End nodes
800// These are used to implement the actions required on finding
801// a successful match or failing to find a match.
802//
803// The code generated (whether as byte codes or native code) maintains
804// some state as it runs. This consists of the following elements:
805//
806// * The capture registers. Used for string captures.
807// * Other registers. Used for counters etc.
808// * The current position.
809// * The stack of backtracking information. Used when a matching node
810// fails to find a match and needs to try an alternative.
811//
812// Conceptual regular expression execution model:
813//
814// There is a simple conceptual model of regular expression execution
815// which will be presented first. The actual code generated is a more
816// efficient simulation of the simple conceptual model:
817//
818// * Choice nodes are implemented as follows:
819// For each choice except the last {
820// push current position
821// push backtrack code location
822// <generate code to test for choice>
823// backtrack code location:
824// pop current position
825// }
826// <generate code to test for last choice>
827//
828// * Actions nodes are generated as follows
829// <push affected registers on backtrack stack>
830// <generate code to perform action>
831// push backtrack code location
832// <generate code to test for following nodes>
833// backtrack code location:
834// <pop affected registers to restore their state>
835// <pop backtrack location from stack and go to it>
836//
837// * Matching nodes are generated as follows:
838// if input string matches at current position
839// update current position
840// <generate code to test for following nodes>
841// else
842// <pop backtrack location from stack and go to it>
843//
844// Thus it can be seen that the current position is saved and restored
845// by the choice nodes, whereas the registers are saved and restored by
846// by the action nodes that manipulate them.
847//
848// The other interesting aspect of this model is that nodes are generated
849// at the point where they are needed by a recursive call to Emit(). If
850// the node has already been code generated then the Emit() call will
851// generate a jump to the previously generated code instead. In order to
852// limit recursion it is possible for the Emit() function to put the node
853// on a work list for later generation and instead generate a jump. The
854// destination of the jump is resolved later when the code is generated.
855//
856// Actual regular expression code generation.
857//
858// Code generation is actually more complicated than the above. In order
859// to improve the efficiency of the generated code some optimizations are
860// performed
861//
862// * Choice nodes have 1-character lookahead.
863// A choice node looks at the following character and eliminates some of
864// the choices immediately based on that character. This is not yet
865// implemented.
866// * Simple greedy loops store reduced backtracking information.
867// A quantifier like /.*foo/m will greedily match the whole input. It will
868// then need to backtrack to a point where it can match "foo". The naive
869// implementation of this would push each character position onto the
870// backtracking stack, then pop them off one by one. This would use space
871// proportional to the length of the input string. However since the "."
872// can only match in one way and always has a constant length (in this case
873// of 1) it suffices to store the current position on the top of the stack
874// once. Matching now becomes merely incrementing the current position and
875// backtracking becomes decrementing the current position and checking the
876// result against the stored current position. This is faster and saves
877// space.
878// * The current state is virtualized.
879// This is used to defer expensive operations until it is clear that they
880// are needed and to generate code for a node more than once, allowing
881// specialized an efficient versions of the code to be created. This is
882// explained in the section below.
883//
884// Execution state virtualization.
885//
886// Instead of emitting code, nodes that manipulate the state can record their
ager@chromium.org32912102009-01-16 10:38:43 +0000887// manipulation in an object called the Trace. The Trace object can record a
888// current position offset, an optional backtrack code location on the top of
889// the virtualized backtrack stack and some register changes. When a node is
890// to be emitted it can flush the Trace or update it. Flushing the Trace
ager@chromium.org8bb60582008-12-11 12:02:20 +0000891// will emit code to bring the actual state into line with the virtual state.
ulan@chromium.org2efb9002012-01-19 15:36:35 +0000892// Avoiding flushing the state can postpone some work (e.g. updates of capture
ager@chromium.org8bb60582008-12-11 12:02:20 +0000893// registers). Postponing work can save time when executing the regular
894// expression since it may be found that the work never has to be done as a
895// failure to match can occur. In addition it is much faster to jump to a
896// known backtrack code location than it is to pop an unknown backtrack
897// location from the stack and jump there.
898//
ager@chromium.org32912102009-01-16 10:38:43 +0000899// The virtual state found in the Trace affects code generation. For example
900// the virtual state contains the difference between the actual current
901// position and the virtual current position, and matching code needs to use
902// this offset to attempt a match in the correct location of the input
903// string. Therefore code generated for a non-trivial trace is specialized
904// to that trace. The code generator therefore has the ability to generate
905// code for each node several times. In order to limit the size of the
906// generated code there is an arbitrary limit on how many specialized sets of
907// code may be generated for a given node. If the limit is reached, the
908// trace is flushed and a generic version of the code for a node is emitted.
909// This is subsequently used for that node. The code emitted for non-generic
910// trace is not recorded in the node and so it cannot currently be reused in
911// the event that code generation is requested for an identical trace.
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000912
913
mmassi@chromium.org7028c052012-06-13 11:51:58 +0000914void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000915 UNREACHABLE();
916}
917
918
mmassi@chromium.org7028c052012-06-13 11:51:58 +0000919void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
920 text->AddElement(TextElement::Atom(this), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000921}
922
923
mmassi@chromium.org7028c052012-06-13 11:51:58 +0000924void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
925 text->AddElement(TextElement::CharClass(this), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000926}
927
928
mmassi@chromium.org7028c052012-06-13 11:51:58 +0000929void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000930 for (int i = 0; i < elements()->length(); i++)
mmassi@chromium.org7028c052012-06-13 11:51:58 +0000931 text->AddElement(elements()->at(i), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000932}
933
934
935TextElement TextElement::Atom(RegExpAtom* atom) {
936 TextElement result = TextElement(ATOM);
937 result.data.u_atom = atom;
938 return result;
939}
940
941
942TextElement TextElement::CharClass(
943 RegExpCharacterClass* char_class) {
944 TextElement result = TextElement(CHAR_CLASS);
945 result.data.u_char_class = char_class;
946 return result;
947}
948
949
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000950int TextElement::length() {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +0000951 if (text_type == ATOM) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000952 return data.u_atom->length();
953 } else {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +0000954 ASSERT(text_type == CHAR_CLASS);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000955 return 1;
956 }
957}
958
959
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000960DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
961 if (table_ == NULL) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +0000962 table_ = new(zone()) DispatchTable(zone());
963 DispatchTableConstructor cons(table_, ignore_case, zone());
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000964 cons.BuildTable(this);
965 }
966 return table_;
967}
968
969
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000970class FrequencyCollator {
971 public:
972 FrequencyCollator() : total_samples_(0) {
973 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
974 frequencies_[i] = CharacterFrequency(i);
975 }
976 }
977
978 void CountCharacter(int character) {
979 int index = (character & RegExpMacroAssembler::kTableMask);
980 frequencies_[index].Increment();
981 total_samples_++;
982 }
983
984 // Does not measure in percent, but rather per-128 (the table size from the
985 // regexp macro assembler).
986 int Frequency(int in_character) {
987 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
988 if (total_samples_ < 1) return 1; // Division by zero.
989 int freq_in_per128 =
990 (frequencies_[in_character].counter() * 128) / total_samples_;
991 return freq_in_per128;
992 }
993
994 private:
995 class CharacterFrequency {
996 public:
997 CharacterFrequency() : counter_(0), character_(-1) { }
998 explicit CharacterFrequency(int character)
999 : counter_(0), character_(character) { }
1000
1001 void Increment() { counter_++; }
1002 int counter() { return counter_; }
1003 int character() { return character_; }
1004
1005 private:
1006 int counter_;
1007 int character_;
1008 };
1009
1010
1011 private:
1012 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
1013 int total_samples_;
1014};
1015
1016
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001017class RegExpCompiler {
1018 public:
rossberg@chromium.org400388e2012-06-06 09:29:22 +00001019 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii,
1020 Zone* zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001021
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001022 int AllocateRegister() {
1023 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
1024 reg_exp_too_big_ = true;
1025 return next_register_;
1026 }
1027 return next_register_++;
1028 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001029
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00001030 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
1031 RegExpNode* start,
1032 int capture_count,
1033 Handle<String> pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001034
1035 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1036
1037 static const int kImplementationOffset = 0;
1038 static const int kNumberOfRegistersOffset = 0;
1039 static const int kCodeOffset = 1;
1040
1041 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1042 EndNode* accept() { return accept_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001043
1044 static const int kMaxRecursion = 100;
1045 inline int recursion_depth() { return recursion_depth_; }
1046 inline void IncrementRecursionDepth() { recursion_depth_++; }
1047 inline void DecrementRecursionDepth() { recursion_depth_--; }
1048
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001049 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1050
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001051 inline bool ignore_case() { return ignore_case_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001052 inline bool ascii() { return ascii_; }
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00001053 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001054
whesse@chromium.org7b260152011-06-20 15:33:18 +00001055 int current_expansion_factor() { return current_expansion_factor_; }
1056 void set_current_expansion_factor(int value) {
1057 current_expansion_factor_ = value;
1058 }
1059
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001060 Zone* zone() const { return zone_; }
rossberg@chromium.org400388e2012-06-06 09:29:22 +00001061
ager@chromium.org32912102009-01-16 10:38:43 +00001062 static const int kNoRegister = -1;
jkummerow@chromium.orge297f592011-06-08 10:05:15 +00001063
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001064 private:
1065 EndNode* accept_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001066 int next_register_;
1067 List<RegExpNode*>* work_list_;
1068 int recursion_depth_;
1069 RegExpMacroAssembler* macro_assembler_;
1070 bool ignore_case_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001071 bool ascii_;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001072 bool reg_exp_too_big_;
whesse@chromium.org7b260152011-06-20 15:33:18 +00001073 int current_expansion_factor_;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00001074 FrequencyCollator frequency_collator_;
rossberg@chromium.org400388e2012-06-06 09:29:22 +00001075 Zone* zone_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001076};
1077
1078
1079class RecursionCheck {
1080 public:
1081 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1082 compiler->IncrementRecursionDepth();
1083 }
1084 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1085 private:
1086 RegExpCompiler* compiler_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001087};
1088
1089
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00001090static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
1091 return RegExpEngine::CompilationResult("RegExp too big");
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001092}
1093
1094
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001095// Attempts to compile the regexp using an Irregexp code generator. Returns
1096// a fixed array or a null handle depending on whether it succeeded.
rossberg@chromium.org400388e2012-06-06 09:29:22 +00001097RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii,
1098 Zone* zone)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001099 : next_register_(2 * (capture_count + 1)),
1100 work_list_(NULL),
1101 recursion_depth_(0),
ager@chromium.org8bb60582008-12-11 12:02:20 +00001102 ignore_case_(ignore_case),
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001103 ascii_(ascii),
whesse@chromium.org7b260152011-06-20 15:33:18 +00001104 reg_exp_too_big_(false),
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00001105 current_expansion_factor_(1),
rossberg@chromium.org400388e2012-06-06 09:29:22 +00001106 frequency_collator_(),
1107 zone_(zone) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001108 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001109 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001110}
1111
1112
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00001113RegExpEngine::CompilationResult RegExpCompiler::Assemble(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001114 RegExpMacroAssembler* macro_assembler,
1115 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001116 int capture_count,
1117 Handle<String> pattern) {
karlklose@chromium.org83a47282011-05-11 11:54:09 +00001118 Heap* heap = pattern->GetHeap();
1119
1120 bool use_slow_safe_regexp_compiler = false;
1121 if (heap->total_regexp_code_generated() >
1122 RegExpImpl::kRegWxpCompiledLimit &&
1123 heap->isolate()->memory_allocator()->SizeExecutable() >
1124 RegExpImpl::kRegExpExecutableMemoryLimit) {
1125 use_slow_safe_regexp_compiler = true;
1126 }
1127
1128 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
1129
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001130#ifdef DEBUG
1131 if (FLAG_trace_regexp_assembler)
1132 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
1133 else
1134#endif
1135 macro_assembler_ = macro_assembler;
karlklose@chromium.org83a47282011-05-11 11:54:09 +00001136
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001137 List <RegExpNode*> work_list(0);
1138 work_list_ = &work_list;
1139 Label fail;
iposva@chromium.org245aa852009-02-10 00:49:54 +00001140 macro_assembler_->PushBacktrack(&fail);
ager@chromium.org32912102009-01-16 10:38:43 +00001141 Trace new_trace;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001142 start->Emit(this, &new_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001143 macro_assembler_->Bind(&fail);
1144 macro_assembler_->Fail();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001145 while (!work_list.is_empty()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001146 work_list.RemoveLast()->Emit(this, &new_trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001147 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00001148 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
1149
karlklose@chromium.org83a47282011-05-11 11:54:09 +00001150 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1151 heap->IncreaseTotalRegexpCodeGenerated(code->Size());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001152 work_list_ = NULL;
1153#ifdef DEBUG
danno@chromium.org4d3fe4e2011-03-10 10:14:28 +00001154 if (FLAG_print_code) {
1155 Handle<Code>::cast(code)->Disassemble(*pattern->ToCString());
1156 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001157 if (FLAG_trace_regexp_assembler) {
1158 delete macro_assembler_;
1159 }
1160#endif
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00001161 return RegExpEngine::CompilationResult(*code, next_register_);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001162}
1163
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001164
ager@chromium.org32912102009-01-16 10:38:43 +00001165bool Trace::DeferredAction::Mentions(int that) {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00001166 if (action_type() == ActionNode::CLEAR_CAPTURES) {
ager@chromium.org32912102009-01-16 10:38:43 +00001167 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1168 return range.Contains(that);
1169 } else {
1170 return reg() == that;
1171 }
1172}
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001173
ager@chromium.org32912102009-01-16 10:38:43 +00001174
1175bool Trace::mentions_reg(int reg) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001176 for (DeferredAction* action = actions_;
1177 action != NULL;
1178 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001179 if (action->Mentions(reg))
1180 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001181 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001182 return false;
1183}
1184
1185
ager@chromium.org32912102009-01-16 10:38:43 +00001186bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1187 ASSERT_EQ(0, *cp_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001188 for (DeferredAction* action = actions_;
1189 action != NULL;
1190 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001191 if (action->Mentions(reg)) {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00001192 if (action->action_type() == ActionNode::STORE_POSITION) {
ager@chromium.org32912102009-01-16 10:38:43 +00001193 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1194 return true;
1195 } else {
1196 return false;
1197 }
1198 }
1199 }
1200 return false;
1201}
1202
1203
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001204int Trace::FindAffectedRegisters(OutSet* affected_registers,
1205 Zone* zone) {
ager@chromium.org32912102009-01-16 10:38:43 +00001206 int max_register = RegExpCompiler::kNoRegister;
1207 for (DeferredAction* action = actions_;
1208 action != NULL;
1209 action = action->next()) {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00001210 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
ager@chromium.org32912102009-01-16 10:38:43 +00001211 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1212 for (int i = range.from(); i <= range.to(); i++)
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001213 affected_registers->Set(i, zone);
ager@chromium.org32912102009-01-16 10:38:43 +00001214 if (range.to() > max_register) max_register = range.to();
1215 } else {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001216 affected_registers->Set(action->reg(), zone);
ager@chromium.org32912102009-01-16 10:38:43 +00001217 if (action->reg() > max_register) max_register = action->reg();
1218 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001219 }
1220 return max_register;
1221}
1222
1223
ager@chromium.org32912102009-01-16 10:38:43 +00001224void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1225 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001226 OutSet& registers_to_pop,
1227 OutSet& registers_to_clear) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001228 for (int reg = max_register; reg >= 0; reg--) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001229 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
1230 else if (registers_to_clear.Get(reg)) {
1231 int clear_to = reg;
1232 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1233 reg--;
1234 }
1235 assembler->ClearRegisters(reg, clear_to);
1236 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001237 }
1238}
1239
1240
ager@chromium.org32912102009-01-16 10:38:43 +00001241void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1242 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001243 OutSet& affected_registers,
1244 OutSet* registers_to_pop,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001245 OutSet* registers_to_clear,
1246 Zone* zone) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001247 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1248 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1249
ager@chromium.org5aa501c2009-06-23 07:57:28 +00001250 // Count pushes performed to force a stack limit check occasionally.
1251 int pushes = 0;
1252
ager@chromium.org8bb60582008-12-11 12:02:20 +00001253 for (int reg = 0; reg <= max_register; reg++) {
1254 if (!affected_registers.Get(reg)) {
1255 continue;
1256 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001257
1258 // The chronologically first deferred action in the trace
1259 // is used to infer the action needed to restore a register
1260 // to its previous state (or not, if it's safe to ignore it).
1261 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1262 DeferredActionUndoType undo_action = IGNORE;
1263
ager@chromium.org8bb60582008-12-11 12:02:20 +00001264 int value = 0;
1265 bool absolute = false;
ager@chromium.org32912102009-01-16 10:38:43 +00001266 bool clear = false;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001267 int store_position = -1;
1268 // This is a little tricky because we are scanning the actions in reverse
1269 // historical order (newest first).
1270 for (DeferredAction* action = actions_;
1271 action != NULL;
1272 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001273 if (action->Mentions(reg)) {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00001274 switch (action->action_type()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001275 case ActionNode::SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00001276 Trace::DeferredSetRegister* psr =
1277 static_cast<Trace::DeferredSetRegister*>(action);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001278 if (!absolute) {
1279 value += psr->value();
1280 absolute = true;
1281 }
1282 // SET_REGISTER is currently only used for newly introduced loop
1283 // counters. They can have a significant previous value if they
1284 // occour in a loop. TODO(lrn): Propagate this information, so
1285 // we can set undo_action to IGNORE if we know there is no value to
1286 // restore.
1287 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001288 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +00001289 ASSERT(!clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001290 break;
1291 }
1292 case ActionNode::INCREMENT_REGISTER:
1293 if (!absolute) {
1294 value++;
1295 }
1296 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +00001297 ASSERT(!clear);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001298 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001299 break;
1300 case ActionNode::STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00001301 Trace::DeferredCapture* pc =
1302 static_cast<Trace::DeferredCapture*>(action);
1303 if (!clear && store_position == -1) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001304 store_position = pc->cp_offset();
1305 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001306
1307 // For captures we know that stores and clears alternate.
1308 // Other register, are never cleared, and if the occur
1309 // inside a loop, they might be assigned more than once.
1310 if (reg <= 1) {
1311 // Registers zero and one, aka "capture zero", is
1312 // always set correctly if we succeed. There is no
1313 // need to undo a setting on backtrack, because we
1314 // will set it again or fail.
1315 undo_action = IGNORE;
1316 } else {
1317 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1318 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001319 ASSERT(!absolute);
1320 ASSERT_EQ(value, 0);
1321 break;
1322 }
ager@chromium.org32912102009-01-16 10:38:43 +00001323 case ActionNode::CLEAR_CAPTURES: {
1324 // Since we're scanning in reverse order, if we've already
1325 // set the position we have to ignore historically earlier
1326 // clearing operations.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001327 if (store_position == -1) {
ager@chromium.org32912102009-01-16 10:38:43 +00001328 clear = true;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001329 }
1330 undo_action = RESTORE;
ager@chromium.org32912102009-01-16 10:38:43 +00001331 ASSERT(!absolute);
1332 ASSERT_EQ(value, 0);
1333 break;
1334 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001335 default:
1336 UNREACHABLE();
1337 break;
1338 }
1339 }
1340 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001341 // Prepare for the undo-action (e.g., push if it's going to be popped).
1342 if (undo_action == RESTORE) {
1343 pushes++;
1344 RegExpMacroAssembler::StackCheckFlag stack_check =
1345 RegExpMacroAssembler::kNoStackLimitCheck;
1346 if (pushes == push_limit) {
1347 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1348 pushes = 0;
1349 }
1350
1351 assembler->PushRegister(reg, stack_check);
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001352 registers_to_pop->Set(reg, zone);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001353 } else if (undo_action == CLEAR) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001354 registers_to_clear->Set(reg, zone);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001355 }
1356 // Perform the chronologically last action (or accumulated increment)
1357 // for the register.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001358 if (store_position != -1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001359 assembler->WriteCurrentPositionToRegister(reg, store_position);
ager@chromium.org32912102009-01-16 10:38:43 +00001360 } else if (clear) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001361 assembler->ClearRegisters(reg, reg);
ager@chromium.org32912102009-01-16 10:38:43 +00001362 } else if (absolute) {
1363 assembler->SetRegister(reg, value);
1364 } else if (value != 0) {
1365 assembler->AdvanceRegister(reg, value);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001366 }
1367 }
1368}
1369
1370
ager@chromium.org8bb60582008-12-11 12:02:20 +00001371// This is called as we come into a loop choice node and some other tricky
ager@chromium.org32912102009-01-16 10:38:43 +00001372// nodes. It normalizes the state of the code generator to ensure we can
ager@chromium.org8bb60582008-12-11 12:02:20 +00001373// generate generic code.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001374void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001375 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001376
iposva@chromium.org245aa852009-02-10 00:49:54 +00001377 ASSERT(!is_trivial());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001378
1379 if (actions_ == NULL && backtrack() == NULL) {
1380 // Here we just have some deferred cp advances to fix and we are back to
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001381 // a normal situation. We may also have to forget some information gained
1382 // through a quick check that was already performed.
1383 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001384 // Create a new trivial state and generate the node with that.
ager@chromium.org32912102009-01-16 10:38:43 +00001385 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001386 successor->Emit(compiler, &new_state);
1387 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001388 }
1389
1390 // Generate deferred actions here along with code to undo them again.
1391 OutSet affected_registers;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001392
ager@chromium.org381abbb2009-02-25 13:23:22 +00001393 if (backtrack() != NULL) {
1394 // Here we have a concrete backtrack location. These are set up by choice
1395 // nodes and so they indicate that we have a deferred save of the current
1396 // position which we may need to emit here.
1397 assembler->PushCurrentPosition();
1398 }
1399
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001400 int max_register = FindAffectedRegisters(&affected_registers,
1401 compiler->zone());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001402 OutSet registers_to_pop;
1403 OutSet registers_to_clear;
1404 PerformDeferredActions(assembler,
1405 max_register,
1406 affected_registers,
1407 &registers_to_pop,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001408 &registers_to_clear,
1409 compiler->zone());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001410 if (cp_offset_ != 0) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001411 assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001412 }
1413
1414 // Create a new trivial state and generate the node with that.
1415 Label undo;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001416 assembler->PushBacktrack(&undo);
ager@chromium.org32912102009-01-16 10:38:43 +00001417 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001418 successor->Emit(compiler, &new_state);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001419
1420 // On backtrack we need to restore state.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001421 assembler->Bind(&undo);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001422 RestoreAffectedRegisters(assembler,
1423 max_register,
1424 registers_to_pop,
1425 registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001426 if (backtrack() == NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001427 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001428 } else {
ager@chromium.org381abbb2009-02-25 13:23:22 +00001429 assembler->PopCurrentPosition();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001430 assembler->GoTo(backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001431 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001432}
1433
1434
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001435void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001436 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001437
1438 // Omit flushing the trace. We discard the entire stack frame anyway.
1439
ager@chromium.org8bb60582008-12-11 12:02:20 +00001440 if (!label()->is_bound()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001441 // We are completely independent of the trace, since we ignore it,
1442 // so this code can be used as the generic version.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001443 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001444 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001445
1446 // Throw away everything on the backtrack stack since the start
1447 // of the negative submatch and restore the character position.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001448 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1449 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001450 if (clear_capture_count_ > 0) {
1451 // Clear any captures that might have been performed during the success
1452 // of the body of the negative look-ahead.
1453 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1454 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1455 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001456 // Now that we have unwound the stack we find at the top of the stack the
1457 // backtrack that the BeginSubmatch node got.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001458 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001459}
1460
1461
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001462void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00001463 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001464 trace->Flush(compiler, this);
1465 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001466 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001467 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001468 if (!label()->is_bound()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001469 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001470 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001471 switch (action_) {
1472 case ACCEPT:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001473 assembler->Succeed();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001474 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001475 case BACKTRACK:
ager@chromium.org32912102009-01-16 10:38:43 +00001476 assembler->GoTo(trace->backtrack());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001477 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001478 case NEGATIVE_SUBMATCH_SUCCESS:
1479 // This case is handled in a different virtual method.
1480 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001481 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001482 UNIMPLEMENTED();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001483}
1484
1485
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001486void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001487 if (guards_ == NULL)
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001488 guards_ = new(zone) ZoneList<Guard*>(1, zone);
1489 guards_->Add(guard, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001490}
1491
1492
ager@chromium.org8bb60582008-12-11 12:02:20 +00001493ActionNode* ActionNode::SetRegister(int reg,
1494 int val,
1495 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001496 ActionNode* result =
1497 new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001498 result->data_.u_store_register.reg = reg;
1499 result->data_.u_store_register.value = val;
1500 return result;
1501}
1502
1503
1504ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001505 ActionNode* result =
1506 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001507 result->data_.u_increment_register.reg = reg;
1508 return result;
1509}
1510
1511
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001512ActionNode* ActionNode::StorePosition(int reg,
1513 bool is_capture,
1514 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001515 ActionNode* result =
1516 new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001517 result->data_.u_position_register.reg = reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001518 result->data_.u_position_register.is_capture = is_capture;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001519 return result;
1520}
1521
1522
ager@chromium.org32912102009-01-16 10:38:43 +00001523ActionNode* ActionNode::ClearCaptures(Interval range,
1524 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001525 ActionNode* result =
1526 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
ager@chromium.org32912102009-01-16 10:38:43 +00001527 result->data_.u_clear_captures.range_from = range.from();
1528 result->data_.u_clear_captures.range_to = range.to();
1529 return result;
1530}
1531
1532
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001533ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1534 int position_reg,
1535 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001536 ActionNode* result =
1537 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001538 result->data_.u_submatch.stack_pointer_register = stack_reg;
1539 result->data_.u_submatch.current_position_register = position_reg;
1540 return result;
1541}
1542
1543
ager@chromium.org8bb60582008-12-11 12:02:20 +00001544ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1545 int position_reg,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001546 int clear_register_count,
1547 int clear_register_from,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001548 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001549 ActionNode* result =
1550 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001551 result->data_.u_submatch.stack_pointer_register = stack_reg;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001552 result->data_.u_submatch.current_position_register = position_reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001553 result->data_.u_submatch.clear_register_count = clear_register_count;
1554 result->data_.u_submatch.clear_register_from = clear_register_from;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001555 return result;
1556}
1557
1558
ager@chromium.org32912102009-01-16 10:38:43 +00001559ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1560 int repetition_register,
1561 int repetition_limit,
1562 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001563 ActionNode* result =
1564 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
ager@chromium.org32912102009-01-16 10:38:43 +00001565 result->data_.u_empty_match_check.start_register = start_register;
1566 result->data_.u_empty_match_check.repetition_register = repetition_register;
1567 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1568 return result;
1569}
1570
1571
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001572#define DEFINE_ACCEPT(Type) \
1573 void Type##Node::Accept(NodeVisitor* visitor) { \
1574 visitor->Visit##Type(this); \
1575 }
1576FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1577#undef DEFINE_ACCEPT
1578
1579
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001580void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1581 visitor->VisitLoopChoice(this);
1582}
1583
1584
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001585// -------------------------------------------------------------------
1586// Emit code.
1587
1588
1589void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1590 Guard* guard,
ager@chromium.org32912102009-01-16 10:38:43 +00001591 Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001592 switch (guard->op()) {
1593 case Guard::LT:
ager@chromium.org32912102009-01-16 10:38:43 +00001594 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001595 macro_assembler->IfRegisterGE(guard->reg(),
1596 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001597 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001598 break;
1599 case Guard::GEQ:
ager@chromium.org32912102009-01-16 10:38:43 +00001600 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001601 macro_assembler->IfRegisterLT(guard->reg(),
1602 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001603 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001604 break;
1605 }
1606}
1607
1608
ager@chromium.org381abbb2009-02-25 13:23:22 +00001609// Returns the number of characters in the equivalence class, omitting those
1610// that cannot occur in the source string because it is ASCII.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001611static int GetCaseIndependentLetters(Isolate* isolate,
1612 uc16 character,
ager@chromium.org381abbb2009-02-25 13:23:22 +00001613 bool ascii_subject,
1614 unibrow::uchar* letters) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001615 int length =
1616 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
whesse@chromium.orge90029b2010-08-02 11:52:17 +00001617 // Unibrow returns 0 or 1 for characters where case independence is
ager@chromium.org381abbb2009-02-25 13:23:22 +00001618 // trivial.
1619 if (length == 0) {
1620 letters[0] = character;
1621 length = 1;
1622 }
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00001623 if (!ascii_subject || character <= String::kMaxOneByteCharCode) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00001624 return length;
1625 }
1626 // The standard requires that non-ASCII characters cannot have ASCII
1627 // character codes in their equivalence class.
1628 return 0;
1629}
1630
1631
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001632static inline bool EmitSimpleCharacter(Isolate* isolate,
1633 RegExpCompiler* compiler,
ager@chromium.org381abbb2009-02-25 13:23:22 +00001634 uc16 c,
1635 Label* on_failure,
1636 int cp_offset,
1637 bool check,
1638 bool preloaded) {
1639 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1640 bool bound_checked = false;
1641 if (!preloaded) {
1642 assembler->LoadCurrentCharacter(
1643 cp_offset,
1644 on_failure,
1645 check);
1646 bound_checked = true;
1647 }
1648 assembler->CheckNotCharacter(c, on_failure);
1649 return bound_checked;
1650}
1651
1652
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001653// Only emits non-letters (things that don't have case). Only used for case
1654// independent matches.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001655static inline bool EmitAtomNonLetter(Isolate* isolate,
1656 RegExpCompiler* compiler,
ager@chromium.org381abbb2009-02-25 13:23:22 +00001657 uc16 c,
1658 Label* on_failure,
1659 int cp_offset,
1660 bool check,
1661 bool preloaded) {
1662 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1663 bool ascii = compiler->ascii();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001664 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001665 int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
ager@chromium.org381abbb2009-02-25 13:23:22 +00001666 if (length < 1) {
1667 // This can't match. Must be an ASCII subject and a non-ASCII character.
1668 // We do not need to do anything since the ASCII pass already handled this.
1669 return false; // Bounds not checked.
1670 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001671 bool checked = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001672 // We handle the length > 1 case in a later pass.
1673 if (length == 1) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00001674 if (ascii && c > String::kMaxOneByteCharCodeU) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00001675 // Can't match - see above.
1676 return false; // Bounds not checked.
1677 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001678 if (!preloaded) {
1679 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1680 checked = check;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001681 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001682 macro_assembler->CheckNotCharacter(c, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001683 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001684 return checked;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001685}
1686
1687
1688static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001689 bool ascii,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001690 uc16 c1,
1691 uc16 c2,
1692 Label* on_failure) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001693 uc16 char_mask;
1694 if (ascii) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00001695 char_mask = String::kMaxOneByteCharCode;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001696 } else {
yangguo@chromium.org154ff992012-03-13 08:09:54 +00001697 char_mask = String::kMaxUtf16CodeUnit;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001698 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001699 uc16 exor = c1 ^ c2;
1700 // Check whether exor has only one bit set.
1701 if (((exor - 1) & exor) == 0) {
1702 // If c1 and c2 differ only by one bit.
1703 // Ecma262UnCanonicalize always gives the highest number last.
1704 ASSERT(c2 > c1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001705 uc16 mask = char_mask ^ exor;
1706 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001707 return true;
1708 }
1709 ASSERT(c2 > c1);
1710 uc16 diff = c2 - c1;
1711 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1712 // If the characters differ by 2^n but don't differ by one bit then
1713 // subtract the difference from the found character, then do the or
1714 // trick. We avoid the theoretical case where negative numbers are
1715 // involved in order to simplify code generation.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001716 uc16 mask = char_mask ^ diff;
1717 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1718 diff,
1719 mask,
1720 on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001721 return true;
1722 }
1723 return false;
1724}
1725
1726
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001727typedef bool EmitCharacterFunction(Isolate* isolate,
1728 RegExpCompiler* compiler,
ager@chromium.org381abbb2009-02-25 13:23:22 +00001729 uc16 c,
1730 Label* on_failure,
1731 int cp_offset,
1732 bool check,
1733 bool preloaded);
1734
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001735// Only emits letters (things that have case). Only used for case independent
1736// matches.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001737static inline bool EmitAtomLetter(Isolate* isolate,
1738 RegExpCompiler* compiler,
ager@chromium.org381abbb2009-02-25 13:23:22 +00001739 uc16 c,
1740 Label* on_failure,
1741 int cp_offset,
1742 bool check,
1743 bool preloaded) {
1744 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1745 bool ascii = compiler->ascii();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001746 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001747 int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001748 if (length <= 1) return false;
1749 // We may not need to check against the end of the input string
1750 // if this character lies before a character that matched.
1751 if (!preloaded) {
1752 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001753 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001754 Label ok;
1755 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1756 switch (length) {
1757 case 2: {
1758 if (ShortCutEmitCharacterPair(macro_assembler,
1759 ascii,
1760 chars[0],
1761 chars[1],
1762 on_failure)) {
1763 } else {
1764 macro_assembler->CheckCharacter(chars[0], &ok);
1765 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1766 macro_assembler->Bind(&ok);
1767 }
1768 break;
1769 }
1770 case 4:
1771 macro_assembler->CheckCharacter(chars[3], &ok);
1772 // Fall through!
1773 case 3:
1774 macro_assembler->CheckCharacter(chars[0], &ok);
1775 macro_assembler->CheckCharacter(chars[1], &ok);
1776 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1777 macro_assembler->Bind(&ok);
1778 break;
1779 default:
1780 UNREACHABLE();
1781 break;
1782 }
1783 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001784}
1785
1786
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00001787static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1788 int border,
1789 Label* fall_through,
1790 Label* above_or_equal,
1791 Label* below) {
1792 if (below != fall_through) {
1793 masm->CheckCharacterLT(border, below);
1794 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1795 } else {
1796 masm->CheckCharacterGT(border - 1, above_or_equal);
1797 }
1798}
1799
1800
1801static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1802 int first,
1803 int last,
1804 Label* fall_through,
1805 Label* in_range,
1806 Label* out_of_range) {
1807 if (in_range == fall_through) {
1808 if (first == last) {
1809 masm->CheckNotCharacter(first, out_of_range);
1810 } else {
1811 masm->CheckCharacterNotInRange(first, last, out_of_range);
1812 }
1813 } else {
1814 if (first == last) {
1815 masm->CheckCharacter(first, in_range);
1816 } else {
1817 masm->CheckCharacterInRange(first, last, in_range);
1818 }
1819 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1820 }
1821}
1822
1823
1824// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1825// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1826static void EmitUseLookupTable(
1827 RegExpMacroAssembler* masm,
1828 ZoneList<int>* ranges,
1829 int start_index,
1830 int end_index,
1831 int min_char,
1832 Label* fall_through,
1833 Label* even_label,
1834 Label* odd_label) {
1835 static const int kSize = RegExpMacroAssembler::kTableSize;
1836 static const int kMask = RegExpMacroAssembler::kTableMask;
1837
1838 int base = (min_char & ~kMask);
1839 USE(base);
1840
1841 // Assert that everything is on one kTableSize page.
1842 for (int i = start_index; i <= end_index; i++) {
1843 ASSERT_EQ(ranges->at(i) & ~kMask, base);
1844 }
1845 ASSERT(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1846
1847 char templ[kSize];
1848 Label* on_bit_set;
1849 Label* on_bit_clear;
1850 int bit;
1851 if (even_label == fall_through) {
1852 on_bit_set = odd_label;
1853 on_bit_clear = even_label;
1854 bit = 1;
1855 } else {
1856 on_bit_set = even_label;
1857 on_bit_clear = odd_label;
1858 bit = 0;
1859 }
1860 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1861 templ[i] = bit;
1862 }
1863 int j = 0;
1864 bit ^= 1;
1865 for (int i = start_index; i < end_index; i++) {
1866 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1867 templ[j] = bit;
1868 }
1869 bit ^= 1;
1870 }
1871 for (int i = j; i < kSize; i++) {
1872 templ[i] = bit;
1873 }
verwaest@chromium.orgd4be0f02013-06-05 13:39:03 +00001874 Factory* factory = Isolate::Current()->factory();
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00001875 // TODO(erikcorry): Cache these.
verwaest@chromium.orgd4be0f02013-06-05 13:39:03 +00001876 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00001877 for (int i = 0; i < kSize; i++) {
1878 ba->set(i, templ[i]);
1879 }
1880 masm->CheckBitInTable(ba, on_bit_set);
1881 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1882}
1883
1884
1885static void CutOutRange(RegExpMacroAssembler* masm,
1886 ZoneList<int>* ranges,
1887 int start_index,
1888 int end_index,
1889 int cut_index,
1890 Label* even_label,
1891 Label* odd_label) {
1892 bool odd = (((cut_index - start_index) & 1) == 1);
1893 Label* in_range_label = odd ? odd_label : even_label;
1894 Label dummy;
1895 EmitDoubleBoundaryTest(masm,
1896 ranges->at(cut_index),
1897 ranges->at(cut_index + 1) - 1,
1898 &dummy,
1899 in_range_label,
1900 &dummy);
1901 ASSERT(!dummy.is_linked());
1902 // Cut out the single range by rewriting the array. This creates a new
1903 // range that is a merger of the two ranges on either side of the one we
1904 // are cutting out. The oddity of the labels is preserved.
1905 for (int j = cut_index; j > start_index; j--) {
1906 ranges->at(j) = ranges->at(j - 1);
1907 }
1908 for (int j = cut_index + 1; j < end_index; j++) {
1909 ranges->at(j) = ranges->at(j + 1);
1910 }
1911}
1912
1913
1914// Unicode case. Split the search space into kSize spaces that are handled
1915// with recursion.
1916static void SplitSearchSpace(ZoneList<int>* ranges,
1917 int start_index,
1918 int end_index,
1919 int* new_start_index,
1920 int* new_end_index,
1921 int* border) {
1922 static const int kSize = RegExpMacroAssembler::kTableSize;
1923 static const int kMask = RegExpMacroAssembler::kTableMask;
1924
1925 int first = ranges->at(start_index);
1926 int last = ranges->at(end_index) - 1;
1927
1928 *new_start_index = start_index;
1929 *border = (ranges->at(start_index) & ~kMask) + kSize;
1930 while (*new_start_index < end_index) {
1931 if (ranges->at(*new_start_index) > *border) break;
1932 (*new_start_index)++;
1933 }
1934 // new_start_index is the index of the first edge that is beyond the
1935 // current kSize space.
1936
1937 // For very large search spaces we do a binary chop search of the non-ASCII
1938 // space instead of just going to the end of the current kSize space. The
1939 // heuristics are complicated a little by the fact that any 128-character
1940 // encoding space can be quickly tested with a table lookup, so we don't
1941 // wish to do binary chop search at a smaller granularity than that. A
1942 // 128-character space can take up a lot of space in the ranges array if,
1943 // for example, we only want to match every second character (eg. the lower
1944 // case characters on some Unicode pages).
1945 int binary_chop_index = (end_index + start_index) / 2;
1946 // The first test ensures that we get to the code that handles the ASCII
1947 // range with a single not-taken branch, speeding up this important
1948 // character range (even non-ASCII charset-based text has spaces and
1949 // punctuation).
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00001950 if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case.
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00001951 end_index - start_index > (*new_start_index - start_index) * 2 &&
1952 last - first > kSize * 2 &&
1953 binary_chop_index > *new_start_index &&
1954 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1955 int scan_forward_for_section_border = binary_chop_index;;
1956 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1957
1958 while (scan_forward_for_section_border < end_index) {
1959 if (ranges->at(scan_forward_for_section_border) > new_border) {
1960 *new_start_index = scan_forward_for_section_border;
1961 *border = new_border;
1962 break;
1963 }
1964 scan_forward_for_section_border++;
1965 }
1966 }
1967
1968 ASSERT(*new_start_index > start_index);
1969 *new_end_index = *new_start_index - 1;
1970 if (ranges->at(*new_end_index) == *border) {
1971 (*new_end_index)--;
1972 }
1973 if (*border >= ranges->at(end_index)) {
1974 *border = ranges->at(end_index);
1975 *new_start_index = end_index; // Won't be used.
1976 *new_end_index = end_index - 1;
1977 }
1978}
1979
1980
1981// Gets a series of segment boundaries representing a character class. If the
1982// character is in the range between an even and an odd boundary (counting from
1983// start_index) then go to even_label, otherwise go to odd_label. We already
1984// know that the character is in the range of min_char to max_char inclusive.
1985// Either label can be NULL indicating backtracking. Either label can also be
1986// equal to the fall_through label.
1987static void GenerateBranches(RegExpMacroAssembler* masm,
1988 ZoneList<int>* ranges,
1989 int start_index,
1990 int end_index,
1991 uc16 min_char,
1992 uc16 max_char,
1993 Label* fall_through,
1994 Label* even_label,
1995 Label* odd_label) {
1996 int first = ranges->at(start_index);
1997 int last = ranges->at(end_index) - 1;
1998
1999 ASSERT_LT(min_char, first);
2000
2001 // Just need to test if the character is before or on-or-after
2002 // a particular character.
2003 if (start_index == end_index) {
2004 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
2005 return;
2006 }
2007
2008 // Another almost trivial case: There is one interval in the middle that is
2009 // different from the end intervals.
2010 if (start_index + 1 == end_index) {
2011 EmitDoubleBoundaryTest(
2012 masm, first, last, fall_through, even_label, odd_label);
2013 return;
2014 }
2015
2016 // It's not worth using table lookup if there are very few intervals in the
2017 // character class.
2018 if (end_index - start_index <= 6) {
2019 // It is faster to test for individual characters, so we look for those
2020 // first, then try arbitrary ranges in the second round.
2021 static int kNoCutIndex = -1;
2022 int cut = kNoCutIndex;
2023 for (int i = start_index; i < end_index; i++) {
2024 if (ranges->at(i) == ranges->at(i + 1) - 1) {
2025 cut = i;
2026 break;
2027 }
2028 }
2029 if (cut == kNoCutIndex) cut = start_index;
2030 CutOutRange(
2031 masm, ranges, start_index, end_index, cut, even_label, odd_label);
2032 ASSERT_GE(end_index - start_index, 2);
2033 GenerateBranches(masm,
2034 ranges,
2035 start_index + 1,
2036 end_index - 1,
2037 min_char,
2038 max_char,
2039 fall_through,
2040 even_label,
2041 odd_label);
2042 return;
2043 }
2044
2045 // If there are a lot of intervals in the regexp, then we will use tables to
2046 // determine whether the character is inside or outside the character class.
2047 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2048
2049 if ((max_char >> kBits) == (min_char >> kBits)) {
2050 EmitUseLookupTable(masm,
2051 ranges,
2052 start_index,
2053 end_index,
2054 min_char,
2055 fall_through,
2056 even_label,
2057 odd_label);
2058 return;
2059 }
2060
2061 if ((min_char >> kBits) != (first >> kBits)) {
2062 masm->CheckCharacterLT(first, odd_label);
2063 GenerateBranches(masm,
2064 ranges,
2065 start_index + 1,
2066 end_index,
2067 first,
2068 max_char,
2069 fall_through,
2070 odd_label,
2071 even_label);
2072 return;
2073 }
2074
2075 int new_start_index = 0;
2076 int new_end_index = 0;
2077 int border = 0;
2078
2079 SplitSearchSpace(ranges,
2080 start_index,
2081 end_index,
2082 &new_start_index,
2083 &new_end_index,
2084 &border);
2085
2086 Label handle_rest;
2087 Label* above = &handle_rest;
2088 if (border == last + 1) {
2089 // We didn't find any section that started after the limit, so everything
2090 // above the border is one of the terminal labels.
2091 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2092 ASSERT(new_end_index == end_index - 1);
2093 }
2094
2095 ASSERT_LE(start_index, new_end_index);
2096 ASSERT_LE(new_start_index, end_index);
2097 ASSERT_LT(start_index, new_start_index);
2098 ASSERT_LT(new_end_index, end_index);
2099 ASSERT(new_end_index + 1 == new_start_index ||
2100 (new_end_index + 2 == new_start_index &&
2101 border == ranges->at(new_end_index + 1)));
2102 ASSERT_LT(min_char, border - 1);
2103 ASSERT_LT(border, max_char);
2104 ASSERT_LT(ranges->at(new_end_index), border);
2105 ASSERT(border < ranges->at(new_start_index) ||
2106 (border == ranges->at(new_start_index) &&
2107 new_start_index == end_index &&
2108 new_end_index == end_index - 1 &&
2109 border == last + 1));
2110 ASSERT(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2111
2112 masm->CheckCharacterGT(border - 1, above);
2113 Label dummy;
2114 GenerateBranches(masm,
2115 ranges,
2116 start_index,
2117 new_end_index,
2118 min_char,
2119 border - 1,
2120 &dummy,
2121 even_label,
2122 odd_label);
2123 if (handle_rest.is_linked()) {
2124 masm->Bind(&handle_rest);
2125 bool flip = (new_start_index & 1) != (start_index & 1);
2126 GenerateBranches(masm,
2127 ranges,
2128 new_start_index,
2129 end_index,
2130 border,
2131 max_char,
2132 &dummy,
2133 flip ? odd_label : even_label,
2134 flip ? even_label : odd_label);
2135 }
2136}
2137
2138
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002139static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2140 RegExpCharacterClass* cc,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002141 bool ascii,
ager@chromium.org381abbb2009-02-25 13:23:22 +00002142 Label* on_failure,
2143 int cp_offset,
2144 bool check_offset,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002145 bool preloaded,
2146 Zone* zone) {
2147 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00002148 if (!CharacterRange::IsCanonical(ranges)) {
2149 CharacterRange::Canonicalize(ranges);
2150 }
2151
ager@chromium.org8bb60582008-12-11 12:02:20 +00002152 int max_char;
2153 if (ascii) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002154 max_char = String::kMaxOneByteCharCode;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002155 } else {
yangguo@chromium.org154ff992012-03-13 08:09:54 +00002156 max_char = String::kMaxUtf16CodeUnit;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002157 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002158
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002159 int range_count = ranges->length();
2160
ager@chromium.org8bb60582008-12-11 12:02:20 +00002161 int last_valid_range = range_count - 1;
2162 while (last_valid_range >= 0) {
2163 CharacterRange& range = ranges->at(last_valid_range);
2164 if (range.from() <= max_char) {
2165 break;
2166 }
2167 last_valid_range--;
2168 }
2169
2170 if (last_valid_range < 0) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002171 if (!cc->is_negated()) {
2172 macro_assembler->GoTo(on_failure);
2173 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002174 if (check_offset) {
2175 macro_assembler->CheckPosition(cp_offset, on_failure);
2176 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002177 return;
2178 }
2179
ager@chromium.org8bb60582008-12-11 12:02:20 +00002180 if (last_valid_range == 0 &&
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00002181 ranges->at(0).IsEverything(max_char)) {
2182 if (cc->is_negated()) {
2183 macro_assembler->GoTo(on_failure);
2184 } else {
2185 // This is a common case hit by non-anchored expressions.
2186 if (check_offset) {
2187 macro_assembler->CheckPosition(cp_offset, on_failure);
2188 }
2189 }
2190 return;
2191 }
2192 if (last_valid_range == 0 &&
ager@chromium.org8bb60582008-12-11 12:02:20 +00002193 !cc->is_negated() &&
2194 ranges->at(0).IsEverything(max_char)) {
2195 // This is a common case hit by non-anchored expressions.
ager@chromium.org8bb60582008-12-11 12:02:20 +00002196 if (check_offset) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002197 macro_assembler->CheckPosition(cp_offset, on_failure);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002198 }
2199 return;
2200 }
2201
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002202 if (!preloaded) {
2203 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002204 }
2205
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002206 if (cc->is_standard(zone) &&
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00002207 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2208 on_failure)) {
2209 return;
2210 }
2211
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00002212
2213 // A new list with ascending entries. Each entry is a code unit
2214 // where there is a boundary between code units that are part of
2215 // the class and code units that are not. Normally we insert an
2216 // entry at zero which goes to the failure label, but if there
2217 // was already one there we fall through for success on that entry.
2218 // Subsequent entries have alternating meaning (success/failure).
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002219 ZoneList<int>* range_boundaries =
2220 new(zone) ZoneList<int>(last_valid_range, zone);
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00002221
2222 bool zeroth_entry_is_failure = !cc->is_negated();
2223
2224 for (int i = 0; i <= last_valid_range; i++) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002225 CharacterRange& range = ranges->at(i);
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00002226 if (range.from() == 0) {
2227 ASSERT_EQ(i, 0);
2228 zeroth_entry_is_failure = !zeroth_entry_is_failure;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002229 } else {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002230 range_boundaries->Add(range.from(), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002231 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002232 range_boundaries->Add(range.to() + 1, zone);
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00002233 }
2234 int end_index = range_boundaries->length() - 1;
2235 if (range_boundaries->at(end_index) > max_char) {
2236 end_index--;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002237 }
2238
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00002239 Label fall_through;
2240 GenerateBranches(macro_assembler,
2241 range_boundaries,
2242 0, // start_index.
2243 end_index,
2244 0, // min_char.
2245 max_char,
2246 &fall_through,
2247 zeroth_entry_is_failure ? &fall_through : on_failure,
2248 zeroth_entry_is_failure ? on_failure : &fall_through);
2249 macro_assembler->Bind(&fall_through);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002250}
2251
2252
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002253RegExpNode::~RegExpNode() {
2254}
2255
2256
ager@chromium.org8bb60582008-12-11 12:02:20 +00002257RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002258 Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002259 // If we are generating a greedy loop then don't stop and don't reuse code.
ager@chromium.org32912102009-01-16 10:38:43 +00002260 if (trace->stop_node() != NULL) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002261 return CONTINUE;
2262 }
2263
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002264 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00002265 if (trace->is_trivial()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002266 if (label_.is_bound()) {
2267 // We are being asked to generate a generic version, but that's already
2268 // been done so just go to it.
2269 macro_assembler->GoTo(&label_);
2270 return DONE;
2271 }
2272 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
2273 // To avoid too deep recursion we push the node to the work queue and just
2274 // generate a goto here.
2275 compiler->AddWork(this);
2276 macro_assembler->GoTo(&label_);
2277 return DONE;
2278 }
2279 // Generate generic version of the node and bind the label for later use.
2280 macro_assembler->Bind(&label_);
2281 return CONTINUE;
2282 }
2283
2284 // We are being asked to make a non-generic version. Keep track of how many
2285 // non-generic versions we generate so as not to overdo it.
ager@chromium.org32912102009-01-16 10:38:43 +00002286 trace_count_++;
ager@chromium.org381abbb2009-02-25 13:23:22 +00002287 if (FLAG_regexp_optimization &&
iposva@chromium.org245aa852009-02-10 00:49:54 +00002288 trace_count_ < kMaxCopiesCodeGenerated &&
ager@chromium.org8bb60582008-12-11 12:02:20 +00002289 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
2290 return CONTINUE;
2291 }
2292
ager@chromium.org32912102009-01-16 10:38:43 +00002293 // If we get here code has been generated for this node too many times or
2294 // recursion is too deep. Time to switch to a generic version. The code for
ager@chromium.org8bb60582008-12-11 12:02:20 +00002295 // generic versions above can handle deep recursion properly.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002296 trace->Flush(compiler, this);
2297 return DONE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002298}
2299
2300
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002301int ActionNode::EatsAtLeast(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002302 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002303 bool not_at_start) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002304 if (budget <= 0) return 0;
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00002305 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002306 return on_success()->EatsAtLeast(still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002307 budget - 1,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002308 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002309}
2310
2311
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002312void ActionNode::FillInBMInfo(int offset,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00002313 int budget,
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002314 BoyerMooreLookahead* bm,
2315 bool not_at_start) {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00002316 if (action_type_ == BEGIN_SUBMATCH) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002317 bm->SetRest(offset);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00002318 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002319 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002320 }
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00002321 SaveBMInfo(bm, not_at_start, offset);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002322}
2323
2324
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002325int AssertionNode::EatsAtLeast(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002326 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002327 bool not_at_start) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002328 if (budget <= 0) return 0;
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002329 // If we know we are not at the start and we are asked "how many characters
2330 // will you match if you succeed?" then we can answer anything since false
2331 // implies false. So lets just return the max answer (still_to_find) since
2332 // that won't prevent us from preloading a lot of characters for the other
2333 // branches in the node graph.
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00002334 if (assertion_type() == AT_START && not_at_start) return still_to_find;
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002335 return on_success()->EatsAtLeast(still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002336 budget - 1,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002337 not_at_start);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002338}
2339
2340
verwaest@chromium.org37141392012-05-31 13:27:02 +00002341void AssertionNode::FillInBMInfo(int offset,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00002342 int budget,
verwaest@chromium.org37141392012-05-31 13:27:02 +00002343 BoyerMooreLookahead* bm,
2344 bool not_at_start) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002345 // Match the behaviour of EatsAtLeast on this node.
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00002346 if (assertion_type() == AT_START && not_at_start) return;
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002347 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00002348 SaveBMInfo(bm, not_at_start, offset);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002349}
2350
2351
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002352int BackReferenceNode::EatsAtLeast(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002353 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002354 bool not_at_start) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002355 if (budget <= 0) return 0;
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002356 return on_success()->EatsAtLeast(still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002357 budget - 1,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002358 not_at_start);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002359}
2360
2361
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002362int TextNode::EatsAtLeast(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002363 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002364 bool not_at_start) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002365 int answer = Length();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002366 if (answer >= still_to_find) return answer;
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002367 if (budget <= 0) return answer;
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002368 // We are not at start after this node so we set the last argument to 'true'.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002369 return answer + on_success()->EatsAtLeast(still_to_find - answer,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002370 budget - 1,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002371 true);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002372}
2373
2374
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00002375int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002376 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002377 bool not_at_start) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002378 if (budget <= 0) return 0;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002379 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2380 // afterwards.
2381 RegExpNode* node = alternatives_->at(1).node();
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002382 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002383}
2384
2385
2386void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
2387 QuickCheckDetails* details,
2388 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002389 int filled_in,
2390 bool not_at_start) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002391 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2392 // afterwards.
2393 RegExpNode* node = alternatives_->at(1).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002394 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002395}
2396
2397
2398int ChoiceNode::EatsAtLeastHelper(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002399 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002400 RegExpNode* ignore_this_node,
2401 bool not_at_start) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002402 if (budget <= 0) return 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002403 int min = 100;
2404 int choice_count = alternatives_->length();
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002405 budget = (budget - 1) / choice_count;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002406 for (int i = 0; i < choice_count; i++) {
2407 RegExpNode* node = alternatives_->at(i).node();
2408 if (node == ignore_this_node) continue;
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002409 int node_eats_at_least =
2410 node->EatsAtLeast(still_to_find, budget, not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002411 if (node_eats_at_least < min) min = node_eats_at_least;
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002412 if (min == 0) return 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002413 }
2414 return min;
2415}
2416
2417
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002418int LoopChoiceNode::EatsAtLeast(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002419 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002420 bool not_at_start) {
2421 return EatsAtLeastHelper(still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002422 budget - 1,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002423 loop_node_,
2424 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002425}
2426
2427
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002428int ChoiceNode::EatsAtLeast(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002429 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002430 bool not_at_start) {
2431 return EatsAtLeastHelper(still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002432 budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002433 NULL,
2434 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002435}
2436
2437
2438// Takes the left-most 1-bit and smears it out, setting all bits to its right.
2439static inline uint32_t SmearBitsRight(uint32_t v) {
2440 v |= v >> 1;
2441 v |= v >> 2;
2442 v |= v >> 4;
2443 v |= v >> 8;
2444 v |= v >> 16;
2445 return v;
2446}
2447
2448
2449bool QuickCheckDetails::Rationalize(bool asc) {
2450 bool found_useful_op = false;
2451 uint32_t char_mask;
2452 if (asc) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002453 char_mask = String::kMaxOneByteCharCode;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002454 } else {
yangguo@chromium.org154ff992012-03-13 08:09:54 +00002455 char_mask = String::kMaxUtf16CodeUnit;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002456 }
2457 mask_ = 0;
2458 value_ = 0;
2459 int char_shift = 0;
2460 for (int i = 0; i < characters_; i++) {
2461 Position* pos = &positions_[i];
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002462 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002463 found_useful_op = true;
2464 }
2465 mask_ |= (pos->mask & char_mask) << char_shift;
2466 value_ |= (pos->value & char_mask) << char_shift;
2467 char_shift += asc ? 8 : 16;
2468 }
2469 return found_useful_op;
2470}
2471
2472
2473bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002474 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002475 bool preload_has_checked_bounds,
2476 Label* on_possible_success,
2477 QuickCheckDetails* details,
2478 bool fall_through_on_failure) {
2479 if (details->characters() == 0) return false;
iposva@chromium.org245aa852009-02-10 00:49:54 +00002480 GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
2481 if (details->cannot_match()) return false;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002482 if (!details->Rationalize(compiler->ascii())) return false;
ager@chromium.org18ad94b2009-09-02 08:22:29 +00002483 ASSERT(details->characters() == 1 ||
2484 compiler->macro_assembler()->CanReadUnaligned());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002485 uint32_t mask = details->mask();
2486 uint32_t value = details->value();
2487
2488 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2489
ager@chromium.org32912102009-01-16 10:38:43 +00002490 if (trace->characters_preloaded() != details->characters()) {
2491 assembler->LoadCurrentCharacter(trace->cp_offset(),
2492 trace->backtrack(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002493 !preload_has_checked_bounds,
2494 details->characters());
2495 }
2496
2497
2498 bool need_mask = true;
2499
2500 if (details->characters() == 1) {
2501 // If number of characters preloaded is 1 then we used a byte or 16 bit
2502 // load so the value is already masked down.
2503 uint32_t char_mask;
2504 if (compiler->ascii()) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002505 char_mask = String::kMaxOneByteCharCode;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002506 } else {
yangguo@chromium.org154ff992012-03-13 08:09:54 +00002507 char_mask = String::kMaxUtf16CodeUnit;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002508 }
2509 if ((mask & char_mask) == char_mask) need_mask = false;
2510 mask &= char_mask;
2511 } else {
ricow@chromium.org5ad5ace2010-06-23 09:06:43 +00002512 // For 2-character preloads in ASCII mode or 1-character preloads in
2513 // TWO_BYTE mode we also use a 16 bit load with zero extend.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002514 if (details->characters() == 2 && compiler->ascii()) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002515 if ((mask & 0xffff) == 0xffff) need_mask = false;
ricow@chromium.org5ad5ace2010-06-23 09:06:43 +00002516 } else if (details->characters() == 1 && !compiler->ascii()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002517 if ((mask & 0xffff) == 0xffff) need_mask = false;
2518 } else {
2519 if (mask == 0xffffffff) need_mask = false;
2520 }
2521 }
2522
2523 if (fall_through_on_failure) {
2524 if (need_mask) {
2525 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2526 } else {
2527 assembler->CheckCharacter(value, on_possible_success);
2528 }
2529 } else {
2530 if (need_mask) {
ager@chromium.org32912102009-01-16 10:38:43 +00002531 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002532 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00002533 assembler->CheckNotCharacter(value, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002534 }
2535 }
2536 return true;
2537}
2538
2539
2540// Here is the meat of GetQuickCheckDetails (see also the comment on the
2541// super-class in the .h file).
2542//
2543// We iterate along the text object, building up for each character a
2544// mask and value that can be used to test for a quick failure to match.
2545// The masks and values for the positions will be combined into a single
2546// machine word for the current character width in order to be used in
2547// generating a quick check.
2548void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2549 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002550 int characters_filled_in,
2551 bool not_at_start) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00002552 Isolate* isolate = Isolate::Current();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002553 ASSERT(characters_filled_in < details->characters());
2554 int characters = details->characters();
2555 int char_mask;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002556 if (compiler->ascii()) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002557 char_mask = String::kMaxOneByteCharCode;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002558 } else {
yangguo@chromium.org154ff992012-03-13 08:09:54 +00002559 char_mask = String::kMaxUtf16CodeUnit;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002560 }
2561 for (int k = 0; k < elms_->length(); k++) {
2562 TextElement elm = elms_->at(k);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00002563 if (elm.text_type == TextElement::ATOM) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002564 Vector<const uc16> quarks = elm.data.u_atom->data();
2565 for (int i = 0; i < characters && i < quarks.length(); i++) {
2566 QuickCheckDetails::Position* pos =
2567 details->positions(characters_filled_in);
ager@chromium.org6f10e412009-02-13 10:11:16 +00002568 uc16 c = quarks[i];
danno@chromium.org2c26cb12012-05-03 09:06:43 +00002569 if (c > char_mask) {
2570 // If we expect a non-ASCII character from an ASCII string,
2571 // there is no way we can match. Not even case independent
2572 // matching can turn an ASCII character into non-ASCII or
2573 // vice versa.
2574 details->set_cannot_match();
2575 pos->determines_perfectly = false;
2576 return;
2577 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002578 if (compiler->ignore_case()) {
2579 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00002580 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
2581 chars);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002582 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
2583 if (length == 1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002584 // This letter has no case equivalents, so it's nice and simple
2585 // and the mask-compare will determine definitely whether we have
2586 // a match at this character position.
2587 pos->mask = char_mask;
2588 pos->value = c;
2589 pos->determines_perfectly = true;
2590 } else {
2591 uint32_t common_bits = char_mask;
2592 uint32_t bits = chars[0];
2593 for (int j = 1; j < length; j++) {
2594 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2595 common_bits ^= differing_bits;
2596 bits &= common_bits;
2597 }
2598 // If length is 2 and common bits has only one zero in it then
2599 // our mask and compare instruction will determine definitely
2600 // whether we have a match at this character position. Otherwise
2601 // it can only be an approximate check.
2602 uint32_t one_zero = (common_bits | ~char_mask);
2603 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2604 pos->determines_perfectly = true;
2605 }
2606 pos->mask = common_bits;
2607 pos->value = bits;
2608 }
2609 } else {
2610 // Don't ignore case. Nice simple case where the mask-compare will
2611 // determine definitely whether we have a match at this character
2612 // position.
2613 pos->mask = char_mask;
ager@chromium.org6f10e412009-02-13 10:11:16 +00002614 pos->value = c;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002615 pos->determines_perfectly = true;
2616 }
2617 characters_filled_in++;
2618 ASSERT(characters_filled_in <= details->characters());
2619 if (characters_filled_in == details->characters()) {
2620 return;
2621 }
2622 }
2623 } else {
2624 QuickCheckDetails::Position* pos =
2625 details->positions(characters_filled_in);
2626 RegExpCharacterClass* tree = elm.data.u_char_class;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002627 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002628 if (tree->is_negated()) {
2629 // A quick check uses multi-character mask and compare. There is no
2630 // useful way to incorporate a negative char class into this scheme
2631 // so we just conservatively create a mask and value that will always
2632 // succeed.
2633 pos->mask = 0;
2634 pos->value = 0;
2635 } else {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002636 int first_range = 0;
2637 while (ranges->at(first_range).from() > char_mask) {
2638 first_range++;
danno@chromium.org2c26cb12012-05-03 09:06:43 +00002639 if (first_range == ranges->length()) {
2640 details->set_cannot_match();
2641 pos->determines_perfectly = false;
2642 return;
2643 }
ager@chromium.org381abbb2009-02-25 13:23:22 +00002644 }
2645 CharacterRange range = ranges->at(first_range);
2646 uc16 from = range.from();
2647 uc16 to = range.to();
2648 if (to > char_mask) {
2649 to = char_mask;
2650 }
2651 uint32_t differing_bits = (from ^ to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002652 // A mask and compare is only perfect if the differing bits form a
2653 // number like 00011111 with one single block of trailing 1s.
ager@chromium.org5aa501c2009-06-23 07:57:28 +00002654 if ((differing_bits & (differing_bits + 1)) == 0 &&
2655 from + differing_bits == to) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002656 pos->determines_perfectly = true;
2657 }
2658 uint32_t common_bits = ~SmearBitsRight(differing_bits);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002659 uint32_t bits = (from & common_bits);
2660 for (int i = first_range + 1; i < ranges->length(); i++) {
2661 CharacterRange range = ranges->at(i);
2662 uc16 from = range.from();
2663 uc16 to = range.to();
2664 if (from > char_mask) continue;
2665 if (to > char_mask) to = char_mask;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002666 // Here we are combining more ranges into the mask and compare
2667 // value. With each new range the mask becomes more sparse and
2668 // so the chances of a false positive rise. A character class
2669 // with multiple ranges is assumed never to be equivalent to a
2670 // mask and compare operation.
2671 pos->determines_perfectly = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00002672 uint32_t new_common_bits = (from ^ to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002673 new_common_bits = ~SmearBitsRight(new_common_bits);
2674 common_bits &= new_common_bits;
2675 bits &= new_common_bits;
ager@chromium.org381abbb2009-02-25 13:23:22 +00002676 uint32_t differing_bits = (from & common_bits) ^ bits;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002677 common_bits ^= differing_bits;
2678 bits &= common_bits;
2679 }
2680 pos->mask = common_bits;
2681 pos->value = bits;
2682 }
2683 characters_filled_in++;
2684 ASSERT(characters_filled_in <= details->characters());
2685 if (characters_filled_in == details->characters()) {
2686 return;
2687 }
2688 }
2689 }
2690 ASSERT(characters_filled_in != details->characters());
danno@chromium.org2c26cb12012-05-03 09:06:43 +00002691 if (!details->cannot_match()) {
2692 on_success()-> GetQuickCheckDetails(details,
2693 compiler,
2694 characters_filled_in,
2695 true);
2696 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002697}
2698
2699
2700void QuickCheckDetails::Clear() {
2701 for (int i = 0; i < characters_; i++) {
2702 positions_[i].mask = 0;
2703 positions_[i].value = 0;
2704 positions_[i].determines_perfectly = false;
2705 }
2706 characters_ = 0;
2707}
2708
2709
2710void QuickCheckDetails::Advance(int by, bool ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002711 ASSERT(by >= 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002712 if (by >= characters_) {
2713 Clear();
2714 return;
2715 }
2716 for (int i = 0; i < characters_ - by; i++) {
2717 positions_[i] = positions_[by + i];
2718 }
2719 for (int i = characters_ - by; i < characters_; i++) {
2720 positions_[i].mask = 0;
2721 positions_[i].value = 0;
2722 positions_[i].determines_perfectly = false;
2723 }
2724 characters_ -= by;
2725 // We could change mask_ and value_ here but we would never advance unless
2726 // they had already been used in a check and they won't be used again because
2727 // it would gain us nothing. So there's no point.
2728}
2729
2730
2731void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2732 ASSERT(characters_ == other->characters_);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002733 if (other->cannot_match_) {
2734 return;
2735 }
2736 if (cannot_match_) {
2737 *this = *other;
2738 return;
2739 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002740 for (int i = from_index; i < characters_; i++) {
2741 QuickCheckDetails::Position* pos = positions(i);
2742 QuickCheckDetails::Position* other_pos = other->positions(i);
2743 if (pos->mask != other_pos->mask ||
2744 pos->value != other_pos->value ||
2745 !other_pos->determines_perfectly) {
2746 // Our mask-compare operation will be approximate unless we have the
2747 // exact same operation on both sides of the alternation.
2748 pos->determines_perfectly = false;
2749 }
2750 pos->mask &= other_pos->mask;
2751 pos->value &= pos->mask;
2752 other_pos->value &= pos->mask;
2753 uc16 differing_bits = (pos->value ^ other_pos->value);
2754 pos->mask &= ~differing_bits;
2755 pos->value &= pos->mask;
2756 }
2757}
2758
2759
ager@chromium.org32912102009-01-16 10:38:43 +00002760class VisitMarker {
2761 public:
2762 explicit VisitMarker(NodeInfo* info) : info_(info) {
2763 ASSERT(!info->visited);
2764 info->visited = true;
2765 }
2766 ~VisitMarker() {
2767 info_->visited = false;
2768 }
2769 private:
2770 NodeInfo* info_;
2771};
2772
2773
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002774RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) {
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002775 if (info()->replacement_calculated) return replacement();
2776 if (depth < 0) return this;
2777 ASSERT(!info()->visited);
2778 VisitMarker marker(info());
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002779 return FilterSuccessor(depth - 1, ignore_case);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002780}
2781
2782
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002783RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2784 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002785 if (next == NULL) return set_replacement(NULL);
2786 on_success_ = next;
2787 return set_replacement(this);
2788}
2789
2790
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002791// We need to check for the following characters: 0x39c 0x3bc 0x178.
2792static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002793 // TODO(dcarney): this could be a lot more efficient.
2794 return range.Contains(0x39c) ||
2795 range.Contains(0x3bc) || range.Contains(0x178);
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002796}
2797
2798
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002799static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2800 for (int i = 0; i < ranges->length(); i++) {
2801 // TODO(dcarney): this could be a lot more efficient.
2802 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2803 }
2804 return false;
2805}
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002806
2807
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002808RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002809 if (info()->replacement_calculated) return replacement();
2810 if (depth < 0) return this;
2811 ASSERT(!info()->visited);
2812 VisitMarker marker(info());
2813 int element_count = elms_->length();
2814 for (int i = 0; i < element_count; i++) {
2815 TextElement elm = elms_->at(i);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00002816 if (elm.text_type == TextElement::ATOM) {
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002817 Vector<const uc16> quarks = elm.data.u_atom->data();
2818 for (int j = 0; j < quarks.length(); j++) {
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002819 uint16_t c = quarks[j];
2820 if (c <= String::kMaxOneByteCharCode) continue;
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002821 if (!ignore_case) return set_replacement(NULL);
2822 // Here, we need to check for characters whose upper and lower cases
2823 // are outside the Latin-1 range.
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002824 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2825 // Character is outside Latin-1 completely
2826 if (converted == 0) return set_replacement(NULL);
2827 // Convert quark to Latin-1 in place.
2828 uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2829 copy[j] = converted;
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002830 }
2831 } else {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00002832 ASSERT(elm.text_type == TextElement::CHAR_CLASS);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002833 RegExpCharacterClass* cc = elm.data.u_char_class;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002834 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002835 if (!CharacterRange::IsCanonical(ranges)) {
2836 CharacterRange::Canonicalize(ranges);
2837 }
2838 // Now they are in order so we only need to look at the first.
2839 int range_count = ranges->length();
2840 if (cc->is_negated()) {
2841 if (range_count != 0 &&
2842 ranges->at(0).from() == 0 &&
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002843 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002844 // This will be handled in a later filter.
2845 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002846 return set_replacement(NULL);
2847 }
2848 } else {
2849 if (range_count == 0 ||
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002850 ranges->at(0).from() > String::kMaxOneByteCharCode) {
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002851 // This will be handled in a later filter.
2852 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002853 return set_replacement(NULL);
2854 }
2855 }
2856 }
2857 }
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002858 return FilterSuccessor(depth - 1, ignore_case);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002859}
2860
2861
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002862RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) {
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002863 if (info()->replacement_calculated) return replacement();
2864 if (depth < 0) return this;
2865 if (info()->visited) return this;
danno@chromium.org2c26cb12012-05-03 09:06:43 +00002866 {
2867 VisitMarker marker(info());
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002868
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002869 RegExpNode* continue_replacement =
2870 continue_node_->FilterASCII(depth - 1, ignore_case);
danno@chromium.org2c26cb12012-05-03 09:06:43 +00002871 // If we can't continue after the loop then there is no sense in doing the
2872 // loop.
2873 if (continue_replacement == NULL) return set_replacement(NULL);
2874 }
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002875
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002876 return ChoiceNode::FilterASCII(depth - 1, ignore_case);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002877}
2878
2879
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002880RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) {
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002881 if (info()->replacement_calculated) return replacement();
2882 if (depth < 0) return this;
2883 if (info()->visited) return this;
2884 VisitMarker marker(info());
2885 int choice_count = alternatives_->length();
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002886
2887 for (int i = 0; i < choice_count; i++) {
2888 GuardedAlternative alternative = alternatives_->at(i);
2889 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2890 set_replacement(this);
2891 return this;
2892 }
2893 }
2894
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002895 int surviving = 0;
2896 RegExpNode* survivor = NULL;
2897 for (int i = 0; i < choice_count; i++) {
2898 GuardedAlternative alternative = alternatives_->at(i);
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002899 RegExpNode* replacement =
2900 alternative.node()->FilterASCII(depth - 1, ignore_case);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002901 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002902 if (replacement != NULL) {
danno@chromium.orgb10deab2012-05-07 14:28:47 +00002903 alternatives_->at(i).set_node(replacement);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002904 surviving++;
2905 survivor = replacement;
2906 }
2907 }
2908 if (surviving < 2) return set_replacement(survivor);
2909
2910 set_replacement(this);
2911 if (surviving == choice_count) {
2912 return this;
2913 }
2914 // Only some of the nodes survived the filtering. We need to rebuild the
2915 // alternatives list.
2916 ZoneList<GuardedAlternative>* new_alternatives =
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002917 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002918 for (int i = 0; i < choice_count; i++) {
danno@chromium.orgb10deab2012-05-07 14:28:47 +00002919 RegExpNode* replacement =
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002920 alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case);
danno@chromium.orgb10deab2012-05-07 14:28:47 +00002921 if (replacement != NULL) {
2922 alternatives_->at(i).set_node(replacement);
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002923 new_alternatives->Add(alternatives_->at(i), zone());
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002924 }
2925 }
2926 alternatives_ = new_alternatives;
2927 return this;
2928}
2929
2930
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002931RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth,
2932 bool ignore_case) {
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002933 if (info()->replacement_calculated) return replacement();
2934 if (depth < 0) return this;
2935 if (info()->visited) return this;
2936 VisitMarker marker(info());
2937 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2938 // afterwards.
2939 RegExpNode* node = alternatives_->at(1).node();
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002940 RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002941 if (replacement == NULL) return set_replacement(NULL);
2942 alternatives_->at(1).set_node(replacement);
2943
2944 RegExpNode* neg_node = alternatives_->at(0).node();
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002945 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002946 // If the negative lookahead is always going to fail then
2947 // we don't need to check it.
2948 if (neg_replacement == NULL) return set_replacement(replacement);
2949 alternatives_->at(0).set_node(neg_replacement);
2950 return set_replacement(this);
2951}
2952
2953
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002954void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2955 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002956 int characters_filled_in,
2957 bool not_at_start) {
ager@chromium.org32912102009-01-16 10:38:43 +00002958 if (body_can_be_zero_length_ || info()->visited) return;
2959 VisitMarker marker(info());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002960 return ChoiceNode::GetQuickCheckDetails(details,
2961 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002962 characters_filled_in,
2963 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002964}
2965
2966
verwaest@chromium.org37141392012-05-31 13:27:02 +00002967void LoopChoiceNode::FillInBMInfo(int offset,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00002968 int budget,
verwaest@chromium.org37141392012-05-31 13:27:02 +00002969 BoyerMooreLookahead* bm,
2970 bool not_at_start) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002971 if (body_can_be_zero_length_ || budget <= 0) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002972 bm->SetRest(offset);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00002973 SaveBMInfo(bm, not_at_start, offset);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002974 return;
2975 }
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002976 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00002977 SaveBMInfo(bm, not_at_start, offset);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002978}
2979
2980
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002981void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2982 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002983 int characters_filled_in,
2984 bool not_at_start) {
2985 not_at_start = (not_at_start || not_at_start_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002986 int choice_count = alternatives_->length();
2987 ASSERT(choice_count > 0);
2988 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2989 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002990 characters_filled_in,
2991 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002992 for (int i = 1; i < choice_count; i++) {
2993 QuickCheckDetails new_details(details->characters());
2994 RegExpNode* node = alternatives_->at(i).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002995 node->GetQuickCheckDetails(&new_details, compiler,
2996 characters_filled_in,
2997 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002998 // Here we merge the quick match details of the two branches.
2999 details->Merge(&new_details, characters_filled_in);
3000 }
3001}
3002
3003
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003004// Check for [0-9A-Z_a-z].
3005static void EmitWordCheck(RegExpMacroAssembler* assembler,
3006 Label* word,
3007 Label* non_word,
3008 bool fall_through_on_word) {
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003009 if (assembler->CheckSpecialCharacterClass(
3010 fall_through_on_word ? 'w' : 'W',
3011 fall_through_on_word ? non_word : word)) {
3012 // Optimized implementation available.
3013 return;
3014 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003015 assembler->CheckCharacterGT('z', non_word);
3016 assembler->CheckCharacterLT('0', non_word);
3017 assembler->CheckCharacterGT('a' - 1, word);
3018 assembler->CheckCharacterLT('9' + 1, word);
3019 assembler->CheckCharacterLT('A', non_word);
3020 assembler->CheckCharacterLT('Z' + 1, word);
3021 if (fall_through_on_word) {
3022 assembler->CheckNotCharacter('_', non_word);
3023 } else {
3024 assembler->CheckCharacter('_', word);
3025 }
3026}
3027
3028
3029// Emit the code to check for a ^ in multiline mode (1-character lookbehind
3030// that matches newline or the start of input).
3031static void EmitHat(RegExpCompiler* compiler,
3032 RegExpNode* on_success,
3033 Trace* trace) {
3034 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3035 // We will be loading the previous character into the current character
3036 // register.
3037 Trace new_trace(*trace);
3038 new_trace.InvalidateCurrentCharacter();
3039
3040 Label ok;
3041 if (new_trace.cp_offset() == 0) {
3042 // The start of input counts as a newline in this context, so skip to
3043 // ok if we are at the start.
3044 assembler->CheckAtStart(&ok);
3045 }
3046 // We already checked that we are not at the start of input so it must be
3047 // OK to load the previous character.
3048 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3049 new_trace.backtrack(),
3050 false);
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003051 if (!assembler->CheckSpecialCharacterClass('n',
3052 new_trace.backtrack())) {
3053 // Newline means \n, \r, 0x2028 or 0x2029.
3054 if (!compiler->ascii()) {
3055 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3056 }
3057 assembler->CheckCharacter('\n', &ok);
3058 assembler->CheckNotCharacter('\r', new_trace.backtrack());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003059 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003060 assembler->Bind(&ok);
3061 on_success->Emit(compiler, &new_trace);
3062}
3063
3064
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003065// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3066void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003067 RegExpMacroAssembler* assembler = compiler->macro_assembler();
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003068 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3069 bool not_at_start = (trace->at_start() == Trace::FALSE);
3070 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3071 if (lookahead == NULL) {
3072 int eats_at_least =
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00003073 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3074 kRecursionBudget,
3075 not_at_start));
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003076 if (eats_at_least >= 1) {
3077 BoyerMooreLookahead* bm =
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003078 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00003079 FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003080 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE;
3081 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE;
3082 }
3083 } else {
3084 if (lookahead->at(0)->is_non_word()) next_is_word_character = Trace::FALSE;
3085 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE;
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003086 }
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00003087 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003088 if (next_is_word_character == Trace::UNKNOWN) {
3089 Label before_non_word;
3090 Label before_word;
3091 if (trace->characters_preloaded() != 1) {
3092 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3093 }
3094 // Fall through on non-word.
3095 EmitWordCheck(assembler, &before_word, &before_non_word, false);
3096 // Next character is not a word character.
3097 assembler->Bind(&before_non_word);
3098 Label ok;
3099 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3100 assembler->GoTo(&ok);
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003101
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003102 assembler->Bind(&before_word);
3103 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3104 assembler->Bind(&ok);
3105 } else if (next_is_word_character == Trace::TRUE) {
3106 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3107 } else {
3108 ASSERT(next_is_word_character == Trace::FALSE);
3109 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3110 }
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003111}
3112
3113
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003114void AssertionNode::BacktrackIfPrevious(
3115 RegExpCompiler* compiler,
3116 Trace* trace,
3117 AssertionNode::IfPrevious backtrack_if_previous) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003118 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003119 Trace new_trace(*trace);
3120 new_trace.InvalidateCurrentCharacter();
3121
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003122 Label fall_through, dummy;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003123
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003124 Label* non_word = backtrack_if_previous == kIsNonWord ?
3125 new_trace.backtrack() :
3126 &fall_through;
3127 Label* word = backtrack_if_previous == kIsNonWord ?
3128 &fall_through :
3129 new_trace.backtrack();
3130
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003131 if (new_trace.cp_offset() == 0) {
3132 // The start of input counts as a non-word character, so the question is
3133 // decided if we are at the start.
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003134 assembler->CheckAtStart(non_word);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003135 }
3136 // We already checked that we are not at the start of input so it must be
3137 // OK to load the previous character.
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003138 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3139 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003140
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003141 assembler->Bind(&fall_through);
3142 on_success()->Emit(compiler, &new_trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003143}
3144
3145
iposva@chromium.org245aa852009-02-10 00:49:54 +00003146void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3147 RegExpCompiler* compiler,
3148 int filled_in,
3149 bool not_at_start) {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00003150 if (assertion_type_ == AT_START && not_at_start) {
iposva@chromium.org245aa852009-02-10 00:49:54 +00003151 details->set_cannot_match();
3152 return;
3153 }
3154 return on_success()->GetQuickCheckDetails(details,
3155 compiler,
3156 filled_in,
3157 not_at_start);
3158}
3159
3160
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003161void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3162 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00003163 switch (assertion_type_) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003164 case AT_END: {
3165 Label ok;
3166 assembler->CheckPosition(trace->cp_offset(), &ok);
3167 assembler->GoTo(trace->backtrack());
3168 assembler->Bind(&ok);
3169 break;
3170 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00003171 case AT_START: {
3172 if (trace->at_start() == Trace::FALSE) {
3173 assembler->GoTo(trace->backtrack());
3174 return;
3175 }
3176 if (trace->at_start() == Trace::UNKNOWN) {
3177 assembler->CheckNotAtStart(trace->backtrack());
3178 Trace at_start_trace = *trace;
3179 at_start_trace.set_at_start(true);
3180 on_success()->Emit(compiler, &at_start_trace);
3181 return;
3182 }
3183 }
3184 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003185 case AFTER_NEWLINE:
3186 EmitHat(compiler, on_success(), trace);
3187 return;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003188 case AT_BOUNDARY:
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003189 case AT_NON_BOUNDARY: {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003190 EmitBoundaryCheck(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003191 return;
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003192 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003193 }
3194 on_success()->Emit(compiler, trace);
3195}
3196
3197
ager@chromium.org381abbb2009-02-25 13:23:22 +00003198static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3199 if (quick_check == NULL) return false;
3200 if (offset >= quick_check->characters()) return false;
3201 return quick_check->positions(offset)->determines_perfectly;
3202}
3203
3204
3205static void UpdateBoundsCheck(int index, int* checked_up_to) {
3206 if (index > *checked_up_to) {
3207 *checked_up_to = index;
3208 }
3209}
3210
3211
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003212// We call this repeatedly to generate code for each pass over the text node.
3213// The passes are in increasing order of difficulty because we hope one
3214// of the first passes will fail in which case we are saved the work of the
3215// later passes. for example for the case independent regexp /%[asdfghjkl]a/
3216// we will check the '%' in the first pass, the case independent 'a' in the
3217// second pass and the character class in the last pass.
3218//
3219// The passes are done from right to left, so for example to test for /bar/
3220// we will first test for an 'r' with offset 2, then an 'a' with offset 1
3221// and then a 'b' with offset 0. This means we can avoid the end-of-input
3222// bounds check most of the time. In the example we only need to check for
3223// end-of-input when loading the putative 'r'.
3224//
3225// A slight complication involves the fact that the first character may already
3226// be fetched into a register by the previous node. In this case we want to
3227// do the test for that character first. We do this in separate passes. The
3228// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3229// pass has been performed then subsequent passes will have true in
3230// first_element_checked to indicate that that character does not need to be
3231// checked again.
3232//
ager@chromium.org32912102009-01-16 10:38:43 +00003233// In addition to all this we are passed a Trace, which can
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003234// contain an AlternativeGeneration object. In this AlternativeGeneration
3235// object we can see details of any quick check that was already passed in
3236// order to get to the code we are now generating. The quick check can involve
3237// loading characters, which means we do not need to recheck the bounds
3238// up to the limit the quick check already checked. In addition the quick
3239// check can have involved a mask and compare operation which may simplify
3240// or obviate the need for further checks at some character positions.
3241void TextNode::TextEmitPass(RegExpCompiler* compiler,
3242 TextEmitPassType pass,
3243 bool preloaded,
ager@chromium.org32912102009-01-16 10:38:43 +00003244 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003245 bool first_element_checked,
3246 int* checked_up_to) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00003247 Isolate* isolate = Isolate::Current();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003248 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3249 bool ascii = compiler->ascii();
ager@chromium.org32912102009-01-16 10:38:43 +00003250 Label* backtrack = trace->backtrack();
3251 QuickCheckDetails* quick_check = trace->quick_check_performed();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003252 int element_count = elms_->length();
3253 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3254 TextElement elm = elms_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00003255 int cp_offset = trace->cp_offset() + elm.cp_offset;
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00003256 if (elm.text_type == TextElement::ATOM) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00003257 Vector<const uc16> quarks = elm.data.u_atom->data();
3258 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3259 if (first_element_checked && i == 0 && j == 0) continue;
3260 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
3261 EmitCharacterFunction* emit_function = NULL;
3262 switch (pass) {
3263 case NON_ASCII_MATCH:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003264 ASSERT(ascii);
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00003265 if (quarks[j] > String::kMaxOneByteCharCode) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003266 assembler->GoTo(backtrack);
3267 return;
3268 }
ager@chromium.org381abbb2009-02-25 13:23:22 +00003269 break;
3270 case NON_LETTER_CHARACTER_MATCH:
3271 emit_function = &EmitAtomNonLetter;
3272 break;
3273 case SIMPLE_CHARACTER_MATCH:
3274 emit_function = &EmitSimpleCharacter;
3275 break;
3276 case CASE_CHARACTER_MATCH:
3277 emit_function = &EmitAtomLetter;
3278 break;
3279 default:
3280 break;
3281 }
3282 if (emit_function != NULL) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00003283 bool bound_checked = emit_function(isolate,
3284 compiler,
ager@chromium.org6f10e412009-02-13 10:11:16 +00003285 quarks[j],
3286 backtrack,
3287 cp_offset + j,
3288 *checked_up_to < cp_offset + j,
3289 preloaded);
ager@chromium.org381abbb2009-02-25 13:23:22 +00003290 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003291 }
3292 }
3293 } else {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00003294 ASSERT_EQ(elm.text_type, TextElement::CHAR_CLASS);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003295 if (pass == CHARACTER_CLASS_MATCH) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00003296 if (first_element_checked && i == 0) continue;
3297 if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003298 RegExpCharacterClass* cc = elm.data.u_char_class;
3299 EmitCharClass(assembler,
3300 cc,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003301 ascii,
ager@chromium.org381abbb2009-02-25 13:23:22 +00003302 backtrack,
3303 cp_offset,
3304 *checked_up_to < cp_offset,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003305 preloaded,
3306 zone());
ager@chromium.org381abbb2009-02-25 13:23:22 +00003307 UpdateBoundsCheck(cp_offset, checked_up_to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003308 }
3309 }
3310 }
3311}
3312
3313
3314int TextNode::Length() {
3315 TextElement elm = elms_->last();
3316 ASSERT(elm.cp_offset >= 0);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00003317 if (elm.text_type == TextElement::ATOM) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003318 return elm.cp_offset + elm.data.u_atom->data().length();
3319 } else {
3320 return elm.cp_offset + 1;
3321 }
3322}
3323
3324
ager@chromium.org381abbb2009-02-25 13:23:22 +00003325bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3326 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3327 if (ignore_case) {
3328 return pass == SIMPLE_CHARACTER_MATCH;
3329 } else {
3330 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3331 }
3332}
3333
3334
ager@chromium.org8bb60582008-12-11 12:02:20 +00003335// This generates the code to match a text node. A text node can contain
3336// straight character sequences (possibly to be matched in a case-independent
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003337// way) and character classes. For efficiency we do not do this in a single
3338// pass from left to right. Instead we pass over the text node several times,
3339// emitting code for some character positions every time. See the comment on
3340// TextEmitPass for details.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003341void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00003342 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003343 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003344 ASSERT(limit_result == CONTINUE);
3345
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003346 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3347 compiler->SetRegExpTooBig();
3348 return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003349 }
3350
3351 if (compiler->ascii()) {
3352 int dummy = 0;
ager@chromium.org32912102009-01-16 10:38:43 +00003353 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003354 }
3355
3356 bool first_elt_done = false;
ager@chromium.org32912102009-01-16 10:38:43 +00003357 int bound_checked_to = trace->cp_offset() - 1;
3358 bound_checked_to += trace->bound_checked_up_to();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003359
3360 // If a character is preloaded into the current character register then
3361 // check that now.
ager@chromium.org32912102009-01-16 10:38:43 +00003362 if (trace->characters_preloaded() == 1) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00003363 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3364 if (!SkipPass(pass, compiler->ignore_case())) {
3365 TextEmitPass(compiler,
3366 static_cast<TextEmitPassType>(pass),
3367 true,
3368 trace,
3369 false,
3370 &bound_checked_to);
3371 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003372 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003373 first_elt_done = true;
3374 }
3375
ager@chromium.org381abbb2009-02-25 13:23:22 +00003376 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3377 if (!SkipPass(pass, compiler->ignore_case())) {
3378 TextEmitPass(compiler,
3379 static_cast<TextEmitPassType>(pass),
3380 false,
3381 trace,
3382 first_elt_done,
3383 &bound_checked_to);
3384 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003385 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003386
ager@chromium.org32912102009-01-16 10:38:43 +00003387 Trace successor_trace(*trace);
iposva@chromium.org245aa852009-02-10 00:49:54 +00003388 successor_trace.set_at_start(false);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003389 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003390 RecursionCheck rc(compiler);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003391 on_success()->Emit(compiler, &successor_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003392}
3393
3394
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003395void Trace::InvalidateCurrentCharacter() {
3396 characters_preloaded_ = 0;
3397}
3398
3399
3400void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003401 ASSERT(by > 0);
3402 // We don't have an instruction for shifting the current character register
3403 // down or for using a shifted value for anything so lets just forget that
3404 // we preloaded any characters into it.
3405 characters_preloaded_ = 0;
3406 // Adjust the offsets of the quick check performed information. This
3407 // information is used to find out what we already determined about the
3408 // characters by means of mask and compare.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003409 quick_check_performed_.Advance(by, compiler->ascii());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003410 cp_offset_ += by;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003411 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3412 compiler->SetRegExpTooBig();
3413 cp_offset_ = 0;
3414 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003415 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003416}
3417
3418
ager@chromium.org38e4c712009-11-11 09:11:58 +00003419void TextNode::MakeCaseIndependent(bool is_ascii) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003420 int element_count = elms_->length();
3421 for (int i = 0; i < element_count; i++) {
3422 TextElement elm = elms_->at(i);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00003423 if (elm.text_type == TextElement::CHAR_CLASS) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003424 RegExpCharacterClass* cc = elm.data.u_char_class;
erik.corry@gmail.comf2038fb2012-01-16 11:42:08 +00003425 // None of the standard character classes is different in the case
ager@chromium.org38e4c712009-11-11 09:11:58 +00003426 // independent case and it slows us down if we don't know that.
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003427 if (cc->is_standard(zone())) continue;
3428 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003429 int range_count = ranges->length();
ager@chromium.org38e4c712009-11-11 09:11:58 +00003430 for (int j = 0; j < range_count; j++) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003431 ranges->at(j).AddCaseEquivalents(ranges, is_ascii, zone());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003432 }
3433 }
3434 }
3435}
3436
3437
ager@chromium.org8bb60582008-12-11 12:02:20 +00003438int TextNode::GreedyLoopTextLength() {
3439 TextElement elm = elms_->at(elms_->length() - 1);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00003440 if (elm.text_type == TextElement::CHAR_CLASS) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003441 return elm.cp_offset + 1;
3442 } else {
3443 return elm.cp_offset + elm.data.u_atom->data().length();
3444 }
3445}
3446
3447
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003448RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3449 RegExpCompiler* compiler) {
3450 if (elms_->length() != 1) return NULL;
3451 TextElement elm = elms_->at(0);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00003452 if (elm.text_type != TextElement::CHAR_CLASS) return NULL;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003453 RegExpCharacterClass* node = elm.data.u_char_class;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003454 ZoneList<CharacterRange>* ranges = node->ranges(zone());
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003455 if (!CharacterRange::IsCanonical(ranges)) {
3456 CharacterRange::Canonicalize(ranges);
3457 }
3458 if (node->is_negated()) {
3459 return ranges->length() == 0 ? on_success() : NULL;
3460 }
3461 if (ranges->length() != 1) return NULL;
3462 uint32_t max_char;
3463 if (compiler->ascii()) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00003464 max_char = String::kMaxOneByteCharCode;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003465 } else {
3466 max_char = String::kMaxUtf16CodeUnit;
3467 }
3468 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3469}
3470
3471
ager@chromium.org8bb60582008-12-11 12:02:20 +00003472// Finds the fixed match length of a sequence of nodes that goes from
3473// this alternative and back to this choice node. If there are variable
3474// length nodes or other complications in the way then return a sentinel
3475// value indicating that a greedy loop cannot be constructed.
jkummerow@chromium.org486075a2011-09-07 12:44:28 +00003476int ChoiceNode::GreedyLoopTextLengthForAlternative(
3477 GuardedAlternative* alternative) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003478 int length = 0;
3479 RegExpNode* node = alternative->node();
3480 // Later we will generate code for all these text nodes using recursion
3481 // so we have to limit the max number.
3482 int recursion_depth = 0;
3483 while (node != this) {
3484 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3485 return kNodeIsTooComplexForGreedyLoops;
3486 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003487 int node_length = node->GreedyLoopTextLength();
3488 if (node_length == kNodeIsTooComplexForGreedyLoops) {
3489 return kNodeIsTooComplexForGreedyLoops;
3490 }
3491 length += node_length;
3492 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3493 node = seq_node->on_success();
3494 }
3495 return length;
3496}
3497
3498
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003499void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3500 ASSERT_EQ(loop_node_, NULL);
3501 AddAlternative(alt);
3502 loop_node_ = alt.node();
3503}
3504
3505
3506void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3507 ASSERT_EQ(continue_node_, NULL);
3508 AddAlternative(alt);
3509 continue_node_ = alt.node();
3510}
3511
3512
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003513void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003514 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003515 if (trace->stop_node() == this) {
jkummerow@chromium.org486075a2011-09-07 12:44:28 +00003516 int text_length =
3517 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
ager@chromium.org8bb60582008-12-11 12:02:20 +00003518 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
3519 // Update the counter-based backtracking info on the stack. This is an
3520 // optimization for greedy loops (see below).
ager@chromium.org32912102009-01-16 10:38:43 +00003521 ASSERT(trace->cp_offset() == text_length);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003522 macro_assembler->AdvanceCurrentPosition(text_length);
ager@chromium.org32912102009-01-16 10:38:43 +00003523 macro_assembler->GoTo(trace->loop_label());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003524 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003525 }
ager@chromium.org32912102009-01-16 10:38:43 +00003526 ASSERT(trace->stop_node() == NULL);
3527 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003528 trace->Flush(compiler, this);
3529 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003530 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003531 ChoiceNode::Emit(compiler, trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003532}
3533
3534
kasperl@chromium.orga5551262010-12-07 12:49:48 +00003535int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003536 int eats_at_least) {
3537 int preload_characters = Min(4, eats_at_least);
ager@chromium.org18ad94b2009-09-02 08:22:29 +00003538 if (compiler->macro_assembler()->CanReadUnaligned()) {
3539 bool ascii = compiler->ascii();
3540 if (ascii) {
3541 if (preload_characters > 4) preload_characters = 4;
3542 // We can't preload 3 characters because there is no machine instruction
3543 // to do that. We can't just load 4 because we could be reading
3544 // beyond the end of the string, which could cause a memory fault.
3545 if (preload_characters == 3) preload_characters = 2;
3546 } else {
3547 if (preload_characters > 2) preload_characters = 2;
3548 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003549 } else {
ager@chromium.org18ad94b2009-09-02 08:22:29 +00003550 if (preload_characters > 1) preload_characters = 1;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003551 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003552 return preload_characters;
3553}
3554
3555
3556// This class is used when generating the alternatives in a choice node. It
3557// records the way the alternative is being code generated.
3558class AlternativeGeneration: public Malloced {
3559 public:
3560 AlternativeGeneration()
3561 : possible_success(),
3562 expects_preload(false),
3563 after(),
3564 quick_check_details() { }
3565 Label possible_success;
3566 bool expects_preload;
3567 Label after;
3568 QuickCheckDetails quick_check_details;
3569};
3570
3571
3572// Creates a list of AlternativeGenerations. If the list has a reasonable
3573// size then it is on the stack, otherwise the excess is on the heap.
3574class AlternativeGenerationList {
3575 public:
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003576 AlternativeGenerationList(int count, Zone* zone)
3577 : alt_gens_(count, zone) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003578 for (int i = 0; i < count && i < kAFew; i++) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003579 alt_gens_.Add(a_few_alt_gens_ + i, zone);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003580 }
3581 for (int i = kAFew; i < count; i++) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003582 alt_gens_.Add(new AlternativeGeneration(), zone);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003583 }
3584 }
3585 ~AlternativeGenerationList() {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003586 for (int i = kAFew; i < alt_gens_.length(); i++) {
3587 delete alt_gens_[i];
3588 alt_gens_[i] = NULL;
3589 }
3590 }
3591
3592 AlternativeGeneration* at(int i) {
3593 return alt_gens_[i];
3594 }
jkummerow@chromium.orge297f592011-06-08 10:05:15 +00003595
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003596 private:
3597 static const int kAFew = 10;
3598 ZoneList<AlternativeGeneration*> alt_gens_;
3599 AlternativeGeneration a_few_alt_gens_[kAFew];
3600};
3601
3602
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003603// The '2' variant is has inclusive from and exclusive to.
3604static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0,
3605 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A,
3606 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 };
3607static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
3608
3609static const int kWordRanges[] = {
3610 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
3611static const int kWordRangeCount = ARRAY_SIZE(kWordRanges);
3612static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
3613static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
3614static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
3615static const int kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges);
3616static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
3617 0x2028, 0x202A, 0x10000 };
3618static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges);
3619
3620
3621void BoyerMoorePositionInfo::Set(int character) {
3622 SetInterval(Interval(character, character));
3623}
3624
3625
3626void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3627 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3628 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3629 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3630 surrogate_ =
3631 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3632 if (interval.to() - interval.from() >= kMapSize - 1) {
3633 if (map_count_ != kMapSize) {
3634 map_count_ = kMapSize;
3635 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3636 }
3637 return;
3638 }
3639 for (int i = interval.from(); i <= interval.to(); i++) {
3640 int mod_character = (i & kMask);
3641 if (!map_->at(mod_character)) {
3642 map_count_++;
3643 map_->at(mod_character) = true;
3644 }
3645 if (map_count_ == kMapSize) return;
3646 }
3647}
3648
3649
3650void BoyerMoorePositionInfo::SetAll() {
3651 s_ = w_ = d_ = kLatticeUnknown;
3652 if (map_count_ != kMapSize) {
3653 map_count_ = kMapSize;
3654 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3655 }
3656}
3657
3658
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003659BoyerMooreLookahead::BoyerMooreLookahead(
rossberg@chromium.org400388e2012-06-06 09:29:22 +00003660 int length, RegExpCompiler* compiler, Zone* zone)
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003661 : length_(length),
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003662 compiler_(compiler) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003663 if (compiler->ascii()) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00003664 max_char_ = String::kMaxOneByteCharCode;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003665 } else {
3666 max_char_ = String::kMaxUtf16CodeUnit;
3667 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003668 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003669 for (int i = 0; i < length; i++) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003670 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003671 }
3672}
3673
3674
3675// Find the longest range of lookahead that has the fewest number of different
3676// characters that can occur at a given position. Since we are optimizing two
3677// different parameters at once this is a tradeoff.
3678bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3679 int biggest_points = 0;
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003680 // If more than 32 characters out of 128 can occur it is unlikely that we can
3681 // be lucky enough to step forwards much of the time.
3682 const int kMaxMax = 32;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003683 for (int max_number_of_chars = 4;
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003684 max_number_of_chars < kMaxMax;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003685 max_number_of_chars *= 2) {
3686 biggest_points =
3687 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3688 }
3689 if (biggest_points == 0) return false;
3690 return true;
3691}
3692
3693
3694// Find the highest-points range between 0 and length_ where the character
3695// information is not too vague. 'Too vague' means that there are more than
3696// max_number_of_chars that can occur at this position. Calculates the number
3697// of points as the product of width-of-the-range and
3698// probability-of-finding-one-of-the-characters, where the probability is
3699// calculated using the frequency distribution of the sample subject string.
3700int BoyerMooreLookahead::FindBestInterval(
3701 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3702 int biggest_points = old_biggest_points;
3703 static const int kSize = RegExpMacroAssembler::kTableSize;
3704 for (int i = 0; i < length_; ) {
3705 while (i < length_ && Count(i) > max_number_of_chars) i++;
3706 if (i == length_) break;
3707 int remembered_from = i;
3708 bool union_map[kSize];
3709 for (int j = 0; j < kSize; j++) union_map[j] = false;
3710 while (i < length_ && Count(i) <= max_number_of_chars) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003711 BoyerMoorePositionInfo* map = bitmaps_->at(i);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003712 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3713 i++;
3714 }
3715 int frequency = 0;
3716 for (int j = 0; j < kSize; j++) {
3717 if (union_map[j]) {
3718 // Add 1 to the frequency to give a small per-character boost for
3719 // the cases where our sampling is not good enough and many
3720 // characters have a frequency of zero. This means the frequency
3721 // can theoretically be up to 2*kSize though we treat it mostly as
3722 // a fraction of kSize.
3723 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3724 }
3725 }
3726 // We use the probability of skipping times the distance we are skipping to
3727 // judge the effectiveness of this. Actually we have a cut-off: By
3728 // dividing by 2 we switch off the skipping if the probability of skipping
3729 // is less than 50%. This is because the multibyte mask-and-compare
3730 // skipping in quickcheck is more likely to do well on this case.
3731 bool in_quickcheck_range = ((i - remembered_from < 4) ||
3732 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2));
3733 // Called 'probability' but it is only a rough estimate and can actually
3734 // be outside the 0-kSize range.
3735 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3736 int points = (i - remembered_from) * probability;
3737 if (points > biggest_points) {
3738 *from = remembered_from;
3739 *to = i - 1;
3740 biggest_points = points;
3741 }
3742 }
3743 return biggest_points;
3744}
3745
3746
3747// Take all the characters that will not prevent a successful match if they
3748// occur in the subject string in the range between min_lookahead and
3749// max_lookahead (inclusive) measured from the current position. If the
3750// character at max_lookahead offset is not one of these characters, then we
3751// can safely skip forwards by the number of characters in the range.
3752int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3753 int max_lookahead,
3754 Handle<ByteArray> boolean_skip_table) {
3755 const int kSize = RegExpMacroAssembler::kTableSize;
3756
3757 const int kSkipArrayEntry = 0;
3758 const int kDontSkipArrayEntry = 1;
3759
3760 for (int i = 0; i < kSize; i++) {
3761 boolean_skip_table->set(i, kSkipArrayEntry);
3762 }
3763 int skip = max_lookahead + 1 - min_lookahead;
3764
3765 for (int i = max_lookahead; i >= min_lookahead; i--) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003766 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3767 for (int j = 0; j < kSize; j++) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003768 if (map->at(j)) {
3769 boolean_skip_table->set(j, kDontSkipArrayEntry);
3770 }
3771 }
3772 }
3773
3774 return skip;
3775}
3776
3777
3778// See comment above on the implementation of GetSkipTable.
3779bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003780 const int kSize = RegExpMacroAssembler::kTableSize;
3781
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003782 int min_lookahead = 0;
3783 int max_lookahead = 0;
3784
3785 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;
3786
3787 bool found_single_character = false;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003788 int single_character = 0;
3789 for (int i = max_lookahead; i >= min_lookahead; i--) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003790 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3791 if (map->map_count() > 1 ||
3792 (found_single_character && map->map_count() != 0)) {
3793 found_single_character = false;
3794 break;
3795 }
3796 for (int j = 0; j < kSize; j++) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003797 if (map->at(j)) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003798 found_single_character = true;
3799 single_character = j;
3800 break;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003801 }
3802 }
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003803 }
3804
3805 int lookahead_width = max_lookahead + 1 - min_lookahead;
3806
3807 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3808 // The mask-compare can probably handle this better.
3809 return false;
3810 }
3811
3812 if (found_single_character) {
3813 Label cont, again;
3814 masm->Bind(&again);
3815 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003816 if (max_char_ > kSize) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003817 masm->CheckCharacterAfterAnd(single_character,
3818 RegExpMacroAssembler::kTableMask,
3819 &cont);
3820 } else {
3821 masm->CheckCharacter(single_character, &cont);
3822 }
3823 masm->AdvanceCurrentPosition(lookahead_width);
3824 masm->GoTo(&again);
3825 masm->Bind(&cont);
3826 return true;
3827 }
3828
verwaest@chromium.orgd4be0f02013-06-05 13:39:03 +00003829 Factory* factory = Isolate::Current()->factory();
3830 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003831 int skip_distance = GetSkipTable(
3832 min_lookahead, max_lookahead, boolean_skip_table);
3833 ASSERT(skip_distance != 0);
3834
3835 Label cont, again;
3836 masm->Bind(&again);
3837 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3838 masm->CheckBitInTable(boolean_skip_table, &cont);
3839 masm->AdvanceCurrentPosition(skip_distance);
3840 masm->GoTo(&again);
3841 masm->Bind(&cont);
3842
3843 return true;
3844}
3845
3846
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003847/* Code generation for choice nodes.
3848 *
3849 * We generate quick checks that do a mask and compare to eliminate a
3850 * choice. If the quick check succeeds then it jumps to the continuation to
3851 * do slow checks and check subsequent nodes. If it fails (the common case)
3852 * it falls through to the next choice.
3853 *
3854 * Here is the desired flow graph. Nodes directly below each other imply
3855 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3856 * 3 doesn't have a quick check so we have to call the slow check.
3857 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3858 * regexp continuation is generated directly after the Sn node, up to the
3859 * next GoTo if we decide to reuse some already generated code. Some
3860 * nodes expect preload_characters to be preloaded into the current
3861 * character register. R nodes do this preloading. Vertices are marked
3862 * F for failures and S for success (possible success in the case of quick
3863 * nodes). L, V, < and > are used as arrow heads.
3864 *
3865 * ----------> R
3866 * |
3867 * V
3868 * Q1 -----> S1
3869 * | S /
3870 * F| /
3871 * | F/
3872 * | /
3873 * | R
3874 * | /
3875 * V L
3876 * Q2 -----> S2
3877 * | S /
3878 * F| /
3879 * | F/
3880 * | /
3881 * | R
3882 * | /
3883 * V L
3884 * S3
3885 * |
3886 * F|
3887 * |
3888 * R
3889 * |
3890 * backtrack V
3891 * <----------Q4
3892 * \ F |
3893 * \ |S
3894 * \ F V
3895 * \-----S4
3896 *
3897 * For greedy loops we reverse our expectation and expect to match rather
3898 * than fail. Therefore we want the loop code to look like this (U is the
3899 * unwind code that steps back in the greedy loop). The following alternatives
3900 * look the same as above.
3901 * _____
3902 * / \
3903 * V |
3904 * ----------> S1 |
3905 * /| |
3906 * / |S |
3907 * F/ \_____/
3908 * /
3909 * |<-----------
3910 * | \
3911 * V \
3912 * Q2 ---> S2 \
3913 * | S / |
3914 * F| / |
3915 * | F/ |
3916 * | / |
3917 * | R |
3918 * | / |
3919 * F VL |
3920 * <------U |
3921 * back |S |
3922 * \______________/
3923 */
3924
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003925void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003926 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3927 int choice_count = alternatives_->length();
3928#ifdef DEBUG
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003929 for (int i = 0; i < choice_count - 1; i++) {
3930 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003931 ZoneList<Guard*>* guards = alternative.guards();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003932 int guard_count = (guards == NULL) ? 0 : guards->length();
3933 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003934 ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003935 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003936 }
3937#endif
3938
ager@chromium.org32912102009-01-16 10:38:43 +00003939 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003940 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003941 ASSERT(limit_result == CONTINUE);
3942
ager@chromium.org381abbb2009-02-25 13:23:22 +00003943 int new_flush_budget = trace->flush_budget() / choice_count;
3944 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3945 trace->Flush(compiler, this);
3946 return;
3947 }
3948
ager@chromium.org8bb60582008-12-11 12:02:20 +00003949 RecursionCheck rc(compiler);
3950
ager@chromium.org32912102009-01-16 10:38:43 +00003951 Trace* current_trace = trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003952
jkummerow@chromium.org486075a2011-09-07 12:44:28 +00003953 int text_length = GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
ager@chromium.org8bb60582008-12-11 12:02:20 +00003954 bool greedy_loop = false;
3955 Label greedy_loop_label;
ager@chromium.org32912102009-01-16 10:38:43 +00003956 Trace counter_backtrack_trace;
3957 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
iposva@chromium.org245aa852009-02-10 00:49:54 +00003958 if (not_at_start()) counter_backtrack_trace.set_at_start(false);
3959
ager@chromium.org8bb60582008-12-11 12:02:20 +00003960 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3961 // Here we have special handling for greedy loops containing only text nodes
3962 // and other simple nodes. These are handled by pushing the current
3963 // position on the stack and then incrementing the current position each
3964 // time around the switch. On backtrack we decrement the current position
3965 // and check it against the pushed value. This avoids pushing backtrack
3966 // information for each iteration of the loop, which could take up a lot of
3967 // space.
3968 greedy_loop = true;
ager@chromium.org32912102009-01-16 10:38:43 +00003969 ASSERT(trace->stop_node() == NULL);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003970 macro_assembler->PushCurrentPosition();
ager@chromium.org32912102009-01-16 10:38:43 +00003971 current_trace = &counter_backtrack_trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003972 Label greedy_match_failed;
ager@chromium.org32912102009-01-16 10:38:43 +00003973 Trace greedy_match_trace;
iposva@chromium.org245aa852009-02-10 00:49:54 +00003974 if (not_at_start()) greedy_match_trace.set_at_start(false);
ager@chromium.org32912102009-01-16 10:38:43 +00003975 greedy_match_trace.set_backtrack(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003976 Label loop_label;
3977 macro_assembler->Bind(&loop_label);
ager@chromium.org32912102009-01-16 10:38:43 +00003978 greedy_match_trace.set_stop_node(this);
3979 greedy_match_trace.set_loop_label(&loop_label);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003980 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003981 macro_assembler->Bind(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003982 }
3983
3984 Label second_choice; // For use in greedy matches.
3985 macro_assembler->Bind(&second_choice);
3986
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003987 int first_normal_choice = greedy_loop ? 1 : 0;
3988
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003989 bool not_at_start = current_trace->at_start() == Trace::FALSE;
3990 const int kEatsAtLeastNotYetInitialized = -1;
3991 int eats_at_least = kEatsAtLeastNotYetInitialized;
3992
3993 bool skip_was_emitted = false;
3994
3995 if (!greedy_loop && choice_count == 2) {
3996 GuardedAlternative alt1 = alternatives_->at(1);
3997 if (alt1.guards() == NULL || alt1.guards()->length() == 0) {
3998 RegExpNode* eats_anything_node = alt1.node();
3999 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==
4000 this) {
4001 // At this point we know that we are at a non-greedy loop that will eat
4002 // any character one at a time. Any non-anchored regexp has such a
4003 // loop prepended to it in order to find where it starts. We look for
4004 // a pattern of the form ...abc... where we can look 6 characters ahead
4005 // and step forwards 3 if the character is not one of abc. Abc need
4006 // not be atoms, they can be any reasonably limited character class or
4007 // small alternation.
4008 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004009 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
4010 if (lookahead == NULL) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00004011 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4012 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4013 kRecursionBudget,
4014 not_at_start));
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004015 if (eats_at_least >= 1) {
4016 BoyerMooreLookahead* bm =
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004017 new(zone()) BoyerMooreLookahead(eats_at_least,
4018 compiler,
4019 zone());
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004020 GuardedAlternative alt0 = alternatives_->at(0);
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00004021 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004022 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler);
4023 }
4024 } else {
4025 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00004026 }
4027 }
4028 }
4029 }
4030
4031 if (eats_at_least == kEatsAtLeastNotYetInitialized) {
4032 // Save some time by looking at most one machine word ahead.
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00004033 eats_at_least =
4034 EatsAtLeast(compiler->ascii() ? 4 : 2, kRecursionBudget, not_at_start);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00004035 }
4036 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);
4037
4038 bool preload_is_current = !skip_was_emitted &&
ager@chromium.org32912102009-01-16 10:38:43 +00004039 (current_trace->characters_preloaded() == preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004040 bool preload_has_checked_bounds = preload_is_current;
4041
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004042 AlternativeGenerationList alt_gens(choice_count, zone());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004043
ager@chromium.org8bb60582008-12-11 12:02:20 +00004044 // For now we just call all choices one after the other. The idea ultimately
4045 // is to use the Dispatch table to try only the relevant ones.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004046 for (int i = first_normal_choice; i < choice_count; i++) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004047 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00004048 AlternativeGeneration* alt_gen = alt_gens.at(i);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004049 alt_gen->quick_check_details.set_characters(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00004050 ZoneList<Guard*>* guards = alternative.guards();
4051 int guard_count = (guards == NULL) ? 0 : guards->length();
ager@chromium.org32912102009-01-16 10:38:43 +00004052 Trace new_trace(*current_trace);
4053 new_trace.set_characters_preloaded(preload_is_current ?
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004054 preload_characters :
4055 0);
4056 if (preload_has_checked_bounds) {
ager@chromium.org32912102009-01-16 10:38:43 +00004057 new_trace.set_bound_checked_up_to(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00004058 }
ager@chromium.org32912102009-01-16 10:38:43 +00004059 new_trace.quick_check_performed()->Clear();
iposva@chromium.org245aa852009-02-10 00:49:54 +00004060 if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004061 alt_gen->expects_preload = preload_is_current;
4062 bool generate_full_check_inline = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00004063 if (FLAG_regexp_optimization &&
iposva@chromium.org245aa852009-02-10 00:49:54 +00004064 try_to_emit_quick_check_for_alternative(i) &&
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004065 alternative.node()->EmitQuickCheck(compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00004066 &new_trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004067 preload_has_checked_bounds,
4068 &alt_gen->possible_success,
4069 &alt_gen->quick_check_details,
4070 i < choice_count - 1)) {
4071 // Quick check was generated for this choice.
4072 preload_is_current = true;
4073 preload_has_checked_bounds = true;
4074 // On the last choice in the ChoiceNode we generated the quick
4075 // check to fall through on possible success. So now we need to
4076 // generate the full check inline.
4077 if (i == choice_count - 1) {
4078 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00004079 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4080 new_trace.set_characters_preloaded(preload_characters);
4081 new_trace.set_bound_checked_up_to(preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004082 generate_full_check_inline = true;
4083 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00004084 } else if (alt_gen->quick_check_details.cannot_match()) {
4085 if (i == choice_count - 1 && !greedy_loop) {
4086 macro_assembler->GoTo(trace->backtrack());
4087 }
4088 continue;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004089 } else {
4090 // No quick check was generated. Put the full code here.
4091 // If this is not the first choice then there could be slow checks from
4092 // previous cases that go here when they fail. There's no reason to
4093 // insist that they preload characters since the slow check we are about
4094 // to generate probably can't use it.
4095 if (i != first_normal_choice) {
4096 alt_gen->expects_preload = false;
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00004097 new_trace.InvalidateCurrentCharacter();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004098 }
4099 if (i < choice_count - 1) {
ager@chromium.org32912102009-01-16 10:38:43 +00004100 new_trace.set_backtrack(&alt_gen->after);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004101 }
4102 generate_full_check_inline = true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004103 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004104 if (generate_full_check_inline) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00004105 if (new_trace.actions() != NULL) {
4106 new_trace.set_flush_budget(new_flush_budget);
4107 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004108 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00004109 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004110 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004111 alternative.node()->Emit(compiler, &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004112 preload_is_current = false;
4113 }
4114 macro_assembler->Bind(&alt_gen->after);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004115 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004116 if (greedy_loop) {
4117 macro_assembler->Bind(&greedy_loop_label);
4118 // If we have unwound to the bottom then backtrack.
ager@chromium.org32912102009-01-16 10:38:43 +00004119 macro_assembler->CheckGreedyLoop(trace->backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00004120 // Otherwise try the second priority at an earlier position.
4121 macro_assembler->AdvanceCurrentPosition(-text_length);
4122 macro_assembler->GoTo(&second_choice);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004123 }
ager@chromium.org381abbb2009-02-25 13:23:22 +00004124
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004125 // At this point we need to generate slow checks for the alternatives where
4126 // the quick check was inlined. We can recognize these because the associated
4127 // label was bound.
4128 for (int i = first_normal_choice; i < choice_count - 1; i++) {
4129 AlternativeGeneration* alt_gen = alt_gens.at(i);
ager@chromium.org381abbb2009-02-25 13:23:22 +00004130 Trace new_trace(*current_trace);
4131 // If there are actions to be flushed we have to limit how many times
4132 // they are flushed. Take the budget of the parent trace and distribute
4133 // it fairly amongst the children.
4134 if (new_trace.actions() != NULL) {
4135 new_trace.set_flush_budget(new_flush_budget);
4136 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004137 EmitOutOfLineContinuation(compiler,
ager@chromium.org381abbb2009-02-25 13:23:22 +00004138 &new_trace,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004139 alternatives_->at(i),
4140 alt_gen,
4141 preload_characters,
4142 alt_gens.at(i + 1)->expects_preload);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004143 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004144}
4145
4146
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004147void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00004148 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004149 GuardedAlternative alternative,
4150 AlternativeGeneration* alt_gen,
4151 int preload_characters,
4152 bool next_expects_preload) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004153 if (!alt_gen->possible_success.is_linked()) return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004154
4155 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4156 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00004157 Trace out_of_line_trace(*trace);
4158 out_of_line_trace.set_characters_preloaded(preload_characters);
4159 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
iposva@chromium.org245aa852009-02-10 00:49:54 +00004160 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004161 ZoneList<Guard*>* guards = alternative.guards();
4162 int guard_count = (guards == NULL) ? 0 : guards->length();
4163 if (next_expects_preload) {
4164 Label reload_current_char;
ager@chromium.org32912102009-01-16 10:38:43 +00004165 out_of_line_trace.set_backtrack(&reload_current_char);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004166 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00004167 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004168 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004169 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004170 macro_assembler->Bind(&reload_current_char);
4171 // Reload the current character, since the next quick check expects that.
4172 // We don't need to check bounds here because we only get into this
4173 // code through a quick check which already did the checked load.
ager@chromium.org32912102009-01-16 10:38:43 +00004174 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004175 NULL,
4176 false,
4177 preload_characters);
4178 macro_assembler->GoTo(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004179 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00004180 out_of_line_trace.set_backtrack(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004181 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00004182 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004183 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004184 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004185 }
4186}
4187
4188
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004189void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004190 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00004191 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004192 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00004193 ASSERT(limit_result == CONTINUE);
4194
4195 RecursionCheck rc(compiler);
4196
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00004197 switch (action_type_) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004198 case STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00004199 Trace::DeferredCapture
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004200 new_capture(data_.u_position_register.reg,
4201 data_.u_position_register.is_capture,
4202 trace);
ager@chromium.org32912102009-01-16 10:38:43 +00004203 Trace new_trace = *trace;
4204 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004205 on_success()->Emit(compiler, &new_trace);
4206 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004207 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004208 case INCREMENT_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00004209 Trace::DeferredIncrementRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00004210 new_increment(data_.u_increment_register.reg);
ager@chromium.org32912102009-01-16 10:38:43 +00004211 Trace new_trace = *trace;
4212 new_trace.add_action(&new_increment);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004213 on_success()->Emit(compiler, &new_trace);
4214 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00004215 }
4216 case SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00004217 Trace::DeferredSetRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00004218 new_set(data_.u_store_register.reg, data_.u_store_register.value);
ager@chromium.org32912102009-01-16 10:38:43 +00004219 Trace new_trace = *trace;
4220 new_trace.add_action(&new_set);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004221 on_success()->Emit(compiler, &new_trace);
4222 break;
ager@chromium.org32912102009-01-16 10:38:43 +00004223 }
4224 case CLEAR_CAPTURES: {
4225 Trace::DeferredClearCaptures
4226 new_capture(Interval(data_.u_clear_captures.range_from,
4227 data_.u_clear_captures.range_to));
4228 Trace new_trace = *trace;
4229 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004230 on_success()->Emit(compiler, &new_trace);
4231 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00004232 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004233 case BEGIN_SUBMATCH:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004234 if (!trace->is_trivial()) {
4235 trace->Flush(compiler, this);
4236 } else {
4237 assembler->WriteCurrentPositionToRegister(
4238 data_.u_submatch.current_position_register, 0);
4239 assembler->WriteStackPointerToRegister(
4240 data_.u_submatch.stack_pointer_register);
4241 on_success()->Emit(compiler, trace);
4242 }
4243 break;
ager@chromium.org32912102009-01-16 10:38:43 +00004244 case EMPTY_MATCH_CHECK: {
4245 int start_pos_reg = data_.u_empty_match_check.start_register;
4246 int stored_pos = 0;
4247 int rep_reg = data_.u_empty_match_check.repetition_register;
4248 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4249 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4250 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4251 // If we know we haven't advanced and there is no minimum we
4252 // can just backtrack immediately.
4253 assembler->GoTo(trace->backtrack());
ager@chromium.org32912102009-01-16 10:38:43 +00004254 } else if (know_dist && stored_pos < trace->cp_offset()) {
4255 // If we know we've advanced we can generate the continuation
4256 // immediately.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004257 on_success()->Emit(compiler, trace);
4258 } else if (!trace->is_trivial()) {
4259 trace->Flush(compiler, this);
4260 } else {
4261 Label skip_empty_check;
4262 // If we have a minimum number of repetitions we check the current
4263 // number first and skip the empty check if it's not enough.
4264 if (has_minimum) {
4265 int limit = data_.u_empty_match_check.repetition_limit;
4266 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4267 }
4268 // If the match is empty we bail out, otherwise we fall through
4269 // to the on-success continuation.
4270 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4271 trace->backtrack());
4272 assembler->Bind(&skip_empty_check);
4273 on_success()->Emit(compiler, trace);
ager@chromium.org32912102009-01-16 10:38:43 +00004274 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004275 break;
ager@chromium.org32912102009-01-16 10:38:43 +00004276 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004277 case POSITIVE_SUBMATCH_SUCCESS: {
4278 if (!trace->is_trivial()) {
4279 trace->Flush(compiler, this);
4280 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004281 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004282 assembler->ReadCurrentPositionFromRegister(
ager@chromium.org8bb60582008-12-11 12:02:20 +00004283 data_.u_submatch.current_position_register);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004284 assembler->ReadStackPointerFromRegister(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004285 data_.u_submatch.stack_pointer_register);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004286 int clear_register_count = data_.u_submatch.clear_register_count;
4287 if (clear_register_count == 0) {
4288 on_success()->Emit(compiler, trace);
4289 return;
4290 }
4291 int clear_registers_from = data_.u_submatch.clear_register_from;
4292 Label clear_registers_backtrack;
4293 Trace new_trace = *trace;
4294 new_trace.set_backtrack(&clear_registers_backtrack);
4295 on_success()->Emit(compiler, &new_trace);
4296
4297 assembler->Bind(&clear_registers_backtrack);
4298 int clear_registers_to = clear_registers_from + clear_register_count - 1;
4299 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4300
4301 ASSERT(trace->backtrack() == NULL);
4302 assembler->Backtrack();
4303 return;
4304 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004305 default:
4306 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004307 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004308}
4309
4310
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004311void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004312 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00004313 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004314 trace->Flush(compiler, this);
4315 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00004316 }
4317
ager@chromium.org32912102009-01-16 10:38:43 +00004318 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004319 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00004320 ASSERT(limit_result == CONTINUE);
4321
4322 RecursionCheck rc(compiler);
4323
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004324 ASSERT_EQ(start_reg_ + 1, end_reg_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004325 if (compiler->ignore_case()) {
4326 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
4327 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004328 } else {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004329 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004330 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004331 on_success()->Emit(compiler, trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004332}
4333
4334
4335// -------------------------------------------------------------------
4336// Dot/dotty output
4337
4338
4339#ifdef DEBUG
4340
4341
4342class DotPrinter: public NodeVisitor {
4343 public:
4344 explicit DotPrinter(bool ignore_case)
4345 : ignore_case_(ignore_case),
4346 stream_(&alloc_) { }
4347 void PrintNode(const char* label, RegExpNode* node);
4348 void Visit(RegExpNode* node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004349 void PrintAttributes(RegExpNode* from);
4350 StringStream* stream() { return &stream_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004351 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004352#define DECLARE_VISIT(Type) \
4353 virtual void Visit##Type(Type##Node* that);
4354FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4355#undef DECLARE_VISIT
4356 private:
4357 bool ignore_case_;
4358 HeapStringAllocator alloc_;
4359 StringStream stream_;
4360};
4361
4362
4363void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4364 stream()->Add("digraph G {\n graph [label=\"");
4365 for (int i = 0; label[i]; i++) {
4366 switch (label[i]) {
4367 case '\\':
4368 stream()->Add("\\\\");
4369 break;
4370 case '"':
4371 stream()->Add("\"");
4372 break;
4373 default:
4374 stream()->Put(label[i]);
4375 break;
4376 }
4377 }
4378 stream()->Add("\"];\n");
4379 Visit(node);
4380 stream()->Add("}\n");
4381 printf("%s", *(stream()->ToCString()));
4382}
4383
4384
4385void DotPrinter::Visit(RegExpNode* node) {
4386 if (node->info()->visited) return;
4387 node->info()->visited = true;
4388 node->Accept(this);
4389}
4390
4391
4392void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004393 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
4394 Visit(on_failure);
4395}
4396
4397
4398class TableEntryBodyPrinter {
4399 public:
4400 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
4401 : stream_(stream), choice_(choice) { }
4402 void Call(uc16 from, DispatchTable::Entry entry) {
4403 OutSet* out_set = entry.out_set();
4404 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4405 if (out_set->Get(i)) {
4406 stream()->Add(" n%p:s%io%i -> n%p;\n",
4407 choice(),
4408 from,
4409 i,
4410 choice()->alternatives()->at(i).node());
4411 }
4412 }
4413 }
4414 private:
4415 StringStream* stream() { return stream_; }
4416 ChoiceNode* choice() { return choice_; }
4417 StringStream* stream_;
4418 ChoiceNode* choice_;
4419};
4420
4421
4422class TableEntryHeaderPrinter {
4423 public:
4424 explicit TableEntryHeaderPrinter(StringStream* stream)
4425 : first_(true), stream_(stream) { }
4426 void Call(uc16 from, DispatchTable::Entry entry) {
4427 if (first_) {
4428 first_ = false;
4429 } else {
4430 stream()->Add("|");
4431 }
4432 stream()->Add("{\\%k-\\%k|{", from, entry.to());
4433 OutSet* out_set = entry.out_set();
4434 int priority = 0;
4435 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4436 if (out_set->Get(i)) {
4437 if (priority > 0) stream()->Add("|");
4438 stream()->Add("<s%io%i> %i", from, i, priority);
4439 priority++;
4440 }
4441 }
4442 stream()->Add("}}");
4443 }
jkummerow@chromium.orge297f592011-06-08 10:05:15 +00004444
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004445 private:
4446 bool first_;
4447 StringStream* stream() { return stream_; }
4448 StringStream* stream_;
4449};
4450
4451
4452class AttributePrinter {
4453 public:
4454 explicit AttributePrinter(DotPrinter* out)
4455 : out_(out), first_(true) { }
4456 void PrintSeparator() {
4457 if (first_) {
4458 first_ = false;
4459 } else {
4460 out_->stream()->Add("|");
4461 }
4462 }
4463 void PrintBit(const char* name, bool value) {
4464 if (!value) return;
4465 PrintSeparator();
4466 out_->stream()->Add("{%s}", name);
4467 }
4468 void PrintPositive(const char* name, int value) {
4469 if (value < 0) return;
4470 PrintSeparator();
4471 out_->stream()->Add("{%s|%x}", name, value);
4472 }
4473 private:
4474 DotPrinter* out_;
4475 bool first_;
4476};
4477
4478
4479void DotPrinter::PrintAttributes(RegExpNode* that) {
4480 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
4481 "margin=0.1, fontsize=10, label=\"{",
4482 that);
4483 AttributePrinter printer(this);
4484 NodeInfo* info = that->info();
4485 printer.PrintBit("NI", info->follows_newline_interest);
4486 printer.PrintBit("WI", info->follows_word_interest);
4487 printer.PrintBit("SI", info->follows_start_interest);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004488 Label* label = that->label();
4489 if (label->is_bound())
4490 printer.PrintPositive("@", label->pos());
4491 stream()->Add("}\"];\n");
4492 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
4493 "arrowhead=none];\n", that, that);
4494}
4495
4496
4497static const bool kPrintDispatchTable = false;
4498void DotPrinter::VisitChoice(ChoiceNode* that) {
4499 if (kPrintDispatchTable) {
4500 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
4501 TableEntryHeaderPrinter header_printer(stream());
4502 that->GetTable(ignore_case_)->ForEach(&header_printer);
4503 stream()->Add("\"]\n", that);
4504 PrintAttributes(that);
4505 TableEntryBodyPrinter body_printer(stream(), that);
4506 that->GetTable(ignore_case_)->ForEach(&body_printer);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004507 } else {
4508 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
4509 for (int i = 0; i < that->alternatives()->length(); i++) {
4510 GuardedAlternative alt = that->alternatives()->at(i);
4511 stream()->Add(" n%p -> n%p;\n", that, alt.node());
4512 }
4513 }
4514 for (int i = 0; i < that->alternatives()->length(); i++) {
4515 GuardedAlternative alt = that->alternatives()->at(i);
4516 alt.node()->Accept(this);
4517 }
4518}
4519
4520
4521void DotPrinter::VisitText(TextNode* that) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004522 Zone* zone = that->zone();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004523 stream()->Add(" n%p [label=\"", that);
4524 for (int i = 0; i < that->elements()->length(); i++) {
4525 if (i > 0) stream()->Add(" ");
4526 TextElement elm = that->elements()->at(i);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00004527 switch (elm.text_type) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004528 case TextElement::ATOM: {
4529 stream()->Add("'%w'", elm.data.u_atom->data());
4530 break;
4531 }
4532 case TextElement::CHAR_CLASS: {
4533 RegExpCharacterClass* node = elm.data.u_char_class;
4534 stream()->Add("[");
4535 if (node->is_negated())
4536 stream()->Add("^");
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004537 for (int j = 0; j < node->ranges(zone)->length(); j++) {
4538 CharacterRange range = node->ranges(zone)->at(j);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004539 stream()->Add("%k-%k", range.from(), range.to());
4540 }
4541 stream()->Add("]");
4542 break;
4543 }
4544 default:
4545 UNREACHABLE();
4546 }
4547 }
4548 stream()->Add("\", shape=box, peripheries=2];\n");
4549 PrintAttributes(that);
4550 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
4551 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004552}
4553
4554
4555void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4556 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
4557 that,
4558 that->start_register(),
4559 that->end_register());
4560 PrintAttributes(that);
4561 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
4562 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004563}
4564
4565
4566void DotPrinter::VisitEnd(EndNode* that) {
4567 stream()->Add(" n%p [style=bold, shape=point];\n", that);
4568 PrintAttributes(that);
4569}
4570
4571
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004572void DotPrinter::VisitAssertion(AssertionNode* that) {
4573 stream()->Add(" n%p [", that);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00004574 switch (that->assertion_type()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004575 case AssertionNode::AT_END:
4576 stream()->Add("label=\"$\", shape=septagon");
4577 break;
4578 case AssertionNode::AT_START:
4579 stream()->Add("label=\"^\", shape=septagon");
4580 break;
4581 case AssertionNode::AT_BOUNDARY:
4582 stream()->Add("label=\"\\b\", shape=septagon");
4583 break;
4584 case AssertionNode::AT_NON_BOUNDARY:
4585 stream()->Add("label=\"\\B\", shape=septagon");
4586 break;
4587 case AssertionNode::AFTER_NEWLINE:
4588 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
4589 break;
4590 }
4591 stream()->Add("];\n");
4592 PrintAttributes(that);
4593 RegExpNode* successor = that->on_success();
4594 stream()->Add(" n%p -> n%p;\n", that, successor);
4595 Visit(successor);
4596}
4597
4598
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004599void DotPrinter::VisitAction(ActionNode* that) {
4600 stream()->Add(" n%p [", that);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00004601 switch (that->action_type_) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004602 case ActionNode::SET_REGISTER:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004603 stream()->Add("label=\"$%i:=%i\", shape=octagon",
4604 that->data_.u_store_register.reg,
4605 that->data_.u_store_register.value);
4606 break;
4607 case ActionNode::INCREMENT_REGISTER:
4608 stream()->Add("label=\"$%i++\", shape=octagon",
4609 that->data_.u_increment_register.reg);
4610 break;
4611 case ActionNode::STORE_POSITION:
4612 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
4613 that->data_.u_position_register.reg);
4614 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004615 case ActionNode::BEGIN_SUBMATCH:
4616 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
4617 that->data_.u_submatch.current_position_register);
4618 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00004619 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004620 stream()->Add("label=\"escape\", shape=septagon");
4621 break;
ager@chromium.org32912102009-01-16 10:38:43 +00004622 case ActionNode::EMPTY_MATCH_CHECK:
4623 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
4624 that->data_.u_empty_match_check.start_register,
4625 that->data_.u_empty_match_check.repetition_register,
4626 that->data_.u_empty_match_check.repetition_limit);
4627 break;
4628 case ActionNode::CLEAR_CAPTURES: {
4629 stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
4630 that->data_.u_clear_captures.range_from,
4631 that->data_.u_clear_captures.range_to);
4632 break;
4633 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004634 }
4635 stream()->Add("];\n");
4636 PrintAttributes(that);
ager@chromium.org8bb60582008-12-11 12:02:20 +00004637 RegExpNode* successor = that->on_success();
4638 stream()->Add(" n%p -> n%p;\n", that, successor);
4639 Visit(successor);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004640}
4641
4642
4643class DispatchTableDumper {
4644 public:
4645 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
4646 void Call(uc16 key, DispatchTable::Entry entry);
4647 StringStream* stream() { return stream_; }
4648 private:
4649 StringStream* stream_;
4650};
4651
4652
4653void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4654 stream()->Add("[%k-%k]: {", key, entry.to());
4655 OutSet* set = entry.out_set();
4656 bool first = true;
4657 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4658 if (set->Get(i)) {
4659 if (first) {
4660 first = false;
4661 } else {
4662 stream()->Add(", ");
4663 }
4664 stream()->Add("%i", i);
4665 }
4666 }
4667 stream()->Add("}\n");
4668}
4669
4670
4671void DispatchTable::Dump() {
4672 HeapStringAllocator alloc;
4673 StringStream stream(&alloc);
4674 DispatchTableDumper dumper(&stream);
4675 tree()->ForEach(&dumper);
4676 OS::PrintError("%s", *stream.ToCString());
4677}
4678
4679
4680void RegExpEngine::DotPrint(const char* label,
4681 RegExpNode* node,
4682 bool ignore_case) {
4683 DotPrinter printer(ignore_case);
4684 printer.PrintNode(label, node);
4685}
4686
4687
4688#endif // DEBUG
4689
4690
4691// -------------------------------------------------------------------
4692// Tree to graph conversion
4693
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004694RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004695 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004696 ZoneList<TextElement>* elms =
4697 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4698 elms->Add(TextElement::Atom(this), compiler->zone());
4699 return new(compiler->zone()) TextNode(elms, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004700}
4701
4702
4703RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004704 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004705 return new(compiler->zone()) TextNode(elements(), on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004706}
4707
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004708
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004709static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004710 const int* special_class,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004711 int length) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004712 length--; // Remove final 0x10000.
4713 ASSERT(special_class[length] == 0x10000);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004714 ASSERT(ranges->length() != 0);
4715 ASSERT(length != 0);
4716 ASSERT(special_class[0] != 0);
4717 if (ranges->length() != (length >> 1) + 1) {
4718 return false;
4719 }
4720 CharacterRange range = ranges->at(0);
4721 if (range.from() != 0) {
4722 return false;
4723 }
4724 for (int i = 0; i < length; i += 2) {
4725 if (special_class[i] != (range.to() + 1)) {
4726 return false;
4727 }
4728 range = ranges->at((i >> 1) + 1);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004729 if (special_class[i+1] != range.from()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004730 return false;
4731 }
4732 }
4733 if (range.to() != 0xffff) {
4734 return false;
4735 }
4736 return true;
4737}
4738
4739
4740static bool CompareRanges(ZoneList<CharacterRange>* ranges,
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004741 const int* special_class,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004742 int length) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004743 length--; // Remove final 0x10000.
4744 ASSERT(special_class[length] == 0x10000);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004745 if (ranges->length() * 2 != length) {
4746 return false;
4747 }
4748 for (int i = 0; i < length; i += 2) {
4749 CharacterRange range = ranges->at(i >> 1);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004750 if (range.from() != special_class[i] ||
4751 range.to() != special_class[i + 1] - 1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004752 return false;
4753 }
4754 }
4755 return true;
4756}
4757
4758
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004759bool RegExpCharacterClass::is_standard(Zone* zone) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004760 // TODO(lrn): Remove need for this function, by not throwing away information
4761 // along the way.
4762 if (is_negated_) {
4763 return false;
4764 }
4765 if (set_.is_standard()) {
4766 return true;
4767 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004768 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004769 set_.set_standard_set_type('s');
4770 return true;
4771 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004772 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004773 set_.set_standard_set_type('S');
4774 return true;
4775 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004776 if (CompareInverseRanges(set_.ranges(zone),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004777 kLineTerminatorRanges,
4778 kLineTerminatorRangeCount)) {
4779 set_.set_standard_set_type('.');
4780 return true;
4781 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004782 if (CompareRanges(set_.ranges(zone),
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00004783 kLineTerminatorRanges,
4784 kLineTerminatorRangeCount)) {
4785 set_.set_standard_set_type('n');
4786 return true;
4787 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004788 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00004789 set_.set_standard_set_type('w');
4790 return true;
4791 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004792 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00004793 set_.set_standard_set_type('W');
4794 return true;
4795 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004796 return false;
4797}
4798
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004799
4800RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004801 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004802 return new(compiler->zone()) TextNode(this, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004803}
4804
4805
4806RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004807 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004808 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4809 int length = alternatives->length();
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004810 ChoiceNode* result =
4811 new(compiler->zone()) ChoiceNode(length, compiler->zone());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004812 for (int i = 0; i < length; i++) {
4813 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004814 on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004815 result->AddAlternative(alternative);
4816 }
4817 return result;
4818}
4819
4820
4821RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004822 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004823 return ToNode(min(),
4824 max(),
4825 is_greedy(),
4826 body(),
4827 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004828 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004829}
4830
4831
whesse@chromium.org7b260152011-06-20 15:33:18 +00004832// Scoped object to keep track of how much we unroll quantifier loops in the
4833// regexp graph generator.
4834class RegExpExpansionLimiter {
4835 public:
4836 static const int kMaxExpansionFactor = 6;
4837 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
4838 : compiler_(compiler),
4839 saved_expansion_factor_(compiler->current_expansion_factor()),
4840 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
4841 ASSERT(factor > 0);
4842 if (ok_to_expand_) {
4843 if (factor > kMaxExpansionFactor) {
4844 // Avoid integer overflow of the current expansion factor.
4845 ok_to_expand_ = false;
4846 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
4847 } else {
4848 int new_factor = saved_expansion_factor_ * factor;
4849 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4850 compiler->set_current_expansion_factor(new_factor);
4851 }
4852 }
4853 }
4854
4855 ~RegExpExpansionLimiter() {
4856 compiler_->set_current_expansion_factor(saved_expansion_factor_);
4857 }
4858
4859 bool ok_to_expand() { return ok_to_expand_; }
4860
4861 private:
4862 RegExpCompiler* compiler_;
4863 int saved_expansion_factor_;
4864 bool ok_to_expand_;
4865
4866 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
4867};
4868
4869
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004870RegExpNode* RegExpQuantifier::ToNode(int min,
4871 int max,
4872 bool is_greedy,
4873 RegExpTree* body,
4874 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00004875 RegExpNode* on_success,
4876 bool not_at_start) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004877 // x{f, t} becomes this:
4878 //
4879 // (r++)<-.
4880 // | `
4881 // | (x)
4882 // v ^
4883 // (r=0)-->(?)---/ [if r < t]
4884 // |
4885 // [if r >= f] \----> ...
4886 //
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004887
4888 // 15.10.2.5 RepeatMatcher algorithm.
4889 // The parser has already eliminated the case where max is 0. In the case
4890 // where max_match is zero the parser has removed the quantifier if min was
4891 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
4892
4893 // If we know that we cannot match zero length then things are a little
4894 // simpler since we don't need to make the special zero length match check
4895 // from step 2.1. If the min and max are small we can unroll a little in
4896 // this case.
4897 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
4898 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
4899 if (max == 0) return on_success; // This can happen due to recursion.
ager@chromium.org32912102009-01-16 10:38:43 +00004900 bool body_can_be_empty = (body->min_match() == 0);
4901 int body_start_reg = RegExpCompiler::kNoRegister;
4902 Interval capture_registers = body->CaptureRegisters();
4903 bool needs_capture_clearing = !capture_registers.is_empty();
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004904 Zone* zone = compiler->zone();
4905
ager@chromium.org32912102009-01-16 10:38:43 +00004906 if (body_can_be_empty) {
4907 body_start_reg = compiler->AllocateRegister();
ager@chromium.org381abbb2009-02-25 13:23:22 +00004908 } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
ager@chromium.org32912102009-01-16 10:38:43 +00004909 // Only unroll if there are no captures and the body can't be
4910 // empty.
whesse@chromium.org7b260152011-06-20 15:33:18 +00004911 {
4912 RegExpExpansionLimiter limiter(
4913 compiler, min + ((max != min) ? 1 : 0));
4914 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4915 int new_max = (max == kInfinity) ? max : max - min;
4916 // Recurse once to get the loop or optional matches after the fixed
4917 // ones.
4918 RegExpNode* answer = ToNode(
4919 0, new_max, is_greedy, body, compiler, on_success, true);
4920 // Unroll the forced matches from 0 to min. This can cause chains of
4921 // TextNodes (which the parser does not generate). These should be
4922 // combined if it turns out they hinder good code generation.
4923 for (int i = 0; i < min; i++) {
4924 answer = body->ToNode(compiler, answer);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004925 }
whesse@chromium.org7b260152011-06-20 15:33:18 +00004926 return answer;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004927 }
whesse@chromium.org7b260152011-06-20 15:33:18 +00004928 }
4929 if (max <= kMaxUnrolledMaxMatches && min == 0) {
4930 ASSERT(max > 0); // Due to the 'if' above.
4931 RegExpExpansionLimiter limiter(compiler, max);
4932 if (limiter.ok_to_expand()) {
4933 // Unroll the optional matches up to max.
4934 RegExpNode* answer = on_success;
4935 for (int i = 0; i < max; i++) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004936 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
whesse@chromium.org7b260152011-06-20 15:33:18 +00004937 if (is_greedy) {
4938 alternation->AddAlternative(
4939 GuardedAlternative(body->ToNode(compiler, answer)));
4940 alternation->AddAlternative(GuardedAlternative(on_success));
4941 } else {
4942 alternation->AddAlternative(GuardedAlternative(on_success));
4943 alternation->AddAlternative(
4944 GuardedAlternative(body->ToNode(compiler, answer)));
4945 }
4946 answer = alternation;
4947 if (not_at_start) alternation->set_not_at_start();
4948 }
4949 return answer;
4950 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004951 }
4952 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004953 bool has_min = min > 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004954 bool has_max = max < RegExpTree::kInfinity;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004955 bool needs_counter = has_min || has_max;
ager@chromium.org32912102009-01-16 10:38:43 +00004956 int reg_ctr = needs_counter
4957 ? compiler->AllocateRegister()
4958 : RegExpCompiler::kNoRegister;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004959 LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0,
4960 zone);
iposva@chromium.org245aa852009-02-10 00:49:54 +00004961 if (not_at_start) center->set_not_at_start();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004962 RegExpNode* loop_return = needs_counter
4963 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
4964 : static_cast<RegExpNode*>(center);
ager@chromium.org32912102009-01-16 10:38:43 +00004965 if (body_can_be_empty) {
4966 // If the body can be empty we need to check if it was and then
4967 // backtrack.
4968 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
4969 reg_ctr,
4970 min,
4971 loop_return);
4972 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004973 RegExpNode* body_node = body->ToNode(compiler, loop_return);
ager@chromium.org32912102009-01-16 10:38:43 +00004974 if (body_can_be_empty) {
4975 // If the body can be empty we need to store the start position
4976 // so we can bail out if it was empty.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004977 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
ager@chromium.org32912102009-01-16 10:38:43 +00004978 }
4979 if (needs_capture_clearing) {
4980 // Before entering the body of this loop we need to clear captures.
4981 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
4982 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004983 GuardedAlternative body_alt(body_node);
4984 if (has_max) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004985 Guard* body_guard =
4986 new(zone) Guard(reg_ctr, Guard::LT, max);
4987 body_alt.AddGuard(body_guard, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004988 }
4989 GuardedAlternative rest_alt(on_success);
4990 if (has_min) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004991 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
4992 rest_alt.AddGuard(rest_guard, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004993 }
4994 if (is_greedy) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004995 center->AddLoopAlternative(body_alt);
4996 center->AddContinueAlternative(rest_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004997 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004998 center->AddContinueAlternative(rest_alt);
4999 center->AddLoopAlternative(body_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005000 }
5001 if (needs_counter) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00005002 return ActionNode::SetRegister(reg_ctr, 0, center);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005003 } else {
5004 return center;
5005 }
5006}
5007
5008
5009RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005010 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005011 NodeInfo info;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005012 Zone* zone = compiler->zone();
5013
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00005014 switch (assertion_type()) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005015 case START_OF_LINE:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005016 return AssertionNode::AfterNewline(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005017 case START_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005018 return AssertionNode::AtStart(on_success);
5019 case BOUNDARY:
5020 return AssertionNode::AtBoundary(on_success);
5021 case NON_BOUNDARY:
5022 return AssertionNode::AtNonBoundary(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005023 case END_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005024 return AssertionNode::AtEnd(on_success);
5025 case END_OF_LINE: {
5026 // Compile $ in multiline regexps as an alternation with a positive
5027 // lookahead in one side and an end-of-input on the other side.
5028 // We need two registers for the lookahead.
5029 int stack_pointer_register = compiler->AllocateRegister();
5030 int position_register = compiler->AllocateRegister();
5031 // The ChoiceNode to distinguish between a newline and end-of-input.
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005032 ChoiceNode* result = new(zone) ChoiceNode(2, zone);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005033 // Create a newline atom.
5034 ZoneList<CharacterRange>* newline_ranges =
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005035 new(zone) ZoneList<CharacterRange>(3, zone);
5036 CharacterRange::AddClassEscape('n', newline_ranges, zone);
5037 RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n');
5038 TextNode* newline_matcher = new(zone) TextNode(
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005039 newline_atom,
5040 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5041 position_register,
5042 0, // No captures inside.
5043 -1, // Ignored if no captures.
5044 on_success));
5045 // Create an end-of-input matcher.
5046 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5047 stack_pointer_register,
5048 position_register,
5049 newline_matcher);
5050 // Add the two alternatives to the ChoiceNode.
5051 GuardedAlternative eol_alternative(end_of_line);
5052 result->AddAlternative(eol_alternative);
5053 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5054 result->AddAlternative(end_alternative);
5055 return result;
5056 }
5057 default:
5058 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005059 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005060 return on_success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005061}
5062
5063
5064RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005065 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005066 return new(compiler->zone())
5067 BackReferenceNode(RegExpCapture::StartRegister(index()),
5068 RegExpCapture::EndRegister(index()),
5069 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005070}
5071
5072
5073RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005074 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005075 return on_success;
5076}
5077
5078
5079RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005080 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005081 int stack_pointer_register = compiler->AllocateRegister();
5082 int position_register = compiler->AllocateRegister();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005083
5084 const int registers_per_capture = 2;
5085 const int register_of_first_capture = 2;
5086 int register_count = capture_count_ * registers_per_capture;
5087 int register_start =
5088 register_of_first_capture + capture_from_ * registers_per_capture;
5089
ager@chromium.org8bb60582008-12-11 12:02:20 +00005090 RegExpNode* success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005091 if (is_positive()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005092 RegExpNode* node = ActionNode::BeginSubmatch(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005093 stack_pointer_register,
5094 position_register,
5095 body()->ToNode(
5096 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005097 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5098 position_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005099 register_count,
5100 register_start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005101 on_success)));
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005102 return node;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005103 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00005104 // We use a ChoiceNode for a negative lookahead because it has most of
5105 // the characteristics we need. It has the body of the lookahead as its
5106 // first alternative and the expression after the lookahead of the second
5107 // alternative. If the first alternative succeeds then the
5108 // NegativeSubmatchSuccess will unwind the stack including everything the
5109 // choice node set up and backtrack. If the first alternative fails then
5110 // the second alternative is tried, which is exactly the desired result
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005111 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
5112 // ChoiceNode that knows to ignore the first exit when calculating quick
5113 // checks.
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005114 Zone* zone = compiler->zone();
5115
ager@chromium.org8bb60582008-12-11 12:02:20 +00005116 GuardedAlternative body_alt(
5117 body()->ToNode(
5118 compiler,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005119 success = new(zone) NegativeSubmatchSuccess(stack_pointer_register,
5120 position_register,
5121 register_count,
5122 register_start,
5123 zone)));
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005124 ChoiceNode* choice_node =
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005125 new(zone) NegativeLookaheadChoiceNode(body_alt,
5126 GuardedAlternative(on_success),
5127 zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005128 return ActionNode::BeginSubmatch(stack_pointer_register,
5129 position_register,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005130 choice_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005131 }
5132}
5133
5134
5135RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005136 RegExpNode* on_success) {
5137 return ToNode(body(), index(), compiler, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005138}
5139
5140
5141RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5142 int index,
5143 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005144 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005145 int start_reg = RegExpCapture::StartRegister(index);
5146 int end_reg = RegExpCapture::EndRegister(index);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005147 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
ager@chromium.org8bb60582008-12-11 12:02:20 +00005148 RegExpNode* body_node = body->ToNode(compiler, store_end);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005149 return ActionNode::StorePosition(start_reg, true, body_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005150}
5151
5152
5153RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005154 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005155 ZoneList<RegExpTree*>* children = nodes();
5156 RegExpNode* current = on_success;
5157 for (int i = children->length() - 1; i >= 0; i--) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00005158 current = children->at(i)->ToNode(compiler, current);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005159 }
5160 return current;
5161}
5162
5163
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005164static void AddClass(const int* elmv,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005165 int elmc,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005166 ZoneList<CharacterRange>* ranges,
5167 Zone* zone) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005168 elmc--;
5169 ASSERT(elmv[elmc] == 0x10000);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005170 for (int i = 0; i < elmc; i += 2) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005171 ASSERT(elmv[i] < elmv[i + 1]);
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005172 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005173 }
5174}
5175
5176
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005177static void AddClassNegated(const int *elmv,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005178 int elmc,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005179 ZoneList<CharacterRange>* ranges,
5180 Zone* zone) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005181 elmc--;
5182 ASSERT(elmv[elmc] == 0x10000);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005183 ASSERT(elmv[0] != 0x0000);
yangguo@chromium.org154ff992012-03-13 08:09:54 +00005184 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005185 uc16 last = 0x0000;
5186 for (int i = 0; i < elmc; i += 2) {
5187 ASSERT(last <= elmv[i] - 1);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005188 ASSERT(elmv[i] < elmv[i + 1]);
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005189 ranges->Add(CharacterRange(last, elmv[i] - 1), zone);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005190 last = elmv[i + 1];
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005191 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005192 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005193}
5194
5195
5196void CharacterRange::AddClassEscape(uc16 type,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005197 ZoneList<CharacterRange>* ranges,
5198 Zone* zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005199 switch (type) {
5200 case 's':
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005201 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005202 break;
5203 case 'S':
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005204 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005205 break;
5206 case 'w':
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005207 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005208 break;
5209 case 'W':
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005210 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005211 break;
5212 case 'd':
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005213 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005214 break;
5215 case 'D':
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005216 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005217 break;
5218 case '.':
5219 AddClassNegated(kLineTerminatorRanges,
5220 kLineTerminatorRangeCount,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005221 ranges,
5222 zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005223 break;
5224 // This is not a character range as defined by the spec but a
5225 // convenient shorthand for a character class that matches any
5226 // character.
5227 case '*':
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005228 ranges->Add(CharacterRange::Everything(), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005229 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005230 // This is the set of characters matched by the $ and ^ symbols
5231 // in multiline mode.
5232 case 'n':
5233 AddClass(kLineTerminatorRanges,
5234 kLineTerminatorRangeCount,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005235 ranges,
5236 zone);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005237 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005238 default:
5239 UNREACHABLE();
5240 }
5241}
5242
5243
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005244Vector<const int> CharacterRange::GetWordBounds() {
5245 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005246}
5247
5248
5249class CharacterRangeSplitter {
5250 public:
5251 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005252 ZoneList<CharacterRange>** excluded,
5253 Zone* zone)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005254 : included_(included),
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005255 excluded_(excluded),
5256 zone_(zone) { }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005257 void Call(uc16 from, DispatchTable::Entry entry);
5258
5259 static const int kInBase = 0;
5260 static const int kInOverlay = 1;
5261
5262 private:
5263 ZoneList<CharacterRange>** included_;
5264 ZoneList<CharacterRange>** excluded_;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005265 Zone* zone_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005266};
5267
5268
5269void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
5270 if (!entry.out_set()->Get(kInBase)) return;
5271 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
5272 ? included_
5273 : excluded_;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005274 if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_);
5275 (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005276}
5277
5278
5279void CharacterRange::Split(ZoneList<CharacterRange>* base,
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005280 Vector<const int> overlay,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005281 ZoneList<CharacterRange>** included,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005282 ZoneList<CharacterRange>** excluded,
5283 Zone* zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005284 ASSERT_EQ(NULL, *included);
5285 ASSERT_EQ(NULL, *excluded);
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005286 DispatchTable table(zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005287 for (int i = 0; i < base->length(); i++)
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005288 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005289 for (int i = 0; i < overlay.length(); i += 2) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005290 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005291 CharacterRangeSplitter::kInOverlay, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005292 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005293 CharacterRangeSplitter callback(included, excluded, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005294 table.ForEach(&callback);
5295}
5296
5297
ager@chromium.org38e4c712009-11-11 09:11:58 +00005298void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005299 bool is_ascii,
5300 Zone* zone) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00005301 Isolate* isolate = Isolate::Current();
ager@chromium.org38e4c712009-11-11 09:11:58 +00005302 uc16 bottom = from();
5303 uc16 top = to();
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00005304 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00005305 if (bottom > String::kMaxOneByteCharCode) return;
5306 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
ager@chromium.org38e4c712009-11-11 09:11:58 +00005307 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005308 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org38e4c712009-11-11 09:11:58 +00005309 if (top == bottom) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005310 // If this is a singleton we just expand the one character.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00005311 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005312 for (int i = 0; i < length; i++) {
5313 uc32 chr = chars[i];
ager@chromium.org38e4c712009-11-11 09:11:58 +00005314 if (chr != bottom) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005315 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005316 }
5317 }
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005318 } else {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005319 // If this is a range we expand the characters block by block,
5320 // expanding contiguous subranges (blocks) one at a time.
5321 // The approach is as follows. For a given start character we
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005322 // look up the remainder of the block that contains it (represented
5323 // by the end point), for instance we find 'z' if the character
5324 // is 'c'. A block is characterized by the property
5325 // that all characters uncanonicalize in the same way, except that
5326 // each entry in the result is incremented by the distance from the first
5327 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
5328 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
5329 // Once we've found the end point we look up its uncanonicalization
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005330 // and produce a range for each element. For instance for [c-f]
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005331 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005332 // add a range if it is not already contained in the input, so [c-f]
5333 // will be skipped but [C-F] will be added. If this range is not
5334 // completely contained in a block we do this for all the blocks
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005335 // covered by the range (handling characters that is not in a block
5336 // as a "singleton block").
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005337 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org38e4c712009-11-11 09:11:58 +00005338 int pos = bottom;
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00005339 while (pos <= top) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00005340 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005341 uc16 block_end;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005342 if (length == 0) {
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005343 block_end = pos;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005344 } else {
5345 ASSERT_EQ(1, length);
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005346 block_end = range[0];
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005347 }
ager@chromium.org38e4c712009-11-11 09:11:58 +00005348 int end = (block_end > top) ? top : block_end;
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00005349 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005350 for (int i = 0; i < length; i++) {
5351 uc32 c = range[i];
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005352 uc16 range_from = c - (block_end - pos);
5353 uc16 range_to = c - (block_end - end);
ager@chromium.org38e4c712009-11-11 09:11:58 +00005354 if (!(bottom <= range_from && range_to <= top)) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005355 ranges->Add(CharacterRange(range_from, range_to), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005356 }
5357 }
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005358 pos = end + 1;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005359 }
ager@chromium.org38e4c712009-11-11 09:11:58 +00005360 }
5361}
5362
5363
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005364bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5365 ASSERT_NOT_NULL(ranges);
5366 int n = ranges->length();
5367 if (n <= 1) return true;
5368 int max = ranges->at(0).to();
5369 for (int i = 1; i < n; i++) {
5370 CharacterRange next_range = ranges->at(i);
5371 if (next_range.from() <= max + 1) return false;
5372 max = next_range.to();
5373 }
5374 return true;
5375}
5376
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005377
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005378ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005379 if (ranges_ == NULL) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005380 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5381 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005382 }
5383 return ranges_;
5384}
5385
5386
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005387// Move a number of elements in a zonelist to another position
5388// in the same list. Handles overlapping source and target areas.
5389static void MoveRanges(ZoneList<CharacterRange>* list,
5390 int from,
5391 int to,
5392 int count) {
5393 // Ranges are potentially overlapping.
5394 if (from < to) {
5395 for (int i = count - 1; i >= 0; i--) {
5396 list->at(to + i) = list->at(from + i);
5397 }
5398 } else {
5399 for (int i = 0; i < count; i++) {
5400 list->at(to + i) = list->at(from + i);
5401 }
5402 }
5403}
5404
5405
5406static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5407 int count,
5408 CharacterRange insert) {
5409 // Inserts a range into list[0..count[, which must be sorted
5410 // by from value and non-overlapping and non-adjacent, using at most
5411 // list[0..count] for the result. Returns the number of resulting
5412 // canonicalized ranges. Inserting a range may collapse existing ranges into
5413 // fewer ranges, so the return value can be anything in the range 1..count+1.
5414 uc16 from = insert.from();
5415 uc16 to = insert.to();
5416 int start_pos = 0;
5417 int end_pos = count;
5418 for (int i = count - 1; i >= 0; i--) {
5419 CharacterRange current = list->at(i);
5420 if (current.from() > to + 1) {
5421 end_pos = i;
5422 } else if (current.to() + 1 < from) {
5423 start_pos = i + 1;
5424 break;
5425 }
5426 }
5427
5428 // Inserted range overlaps, or is adjacent to, ranges at positions
5429 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
5430 // not affected by the insertion.
5431 // If start_pos == end_pos, the range must be inserted before start_pos.
5432 // if start_pos < end_pos, the entire range from start_pos to end_pos
5433 // must be merged with the insert range.
5434
5435 if (start_pos == end_pos) {
5436 // Insert between existing ranges at position start_pos.
5437 if (start_pos < count) {
5438 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
5439 }
5440 list->at(start_pos) = insert;
5441 return count + 1;
5442 }
5443 if (start_pos + 1 == end_pos) {
5444 // Replace single existing range at position start_pos.
5445 CharacterRange to_replace = list->at(start_pos);
5446 int new_from = Min(to_replace.from(), from);
5447 int new_to = Max(to_replace.to(), to);
5448 list->at(start_pos) = CharacterRange(new_from, new_to);
5449 return count;
5450 }
5451 // Replace a number of existing ranges from start_pos to end_pos - 1.
5452 // Move the remaining ranges down.
5453
5454 int new_from = Min(list->at(start_pos).from(), from);
5455 int new_to = Max(list->at(end_pos - 1).to(), to);
5456 if (end_pos < count) {
5457 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
5458 }
5459 list->at(start_pos) = CharacterRange(new_from, new_to);
5460 return count - (end_pos - start_pos) + 1;
5461}
5462
5463
5464void CharacterSet::Canonicalize() {
5465 // Special/default classes are always considered canonical. The result
5466 // of calling ranges() will be sorted.
5467 if (ranges_ == NULL) return;
5468 CharacterRange::Canonicalize(ranges_);
5469}
5470
5471
5472void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
5473 if (character_ranges->length() <= 1) return;
5474 // Check whether ranges are already canonical (increasing, non-overlapping,
5475 // non-adjacent).
5476 int n = character_ranges->length();
5477 int max = character_ranges->at(0).to();
5478 int i = 1;
5479 while (i < n) {
5480 CharacterRange current = character_ranges->at(i);
5481 if (current.from() <= max + 1) {
5482 break;
5483 }
5484 max = current.to();
5485 i++;
5486 }
5487 // Canonical until the i'th range. If that's all of them, we are done.
5488 if (i == n) return;
5489
5490 // The ranges at index i and forward are not canonicalized. Make them so by
5491 // doing the equivalent of insertion sort (inserting each into the previous
5492 // list, in order).
5493 // Notice that inserting a range can reduce the number of ranges in the
5494 // result due to combining of adjacent and overlapping ranges.
5495 int read = i; // Range to insert.
5496 int num_canonical = i; // Length of canonicalized part of list.
5497 do {
5498 num_canonical = InsertRangeInCanonicalList(character_ranges,
5499 num_canonical,
5500 character_ranges->at(read));
5501 read++;
5502 } while (read < n);
5503 character_ranges->Rewind(num_canonical);
5504
5505 ASSERT(CharacterRange::IsCanonical(character_ranges));
5506}
5507
5508
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005509void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005510 ZoneList<CharacterRange>* negated_ranges,
5511 Zone* zone) {
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005512 ASSERT(CharacterRange::IsCanonical(ranges));
5513 ASSERT_EQ(0, negated_ranges->length());
5514 int range_count = ranges->length();
5515 uc16 from = 0;
5516 int i = 0;
5517 if (range_count > 0 && ranges->at(0).from() == 0) {
5518 from = ranges->at(0).to();
5519 i = 1;
5520 }
5521 while (i < range_count) {
5522 CharacterRange range = ranges->at(i);
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005523 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005524 from = range.to();
5525 i++;
5526 }
yangguo@chromium.org154ff992012-03-13 08:09:54 +00005527 if (from < String::kMaxUtf16CodeUnit) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005528 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit),
5529 zone);
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005530 }
5531}
5532
5533
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005534// -------------------------------------------------------------------
5535// Splay tree
5536
5537
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005538OutSet* OutSet::Extend(unsigned value, Zone* zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005539 if (Get(value))
5540 return this;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005541 if (successors(zone) != NULL) {
5542 for (int i = 0; i < successors(zone)->length(); i++) {
5543 OutSet* successor = successors(zone)->at(i);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005544 if (successor->Get(value))
5545 return successor;
5546 }
5547 } else {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005548 successors_ = new(zone) ZoneList<OutSet*>(2, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005549 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005550 OutSet* result = new(zone) OutSet(first_, remaining_);
5551 result->Set(value, zone);
5552 successors(zone)->Add(result, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005553 return result;
5554}
5555
5556
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005557void OutSet::Set(unsigned value, Zone *zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005558 if (value < kFirstLimit) {
5559 first_ |= (1 << value);
5560 } else {
5561 if (remaining_ == NULL)
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005562 remaining_ = new(zone) ZoneList<unsigned>(1, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005563 if (remaining_->is_empty() || !remaining_->Contains(value))
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005564 remaining_->Add(value, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005565 }
5566}
5567
5568
5569bool OutSet::Get(unsigned value) {
5570 if (value < kFirstLimit) {
5571 return (first_ & (1 << value)) != 0;
5572 } else if (remaining_ == NULL) {
5573 return false;
5574 } else {
5575 return remaining_->Contains(value);
5576 }
5577}
5578
5579
5580const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005581
5582
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005583void DispatchTable::AddRange(CharacterRange full_range, int value,
5584 Zone* zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005585 CharacterRange current = full_range;
5586 if (tree()->is_empty()) {
5587 // If this is the first range we just insert into the table.
5588 ZoneSplayTree<Config>::Locator loc;
5589 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005590 loc.set_value(Entry(current.from(), current.to(),
5591 empty()->Extend(value, zone)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005592 return;
5593 }
5594 // First see if there is a range to the left of this one that
5595 // overlaps.
5596 ZoneSplayTree<Config>::Locator loc;
5597 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
5598 Entry* entry = &loc.value();
5599 // If we've found a range that overlaps with this one, and it
5600 // starts strictly to the left of this one, we have to fix it
5601 // because the following code only handles ranges that start on
5602 // or after the start point of the range we're adding.
5603 if (entry->from() < current.from() && entry->to() >= current.from()) {
5604 // Snap the overlapping range in half around the start point of
5605 // the range we're adding.
5606 CharacterRange left(entry->from(), current.from() - 1);
5607 CharacterRange right(current.from(), entry->to());
5608 // The left part of the overlapping range doesn't overlap.
5609 // Truncate the whole entry to be just the left part.
5610 entry->set_to(left.to());
5611 // The right part is the one that overlaps. We add this part
5612 // to the map and let the next step deal with merging it with
5613 // the range we're adding.
5614 ZoneSplayTree<Config>::Locator loc;
5615 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
5616 loc.set_value(Entry(right.from(),
5617 right.to(),
5618 entry->out_set()));
5619 }
5620 }
5621 while (current.is_valid()) {
5622 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
5623 (loc.value().from() <= current.to()) &&
5624 (loc.value().to() >= current.from())) {
5625 Entry* entry = &loc.value();
5626 // We have overlap. If there is space between the start point of
5627 // the range we're adding and where the overlapping range starts
5628 // then we have to add a range covering just that space.
5629 if (current.from() < entry->from()) {
5630 ZoneSplayTree<Config>::Locator ins;
5631 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
5632 ins.set_value(Entry(current.from(),
5633 entry->from() - 1,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005634 empty()->Extend(value, zone)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005635 current.set_from(entry->from());
5636 }
5637 ASSERT_EQ(current.from(), entry->from());
5638 // If the overlapping range extends beyond the one we want to add
5639 // we have to snap the right part off and add it separately.
5640 if (entry->to() > current.to()) {
5641 ZoneSplayTree<Config>::Locator ins;
5642 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
5643 ins.set_value(Entry(current.to() + 1,
5644 entry->to(),
5645 entry->out_set()));
5646 entry->set_to(current.to());
5647 }
5648 ASSERT(entry->to() <= current.to());
5649 // The overlapping range is now completely contained by the range
5650 // we're adding so we can just update it and move the start point
5651 // of the range we're adding just past it.
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005652 entry->AddValue(value, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005653 // Bail out if the last interval ended at 0xFFFF since otherwise
5654 // adding 1 will wrap around to 0.
yangguo@chromium.org154ff992012-03-13 08:09:54 +00005655 if (entry->to() == String::kMaxUtf16CodeUnit)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005656 break;
5657 ASSERT(entry->to() + 1 > current.from());
5658 current.set_from(entry->to() + 1);
5659 } else {
5660 // There is no overlap so we can just add the range
5661 ZoneSplayTree<Config>::Locator ins;
5662 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
5663 ins.set_value(Entry(current.from(),
5664 current.to(),
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005665 empty()->Extend(value, zone)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005666 break;
5667 }
5668 }
5669}
5670
5671
5672OutSet* DispatchTable::Get(uc16 value) {
5673 ZoneSplayTree<Config>::Locator loc;
5674 if (!tree()->FindGreatestLessThan(value, &loc))
5675 return empty();
5676 Entry* entry = &loc.value();
5677 if (value <= entry->to())
5678 return entry->out_set();
5679 else
5680 return empty();
5681}
5682
5683
5684// -------------------------------------------------------------------
5685// Analysis
5686
5687
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005688void Analysis::EnsureAnalyzed(RegExpNode* that) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00005689 StackLimitCheck check(Isolate::Current());
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00005690 if (check.HasOverflowed()) {
5691 fail("Stack overflow");
5692 return;
5693 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005694 if (that->info()->been_analyzed || that->info()->being_analyzed)
5695 return;
5696 that->info()->being_analyzed = true;
5697 that->Accept(this);
5698 that->info()->being_analyzed = false;
5699 that->info()->been_analyzed = true;
5700}
5701
5702
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005703void Analysis::VisitEnd(EndNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005704 // nothing to do
5705}
5706
5707
ager@chromium.org8bb60582008-12-11 12:02:20 +00005708void TextNode::CalculateOffsets() {
5709 int element_count = elements()->length();
5710 // Set up the offsets of the elements relative to the start. This is a fixed
5711 // quantity since a TextNode can only contain fixed-width things.
5712 int cp_offset = 0;
5713 for (int i = 0; i < element_count; i++) {
5714 TextElement& elm = elements()->at(i);
5715 elm.cp_offset = cp_offset;
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00005716 if (elm.text_type == TextElement::ATOM) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00005717 cp_offset += elm.data.u_atom->data().length();
5718 } else {
5719 cp_offset++;
ager@chromium.org8bb60582008-12-11 12:02:20 +00005720 }
5721 }
5722}
5723
5724
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005725void Analysis::VisitText(TextNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005726 if (ignore_case_) {
ager@chromium.org38e4c712009-11-11 09:11:58 +00005727 that->MakeCaseIndependent(is_ascii_);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005728 }
5729 EnsureAnalyzed(that->on_success());
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00005730 if (!has_failed()) {
5731 that->CalculateOffsets();
5732 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005733}
5734
5735
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005736void Analysis::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00005737 RegExpNode* target = that->on_success();
5738 EnsureAnalyzed(target);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00005739 if (!has_failed()) {
5740 // If the next node is interested in what it follows then this node
5741 // has to be interested too so it can pass the information on.
5742 that->info()->AddFromFollowing(target->info());
5743 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005744}
5745
5746
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005747void Analysis::VisitChoice(ChoiceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005748 NodeInfo* info = that->info();
5749 for (int i = 0; i < that->alternatives()->length(); i++) {
5750 RegExpNode* node = that->alternatives()->at(i).node();
5751 EnsureAnalyzed(node);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00005752 if (has_failed()) return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005753 // Anything the following nodes need to know has to be known by
5754 // this node also, so it can pass it on.
5755 info->AddFromFollowing(node->info());
5756 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005757}
5758
5759
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005760void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
5761 NodeInfo* info = that->info();
5762 for (int i = 0; i < that->alternatives()->length(); i++) {
5763 RegExpNode* node = that->alternatives()->at(i).node();
5764 if (node != that->loop_node()) {
5765 EnsureAnalyzed(node);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00005766 if (has_failed()) return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005767 info->AddFromFollowing(node->info());
5768 }
5769 }
5770 // Check the loop last since it may need the value of this node
5771 // to get a correct result.
5772 EnsureAnalyzed(that->loop_node());
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00005773 if (!has_failed()) {
5774 info->AddFromFollowing(that->loop_node()->info());
5775 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005776}
5777
5778
5779void Analysis::VisitBackReference(BackReferenceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005780 EnsureAnalyzed(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005781}
5782
5783
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005784void Analysis::VisitAssertion(AssertionNode* that) {
5785 EnsureAnalyzed(that->on_success());
5786}
5787
5788
verwaest@chromium.org37141392012-05-31 13:27:02 +00005789void BackReferenceNode::FillInBMInfo(int offset,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00005790 int budget,
verwaest@chromium.org37141392012-05-31 13:27:02 +00005791 BoyerMooreLookahead* bm,
5792 bool not_at_start) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005793 // Working out the set of characters that a backreference can match is too
5794 // hard, so we just say that any character can match.
5795 bm->SetRest(offset);
5796 SaveBMInfo(bm, not_at_start, offset);
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005797}
5798
5799
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005800STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
5801 RegExpMacroAssembler::kTableSize);
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005802
5803
verwaest@chromium.org37141392012-05-31 13:27:02 +00005804void ChoiceNode::FillInBMInfo(int offset,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00005805 int budget,
verwaest@chromium.org37141392012-05-31 13:27:02 +00005806 BoyerMooreLookahead* bm,
5807 bool not_at_start) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005808 ZoneList<GuardedAlternative>* alts = alternatives();
rossberg@chromium.org400388e2012-06-06 09:29:22 +00005809 budget = (budget - 1) / alts->length();
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005810 for (int i = 0; i < alts->length(); i++) {
5811 GuardedAlternative& alt = alts->at(i);
5812 if (alt.guards() != NULL && alt.guards()->length() != 0) {
5813 bm->SetRest(offset); // Give up trying to fill in info.
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005814 SaveBMInfo(bm, not_at_start, offset);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005815 return;
5816 }
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00005817 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005818 }
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005819 SaveBMInfo(bm, not_at_start, offset);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005820}
5821
5822
verwaest@chromium.org37141392012-05-31 13:27:02 +00005823void TextNode::FillInBMInfo(int initial_offset,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00005824 int budget,
verwaest@chromium.org37141392012-05-31 13:27:02 +00005825 BoyerMooreLookahead* bm,
5826 bool not_at_start) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005827 if (initial_offset >= bm->length()) return;
5828 int offset = initial_offset;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005829 int max_char = bm->max_char();
5830 for (int i = 0; i < elements()->length(); i++) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005831 if (offset >= bm->length()) {
5832 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5833 return;
5834 }
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005835 TextElement text = elements()->at(i);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00005836 if (text.text_type == TextElement::ATOM) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005837 RegExpAtom* atom = text.data.u_atom;
5838 for (int j = 0; j < atom->length(); j++, offset++) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005839 if (offset >= bm->length()) {
5840 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5841 return;
5842 }
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005843 uc16 character = atom->data()[j];
5844 if (bm->compiler()->ignore_case()) {
5845 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5846 int length = GetCaseIndependentLetters(
5847 ISOLATE,
5848 character,
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00005849 bm->max_char() == String::kMaxOneByteCharCode,
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005850 chars);
5851 for (int j = 0; j < length; j++) {
5852 bm->Set(offset, chars[j]);
5853 }
5854 } else {
5855 if (character <= max_char) bm->Set(offset, character);
5856 }
5857 }
5858 } else {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00005859 ASSERT(text.text_type == TextElement::CHAR_CLASS);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005860 RegExpCharacterClass* char_class = text.data.u_char_class;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005861 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005862 if (char_class->is_negated()) {
5863 bm->SetAll(offset);
5864 } else {
5865 for (int k = 0; k < ranges->length(); k++) {
5866 CharacterRange& range = ranges->at(k);
5867 if (range.from() > max_char) continue;
5868 int to = Min(max_char, static_cast<int>(range.to()));
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005869 bm->SetInterval(offset, Interval(range.from(), to));
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005870 }
5871 }
5872 offset++;
5873 }
5874 }
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005875 if (offset >= bm->length()) {
5876 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5877 return;
5878 }
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005879 on_success()->FillInBMInfo(offset,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00005880 budget - 1,
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005881 bm,
5882 true); // Not at start after a text node.
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005883 if (initial_offset == 0) set_bm_info(not_at_start, bm);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005884}
5885
5886
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005887// -------------------------------------------------------------------
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005888// Dispatch table construction
5889
5890
5891void DispatchTableConstructor::VisitEnd(EndNode* that) {
5892 AddRange(CharacterRange::Everything());
5893}
5894
5895
5896void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
5897 node->set_being_calculated(true);
5898 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
5899 for (int i = 0; i < alternatives->length(); i++) {
5900 set_choice_index(i);
5901 alternatives->at(i).node()->Accept(this);
5902 }
5903 node->set_being_calculated(false);
5904}
5905
5906
5907class AddDispatchRange {
5908 public:
5909 explicit AddDispatchRange(DispatchTableConstructor* constructor)
5910 : constructor_(constructor) { }
5911 void Call(uc32 from, DispatchTable::Entry entry);
5912 private:
5913 DispatchTableConstructor* constructor_;
5914};
5915
5916
5917void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
5918 CharacterRange range(from, entry.to());
5919 constructor_->AddRange(range);
5920}
5921
5922
5923void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
5924 if (node->being_calculated())
5925 return;
5926 DispatchTable* table = node->GetTable(ignore_case_);
5927 AddDispatchRange adder(this);
5928 table->ForEach(&adder);
5929}
5930
5931
5932void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
5933 // TODO(160): Find the node that we refer back to and propagate its start
5934 // set back to here. For now we just accept anything.
5935 AddRange(CharacterRange::Everything());
5936}
5937
5938
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005939void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
5940 RegExpNode* target = that->on_success();
5941 target->Accept(this);
5942}
5943
5944
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005945static int CompareRangeByFrom(const CharacterRange* a,
5946 const CharacterRange* b) {
5947 return Compare<uc16>(a->from(), b->from());
5948}
5949
5950
5951void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
5952 ranges->Sort(CompareRangeByFrom);
5953 uc16 last = 0;
5954 for (int i = 0; i < ranges->length(); i++) {
5955 CharacterRange range = ranges->at(i);
5956 if (last < range.from())
5957 AddRange(CharacterRange(last, range.from() - 1));
5958 if (range.to() >= last) {
yangguo@chromium.org154ff992012-03-13 08:09:54 +00005959 if (range.to() == String::kMaxUtf16CodeUnit) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005960 return;
5961 } else {
5962 last = range.to() + 1;
5963 }
5964 }
5965 }
yangguo@chromium.org154ff992012-03-13 08:09:54 +00005966 AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005967}
5968
5969
5970void DispatchTableConstructor::VisitText(TextNode* that) {
5971 TextElement elm = that->elements()->at(0);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00005972 switch (elm.text_type) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005973 case TextElement::ATOM: {
5974 uc16 c = elm.data.u_atom->data()[0];
5975 AddRange(CharacterRange(c, c));
5976 break;
5977 }
5978 case TextElement::CHAR_CLASS: {
5979 RegExpCharacterClass* tree = elm.data.u_char_class;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005980 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005981 if (tree->is_negated()) {
5982 AddInverse(ranges);
5983 } else {
5984 for (int i = 0; i < ranges->length(); i++)
5985 AddRange(ranges->at(i));
5986 }
5987 break;
5988 }
5989 default: {
5990 UNIMPLEMENTED();
5991 }
5992 }
5993}
5994
5995
5996void DispatchTableConstructor::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00005997 RegExpNode* target = that->on_success();
5998 target->Accept(this);
5999}
6000
6001
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00006002RegExpEngine::CompilationResult RegExpEngine::Compile(
6003 RegExpCompileData* data,
6004 bool ignore_case,
mstarzinger@chromium.org15613d02012-05-23 12:04:37 +00006005 bool is_global,
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00006006 bool is_multiline,
6007 Handle<String> pattern,
6008 Handle<String> sample_subject,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00006009 bool is_ascii,
6010 Zone* zone) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00006011 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00006012 return IrregexpRegExpTooBig();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00006013 }
rossberg@chromium.org400388e2012-06-06 09:29:22 +00006014 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii, zone);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00006015
6016 // Sample some characters from the middle of the string.
6017 static const int kSampleSize = 128;
6018
6019 FlattenString(sample_subject);
6020 int chars_sampled = 0;
6021 int half_way = (sample_subject->length() - kSampleSize) / 2;
6022 for (int i = Max(0, half_way);
6023 i < sample_subject->length() && chars_sampled < kSampleSize;
6024 i++, chars_sampled++) {
6025 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6026 }
6027
ager@chromium.orga74f0da2008-12-03 16:05:52 +00006028 // Wrap the body of the regexp in capture #0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00006029 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00006030 0,
6031 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00006032 compiler.accept());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00006033 RegExpNode* node = captured_body;
whesse@chromium.org4a5224e2010-10-20 12:37:07 +00006034 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6035 bool is_start_anchored = data->tree->IsAnchoredAtStart();
6036 int max_length = data->tree->max_match();
6037 if (!is_start_anchored) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00006038 // Add a .*? at the beginning, outside the body capture, unless
6039 // this expression is anchored at the beginning.
iposva@chromium.org245aa852009-02-10 00:49:54 +00006040 RegExpNode* loop_node =
6041 RegExpQuantifier::ToNode(0,
6042 RegExpTree::kInfinity,
6043 false,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006044 new(zone) RegExpCharacterClass('*'),
iposva@chromium.org245aa852009-02-10 00:49:54 +00006045 &compiler,
6046 captured_body,
6047 data->contains_anchor);
6048
6049 if (data->contains_anchor) {
6050 // Unroll loop once, to take care of the case that might start
6051 // at the start of input.
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006052 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
iposva@chromium.org245aa852009-02-10 00:49:54 +00006053 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6054 first_step_node->AddAlternative(GuardedAlternative(
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006055 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node)));
iposva@chromium.org245aa852009-02-10 00:49:54 +00006056 node = first_step_node;
6057 } else {
6058 node = loop_node;
6059 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00006060 }
danno@chromium.orgb10deab2012-05-07 14:28:47 +00006061 if (is_ascii) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00006062 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
danno@chromium.orgb10deab2012-05-07 14:28:47 +00006063 // Do it again to propagate the new nodes to places where they were not
6064 // put because they had not been calculated yet.
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00006065 if (node != NULL) {
6066 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
6067 }
danno@chromium.orgb10deab2012-05-07 14:28:47 +00006068 }
danno@chromium.org1044a4d2012-04-30 12:34:39 +00006069
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006070 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00006071 data->node = node;
ager@chromium.org38e4c712009-11-11 09:11:58 +00006072 Analysis analysis(ignore_case, is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00006073 analysis.EnsureAnalyzed(node);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00006074 if (analysis.has_failed()) {
6075 const char* error_message = analysis.error_message();
6076 return CompilationResult(error_message);
6077 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00006078
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00006079 // Create the correct assembler for the architecture.
ricow@chromium.orgc9c80822010-04-21 08:22:37 +00006080#ifndef V8_INTERPRETED_REGEXP
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00006081 // Native regexp implementation.
6082
6083 NativeRegExpMacroAssembler::Mode mode =
6084 is_ascii ? NativeRegExpMacroAssembler::ASCII
6085 : NativeRegExpMacroAssembler::UC16;
6086
ager@chromium.org18ad94b2009-09-02 08:22:29 +00006087#if V8_TARGET_ARCH_IA32
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006088 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2,
6089 zone);
ager@chromium.org18ad94b2009-09-02 08:22:29 +00006090#elif V8_TARGET_ARCH_X64
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006091 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2,
6092 zone);
ager@chromium.org18ad94b2009-09-02 08:22:29 +00006093#elif V8_TARGET_ARCH_ARM
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006094 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2,
6095 zone);
lrn@chromium.org7516f052011-03-30 08:52:27 +00006096#elif V8_TARGET_ARCH_MIPS
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006097 RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2,
6098 zone);
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00006099#endif
6100
ricow@chromium.orgc9c80822010-04-21 08:22:37 +00006101#else // V8_INTERPRETED_REGEXP
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00006102 // Interpreted regexp implementation.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00006103 EmbeddedVector<byte, 1024> codes;
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +00006104 RegExpMacroAssemblerIrregexp macro_assembler(codes, zone);
ricow@chromium.orgc9c80822010-04-21 08:22:37 +00006105#endif // V8_INTERPRETED_REGEXP
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00006106
whesse@chromium.org4a5224e2010-10-20 12:37:07 +00006107 // Inserted here, instead of in Assembler, because it depends on information
6108 // in the AST that isn't replicated in the Node structure.
6109 static const int kMaxBacksearchLimit = 1024;
6110 if (is_end_anchored &&
6111 !is_start_anchored &&
6112 max_length < kMaxBacksearchLimit) {
6113 macro_assembler.SetCurrentPositionFromEnd(max_length);
6114 }
6115
rossberg@chromium.org400388e2012-06-06 09:29:22 +00006116 if (is_global) {
6117 macro_assembler.set_global_mode(
6118 (data->tree->min_match() > 0)
6119 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
6120 : RegExpMacroAssembler::GLOBAL);
6121 }
mstarzinger@chromium.org15613d02012-05-23 12:04:37 +00006122
ager@chromium.orga74f0da2008-12-03 16:05:52 +00006123 return compiler.Assemble(&macro_assembler,
6124 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00006125 data->capture_count,
6126 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00006127}
6128
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00006129
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00006130}} // namespace v8::internal