blob: 3a3d91599c3ea045893f7b0ef773c6533499ed5a [file] [log] [blame]
mstarzinger@chromium.org15613d02012-05-23 12:04:37 +00001// Copyright 2012 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
ager@chromium.orga74f0da2008-12-03 16:05:52 +000030#include "ast.h"
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +000031#include "compiler.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000032#include "execution.h"
33#include "factory.h"
sgjesse@chromium.org0b6db592009-07-30 14:48:31 +000034#include "jsregexp.h"
mmassi@chromium.org2f0efde2013-02-06 14:12:58 +000035#include "jsregexp-inl.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000036#include "platform.h"
kasperl@chromium.orga5551262010-12-07 12:49:48 +000037#include "string-search.h"
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000038#include "runtime.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000039#include "compilation-cache.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000040#include "string-stream.h"
41#include "parser.h"
42#include "regexp-macro-assembler.h"
43#include "regexp-macro-assembler-tracer.h"
44#include "regexp-macro-assembler-irregexp.h"
ager@chromium.org32912102009-01-16 10:38:43 +000045#include "regexp-stack.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000046
ricow@chromium.orgc9c80822010-04-21 08:22:37 +000047#ifndef V8_INTERPRETED_REGEXP
kasperl@chromium.org71affb52009-05-26 05:44:31 +000048#if V8_TARGET_ARCH_IA32
ager@chromium.org3a37e9b2009-04-27 09:26:21 +000049#include "ia32/regexp-macro-assembler-ia32.h"
ager@chromium.org9085a012009-05-11 19:22:57 +000050#elif V8_TARGET_ARCH_X64
ager@chromium.org9085a012009-05-11 19:22:57 +000051#include "x64/regexp-macro-assembler-x64.h"
52#elif V8_TARGET_ARCH_ARM
53#include "arm/regexp-macro-assembler-arm.h"
lrn@chromium.org7516f052011-03-30 08:52:27 +000054#elif V8_TARGET_ARCH_MIPS
55#include "mips/regexp-macro-assembler-mips.h"
kasperl@chromium.org2abc4502009-07-02 07:00:29 +000056#else
57#error Unsupported target architecture.
ager@chromium.orga74f0da2008-12-03 16:05:52 +000058#endif
sgjesse@chromium.org911335c2009-08-19 12:59:44 +000059#endif
ager@chromium.orga74f0da2008-12-03 16:05:52 +000060
61#include "interpreter-irregexp.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000062
ager@chromium.orga74f0da2008-12-03 16:05:52 +000063
kasperl@chromium.org71affb52009-05-26 05:44:31 +000064namespace v8 {
65namespace internal {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000066
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000067Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
68 Handle<String> pattern,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000069 Handle<String> flags,
70 bool* has_pending_exception) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000071 // Call the construct code with 2 arguments.
svenpanne@chromium.orga8bb4d92011-10-10 13:20:40 +000072 Handle<Object> argv[] = { pattern, flags };
73 return Execution::New(constructor, ARRAY_SIZE(argv), argv,
74 has_pending_exception);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000075}
76
77
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000078static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
79 int flags = JSRegExp::NONE;
ager@chromium.orgbb29dc92009-03-24 13:25:23 +000080 for (int i = 0; i < str->length(); i++) {
81 switch (str->Get(i)) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000082 case 'i':
83 flags |= JSRegExp::IGNORE_CASE;
84 break;
85 case 'g':
86 flags |= JSRegExp::GLOBAL;
87 break;
88 case 'm':
89 flags |= JSRegExp::MULTILINE;
90 break;
91 }
92 }
93 return JSRegExp::Flags(flags);
94}
95
96
ager@chromium.orga74f0da2008-12-03 16:05:52 +000097static inline void ThrowRegExpException(Handle<JSRegExp> re,
98 Handle<String> pattern,
99 Handle<String> error_text,
100 const char* message) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000101 Isolate* isolate = re->GetIsolate();
102 Factory* factory = isolate->factory();
103 Handle<FixedArray> elements = factory->NewFixedArray(2);
karlklose@chromium.org8f806e82011-03-07 14:06:08 +0000104 elements->set(0, *pattern);
105 elements->set(1, *error_text);
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000106 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
107 Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
108 isolate->Throw(*regexp_err);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000109}
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000110
111
erik.corry@gmail.comed49e962012-04-17 11:57:53 +0000112ContainedInLattice AddRange(ContainedInLattice containment,
113 const int* ranges,
114 int ranges_length,
115 Interval new_range) {
116 ASSERT((ranges_length & 1) == 1);
117 ASSERT(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
118 if (containment == kLatticeUnknown) return containment;
119 bool inside = false;
120 int last = 0;
121 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
122 // Consider the range from last to ranges[i].
123 // We haven't got to the new range yet.
124 if (ranges[i] <= new_range.from()) continue;
125 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
126 // inclusive, but the values in ranges are not.
127 if (last <= new_range.from() && new_range.to() < ranges[i]) {
128 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
129 }
130 return kLatticeUnknown;
131 }
132 return containment;
133}
134
135
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000136// More makes code generation slower, less makes V8 benchmark score lower.
137const int kMaxLookaheadForBoyerMoore = 8;
138// In a 3-character pattern you can maximally step forwards 3 characters
139// at a time, which is not always enough to pay for the extra logic.
140const int kPatternTooShortForBoyerMoore = 2;
141
142
143// Identifies the sort of regexps where the regexp engine is faster
144// than the code used for atom matches.
145static bool HasFewDifferentCharacters(Handle<String> pattern) {
146 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
147 if (length <= kPatternTooShortForBoyerMoore) return false;
148 const int kMod = 128;
149 bool character_found[kMod];
150 int different = 0;
151 memset(&character_found[0], 0, sizeof(character_found));
152 for (int i = 0; i < length; i++) {
153 int ch = (pattern->Get(i) & (kMod - 1));
154 if (!character_found[ch]) {
155 character_found[ch] = true;
156 different++;
157 // We declare a regexp low-alphabet if it has at least 3 times as many
158 // characters as it has different characters.
159 if (different * 3 > length) return false;
160 }
161 }
162 return true;
163}
164
165
ager@chromium.org8bb60582008-12-11 12:02:20 +0000166// Generic RegExp methods. Dispatches to implementation specific methods.
167
168
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000169Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
170 Handle<String> pattern,
mstarzinger@chromium.org1510d582013-06-28 14:00:48 +0000171 Handle<String> flag_str) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000172 Isolate* isolate = re->GetIsolate();
mstarzinger@chromium.org1510d582013-06-28 14:00:48 +0000173 Zone zone(isolate);
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000174 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000175 CompilationCache* compilation_cache = isolate->compilation_cache();
176 Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags);
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000177 bool in_cache = !cached.is_null();
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000178 LOG(isolate, RegExpCompileEvent(re, in_cache));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000179
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000180 Handle<Object> result;
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000181 if (in_cache) {
182 re->set_data(*cached);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000183 return re;
184 }
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000185 pattern = FlattenGetString(pattern);
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000186 PostponeInterruptsScope postpone(isolate);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000187 RegExpCompileData parse_result;
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000188 FlatStringReader reader(isolate, pattern);
fschneider@chromium.orge03fb642010-11-01 12:34:09 +0000189 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
mstarzinger@chromium.org1510d582013-06-28 14:00:48 +0000190 &parse_result, &zone)) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000191 // Throw an exception if we fail to parse the pattern.
192 ThrowRegExpException(re,
193 pattern,
194 parse_result.error,
195 "malformed_regexp");
196 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000197 }
198
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000199 bool has_been_compiled = false;
200
201 if (parse_result.simple &&
202 !flags.is_ignore_case() &&
203 !HasFewDifferentCharacters(pattern)) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000204 // Parse-tree is a single atom that is equal to the pattern.
205 AtomCompile(re, pattern, flags, pattern);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000206 has_been_compiled = true;
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000207 } else if (parse_result.tree->IsAtom() &&
208 !flags.is_ignore_case() &&
209 parse_result.capture_count == 0) {
210 RegExpAtom* atom = parse_result.tree->AsAtom();
211 Vector<const uc16> atom_pattern = atom->data();
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000212 Handle<String> atom_string =
213 isolate->factory()->NewStringFromTwoByte(atom_pattern);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000214 if (!HasFewDifferentCharacters(atom_string)) {
215 AtomCompile(re, pattern, flags, atom_string);
216 has_been_compiled = true;
217 }
218 }
219 if (!has_been_compiled) {
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000220 IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000221 }
222 ASSERT(re->data()->IsFixedArray());
223 // Compilation succeeded so the data is set on the regexp
224 // and we can store it in the cache.
225 Handle<FixedArray> data(FixedArray::cast(re->data()));
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000226 compilation_cache->PutRegExp(pattern, flags, data);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000227
228 return re;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000229}
230
231
232Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
233 Handle<String> subject,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000234 int index,
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000235 Handle<JSArray> last_match_info) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000236 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000237 case JSRegExp::ATOM:
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000238 return AtomExec(regexp, subject, index, last_match_info);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000239 case JSRegExp::IRREGEXP: {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000240 Handle<Object> result =
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000241 IrregexpExec(regexp, subject, index, last_match_info);
ulan@chromium.org812308e2012-02-29 15:58:45 +0000242 ASSERT(!result.is_null() ||
243 regexp->GetIsolate()->has_pending_exception());
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000244 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000245 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000246 default:
247 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000248 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000249 }
250}
251
252
ager@chromium.org8bb60582008-12-11 12:02:20 +0000253// RegExp Atom implementation: Simple string search using indexOf.
254
255
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000256void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
257 Handle<String> pattern,
258 JSRegExp::Flags flags,
259 Handle<String> match_pattern) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000260 re->GetIsolate()->factory()->SetRegExpAtomData(re,
261 JSRegExp::ATOM,
262 pattern,
263 flags,
264 match_pattern);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000265}
266
267
268static void SetAtomLastCapture(FixedArray* array,
269 String* subject,
270 int from,
271 int to) {
rossberg@chromium.org79e79022013-06-03 15:43:46 +0000272 SealHandleScope shs(array->GetIsolate());
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000273 RegExpImpl::SetLastCaptureCount(array, 2);
274 RegExpImpl::SetLastSubject(array, subject);
275 RegExpImpl::SetLastInput(array, subject);
276 RegExpImpl::SetCapture(array, 0, from);
277 RegExpImpl::SetCapture(array, 1, to);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000278}
279
280
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000281int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
282 Handle<String> subject,
283 int index,
284 int32_t* output,
285 int output_size) {
286 Isolate* isolate = regexp->GetIsolate();
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000287
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000288 ASSERT(0 <= index);
289 ASSERT(index <= subject->length());
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000290
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000291 if (!subject->IsFlat()) FlattenString(subject);
rossberg@chromium.org79e79022013-06-03 15:43:46 +0000292 DisallowHeapAllocation no_gc; // ensure vectors stay valid
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000293
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000294 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000295 int needle_len = needle->length();
ricow@chromium.orgddd545c2011-08-24 12:02:41 +0000296 ASSERT(needle->IsFlat());
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000297 ASSERT_LT(0, needle_len);
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000298
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000299 if (index + needle_len > subject->length()) {
300 return RegExpImpl::RE_FAILURE;
301 }
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000302
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000303 for (int i = 0; i < output_size; i += 2) {
ricow@chromium.orgddd545c2011-08-24 12:02:41 +0000304 String::FlatContent needle_content = needle->GetFlatContent();
305 String::FlatContent subject_content = subject->GetFlatContent();
306 ASSERT(needle_content.IsFlat());
307 ASSERT(subject_content.IsFlat());
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000308 // dispatch on type of strings
ricow@chromium.orgddd545c2011-08-24 12:02:41 +0000309 index = (needle_content.IsAscii()
310 ? (subject_content.IsAscii()
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000311 ? SearchString(isolate,
jkummerow@chromium.org59297c72013-01-09 16:32:23 +0000312 subject_content.ToOneByteVector(),
313 needle_content.ToOneByteVector(),
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000314 index)
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000315 : SearchString(isolate,
ricow@chromium.orgddd545c2011-08-24 12:02:41 +0000316 subject_content.ToUC16Vector(),
jkummerow@chromium.org59297c72013-01-09 16:32:23 +0000317 needle_content.ToOneByteVector(),
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000318 index))
ricow@chromium.orgddd545c2011-08-24 12:02:41 +0000319 : (subject_content.IsAscii()
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000320 ? SearchString(isolate,
jkummerow@chromium.org59297c72013-01-09 16:32:23 +0000321 subject_content.ToOneByteVector(),
ricow@chromium.orgddd545c2011-08-24 12:02:41 +0000322 needle_content.ToUC16Vector(),
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000323 index)
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000324 : SearchString(isolate,
ricow@chromium.orgddd545c2011-08-24 12:02:41 +0000325 subject_content.ToUC16Vector(),
326 needle_content.ToUC16Vector(),
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000327 index)));
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000328 if (index == -1) {
329 return i / 2; // Return number of matches.
330 } else {
331 output[i] = index;
332 output[i+1] = index + needle_len;
333 index += needle_len;
334 }
kasperl@chromium.orga5551262010-12-07 12:49:48 +0000335 }
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000336 return output_size / 2;
337}
ager@chromium.org7c537e22008-10-16 08:43:32 +0000338
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000339
340Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
341 Handle<String> subject,
342 int index,
343 Handle<JSArray> last_match_info) {
344 Isolate* isolate = re->GetIsolate();
345
346 static const int kNumRegisters = 2;
347 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
348 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
349
350 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
351
352 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
353
354 ASSERT_EQ(res, RegExpImpl::RE_SUCCESS);
rossberg@chromium.org79e79022013-06-03 15:43:46 +0000355 SealHandleScope shs(isolate);
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000356 FixedArray* array = FixedArray::cast(last_match_info->elements());
357 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000358 return last_match_info;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000359}
360
361
ager@chromium.org8bb60582008-12-11 12:02:20 +0000362// Irregexp implementation.
363
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000364// Ensures that the regexp object contains a compiled version of the
365// source for either ASCII or non-ASCII strings.
366// If the compiled version doesn't already exist, it is compiled
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000367// from the source pattern.
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000368// If compilation fails, an exception is thrown and this function
369// returns false.
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000370bool RegExpImpl::EnsureCompiledIrregexp(
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000371 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) {
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000372 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000373#ifdef V8_INTERPRETED_REGEXP
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000374 if (compiled_code->IsByteArray()) return true;
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000375#else // V8_INTERPRETED_REGEXP (RegExp native code)
376 if (compiled_code->IsCode()) return true;
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000377#endif
jkummerow@chromium.orgddda9e82011-07-06 11:27:02 +0000378 // We could potentially have marked this as flushable, but have kept
379 // a saved version if we did not flush it yet.
380 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii));
381 if (saved_code->IsCode()) {
382 // Reinstate the code in the original place.
383 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code);
384 ASSERT(compiled_code->IsSmi());
385 return true;
386 }
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000387 return CompileIrregexp(re, sample_subject, is_ascii);
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000388}
ager@chromium.org8bb60582008-12-11 12:02:20 +0000389
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000390
jkummerow@chromium.orgddda9e82011-07-06 11:27:02 +0000391static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
392 bool is_ascii,
393 Handle<String> error_message,
394 Isolate* isolate) {
395 Factory* factory = isolate->factory();
396 Handle<FixedArray> elements = factory->NewFixedArray(2);
397 elements->set(0, re->Pattern());
398 elements->set(1, *error_message);
399 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
400 Handle<Object> regexp_err =
401 factory->NewSyntaxError("malformed_regexp", array);
402 isolate->Throw(*regexp_err);
403 return false;
404}
405
406
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000407bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
408 Handle<String> sample_subject,
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000409 bool is_ascii) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000410 // Compile the RegExp.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000411 Isolate* isolate = re->GetIsolate();
mstarzinger@chromium.org1510d582013-06-28 14:00:48 +0000412 Zone zone(isolate);
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000413 PostponeInterruptsScope postpone(isolate);
jkummerow@chromium.orgddda9e82011-07-06 11:27:02 +0000414 // If we had a compilation error the last time this is saved at the
415 // saved code index.
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000416 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
jkummerow@chromium.orgddda9e82011-07-06 11:27:02 +0000417 // When arriving here entry can only be a smi, either representing an
418 // uncompiled regexp, a previous compilation error, or code that has
419 // been flushed.
420 ASSERT(entry->IsSmi());
421 int entry_value = Smi::cast(entry)->value();
422 ASSERT(entry_value == JSRegExp::kUninitializedValue ||
423 entry_value == JSRegExp::kCompilationErrorValue ||
424 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
425
426 if (entry_value == JSRegExp::kCompilationErrorValue) {
427 // A previous compilation failed and threw an error which we store in
428 // the saved code index (we store the error message, not the actual
429 // error). Recreate the error object and throw it.
430 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii));
431 ASSERT(error_string->IsString());
432 Handle<String> error_message(String::cast(error_string));
433 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000434 return false;
435 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000436
437 JSRegExp::Flags flags = re->GetFlags();
438
439 Handle<String> pattern(re->Pattern());
ricow@chromium.org4668a2c2011-08-29 10:41:00 +0000440 if (!pattern->IsFlat()) FlattenString(pattern);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000441 RegExpCompileData compile_data;
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000442 FlatStringReader reader(isolate, pattern);
fschneider@chromium.orge03fb642010-11-01 12:34:09 +0000443 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000444 &compile_data,
mstarzinger@chromium.org1510d582013-06-28 14:00:48 +0000445 &zone)) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000446 // Throw an exception if we fail to parse the pattern.
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000447 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000448 ThrowRegExpException(re,
449 pattern,
450 compile_data.error,
451 "malformed_regexp");
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000452 return false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000453 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000454 RegExpEngine::CompilationResult result =
ager@chromium.org8bb60582008-12-11 12:02:20 +0000455 RegExpEngine::Compile(&compile_data,
456 flags.is_ignore_case(),
mstarzinger@chromium.org15613d02012-05-23 12:04:37 +0000457 flags.is_global(),
ager@chromium.org8bb60582008-12-11 12:02:20 +0000458 flags.is_multiline(),
459 pattern,
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000460 sample_subject,
rossberg@chromium.org400388e2012-06-06 09:29:22 +0000461 is_ascii,
mstarzinger@chromium.org1510d582013-06-28 14:00:48 +0000462 &zone);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000463 if (result.error_message != NULL) {
464 // Unable to compile regexp.
karlklose@chromium.org8f806e82011-03-07 14:06:08 +0000465 Handle<String> error_message =
jkummerow@chromium.orgddda9e82011-07-06 11:27:02 +0000466 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message));
467 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000468 return false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000469 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000470
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000471 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
472 data->set(JSRegExp::code_index(is_ascii), result.code);
473 int register_max = IrregexpMaxRegisterCount(*data);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000474 if (result.num_registers > register_max) {
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000475 SetIrregexpMaxRegisterCount(*data, result.num_registers);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000476 }
477
478 return true;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000479}
480
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000481
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000482int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
483 return Smi::cast(
484 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000485}
486
487
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000488void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
489 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000490}
491
492
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000493int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
494 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000495}
496
497
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000498int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
499 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000500}
501
502
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000503ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000504 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000505}
506
507
508Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000509 return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000510}
511
512
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000513void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
514 Handle<String> pattern,
515 JSRegExp::Flags flags,
516 int capture_count) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000517 // Initialize compiled code entries to null.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000518 re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
519 JSRegExp::IRREGEXP,
520 pattern,
521 flags,
522 capture_count);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000523}
524
525
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000526int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000527 Handle<String> subject) {
ricow@chromium.org4668a2c2011-08-29 10:41:00 +0000528 if (!subject->IsFlat()) FlattenString(subject);
529
lrn@chromium.org32d961d2010-06-30 09:09:34 +0000530 // Check the asciiness of the underlying storage.
ulan@chromium.org8e8d8822012-11-23 14:36:46 +0000531 bool is_ascii = subject->IsOneByteRepresentationUnderneath();
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000532 if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1;
ricow@chromium.org4668a2c2011-08-29 10:41:00 +0000533
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000534#ifdef V8_INTERPRETED_REGEXP
535 // Byte-code regexp needs space allocated for all its registers.
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000536 // The result captures are copied to the start of the registers array
537 // if the match succeeds. This way those registers are not clobbered
538 // when we set the last match info from last successful match.
539 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
540 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000541#else // V8_INTERPRETED_REGEXP
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000542 // Native regexp only needs room to output captures. Registers are handled
543 // internally.
544 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000545#endif // V8_INTERPRETED_REGEXP
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000546}
547
548
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000549int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
550 Handle<String> subject,
551 int index,
552 int32_t* output,
553 int output_size) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000554 Isolate* isolate = regexp->GetIsolate();
555
556 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000557
558 ASSERT(index >= 0);
559 ASSERT(index <= subject->length());
560 ASSERT(subject->IsFlat());
561
ulan@chromium.org8e8d8822012-11-23 14:36:46 +0000562 bool is_ascii = subject->IsOneByteRepresentationUnderneath();
lrn@chromium.org32d961d2010-06-30 09:09:34 +0000563
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000564#ifndef V8_INTERPRETED_REGEXP
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000565 ASSERT(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000566 do {
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000567 EnsureCompiledIrregexp(regexp, subject, is_ascii);
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000568 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000569 // The stack is used to allocate registers for the compiled regexp code.
570 // This means that in case of failure, the output registers array is left
571 // untouched and contains the capture results from the previous successful
572 // match. We can use that to set the last match info lazily.
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000573 NativeRegExpMacroAssembler::Result res =
574 NativeRegExpMacroAssembler::Match(code,
575 subject,
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000576 output,
577 output_size,
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000578 index,
579 isolate);
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000580 if (res != NativeRegExpMacroAssembler::RETRY) {
581 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION ||
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000582 isolate->has_pending_exception());
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000583 STATIC_ASSERT(
584 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
585 STATIC_ASSERT(
586 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
587 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
588 == RE_EXCEPTION);
589 return static_cast<IrregexpResult>(res);
590 }
591 // If result is RETRY, the string has changed representation, and we
592 // must restart from scratch.
593 // In this case, it means we must make sure we are prepared to handle
lrn@chromium.org32d961d2010-06-30 09:09:34 +0000594 // the, potentially, different subject (the string can switch between
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000595 // being internal and external, and even between being ASCII and UC16,
596 // but the characters are always the same).
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000597 IrregexpPrepare(regexp, subject);
ulan@chromium.org8e8d8822012-11-23 14:36:46 +0000598 is_ascii = subject->IsOneByteRepresentationUnderneath();
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000599 } while (true);
600 UNREACHABLE();
601 return RE_EXCEPTION;
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000602#else // V8_INTERPRETED_REGEXP
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000603
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000604 ASSERT(output_size >= IrregexpNumberOfRegisters(*irregexp));
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000605 // We must have done EnsureCompiledIrregexp, so we can get the number of
606 // registers.
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000607 int number_of_capture_registers =
608 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000609 int32_t* raw_output = &output[number_of_capture_registers];
610 // We do not touch the actual capture result registers until we know there
611 // has been a match so that we can use those capture results to set the
612 // last match info.
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000613 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000614 raw_output[i] = -1;
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000615 }
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000616 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate);
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000617
erik.corry@gmail.com394dbcf2011-10-27 07:38:48 +0000618 IrregexpResult result = IrregexpInterpreter::Match(isolate,
619 byte_codes,
620 subject,
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000621 raw_output,
erik.corry@gmail.com394dbcf2011-10-27 07:38:48 +0000622 index);
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000623 if (result == RE_SUCCESS) {
624 // Copy capture results to the start of the registers array.
mstarzinger@chromium.orge27d6172013-04-17 11:51:44 +0000625 OS::MemCopy(
626 output, raw_output, number_of_capture_registers * sizeof(int32_t));
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000627 }
erik.corry@gmail.com394dbcf2011-10-27 07:38:48 +0000628 if (result == RE_EXCEPTION) {
629 ASSERT(!isolate->has_pending_exception());
630 isolate->StackOverflow();
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000631 }
erik.corry@gmail.com394dbcf2011-10-27 07:38:48 +0000632 return result;
ricow@chromium.orgc9c80822010-04-21 08:22:37 +0000633#endif // V8_INTERPRETED_REGEXP
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000634}
635
636
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000637Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000638 Handle<String> subject,
ager@chromium.org41826e72009-03-30 13:30:57 +0000639 int previous_index,
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +0000640 Handle<JSArray> last_match_info) {
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000641 Isolate* isolate = regexp->GetIsolate();
642 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000643
ager@chromium.org8bb60582008-12-11 12:02:20 +0000644 // Prepare space for the return values.
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000645#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
ager@chromium.org8bb60582008-12-11 12:02:20 +0000646 if (FLAG_trace_regexp_bytecodes) {
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000647 String* pattern = regexp->Pattern();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000648 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
649 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
650 }
651#endif
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000652 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000653 if (required_registers < 0) {
654 // Compiling failed with an exception.
ulan@chromium.org812308e2012-02-29 15:58:45 +0000655 ASSERT(isolate->has_pending_exception());
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000656 return Handle<Object>::null();
657 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000658
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000659 int32_t* output_registers = NULL;
660 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
661 output_registers = NewArray<int32_t>(required_registers);
662 }
663 SmartArrayPointer<int32_t> auto_release(output_registers);
664 if (output_registers == NULL) {
665 output_registers = isolate->jsregexp_static_offsets_vector();
666 }
ager@chromium.org5aa501c2009-06-23 07:57:28 +0000667
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000668 int res = RegExpImpl::IrregexpExecRaw(
669 regexp, subject, previous_index, output_registers, required_registers);
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000670 if (res == RE_SUCCESS) {
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000671 int capture_count =
672 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
673 return SetLastMatchInfo(
674 last_match_info, subject, capture_count, output_registers);
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000675 }
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000676 if (res == RE_EXCEPTION) {
ulan@chromium.org812308e2012-02-29 15:58:45 +0000677 ASSERT(isolate->has_pending_exception());
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000678 return Handle<Object>::null();
679 }
whesse@chromium.orgcec079d2010-03-22 14:44:04 +0000680 ASSERT(res == RE_FAILURE);
ulan@chromium.org812308e2012-02-29 15:58:45 +0000681 return isolate->factory()->null_value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000682}
683
684
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000685Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
686 Handle<String> subject,
687 int capture_count,
688 int32_t* match) {
hpayer@chromium.org8432c912013-02-28 15:55:26 +0000689 ASSERT(last_match_info->HasFastObjectElements());
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000690 int capture_register_count = (capture_count + 1) * 2;
691 last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead);
rossberg@chromium.org79e79022013-06-03 15:43:46 +0000692 DisallowHeapAllocation no_allocation;
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000693 FixedArray* array = FixedArray::cast(last_match_info->elements());
694 if (match != NULL) {
695 for (int i = 0; i < capture_register_count; i += 2) {
696 SetCapture(array, i, match[i]);
697 SetCapture(array, i + 1, match[i + 1]);
698 }
699 }
700 SetLastCaptureCount(array, capture_register_count);
701 SetLastSubject(array, *subject);
702 SetLastInput(array, *subject);
703 return last_match_info;
704}
705
706
707RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
708 Handle<String> subject,
709 bool is_global,
danno@chromium.org412fa512012-09-14 13:28:26 +0000710 Isolate* isolate)
711 : register_array_(NULL),
712 register_array_size_(0),
713 regexp_(regexp),
714 subject_(subject) {
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000715#ifdef V8_INTERPRETED_REGEXP
716 bool interpreted = true;
717#else
718 bool interpreted = false;
719#endif // V8_INTERPRETED_REGEXP
720
yangguo@chromium.org355cfd12012-08-29 15:32:24 +0000721 if (regexp_->TypeTag() == JSRegExp::ATOM) {
722 static const int kAtomRegistersPerMatch = 2;
723 registers_per_match_ = kAtomRegistersPerMatch;
724 // There is no distinction between interpreted and native for atom regexps.
725 interpreted = false;
726 } else {
727 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
728 if (registers_per_match_ < 0) {
729 num_matches_ = -1; // Signal exception.
730 return;
731 }
732 }
733
734 if (is_global && !interpreted) {
735 register_array_size_ =
736 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
737 max_matches_ = register_array_size_ / registers_per_match_;
738 } else {
739 // Global loop in interpreted regexp is not implemented. We choose
740 // the size of the offsets vector so that it can only store one match.
741 register_array_size_ = registers_per_match_;
742 max_matches_ = 1;
743 }
744
745 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
746 register_array_ = NewArray<int32_t>(register_array_size_);
747 } else {
748 register_array_ = isolate->jsregexp_static_offsets_vector();
749 }
750
751 // Set state so that fetching the results the first time triggers a call
752 // to the compiled regexp.
753 current_match_index_ = max_matches_ - 1;
754 num_matches_ = max_matches_;
755 ASSERT(registers_per_match_ >= 2); // Each match has at least one capture.
756 ASSERT_GE(register_array_size_, registers_per_match_);
757 int32_t* last_match =
758 &register_array_[current_match_index_ * registers_per_match_];
759 last_match[0] = -1;
760 last_match[1] = 0;
761}
762
763
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000764// -------------------------------------------------------------------
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000765// Implementation of the Irregexp regular expression engine.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000766//
767// The Irregexp regular expression engine is intended to be a complete
768// implementation of ECMAScript regular expressions. It generates either
769// bytecodes or native code.
770
771// The Irregexp regexp engine is structured in three steps.
772// 1) The parser generates an abstract syntax tree. See ast.cc.
773// 2) From the AST a node network is created. The nodes are all
774// subclasses of RegExpNode. The nodes represent states when
775// executing a regular expression. Several optimizations are
776// performed on the node network.
777// 3) From the nodes we generate either byte codes or native code
778// that can actually execute the regular expression (perform
779// the search). The code generation step is described in more
780// detail below.
781
782// Code generation.
783//
784// The nodes are divided into four main categories.
785// * Choice nodes
786// These represent places where the regular expression can
787// match in more than one way. For example on entry to an
788// alternation (foo|bar) or a repetition (*, +, ? or {}).
789// * Action nodes
790// These represent places where some action should be
791// performed. Examples include recording the current position
792// in the input string to a register (in order to implement
793// captures) or other actions on register for example in order
794// to implement the counters needed for {} repetitions.
795// * Matching nodes
796// These attempt to match some element part of the input string.
797// Examples of elements include character classes, plain strings
798// or back references.
799// * End nodes
800// These are used to implement the actions required on finding
801// a successful match or failing to find a match.
802//
803// The code generated (whether as byte codes or native code) maintains
804// some state as it runs. This consists of the following elements:
805//
806// * The capture registers. Used for string captures.
807// * Other registers. Used for counters etc.
808// * The current position.
809// * The stack of backtracking information. Used when a matching node
810// fails to find a match and needs to try an alternative.
811//
812// Conceptual regular expression execution model:
813//
814// There is a simple conceptual model of regular expression execution
815// which will be presented first. The actual code generated is a more
816// efficient simulation of the simple conceptual model:
817//
818// * Choice nodes are implemented as follows:
819// For each choice except the last {
820// push current position
821// push backtrack code location
822// <generate code to test for choice>
823// backtrack code location:
824// pop current position
825// }
826// <generate code to test for last choice>
827//
828// * Actions nodes are generated as follows
829// <push affected registers on backtrack stack>
830// <generate code to perform action>
831// push backtrack code location
832// <generate code to test for following nodes>
833// backtrack code location:
834// <pop affected registers to restore their state>
835// <pop backtrack location from stack and go to it>
836//
837// * Matching nodes are generated as follows:
838// if input string matches at current position
839// update current position
840// <generate code to test for following nodes>
841// else
842// <pop backtrack location from stack and go to it>
843//
844// Thus it can be seen that the current position is saved and restored
845// by the choice nodes, whereas the registers are saved and restored by
846// by the action nodes that manipulate them.
847//
848// The other interesting aspect of this model is that nodes are generated
849// at the point where they are needed by a recursive call to Emit(). If
850// the node has already been code generated then the Emit() call will
851// generate a jump to the previously generated code instead. In order to
852// limit recursion it is possible for the Emit() function to put the node
853// on a work list for later generation and instead generate a jump. The
854// destination of the jump is resolved later when the code is generated.
855//
856// Actual regular expression code generation.
857//
858// Code generation is actually more complicated than the above. In order
859// to improve the efficiency of the generated code some optimizations are
860// performed
861//
862// * Choice nodes have 1-character lookahead.
863// A choice node looks at the following character and eliminates some of
864// the choices immediately based on that character. This is not yet
865// implemented.
866// * Simple greedy loops store reduced backtracking information.
867// A quantifier like /.*foo/m will greedily match the whole input. It will
868// then need to backtrack to a point where it can match "foo". The naive
869// implementation of this would push each character position onto the
870// backtracking stack, then pop them off one by one. This would use space
871// proportional to the length of the input string. However since the "."
872// can only match in one way and always has a constant length (in this case
873// of 1) it suffices to store the current position on the top of the stack
874// once. Matching now becomes merely incrementing the current position and
875// backtracking becomes decrementing the current position and checking the
876// result against the stored current position. This is faster and saves
877// space.
878// * The current state is virtualized.
879// This is used to defer expensive operations until it is clear that they
880// are needed and to generate code for a node more than once, allowing
881// specialized an efficient versions of the code to be created. This is
882// explained in the section below.
883//
884// Execution state virtualization.
885//
886// Instead of emitting code, nodes that manipulate the state can record their
ager@chromium.org32912102009-01-16 10:38:43 +0000887// manipulation in an object called the Trace. The Trace object can record a
888// current position offset, an optional backtrack code location on the top of
889// the virtualized backtrack stack and some register changes. When a node is
890// to be emitted it can flush the Trace or update it. Flushing the Trace
ager@chromium.org8bb60582008-12-11 12:02:20 +0000891// will emit code to bring the actual state into line with the virtual state.
ulan@chromium.org2efb9002012-01-19 15:36:35 +0000892// Avoiding flushing the state can postpone some work (e.g. updates of capture
ager@chromium.org8bb60582008-12-11 12:02:20 +0000893// registers). Postponing work can save time when executing the regular
894// expression since it may be found that the work never has to be done as a
895// failure to match can occur. In addition it is much faster to jump to a
896// known backtrack code location than it is to pop an unknown backtrack
897// location from the stack and jump there.
898//
ager@chromium.org32912102009-01-16 10:38:43 +0000899// The virtual state found in the Trace affects code generation. For example
900// the virtual state contains the difference between the actual current
901// position and the virtual current position, and matching code needs to use
902// this offset to attempt a match in the correct location of the input
903// string. Therefore code generated for a non-trivial trace is specialized
904// to that trace. The code generator therefore has the ability to generate
905// code for each node several times. In order to limit the size of the
906// generated code there is an arbitrary limit on how many specialized sets of
907// code may be generated for a given node. If the limit is reached, the
908// trace is flushed and a generic version of the code for a node is emitted.
909// This is subsequently used for that node. The code emitted for non-generic
910// trace is not recorded in the node and so it cannot currently be reused in
911// the event that code generation is requested for an identical trace.
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000912
913
mmassi@chromium.org7028c052012-06-13 11:51:58 +0000914void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000915 UNREACHABLE();
916}
917
918
mmassi@chromium.org7028c052012-06-13 11:51:58 +0000919void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
920 text->AddElement(TextElement::Atom(this), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000921}
922
923
mmassi@chromium.org7028c052012-06-13 11:51:58 +0000924void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
925 text->AddElement(TextElement::CharClass(this), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000926}
927
928
mmassi@chromium.org7028c052012-06-13 11:51:58 +0000929void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000930 for (int i = 0; i < elements()->length(); i++)
mmassi@chromium.org7028c052012-06-13 11:51:58 +0000931 text->AddElement(elements()->at(i), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000932}
933
934
935TextElement TextElement::Atom(RegExpAtom* atom) {
rossberg@chromium.org92597162013-08-23 13:28:00 +0000936 return TextElement(ATOM, atom);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000937}
938
939
rossberg@chromium.org92597162013-08-23 13:28:00 +0000940TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
941 return TextElement(CHAR_CLASS, char_class);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000942}
943
944
rossberg@chromium.org92597162013-08-23 13:28:00 +0000945int TextElement::length() const {
946 switch (text_type()) {
947 case ATOM:
948 return atom()->length();
949
950 case CHAR_CLASS:
951 return 1;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000952 }
rossberg@chromium.org92597162013-08-23 13:28:00 +0000953 UNREACHABLE();
954 return 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000955}
956
957
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000958DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
959 if (table_ == NULL) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +0000960 table_ = new(zone()) DispatchTable(zone());
961 DispatchTableConstructor cons(table_, ignore_case, zone());
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000962 cons.BuildTable(this);
963 }
964 return table_;
965}
966
967
fschneider@chromium.org7d10be52012-04-10 12:30:14 +0000968class FrequencyCollator {
969 public:
970 FrequencyCollator() : total_samples_(0) {
971 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
972 frequencies_[i] = CharacterFrequency(i);
973 }
974 }
975
976 void CountCharacter(int character) {
977 int index = (character & RegExpMacroAssembler::kTableMask);
978 frequencies_[index].Increment();
979 total_samples_++;
980 }
981
982 // Does not measure in percent, but rather per-128 (the table size from the
983 // regexp macro assembler).
984 int Frequency(int in_character) {
985 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
986 if (total_samples_ < 1) return 1; // Division by zero.
987 int freq_in_per128 =
988 (frequencies_[in_character].counter() * 128) / total_samples_;
989 return freq_in_per128;
990 }
991
992 private:
993 class CharacterFrequency {
994 public:
995 CharacterFrequency() : counter_(0), character_(-1) { }
996 explicit CharacterFrequency(int character)
997 : counter_(0), character_(character) { }
998
999 void Increment() { counter_++; }
1000 int counter() { return counter_; }
1001 int character() { return character_; }
1002
1003 private:
1004 int counter_;
1005 int character_;
1006 };
1007
1008
1009 private:
1010 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
1011 int total_samples_;
1012};
1013
1014
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001015class RegExpCompiler {
1016 public:
rossberg@chromium.org400388e2012-06-06 09:29:22 +00001017 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii,
1018 Zone* zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001019
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001020 int AllocateRegister() {
1021 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
1022 reg_exp_too_big_ = true;
1023 return next_register_;
1024 }
1025 return next_register_++;
1026 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001027
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00001028 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
1029 RegExpNode* start,
1030 int capture_count,
1031 Handle<String> pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001032
1033 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1034
1035 static const int kImplementationOffset = 0;
1036 static const int kNumberOfRegistersOffset = 0;
1037 static const int kCodeOffset = 1;
1038
1039 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1040 EndNode* accept() { return accept_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001041
1042 static const int kMaxRecursion = 100;
1043 inline int recursion_depth() { return recursion_depth_; }
1044 inline void IncrementRecursionDepth() { recursion_depth_++; }
1045 inline void DecrementRecursionDepth() { recursion_depth_--; }
1046
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001047 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1048
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001049 inline bool ignore_case() { return ignore_case_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001050 inline bool ascii() { return ascii_; }
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00001051 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001052
whesse@chromium.org7b260152011-06-20 15:33:18 +00001053 int current_expansion_factor() { return current_expansion_factor_; }
1054 void set_current_expansion_factor(int value) {
1055 current_expansion_factor_ = value;
1056 }
1057
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001058 Zone* zone() const { return zone_; }
rossberg@chromium.org400388e2012-06-06 09:29:22 +00001059
ager@chromium.org32912102009-01-16 10:38:43 +00001060 static const int kNoRegister = -1;
jkummerow@chromium.orge297f592011-06-08 10:05:15 +00001061
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001062 private:
1063 EndNode* accept_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001064 int next_register_;
1065 List<RegExpNode*>* work_list_;
1066 int recursion_depth_;
1067 RegExpMacroAssembler* macro_assembler_;
1068 bool ignore_case_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001069 bool ascii_;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001070 bool reg_exp_too_big_;
whesse@chromium.org7b260152011-06-20 15:33:18 +00001071 int current_expansion_factor_;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00001072 FrequencyCollator frequency_collator_;
rossberg@chromium.org400388e2012-06-06 09:29:22 +00001073 Zone* zone_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001074};
1075
1076
1077class RecursionCheck {
1078 public:
1079 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1080 compiler->IncrementRecursionDepth();
1081 }
1082 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1083 private:
1084 RegExpCompiler* compiler_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001085};
1086
1087
dslomov@chromium.orge97852d2013-09-12 09:02:59 +00001088static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1089 return RegExpEngine::CompilationResult(isolate, "RegExp too big");
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001090}
1091
1092
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001093// Attempts to compile the regexp using an Irregexp code generator. Returns
1094// a fixed array or a null handle depending on whether it succeeded.
rossberg@chromium.org400388e2012-06-06 09:29:22 +00001095RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii,
1096 Zone* zone)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001097 : next_register_(2 * (capture_count + 1)),
1098 work_list_(NULL),
1099 recursion_depth_(0),
ager@chromium.org8bb60582008-12-11 12:02:20 +00001100 ignore_case_(ignore_case),
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001101 ascii_(ascii),
whesse@chromium.org7b260152011-06-20 15:33:18 +00001102 reg_exp_too_big_(false),
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00001103 current_expansion_factor_(1),
rossberg@chromium.org400388e2012-06-06 09:29:22 +00001104 frequency_collator_(),
1105 zone_(zone) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001106 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001107 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001108}
1109
1110
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00001111RegExpEngine::CompilationResult RegExpCompiler::Assemble(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001112 RegExpMacroAssembler* macro_assembler,
1113 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001114 int capture_count,
1115 Handle<String> pattern) {
karlklose@chromium.org83a47282011-05-11 11:54:09 +00001116 Heap* heap = pattern->GetHeap();
1117
1118 bool use_slow_safe_regexp_compiler = false;
1119 if (heap->total_regexp_code_generated() >
1120 RegExpImpl::kRegWxpCompiledLimit &&
1121 heap->isolate()->memory_allocator()->SizeExecutable() >
1122 RegExpImpl::kRegExpExecutableMemoryLimit) {
1123 use_slow_safe_regexp_compiler = true;
1124 }
1125
1126 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
1127
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001128#ifdef DEBUG
1129 if (FLAG_trace_regexp_assembler)
1130 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
1131 else
1132#endif
1133 macro_assembler_ = macro_assembler;
karlklose@chromium.org83a47282011-05-11 11:54:09 +00001134
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001135 List <RegExpNode*> work_list(0);
1136 work_list_ = &work_list;
1137 Label fail;
iposva@chromium.org245aa852009-02-10 00:49:54 +00001138 macro_assembler_->PushBacktrack(&fail);
ager@chromium.org32912102009-01-16 10:38:43 +00001139 Trace new_trace;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001140 start->Emit(this, &new_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001141 macro_assembler_->Bind(&fail);
1142 macro_assembler_->Fail();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001143 while (!work_list.is_empty()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001144 work_list.RemoveLast()->Emit(this, &new_trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001145 }
dslomov@chromium.orge97852d2013-09-12 09:02:59 +00001146 if (reg_exp_too_big_) return IrregexpRegExpTooBig(zone_->isolate());
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00001147
karlklose@chromium.org83a47282011-05-11 11:54:09 +00001148 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1149 heap->IncreaseTotalRegexpCodeGenerated(code->Size());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001150 work_list_ = NULL;
1151#ifdef DEBUG
danno@chromium.org4d3fe4e2011-03-10 10:14:28 +00001152 if (FLAG_print_code) {
1153 Handle<Code>::cast(code)->Disassemble(*pattern->ToCString());
1154 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001155 if (FLAG_trace_regexp_assembler) {
1156 delete macro_assembler_;
1157 }
1158#endif
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00001159 return RegExpEngine::CompilationResult(*code, next_register_);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001160}
1161
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001162
ager@chromium.org32912102009-01-16 10:38:43 +00001163bool Trace::DeferredAction::Mentions(int that) {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00001164 if (action_type() == ActionNode::CLEAR_CAPTURES) {
ager@chromium.org32912102009-01-16 10:38:43 +00001165 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1166 return range.Contains(that);
1167 } else {
1168 return reg() == that;
1169 }
1170}
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001171
ager@chromium.org32912102009-01-16 10:38:43 +00001172
1173bool Trace::mentions_reg(int reg) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001174 for (DeferredAction* action = actions_;
1175 action != NULL;
1176 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001177 if (action->Mentions(reg))
1178 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001179 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001180 return false;
1181}
1182
1183
ager@chromium.org32912102009-01-16 10:38:43 +00001184bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1185 ASSERT_EQ(0, *cp_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001186 for (DeferredAction* action = actions_;
1187 action != NULL;
1188 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001189 if (action->Mentions(reg)) {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00001190 if (action->action_type() == ActionNode::STORE_POSITION) {
ager@chromium.org32912102009-01-16 10:38:43 +00001191 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1192 return true;
1193 } else {
1194 return false;
1195 }
1196 }
1197 }
1198 return false;
1199}
1200
1201
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001202int Trace::FindAffectedRegisters(OutSet* affected_registers,
1203 Zone* zone) {
ager@chromium.org32912102009-01-16 10:38:43 +00001204 int max_register = RegExpCompiler::kNoRegister;
1205 for (DeferredAction* action = actions_;
1206 action != NULL;
1207 action = action->next()) {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00001208 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
ager@chromium.org32912102009-01-16 10:38:43 +00001209 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1210 for (int i = range.from(); i <= range.to(); i++)
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001211 affected_registers->Set(i, zone);
ager@chromium.org32912102009-01-16 10:38:43 +00001212 if (range.to() > max_register) max_register = range.to();
1213 } else {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001214 affected_registers->Set(action->reg(), zone);
ager@chromium.org32912102009-01-16 10:38:43 +00001215 if (action->reg() > max_register) max_register = action->reg();
1216 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001217 }
1218 return max_register;
1219}
1220
1221
ager@chromium.org32912102009-01-16 10:38:43 +00001222void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1223 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001224 OutSet& registers_to_pop,
1225 OutSet& registers_to_clear) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001226 for (int reg = max_register; reg >= 0; reg--) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001227 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
1228 else if (registers_to_clear.Get(reg)) {
1229 int clear_to = reg;
1230 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1231 reg--;
1232 }
1233 assembler->ClearRegisters(reg, clear_to);
1234 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001235 }
1236}
1237
1238
ager@chromium.org32912102009-01-16 10:38:43 +00001239void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1240 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001241 OutSet& affected_registers,
1242 OutSet* registers_to_pop,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001243 OutSet* registers_to_clear,
1244 Zone* zone) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001245 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1246 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1247
ager@chromium.org5aa501c2009-06-23 07:57:28 +00001248 // Count pushes performed to force a stack limit check occasionally.
1249 int pushes = 0;
1250
ager@chromium.org8bb60582008-12-11 12:02:20 +00001251 for (int reg = 0; reg <= max_register; reg++) {
1252 if (!affected_registers.Get(reg)) {
1253 continue;
1254 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001255
1256 // The chronologically first deferred action in the trace
1257 // is used to infer the action needed to restore a register
1258 // to its previous state (or not, if it's safe to ignore it).
1259 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1260 DeferredActionUndoType undo_action = IGNORE;
1261
ager@chromium.org8bb60582008-12-11 12:02:20 +00001262 int value = 0;
1263 bool absolute = false;
ager@chromium.org32912102009-01-16 10:38:43 +00001264 bool clear = false;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001265 int store_position = -1;
1266 // This is a little tricky because we are scanning the actions in reverse
1267 // historical order (newest first).
1268 for (DeferredAction* action = actions_;
1269 action != NULL;
1270 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001271 if (action->Mentions(reg)) {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00001272 switch (action->action_type()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001273 case ActionNode::SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00001274 Trace::DeferredSetRegister* psr =
1275 static_cast<Trace::DeferredSetRegister*>(action);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001276 if (!absolute) {
1277 value += psr->value();
1278 absolute = true;
1279 }
1280 // SET_REGISTER is currently only used for newly introduced loop
1281 // counters. They can have a significant previous value if they
1282 // occour in a loop. TODO(lrn): Propagate this information, so
1283 // we can set undo_action to IGNORE if we know there is no value to
1284 // restore.
1285 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001286 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +00001287 ASSERT(!clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001288 break;
1289 }
1290 case ActionNode::INCREMENT_REGISTER:
1291 if (!absolute) {
1292 value++;
1293 }
1294 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +00001295 ASSERT(!clear);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001296 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001297 break;
1298 case ActionNode::STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00001299 Trace::DeferredCapture* pc =
1300 static_cast<Trace::DeferredCapture*>(action);
1301 if (!clear && store_position == -1) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001302 store_position = pc->cp_offset();
1303 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001304
1305 // For captures we know that stores and clears alternate.
1306 // Other register, are never cleared, and if the occur
1307 // inside a loop, they might be assigned more than once.
1308 if (reg <= 1) {
1309 // Registers zero and one, aka "capture zero", is
1310 // always set correctly if we succeed. There is no
1311 // need to undo a setting on backtrack, because we
1312 // will set it again or fail.
1313 undo_action = IGNORE;
1314 } else {
1315 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1316 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001317 ASSERT(!absolute);
1318 ASSERT_EQ(value, 0);
1319 break;
1320 }
ager@chromium.org32912102009-01-16 10:38:43 +00001321 case ActionNode::CLEAR_CAPTURES: {
1322 // Since we're scanning in reverse order, if we've already
1323 // set the position we have to ignore historically earlier
1324 // clearing operations.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001325 if (store_position == -1) {
ager@chromium.org32912102009-01-16 10:38:43 +00001326 clear = true;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001327 }
1328 undo_action = RESTORE;
ager@chromium.org32912102009-01-16 10:38:43 +00001329 ASSERT(!absolute);
1330 ASSERT_EQ(value, 0);
1331 break;
1332 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001333 default:
1334 UNREACHABLE();
1335 break;
1336 }
1337 }
1338 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001339 // Prepare for the undo-action (e.g., push if it's going to be popped).
1340 if (undo_action == RESTORE) {
1341 pushes++;
1342 RegExpMacroAssembler::StackCheckFlag stack_check =
1343 RegExpMacroAssembler::kNoStackLimitCheck;
1344 if (pushes == push_limit) {
1345 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1346 pushes = 0;
1347 }
1348
1349 assembler->PushRegister(reg, stack_check);
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001350 registers_to_pop->Set(reg, zone);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001351 } else if (undo_action == CLEAR) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001352 registers_to_clear->Set(reg, zone);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001353 }
1354 // Perform the chronologically last action (or accumulated increment)
1355 // for the register.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001356 if (store_position != -1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001357 assembler->WriteCurrentPositionToRegister(reg, store_position);
ager@chromium.org32912102009-01-16 10:38:43 +00001358 } else if (clear) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001359 assembler->ClearRegisters(reg, reg);
ager@chromium.org32912102009-01-16 10:38:43 +00001360 } else if (absolute) {
1361 assembler->SetRegister(reg, value);
1362 } else if (value != 0) {
1363 assembler->AdvanceRegister(reg, value);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001364 }
1365 }
1366}
1367
1368
ager@chromium.org8bb60582008-12-11 12:02:20 +00001369// This is called as we come into a loop choice node and some other tricky
ager@chromium.org32912102009-01-16 10:38:43 +00001370// nodes. It normalizes the state of the code generator to ensure we can
ager@chromium.org8bb60582008-12-11 12:02:20 +00001371// generate generic code.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001372void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001373 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001374
iposva@chromium.org245aa852009-02-10 00:49:54 +00001375 ASSERT(!is_trivial());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001376
1377 if (actions_ == NULL && backtrack() == NULL) {
1378 // Here we just have some deferred cp advances to fix and we are back to
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001379 // a normal situation. We may also have to forget some information gained
1380 // through a quick check that was already performed.
1381 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001382 // Create a new trivial state and generate the node with that.
ager@chromium.org32912102009-01-16 10:38:43 +00001383 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001384 successor->Emit(compiler, &new_state);
1385 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001386 }
1387
1388 // Generate deferred actions here along with code to undo them again.
1389 OutSet affected_registers;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001390
ager@chromium.org381abbb2009-02-25 13:23:22 +00001391 if (backtrack() != NULL) {
1392 // Here we have a concrete backtrack location. These are set up by choice
1393 // nodes and so they indicate that we have a deferred save of the current
1394 // position which we may need to emit here.
1395 assembler->PushCurrentPosition();
1396 }
1397
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001398 int max_register = FindAffectedRegisters(&affected_registers,
1399 compiler->zone());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001400 OutSet registers_to_pop;
1401 OutSet registers_to_clear;
1402 PerformDeferredActions(assembler,
1403 max_register,
1404 affected_registers,
1405 &registers_to_pop,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001406 &registers_to_clear,
1407 compiler->zone());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001408 if (cp_offset_ != 0) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001409 assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001410 }
1411
1412 // Create a new trivial state and generate the node with that.
1413 Label undo;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001414 assembler->PushBacktrack(&undo);
ager@chromium.org32912102009-01-16 10:38:43 +00001415 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001416 successor->Emit(compiler, &new_state);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001417
1418 // On backtrack we need to restore state.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001419 assembler->Bind(&undo);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001420 RestoreAffectedRegisters(assembler,
1421 max_register,
1422 registers_to_pop,
1423 registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001424 if (backtrack() == NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001425 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001426 } else {
ager@chromium.org381abbb2009-02-25 13:23:22 +00001427 assembler->PopCurrentPosition();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001428 assembler->GoTo(backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001429 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001430}
1431
1432
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001433void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001434 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001435
1436 // Omit flushing the trace. We discard the entire stack frame anyway.
1437
ager@chromium.org8bb60582008-12-11 12:02:20 +00001438 if (!label()->is_bound()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001439 // We are completely independent of the trace, since we ignore it,
1440 // so this code can be used as the generic version.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001441 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001442 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001443
1444 // Throw away everything on the backtrack stack since the start
1445 // of the negative submatch and restore the character position.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001446 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1447 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001448 if (clear_capture_count_ > 0) {
1449 // Clear any captures that might have been performed during the success
1450 // of the body of the negative look-ahead.
1451 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1452 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1453 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001454 // Now that we have unwound the stack we find at the top of the stack the
1455 // backtrack that the BeginSubmatch node got.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001456 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001457}
1458
1459
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001460void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00001461 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001462 trace->Flush(compiler, this);
1463 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001464 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001465 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001466 if (!label()->is_bound()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001467 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001468 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001469 switch (action_) {
1470 case ACCEPT:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001471 assembler->Succeed();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001472 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001473 case BACKTRACK:
ager@chromium.org32912102009-01-16 10:38:43 +00001474 assembler->GoTo(trace->backtrack());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001475 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001476 case NEGATIVE_SUBMATCH_SUCCESS:
1477 // This case is handled in a different virtual method.
1478 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001479 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001480 UNIMPLEMENTED();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001481}
1482
1483
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001484void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001485 if (guards_ == NULL)
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001486 guards_ = new(zone) ZoneList<Guard*>(1, zone);
1487 guards_->Add(guard, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001488}
1489
1490
ager@chromium.org8bb60582008-12-11 12:02:20 +00001491ActionNode* ActionNode::SetRegister(int reg,
1492 int val,
1493 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001494 ActionNode* result =
1495 new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001496 result->data_.u_store_register.reg = reg;
1497 result->data_.u_store_register.value = val;
1498 return result;
1499}
1500
1501
1502ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001503 ActionNode* result =
1504 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001505 result->data_.u_increment_register.reg = reg;
1506 return result;
1507}
1508
1509
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001510ActionNode* ActionNode::StorePosition(int reg,
1511 bool is_capture,
1512 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001513 ActionNode* result =
1514 new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001515 result->data_.u_position_register.reg = reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001516 result->data_.u_position_register.is_capture = is_capture;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001517 return result;
1518}
1519
1520
ager@chromium.org32912102009-01-16 10:38:43 +00001521ActionNode* ActionNode::ClearCaptures(Interval range,
1522 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001523 ActionNode* result =
1524 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
ager@chromium.org32912102009-01-16 10:38:43 +00001525 result->data_.u_clear_captures.range_from = range.from();
1526 result->data_.u_clear_captures.range_to = range.to();
1527 return result;
1528}
1529
1530
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001531ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1532 int position_reg,
1533 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001534 ActionNode* result =
1535 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001536 result->data_.u_submatch.stack_pointer_register = stack_reg;
1537 result->data_.u_submatch.current_position_register = position_reg;
1538 return result;
1539}
1540
1541
ager@chromium.org8bb60582008-12-11 12:02:20 +00001542ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1543 int position_reg,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001544 int clear_register_count,
1545 int clear_register_from,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001546 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001547 ActionNode* result =
1548 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001549 result->data_.u_submatch.stack_pointer_register = stack_reg;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001550 result->data_.u_submatch.current_position_register = position_reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001551 result->data_.u_submatch.clear_register_count = clear_register_count;
1552 result->data_.u_submatch.clear_register_from = clear_register_from;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001553 return result;
1554}
1555
1556
ager@chromium.org32912102009-01-16 10:38:43 +00001557ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1558 int repetition_register,
1559 int repetition_limit,
1560 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00001561 ActionNode* result =
1562 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
ager@chromium.org32912102009-01-16 10:38:43 +00001563 result->data_.u_empty_match_check.start_register = start_register;
1564 result->data_.u_empty_match_check.repetition_register = repetition_register;
1565 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1566 return result;
1567}
1568
1569
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001570#define DEFINE_ACCEPT(Type) \
1571 void Type##Node::Accept(NodeVisitor* visitor) { \
1572 visitor->Visit##Type(this); \
1573 }
1574FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1575#undef DEFINE_ACCEPT
1576
1577
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001578void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1579 visitor->VisitLoopChoice(this);
1580}
1581
1582
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001583// -------------------------------------------------------------------
1584// Emit code.
1585
1586
1587void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1588 Guard* guard,
ager@chromium.org32912102009-01-16 10:38:43 +00001589 Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001590 switch (guard->op()) {
1591 case Guard::LT:
ager@chromium.org32912102009-01-16 10:38:43 +00001592 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001593 macro_assembler->IfRegisterGE(guard->reg(),
1594 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001595 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001596 break;
1597 case Guard::GEQ:
ager@chromium.org32912102009-01-16 10:38:43 +00001598 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001599 macro_assembler->IfRegisterLT(guard->reg(),
1600 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001601 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001602 break;
1603 }
1604}
1605
1606
ager@chromium.org381abbb2009-02-25 13:23:22 +00001607// Returns the number of characters in the equivalence class, omitting those
1608// that cannot occur in the source string because it is ASCII.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001609static int GetCaseIndependentLetters(Isolate* isolate,
1610 uc16 character,
ager@chromium.org381abbb2009-02-25 13:23:22 +00001611 bool ascii_subject,
1612 unibrow::uchar* letters) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001613 int length =
1614 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
whesse@chromium.orge90029b2010-08-02 11:52:17 +00001615 // Unibrow returns 0 or 1 for characters where case independence is
ager@chromium.org381abbb2009-02-25 13:23:22 +00001616 // trivial.
1617 if (length == 0) {
1618 letters[0] = character;
1619 length = 1;
1620 }
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00001621 if (!ascii_subject || character <= String::kMaxOneByteCharCode) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00001622 return length;
1623 }
1624 // The standard requires that non-ASCII characters cannot have ASCII
1625 // character codes in their equivalence class.
1626 return 0;
1627}
1628
1629
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001630static inline bool EmitSimpleCharacter(Isolate* isolate,
1631 RegExpCompiler* compiler,
ager@chromium.org381abbb2009-02-25 13:23:22 +00001632 uc16 c,
1633 Label* on_failure,
1634 int cp_offset,
1635 bool check,
1636 bool preloaded) {
1637 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1638 bool bound_checked = false;
1639 if (!preloaded) {
1640 assembler->LoadCurrentCharacter(
1641 cp_offset,
1642 on_failure,
1643 check);
1644 bound_checked = true;
1645 }
1646 assembler->CheckNotCharacter(c, on_failure);
1647 return bound_checked;
1648}
1649
1650
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001651// Only emits non-letters (things that don't have case). Only used for case
1652// independent matches.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001653static inline bool EmitAtomNonLetter(Isolate* isolate,
1654 RegExpCompiler* compiler,
ager@chromium.org381abbb2009-02-25 13:23:22 +00001655 uc16 c,
1656 Label* on_failure,
1657 int cp_offset,
1658 bool check,
1659 bool preloaded) {
1660 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1661 bool ascii = compiler->ascii();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001662 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001663 int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
ager@chromium.org381abbb2009-02-25 13:23:22 +00001664 if (length < 1) {
1665 // This can't match. Must be an ASCII subject and a non-ASCII character.
1666 // We do not need to do anything since the ASCII pass already handled this.
1667 return false; // Bounds not checked.
1668 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001669 bool checked = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001670 // We handle the length > 1 case in a later pass.
1671 if (length == 1) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00001672 if (ascii && c > String::kMaxOneByteCharCodeU) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00001673 // Can't match - see above.
1674 return false; // Bounds not checked.
1675 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001676 if (!preloaded) {
1677 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1678 checked = check;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001679 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001680 macro_assembler->CheckNotCharacter(c, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001681 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001682 return checked;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001683}
1684
1685
1686static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001687 bool ascii,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001688 uc16 c1,
1689 uc16 c2,
1690 Label* on_failure) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001691 uc16 char_mask;
1692 if (ascii) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00001693 char_mask = String::kMaxOneByteCharCode;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001694 } else {
yangguo@chromium.org154ff992012-03-13 08:09:54 +00001695 char_mask = String::kMaxUtf16CodeUnit;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001696 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001697 uc16 exor = c1 ^ c2;
1698 // Check whether exor has only one bit set.
1699 if (((exor - 1) & exor) == 0) {
1700 // If c1 and c2 differ only by one bit.
1701 // Ecma262UnCanonicalize always gives the highest number last.
1702 ASSERT(c2 > c1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001703 uc16 mask = char_mask ^ exor;
1704 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001705 return true;
1706 }
1707 ASSERT(c2 > c1);
1708 uc16 diff = c2 - c1;
1709 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1710 // If the characters differ by 2^n but don't differ by one bit then
1711 // subtract the difference from the found character, then do the or
1712 // trick. We avoid the theoretical case where negative numbers are
1713 // involved in order to simplify code generation.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001714 uc16 mask = char_mask ^ diff;
1715 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1716 diff,
1717 mask,
1718 on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001719 return true;
1720 }
1721 return false;
1722}
1723
1724
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001725typedef bool EmitCharacterFunction(Isolate* isolate,
1726 RegExpCompiler* compiler,
ager@chromium.org381abbb2009-02-25 13:23:22 +00001727 uc16 c,
1728 Label* on_failure,
1729 int cp_offset,
1730 bool check,
1731 bool preloaded);
1732
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001733// Only emits letters (things that have case). Only used for case independent
1734// matches.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001735static inline bool EmitAtomLetter(Isolate* isolate,
1736 RegExpCompiler* compiler,
ager@chromium.org381abbb2009-02-25 13:23:22 +00001737 uc16 c,
1738 Label* on_failure,
1739 int cp_offset,
1740 bool check,
1741 bool preloaded) {
1742 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1743 bool ascii = compiler->ascii();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001744 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00001745 int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001746 if (length <= 1) return false;
1747 // We may not need to check against the end of the input string
1748 // if this character lies before a character that matched.
1749 if (!preloaded) {
1750 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001751 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001752 Label ok;
1753 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1754 switch (length) {
1755 case 2: {
1756 if (ShortCutEmitCharacterPair(macro_assembler,
1757 ascii,
1758 chars[0],
1759 chars[1],
1760 on_failure)) {
1761 } else {
1762 macro_assembler->CheckCharacter(chars[0], &ok);
1763 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1764 macro_assembler->Bind(&ok);
1765 }
1766 break;
1767 }
1768 case 4:
1769 macro_assembler->CheckCharacter(chars[3], &ok);
1770 // Fall through!
1771 case 3:
1772 macro_assembler->CheckCharacter(chars[0], &ok);
1773 macro_assembler->CheckCharacter(chars[1], &ok);
1774 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1775 macro_assembler->Bind(&ok);
1776 break;
1777 default:
1778 UNREACHABLE();
1779 break;
1780 }
1781 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001782}
1783
1784
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00001785static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1786 int border,
1787 Label* fall_through,
1788 Label* above_or_equal,
1789 Label* below) {
1790 if (below != fall_through) {
1791 masm->CheckCharacterLT(border, below);
1792 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1793 } else {
1794 masm->CheckCharacterGT(border - 1, above_or_equal);
1795 }
1796}
1797
1798
1799static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1800 int first,
1801 int last,
1802 Label* fall_through,
1803 Label* in_range,
1804 Label* out_of_range) {
1805 if (in_range == fall_through) {
1806 if (first == last) {
1807 masm->CheckNotCharacter(first, out_of_range);
1808 } else {
1809 masm->CheckCharacterNotInRange(first, last, out_of_range);
1810 }
1811 } else {
1812 if (first == last) {
1813 masm->CheckCharacter(first, in_range);
1814 } else {
1815 masm->CheckCharacterInRange(first, last, in_range);
1816 }
1817 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1818 }
1819}
1820
1821
1822// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1823// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1824static void EmitUseLookupTable(
1825 RegExpMacroAssembler* masm,
1826 ZoneList<int>* ranges,
1827 int start_index,
1828 int end_index,
1829 int min_char,
1830 Label* fall_through,
1831 Label* even_label,
1832 Label* odd_label) {
1833 static const int kSize = RegExpMacroAssembler::kTableSize;
1834 static const int kMask = RegExpMacroAssembler::kTableMask;
1835
1836 int base = (min_char & ~kMask);
1837 USE(base);
1838
1839 // Assert that everything is on one kTableSize page.
1840 for (int i = start_index; i <= end_index; i++) {
1841 ASSERT_EQ(ranges->at(i) & ~kMask, base);
1842 }
1843 ASSERT(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1844
1845 char templ[kSize];
1846 Label* on_bit_set;
1847 Label* on_bit_clear;
1848 int bit;
1849 if (even_label == fall_through) {
1850 on_bit_set = odd_label;
1851 on_bit_clear = even_label;
1852 bit = 1;
1853 } else {
1854 on_bit_set = even_label;
1855 on_bit_clear = odd_label;
1856 bit = 0;
1857 }
1858 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1859 templ[i] = bit;
1860 }
1861 int j = 0;
1862 bit ^= 1;
1863 for (int i = start_index; i < end_index; i++) {
1864 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1865 templ[j] = bit;
1866 }
1867 bit ^= 1;
1868 }
1869 for (int i = j; i < kSize; i++) {
1870 templ[i] = bit;
1871 }
jkummerow@chromium.org3d00d0a2013-09-04 13:57:32 +00001872 Factory* factory = masm->zone()->isolate()->factory();
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00001873 // TODO(erikcorry): Cache these.
verwaest@chromium.orgd4be0f02013-06-05 13:39:03 +00001874 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00001875 for (int i = 0; i < kSize; i++) {
1876 ba->set(i, templ[i]);
1877 }
1878 masm->CheckBitInTable(ba, on_bit_set);
1879 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1880}
1881
1882
1883static void CutOutRange(RegExpMacroAssembler* masm,
1884 ZoneList<int>* ranges,
1885 int start_index,
1886 int end_index,
1887 int cut_index,
1888 Label* even_label,
1889 Label* odd_label) {
1890 bool odd = (((cut_index - start_index) & 1) == 1);
1891 Label* in_range_label = odd ? odd_label : even_label;
1892 Label dummy;
1893 EmitDoubleBoundaryTest(masm,
1894 ranges->at(cut_index),
1895 ranges->at(cut_index + 1) - 1,
1896 &dummy,
1897 in_range_label,
1898 &dummy);
1899 ASSERT(!dummy.is_linked());
1900 // Cut out the single range by rewriting the array. This creates a new
1901 // range that is a merger of the two ranges on either side of the one we
1902 // are cutting out. The oddity of the labels is preserved.
1903 for (int j = cut_index; j > start_index; j--) {
1904 ranges->at(j) = ranges->at(j - 1);
1905 }
1906 for (int j = cut_index + 1; j < end_index; j++) {
1907 ranges->at(j) = ranges->at(j + 1);
1908 }
1909}
1910
1911
1912// Unicode case. Split the search space into kSize spaces that are handled
1913// with recursion.
1914static void SplitSearchSpace(ZoneList<int>* ranges,
1915 int start_index,
1916 int end_index,
1917 int* new_start_index,
1918 int* new_end_index,
1919 int* border) {
1920 static const int kSize = RegExpMacroAssembler::kTableSize;
1921 static const int kMask = RegExpMacroAssembler::kTableMask;
1922
1923 int first = ranges->at(start_index);
1924 int last = ranges->at(end_index) - 1;
1925
1926 *new_start_index = start_index;
1927 *border = (ranges->at(start_index) & ~kMask) + kSize;
1928 while (*new_start_index < end_index) {
1929 if (ranges->at(*new_start_index) > *border) break;
1930 (*new_start_index)++;
1931 }
1932 // new_start_index is the index of the first edge that is beyond the
1933 // current kSize space.
1934
1935 // For very large search spaces we do a binary chop search of the non-ASCII
1936 // space instead of just going to the end of the current kSize space. The
1937 // heuristics are complicated a little by the fact that any 128-character
1938 // encoding space can be quickly tested with a table lookup, so we don't
1939 // wish to do binary chop search at a smaller granularity than that. A
1940 // 128-character space can take up a lot of space in the ranges array if,
1941 // for example, we only want to match every second character (eg. the lower
1942 // case characters on some Unicode pages).
1943 int binary_chop_index = (end_index + start_index) / 2;
1944 // The first test ensures that we get to the code that handles the ASCII
1945 // range with a single not-taken branch, speeding up this important
1946 // character range (even non-ASCII charset-based text has spaces and
1947 // punctuation).
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00001948 if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case.
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00001949 end_index - start_index > (*new_start_index - start_index) * 2 &&
1950 last - first > kSize * 2 &&
1951 binary_chop_index > *new_start_index &&
1952 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1953 int scan_forward_for_section_border = binary_chop_index;;
1954 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1955
1956 while (scan_forward_for_section_border < end_index) {
1957 if (ranges->at(scan_forward_for_section_border) > new_border) {
1958 *new_start_index = scan_forward_for_section_border;
1959 *border = new_border;
1960 break;
1961 }
1962 scan_forward_for_section_border++;
1963 }
1964 }
1965
1966 ASSERT(*new_start_index > start_index);
1967 *new_end_index = *new_start_index - 1;
1968 if (ranges->at(*new_end_index) == *border) {
1969 (*new_end_index)--;
1970 }
1971 if (*border >= ranges->at(end_index)) {
1972 *border = ranges->at(end_index);
1973 *new_start_index = end_index; // Won't be used.
1974 *new_end_index = end_index - 1;
1975 }
1976}
1977
1978
1979// Gets a series of segment boundaries representing a character class. If the
1980// character is in the range between an even and an odd boundary (counting from
1981// start_index) then go to even_label, otherwise go to odd_label. We already
1982// know that the character is in the range of min_char to max_char inclusive.
1983// Either label can be NULL indicating backtracking. Either label can also be
1984// equal to the fall_through label.
1985static void GenerateBranches(RegExpMacroAssembler* masm,
1986 ZoneList<int>* ranges,
1987 int start_index,
1988 int end_index,
1989 uc16 min_char,
1990 uc16 max_char,
1991 Label* fall_through,
1992 Label* even_label,
1993 Label* odd_label) {
1994 int first = ranges->at(start_index);
1995 int last = ranges->at(end_index) - 1;
1996
1997 ASSERT_LT(min_char, first);
1998
1999 // Just need to test if the character is before or on-or-after
2000 // a particular character.
2001 if (start_index == end_index) {
2002 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
2003 return;
2004 }
2005
2006 // Another almost trivial case: There is one interval in the middle that is
2007 // different from the end intervals.
2008 if (start_index + 1 == end_index) {
2009 EmitDoubleBoundaryTest(
2010 masm, first, last, fall_through, even_label, odd_label);
2011 return;
2012 }
2013
2014 // It's not worth using table lookup if there are very few intervals in the
2015 // character class.
2016 if (end_index - start_index <= 6) {
2017 // It is faster to test for individual characters, so we look for those
2018 // first, then try arbitrary ranges in the second round.
2019 static int kNoCutIndex = -1;
2020 int cut = kNoCutIndex;
2021 for (int i = start_index; i < end_index; i++) {
2022 if (ranges->at(i) == ranges->at(i + 1) - 1) {
2023 cut = i;
2024 break;
2025 }
2026 }
2027 if (cut == kNoCutIndex) cut = start_index;
2028 CutOutRange(
2029 masm, ranges, start_index, end_index, cut, even_label, odd_label);
2030 ASSERT_GE(end_index - start_index, 2);
2031 GenerateBranches(masm,
2032 ranges,
2033 start_index + 1,
2034 end_index - 1,
2035 min_char,
2036 max_char,
2037 fall_through,
2038 even_label,
2039 odd_label);
2040 return;
2041 }
2042
2043 // If there are a lot of intervals in the regexp, then we will use tables to
2044 // determine whether the character is inside or outside the character class.
2045 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2046
2047 if ((max_char >> kBits) == (min_char >> kBits)) {
2048 EmitUseLookupTable(masm,
2049 ranges,
2050 start_index,
2051 end_index,
2052 min_char,
2053 fall_through,
2054 even_label,
2055 odd_label);
2056 return;
2057 }
2058
2059 if ((min_char >> kBits) != (first >> kBits)) {
2060 masm->CheckCharacterLT(first, odd_label);
2061 GenerateBranches(masm,
2062 ranges,
2063 start_index + 1,
2064 end_index,
2065 first,
2066 max_char,
2067 fall_through,
2068 odd_label,
2069 even_label);
2070 return;
2071 }
2072
2073 int new_start_index = 0;
2074 int new_end_index = 0;
2075 int border = 0;
2076
2077 SplitSearchSpace(ranges,
2078 start_index,
2079 end_index,
2080 &new_start_index,
2081 &new_end_index,
2082 &border);
2083
2084 Label handle_rest;
2085 Label* above = &handle_rest;
2086 if (border == last + 1) {
2087 // We didn't find any section that started after the limit, so everything
2088 // above the border is one of the terminal labels.
2089 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2090 ASSERT(new_end_index == end_index - 1);
2091 }
2092
2093 ASSERT_LE(start_index, new_end_index);
2094 ASSERT_LE(new_start_index, end_index);
2095 ASSERT_LT(start_index, new_start_index);
2096 ASSERT_LT(new_end_index, end_index);
2097 ASSERT(new_end_index + 1 == new_start_index ||
2098 (new_end_index + 2 == new_start_index &&
2099 border == ranges->at(new_end_index + 1)));
2100 ASSERT_LT(min_char, border - 1);
2101 ASSERT_LT(border, max_char);
2102 ASSERT_LT(ranges->at(new_end_index), border);
2103 ASSERT(border < ranges->at(new_start_index) ||
2104 (border == ranges->at(new_start_index) &&
2105 new_start_index == end_index &&
2106 new_end_index == end_index - 1 &&
2107 border == last + 1));
2108 ASSERT(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2109
2110 masm->CheckCharacterGT(border - 1, above);
2111 Label dummy;
2112 GenerateBranches(masm,
2113 ranges,
2114 start_index,
2115 new_end_index,
2116 min_char,
2117 border - 1,
2118 &dummy,
2119 even_label,
2120 odd_label);
2121 if (handle_rest.is_linked()) {
2122 masm->Bind(&handle_rest);
2123 bool flip = (new_start_index & 1) != (start_index & 1);
2124 GenerateBranches(masm,
2125 ranges,
2126 new_start_index,
2127 end_index,
2128 border,
2129 max_char,
2130 &dummy,
2131 flip ? odd_label : even_label,
2132 flip ? even_label : odd_label);
2133 }
2134}
2135
2136
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002137static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2138 RegExpCharacterClass* cc,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002139 bool ascii,
ager@chromium.org381abbb2009-02-25 13:23:22 +00002140 Label* on_failure,
2141 int cp_offset,
2142 bool check_offset,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002143 bool preloaded,
2144 Zone* zone) {
2145 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00002146 if (!CharacterRange::IsCanonical(ranges)) {
2147 CharacterRange::Canonicalize(ranges);
2148 }
2149
ager@chromium.org8bb60582008-12-11 12:02:20 +00002150 int max_char;
2151 if (ascii) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002152 max_char = String::kMaxOneByteCharCode;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002153 } else {
yangguo@chromium.org154ff992012-03-13 08:09:54 +00002154 max_char = String::kMaxUtf16CodeUnit;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002155 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002156
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002157 int range_count = ranges->length();
2158
ager@chromium.org8bb60582008-12-11 12:02:20 +00002159 int last_valid_range = range_count - 1;
2160 while (last_valid_range >= 0) {
2161 CharacterRange& range = ranges->at(last_valid_range);
2162 if (range.from() <= max_char) {
2163 break;
2164 }
2165 last_valid_range--;
2166 }
2167
2168 if (last_valid_range < 0) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002169 if (!cc->is_negated()) {
2170 macro_assembler->GoTo(on_failure);
2171 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002172 if (check_offset) {
2173 macro_assembler->CheckPosition(cp_offset, on_failure);
2174 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002175 return;
2176 }
2177
ager@chromium.org8bb60582008-12-11 12:02:20 +00002178 if (last_valid_range == 0 &&
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00002179 ranges->at(0).IsEverything(max_char)) {
2180 if (cc->is_negated()) {
2181 macro_assembler->GoTo(on_failure);
2182 } else {
2183 // This is a common case hit by non-anchored expressions.
2184 if (check_offset) {
2185 macro_assembler->CheckPosition(cp_offset, on_failure);
2186 }
2187 }
2188 return;
2189 }
2190 if (last_valid_range == 0 &&
ager@chromium.org8bb60582008-12-11 12:02:20 +00002191 !cc->is_negated() &&
2192 ranges->at(0).IsEverything(max_char)) {
2193 // This is a common case hit by non-anchored expressions.
ager@chromium.org8bb60582008-12-11 12:02:20 +00002194 if (check_offset) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002195 macro_assembler->CheckPosition(cp_offset, on_failure);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002196 }
2197 return;
2198 }
2199
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002200 if (!preloaded) {
2201 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002202 }
2203
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002204 if (cc->is_standard(zone) &&
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00002205 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2206 on_failure)) {
2207 return;
2208 }
2209
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00002210
2211 // A new list with ascending entries. Each entry is a code unit
2212 // where there is a boundary between code units that are part of
2213 // the class and code units that are not. Normally we insert an
2214 // entry at zero which goes to the failure label, but if there
2215 // was already one there we fall through for success on that entry.
2216 // Subsequent entries have alternating meaning (success/failure).
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002217 ZoneList<int>* range_boundaries =
2218 new(zone) ZoneList<int>(last_valid_range, zone);
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00002219
2220 bool zeroth_entry_is_failure = !cc->is_negated();
2221
2222 for (int i = 0; i <= last_valid_range; i++) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002223 CharacterRange& range = ranges->at(i);
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00002224 if (range.from() == 0) {
2225 ASSERT_EQ(i, 0);
2226 zeroth_entry_is_failure = !zeroth_entry_is_failure;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002227 } else {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002228 range_boundaries->Add(range.from(), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002229 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002230 range_boundaries->Add(range.to() + 1, zone);
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00002231 }
2232 int end_index = range_boundaries->length() - 1;
2233 if (range_boundaries->at(end_index) > max_char) {
2234 end_index--;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002235 }
2236
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00002237 Label fall_through;
2238 GenerateBranches(macro_assembler,
2239 range_boundaries,
2240 0, // start_index.
2241 end_index,
2242 0, // min_char.
2243 max_char,
2244 &fall_through,
2245 zeroth_entry_is_failure ? &fall_through : on_failure,
2246 zeroth_entry_is_failure ? on_failure : &fall_through);
2247 macro_assembler->Bind(&fall_through);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002248}
2249
2250
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002251RegExpNode::~RegExpNode() {
2252}
2253
2254
ager@chromium.org8bb60582008-12-11 12:02:20 +00002255RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002256 Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002257 // If we are generating a greedy loop then don't stop and don't reuse code.
ager@chromium.org32912102009-01-16 10:38:43 +00002258 if (trace->stop_node() != NULL) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002259 return CONTINUE;
2260 }
2261
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002262 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00002263 if (trace->is_trivial()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002264 if (label_.is_bound()) {
2265 // We are being asked to generate a generic version, but that's already
2266 // been done so just go to it.
2267 macro_assembler->GoTo(&label_);
2268 return DONE;
2269 }
2270 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
2271 // To avoid too deep recursion we push the node to the work queue and just
2272 // generate a goto here.
2273 compiler->AddWork(this);
2274 macro_assembler->GoTo(&label_);
2275 return DONE;
2276 }
2277 // Generate generic version of the node and bind the label for later use.
2278 macro_assembler->Bind(&label_);
2279 return CONTINUE;
2280 }
2281
2282 // We are being asked to make a non-generic version. Keep track of how many
2283 // non-generic versions we generate so as not to overdo it.
ager@chromium.org32912102009-01-16 10:38:43 +00002284 trace_count_++;
ager@chromium.org381abbb2009-02-25 13:23:22 +00002285 if (FLAG_regexp_optimization &&
iposva@chromium.org245aa852009-02-10 00:49:54 +00002286 trace_count_ < kMaxCopiesCodeGenerated &&
ager@chromium.org8bb60582008-12-11 12:02:20 +00002287 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
2288 return CONTINUE;
2289 }
2290
ager@chromium.org32912102009-01-16 10:38:43 +00002291 // If we get here code has been generated for this node too many times or
2292 // recursion is too deep. Time to switch to a generic version. The code for
ager@chromium.org8bb60582008-12-11 12:02:20 +00002293 // generic versions above can handle deep recursion properly.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002294 trace->Flush(compiler, this);
2295 return DONE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002296}
2297
2298
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002299int ActionNode::EatsAtLeast(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002300 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002301 bool not_at_start) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002302 if (budget <= 0) return 0;
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00002303 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002304 return on_success()->EatsAtLeast(still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002305 budget - 1,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002306 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002307}
2308
2309
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002310void ActionNode::FillInBMInfo(int offset,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00002311 int budget,
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002312 BoyerMooreLookahead* bm,
2313 bool not_at_start) {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00002314 if (action_type_ == BEGIN_SUBMATCH) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002315 bm->SetRest(offset);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00002316 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002317 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002318 }
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00002319 SaveBMInfo(bm, not_at_start, offset);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002320}
2321
2322
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002323int AssertionNode::EatsAtLeast(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002324 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002325 bool not_at_start) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002326 if (budget <= 0) return 0;
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002327 // If we know we are not at the start and we are asked "how many characters
2328 // will you match if you succeed?" then we can answer anything since false
2329 // implies false. So lets just return the max answer (still_to_find) since
2330 // that won't prevent us from preloading a lot of characters for the other
2331 // branches in the node graph.
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00002332 if (assertion_type() == AT_START && not_at_start) return still_to_find;
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002333 return on_success()->EatsAtLeast(still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002334 budget - 1,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002335 not_at_start);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002336}
2337
2338
verwaest@chromium.org37141392012-05-31 13:27:02 +00002339void AssertionNode::FillInBMInfo(int offset,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00002340 int budget,
verwaest@chromium.org37141392012-05-31 13:27:02 +00002341 BoyerMooreLookahead* bm,
2342 bool not_at_start) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002343 // Match the behaviour of EatsAtLeast on this node.
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00002344 if (assertion_type() == AT_START && not_at_start) return;
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002345 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00002346 SaveBMInfo(bm, not_at_start, offset);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002347}
2348
2349
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002350int BackReferenceNode::EatsAtLeast(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002351 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002352 bool not_at_start) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002353 if (budget <= 0) return 0;
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002354 return on_success()->EatsAtLeast(still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002355 budget - 1,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002356 not_at_start);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002357}
2358
2359
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002360int TextNode::EatsAtLeast(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002361 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002362 bool not_at_start) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002363 int answer = Length();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002364 if (answer >= still_to_find) return answer;
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002365 if (budget <= 0) return answer;
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002366 // We are not at start after this node so we set the last argument to 'true'.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002367 return answer + on_success()->EatsAtLeast(still_to_find - answer,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002368 budget - 1,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002369 true);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002370}
2371
2372
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00002373int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002374 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002375 bool not_at_start) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002376 if (budget <= 0) return 0;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002377 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2378 // afterwards.
2379 RegExpNode* node = alternatives_->at(1).node();
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002380 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002381}
2382
2383
2384void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
2385 QuickCheckDetails* details,
2386 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002387 int filled_in,
2388 bool not_at_start) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002389 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2390 // afterwards.
2391 RegExpNode* node = alternatives_->at(1).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002392 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002393}
2394
2395
2396int ChoiceNode::EatsAtLeastHelper(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002397 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002398 RegExpNode* ignore_this_node,
2399 bool not_at_start) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002400 if (budget <= 0) return 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002401 int min = 100;
2402 int choice_count = alternatives_->length();
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002403 budget = (budget - 1) / choice_count;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002404 for (int i = 0; i < choice_count; i++) {
2405 RegExpNode* node = alternatives_->at(i).node();
2406 if (node == ignore_this_node) continue;
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002407 int node_eats_at_least =
2408 node->EatsAtLeast(still_to_find, budget, not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002409 if (node_eats_at_least < min) min = node_eats_at_least;
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002410 if (min == 0) return 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002411 }
2412 return min;
2413}
2414
2415
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002416int LoopChoiceNode::EatsAtLeast(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002417 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002418 bool not_at_start) {
2419 return EatsAtLeastHelper(still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002420 budget - 1,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002421 loop_node_,
2422 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002423}
2424
2425
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002426int ChoiceNode::EatsAtLeast(int still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002427 int budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002428 bool not_at_start) {
2429 return EatsAtLeastHelper(still_to_find,
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002430 budget,
kasperl@chromium.orga5551262010-12-07 12:49:48 +00002431 NULL,
2432 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002433}
2434
2435
2436// Takes the left-most 1-bit and smears it out, setting all bits to its right.
2437static inline uint32_t SmearBitsRight(uint32_t v) {
2438 v |= v >> 1;
2439 v |= v >> 2;
2440 v |= v >> 4;
2441 v |= v >> 8;
2442 v |= v >> 16;
2443 return v;
2444}
2445
2446
2447bool QuickCheckDetails::Rationalize(bool asc) {
2448 bool found_useful_op = false;
2449 uint32_t char_mask;
2450 if (asc) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002451 char_mask = String::kMaxOneByteCharCode;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002452 } else {
yangguo@chromium.org154ff992012-03-13 08:09:54 +00002453 char_mask = String::kMaxUtf16CodeUnit;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002454 }
2455 mask_ = 0;
2456 value_ = 0;
2457 int char_shift = 0;
2458 for (int i = 0; i < characters_; i++) {
2459 Position* pos = &positions_[i];
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002460 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002461 found_useful_op = true;
2462 }
2463 mask_ |= (pos->mask & char_mask) << char_shift;
2464 value_ |= (pos->value & char_mask) << char_shift;
2465 char_shift += asc ? 8 : 16;
2466 }
2467 return found_useful_op;
2468}
2469
2470
2471bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002472 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002473 bool preload_has_checked_bounds,
2474 Label* on_possible_success,
2475 QuickCheckDetails* details,
2476 bool fall_through_on_failure) {
2477 if (details->characters() == 0) return false;
danno@chromium.orgbee51992013-07-10 14:57:15 +00002478 GetQuickCheckDetails(
2479 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002480 if (details->cannot_match()) return false;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002481 if (!details->Rationalize(compiler->ascii())) return false;
ager@chromium.org18ad94b2009-09-02 08:22:29 +00002482 ASSERT(details->characters() == 1 ||
2483 compiler->macro_assembler()->CanReadUnaligned());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002484 uint32_t mask = details->mask();
2485 uint32_t value = details->value();
2486
2487 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2488
ager@chromium.org32912102009-01-16 10:38:43 +00002489 if (trace->characters_preloaded() != details->characters()) {
2490 assembler->LoadCurrentCharacter(trace->cp_offset(),
2491 trace->backtrack(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002492 !preload_has_checked_bounds,
2493 details->characters());
2494 }
2495
2496
2497 bool need_mask = true;
2498
2499 if (details->characters() == 1) {
2500 // If number of characters preloaded is 1 then we used a byte or 16 bit
2501 // load so the value is already masked down.
2502 uint32_t char_mask;
2503 if (compiler->ascii()) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002504 char_mask = String::kMaxOneByteCharCode;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002505 } else {
yangguo@chromium.org154ff992012-03-13 08:09:54 +00002506 char_mask = String::kMaxUtf16CodeUnit;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002507 }
2508 if ((mask & char_mask) == char_mask) need_mask = false;
2509 mask &= char_mask;
2510 } else {
ricow@chromium.org5ad5ace2010-06-23 09:06:43 +00002511 // For 2-character preloads in ASCII mode or 1-character preloads in
2512 // TWO_BYTE mode we also use a 16 bit load with zero extend.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002513 if (details->characters() == 2 && compiler->ascii()) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002514 if ((mask & 0xffff) == 0xffff) need_mask = false;
ricow@chromium.org5ad5ace2010-06-23 09:06:43 +00002515 } else if (details->characters() == 1 && !compiler->ascii()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002516 if ((mask & 0xffff) == 0xffff) need_mask = false;
2517 } else {
2518 if (mask == 0xffffffff) need_mask = false;
2519 }
2520 }
2521
2522 if (fall_through_on_failure) {
2523 if (need_mask) {
2524 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2525 } else {
2526 assembler->CheckCharacter(value, on_possible_success);
2527 }
2528 } else {
2529 if (need_mask) {
ager@chromium.org32912102009-01-16 10:38:43 +00002530 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002531 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00002532 assembler->CheckNotCharacter(value, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002533 }
2534 }
2535 return true;
2536}
2537
2538
2539// Here is the meat of GetQuickCheckDetails (see also the comment on the
2540// super-class in the .h file).
2541//
2542// We iterate along the text object, building up for each character a
2543// mask and value that can be used to test for a quick failure to match.
2544// The masks and values for the positions will be combined into a single
2545// machine word for the current character width in order to be used in
2546// generating a quick check.
2547void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2548 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002549 int characters_filled_in,
2550 bool not_at_start) {
jkummerow@chromium.org3d00d0a2013-09-04 13:57:32 +00002551 Isolate* isolate = compiler->macro_assembler()->zone()->isolate();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002552 ASSERT(characters_filled_in < details->characters());
2553 int characters = details->characters();
2554 int char_mask;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002555 if (compiler->ascii()) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002556 char_mask = String::kMaxOneByteCharCode;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002557 } else {
yangguo@chromium.org154ff992012-03-13 08:09:54 +00002558 char_mask = String::kMaxUtf16CodeUnit;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002559 }
2560 for (int k = 0; k < elms_->length(); k++) {
2561 TextElement elm = elms_->at(k);
rossberg@chromium.org92597162013-08-23 13:28:00 +00002562 if (elm.text_type() == TextElement::ATOM) {
2563 Vector<const uc16> quarks = elm.atom()->data();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002564 for (int i = 0; i < characters && i < quarks.length(); i++) {
2565 QuickCheckDetails::Position* pos =
2566 details->positions(characters_filled_in);
ager@chromium.org6f10e412009-02-13 10:11:16 +00002567 uc16 c = quarks[i];
danno@chromium.org2c26cb12012-05-03 09:06:43 +00002568 if (c > char_mask) {
2569 // If we expect a non-ASCII character from an ASCII string,
2570 // there is no way we can match. Not even case independent
2571 // matching can turn an ASCII character into non-ASCII or
2572 // vice versa.
2573 details->set_cannot_match();
2574 pos->determines_perfectly = false;
2575 return;
2576 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002577 if (compiler->ignore_case()) {
2578 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00002579 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
2580 chars);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002581 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
2582 if (length == 1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002583 // This letter has no case equivalents, so it's nice and simple
2584 // and the mask-compare will determine definitely whether we have
2585 // a match at this character position.
2586 pos->mask = char_mask;
2587 pos->value = c;
2588 pos->determines_perfectly = true;
2589 } else {
2590 uint32_t common_bits = char_mask;
2591 uint32_t bits = chars[0];
2592 for (int j = 1; j < length; j++) {
2593 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2594 common_bits ^= differing_bits;
2595 bits &= common_bits;
2596 }
2597 // If length is 2 and common bits has only one zero in it then
2598 // our mask and compare instruction will determine definitely
2599 // whether we have a match at this character position. Otherwise
2600 // it can only be an approximate check.
2601 uint32_t one_zero = (common_bits | ~char_mask);
2602 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2603 pos->determines_perfectly = true;
2604 }
2605 pos->mask = common_bits;
2606 pos->value = bits;
2607 }
2608 } else {
2609 // Don't ignore case. Nice simple case where the mask-compare will
2610 // determine definitely whether we have a match at this character
2611 // position.
2612 pos->mask = char_mask;
ager@chromium.org6f10e412009-02-13 10:11:16 +00002613 pos->value = c;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002614 pos->determines_perfectly = true;
2615 }
2616 characters_filled_in++;
2617 ASSERT(characters_filled_in <= details->characters());
2618 if (characters_filled_in == details->characters()) {
2619 return;
2620 }
2621 }
2622 } else {
2623 QuickCheckDetails::Position* pos =
2624 details->positions(characters_filled_in);
rossberg@chromium.org92597162013-08-23 13:28:00 +00002625 RegExpCharacterClass* tree = elm.char_class();
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002626 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002627 if (tree->is_negated()) {
2628 // A quick check uses multi-character mask and compare. There is no
2629 // useful way to incorporate a negative char class into this scheme
2630 // so we just conservatively create a mask and value that will always
2631 // succeed.
2632 pos->mask = 0;
2633 pos->value = 0;
2634 } else {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002635 int first_range = 0;
2636 while (ranges->at(first_range).from() > char_mask) {
2637 first_range++;
danno@chromium.org2c26cb12012-05-03 09:06:43 +00002638 if (first_range == ranges->length()) {
2639 details->set_cannot_match();
2640 pos->determines_perfectly = false;
2641 return;
2642 }
ager@chromium.org381abbb2009-02-25 13:23:22 +00002643 }
2644 CharacterRange range = ranges->at(first_range);
2645 uc16 from = range.from();
2646 uc16 to = range.to();
2647 if (to > char_mask) {
2648 to = char_mask;
2649 }
2650 uint32_t differing_bits = (from ^ to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002651 // A mask and compare is only perfect if the differing bits form a
2652 // number like 00011111 with one single block of trailing 1s.
ager@chromium.org5aa501c2009-06-23 07:57:28 +00002653 if ((differing_bits & (differing_bits + 1)) == 0 &&
2654 from + differing_bits == to) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002655 pos->determines_perfectly = true;
2656 }
2657 uint32_t common_bits = ~SmearBitsRight(differing_bits);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002658 uint32_t bits = (from & common_bits);
2659 for (int i = first_range + 1; i < ranges->length(); i++) {
2660 CharacterRange range = ranges->at(i);
2661 uc16 from = range.from();
2662 uc16 to = range.to();
2663 if (from > char_mask) continue;
2664 if (to > char_mask) to = char_mask;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002665 // Here we are combining more ranges into the mask and compare
2666 // value. With each new range the mask becomes more sparse and
2667 // so the chances of a false positive rise. A character class
2668 // with multiple ranges is assumed never to be equivalent to a
2669 // mask and compare operation.
2670 pos->determines_perfectly = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00002671 uint32_t new_common_bits = (from ^ to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002672 new_common_bits = ~SmearBitsRight(new_common_bits);
2673 common_bits &= new_common_bits;
2674 bits &= new_common_bits;
ager@chromium.org381abbb2009-02-25 13:23:22 +00002675 uint32_t differing_bits = (from & common_bits) ^ bits;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002676 common_bits ^= differing_bits;
2677 bits &= common_bits;
2678 }
2679 pos->mask = common_bits;
2680 pos->value = bits;
2681 }
2682 characters_filled_in++;
2683 ASSERT(characters_filled_in <= details->characters());
2684 if (characters_filled_in == details->characters()) {
2685 return;
2686 }
2687 }
2688 }
2689 ASSERT(characters_filled_in != details->characters());
danno@chromium.org2c26cb12012-05-03 09:06:43 +00002690 if (!details->cannot_match()) {
2691 on_success()-> GetQuickCheckDetails(details,
2692 compiler,
2693 characters_filled_in,
2694 true);
2695 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002696}
2697
2698
2699void QuickCheckDetails::Clear() {
2700 for (int i = 0; i < characters_; i++) {
2701 positions_[i].mask = 0;
2702 positions_[i].value = 0;
2703 positions_[i].determines_perfectly = false;
2704 }
2705 characters_ = 0;
2706}
2707
2708
2709void QuickCheckDetails::Advance(int by, bool ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002710 ASSERT(by >= 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002711 if (by >= characters_) {
2712 Clear();
2713 return;
2714 }
2715 for (int i = 0; i < characters_ - by; i++) {
2716 positions_[i] = positions_[by + i];
2717 }
2718 for (int i = characters_ - by; i < characters_; i++) {
2719 positions_[i].mask = 0;
2720 positions_[i].value = 0;
2721 positions_[i].determines_perfectly = false;
2722 }
2723 characters_ -= by;
2724 // We could change mask_ and value_ here but we would never advance unless
2725 // they had already been used in a check and they won't be used again because
2726 // it would gain us nothing. So there's no point.
2727}
2728
2729
2730void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2731 ASSERT(characters_ == other->characters_);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002732 if (other->cannot_match_) {
2733 return;
2734 }
2735 if (cannot_match_) {
2736 *this = *other;
2737 return;
2738 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002739 for (int i = from_index; i < characters_; i++) {
2740 QuickCheckDetails::Position* pos = positions(i);
2741 QuickCheckDetails::Position* other_pos = other->positions(i);
2742 if (pos->mask != other_pos->mask ||
2743 pos->value != other_pos->value ||
2744 !other_pos->determines_perfectly) {
2745 // Our mask-compare operation will be approximate unless we have the
2746 // exact same operation on both sides of the alternation.
2747 pos->determines_perfectly = false;
2748 }
2749 pos->mask &= other_pos->mask;
2750 pos->value &= pos->mask;
2751 other_pos->value &= pos->mask;
2752 uc16 differing_bits = (pos->value ^ other_pos->value);
2753 pos->mask &= ~differing_bits;
2754 pos->value &= pos->mask;
2755 }
2756}
2757
2758
ager@chromium.org32912102009-01-16 10:38:43 +00002759class VisitMarker {
2760 public:
2761 explicit VisitMarker(NodeInfo* info) : info_(info) {
2762 ASSERT(!info->visited);
2763 info->visited = true;
2764 }
2765 ~VisitMarker() {
2766 info_->visited = false;
2767 }
2768 private:
2769 NodeInfo* info_;
2770};
2771
2772
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002773RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) {
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002774 if (info()->replacement_calculated) return replacement();
2775 if (depth < 0) return this;
2776 ASSERT(!info()->visited);
2777 VisitMarker marker(info());
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002778 return FilterSuccessor(depth - 1, ignore_case);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002779}
2780
2781
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002782RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2783 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002784 if (next == NULL) return set_replacement(NULL);
2785 on_success_ = next;
2786 return set_replacement(this);
2787}
2788
2789
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002790// We need to check for the following characters: 0x39c 0x3bc 0x178.
2791static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002792 // TODO(dcarney): this could be a lot more efficient.
2793 return range.Contains(0x39c) ||
2794 range.Contains(0x3bc) || range.Contains(0x178);
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002795}
2796
2797
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002798static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2799 for (int i = 0; i < ranges->length(); i++) {
2800 // TODO(dcarney): this could be a lot more efficient.
2801 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2802 }
2803 return false;
2804}
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002805
2806
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002807RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002808 if (info()->replacement_calculated) return replacement();
2809 if (depth < 0) return this;
2810 ASSERT(!info()->visited);
2811 VisitMarker marker(info());
2812 int element_count = elms_->length();
2813 for (int i = 0; i < element_count; i++) {
2814 TextElement elm = elms_->at(i);
rossberg@chromium.org92597162013-08-23 13:28:00 +00002815 if (elm.text_type() == TextElement::ATOM) {
2816 Vector<const uc16> quarks = elm.atom()->data();
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002817 for (int j = 0; j < quarks.length(); j++) {
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002818 uint16_t c = quarks[j];
2819 if (c <= String::kMaxOneByteCharCode) continue;
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002820 if (!ignore_case) return set_replacement(NULL);
2821 // Here, we need to check for characters whose upper and lower cases
2822 // are outside the Latin-1 range.
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002823 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2824 // Character is outside Latin-1 completely
2825 if (converted == 0) return set_replacement(NULL);
2826 // Convert quark to Latin-1 in place.
2827 uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2828 copy[j] = converted;
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002829 }
2830 } else {
rossberg@chromium.org92597162013-08-23 13:28:00 +00002831 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
2832 RegExpCharacterClass* cc = elm.char_class();
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002833 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002834 if (!CharacterRange::IsCanonical(ranges)) {
2835 CharacterRange::Canonicalize(ranges);
2836 }
2837 // Now they are in order so we only need to look at the first.
2838 int range_count = ranges->length();
2839 if (cc->is_negated()) {
2840 if (range_count != 0 &&
2841 ranges->at(0).from() == 0 &&
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002842 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002843 // This will be handled in a later filter.
2844 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002845 return set_replacement(NULL);
2846 }
2847 } else {
2848 if (range_count == 0 ||
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002849 ranges->at(0).from() > String::kMaxOneByteCharCode) {
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00002850 // This will be handled in a later filter.
2851 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002852 return set_replacement(NULL);
2853 }
2854 }
2855 }
2856 }
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002857 return FilterSuccessor(depth - 1, ignore_case);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002858}
2859
2860
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002861RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) {
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002862 if (info()->replacement_calculated) return replacement();
2863 if (depth < 0) return this;
2864 if (info()->visited) return this;
danno@chromium.org2c26cb12012-05-03 09:06:43 +00002865 {
2866 VisitMarker marker(info());
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002867
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002868 RegExpNode* continue_replacement =
2869 continue_node_->FilterASCII(depth - 1, ignore_case);
danno@chromium.org2c26cb12012-05-03 09:06:43 +00002870 // If we can't continue after the loop then there is no sense in doing the
2871 // loop.
2872 if (continue_replacement == NULL) return set_replacement(NULL);
2873 }
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002874
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002875 return ChoiceNode::FilterASCII(depth - 1, ignore_case);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002876}
2877
2878
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002879RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) {
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002880 if (info()->replacement_calculated) return replacement();
2881 if (depth < 0) return this;
2882 if (info()->visited) return this;
2883 VisitMarker marker(info());
2884 int choice_count = alternatives_->length();
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002885
2886 for (int i = 0; i < choice_count; i++) {
2887 GuardedAlternative alternative = alternatives_->at(i);
2888 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2889 set_replacement(this);
2890 return this;
2891 }
2892 }
2893
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002894 int surviving = 0;
2895 RegExpNode* survivor = NULL;
2896 for (int i = 0; i < choice_count; i++) {
2897 GuardedAlternative alternative = alternatives_->at(i);
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002898 RegExpNode* replacement =
2899 alternative.node()->FilterASCII(depth - 1, ignore_case);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002900 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002901 if (replacement != NULL) {
danno@chromium.orgb10deab2012-05-07 14:28:47 +00002902 alternatives_->at(i).set_node(replacement);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002903 surviving++;
2904 survivor = replacement;
2905 }
2906 }
2907 if (surviving < 2) return set_replacement(survivor);
2908
2909 set_replacement(this);
2910 if (surviving == choice_count) {
2911 return this;
2912 }
2913 // Only some of the nodes survived the filtering. We need to rebuild the
2914 // alternatives list.
2915 ZoneList<GuardedAlternative>* new_alternatives =
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002916 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002917 for (int i = 0; i < choice_count; i++) {
danno@chromium.orgb10deab2012-05-07 14:28:47 +00002918 RegExpNode* replacement =
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002919 alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case);
danno@chromium.orgb10deab2012-05-07 14:28:47 +00002920 if (replacement != NULL) {
2921 alternatives_->at(i).set_node(replacement);
mmassi@chromium.org7028c052012-06-13 11:51:58 +00002922 new_alternatives->Add(alternatives_->at(i), zone());
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002923 }
2924 }
2925 alternatives_ = new_alternatives;
2926 return this;
2927}
2928
2929
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002930RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth,
2931 bool ignore_case) {
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002932 if (info()->replacement_calculated) return replacement();
2933 if (depth < 0) return this;
2934 if (info()->visited) return this;
2935 VisitMarker marker(info());
2936 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2937 // afterwards.
2938 RegExpNode* node = alternatives_->at(1).node();
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002939 RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002940 if (replacement == NULL) return set_replacement(NULL);
2941 alternatives_->at(1).set_node(replacement);
2942
2943 RegExpNode* neg_node = alternatives_->at(0).node();
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00002944 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case);
danno@chromium.org1044a4d2012-04-30 12:34:39 +00002945 // If the negative lookahead is always going to fail then
2946 // we don't need to check it.
2947 if (neg_replacement == NULL) return set_replacement(replacement);
2948 alternatives_->at(0).set_node(neg_replacement);
2949 return set_replacement(this);
2950}
2951
2952
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002953void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2954 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002955 int characters_filled_in,
2956 bool not_at_start) {
ager@chromium.org32912102009-01-16 10:38:43 +00002957 if (body_can_be_zero_length_ || info()->visited) return;
2958 VisitMarker marker(info());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002959 return ChoiceNode::GetQuickCheckDetails(details,
2960 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002961 characters_filled_in,
2962 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002963}
2964
2965
verwaest@chromium.org37141392012-05-31 13:27:02 +00002966void LoopChoiceNode::FillInBMInfo(int offset,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00002967 int budget,
verwaest@chromium.org37141392012-05-31 13:27:02 +00002968 BoyerMooreLookahead* bm,
2969 bool not_at_start) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002970 if (body_can_be_zero_length_ || budget <= 0) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002971 bm->SetRest(offset);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00002972 SaveBMInfo(bm, not_at_start, offset);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002973 return;
2974 }
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00002975 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00002976 SaveBMInfo(bm, not_at_start, offset);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00002977}
2978
2979
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002980void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2981 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002982 int characters_filled_in,
2983 bool not_at_start) {
2984 not_at_start = (not_at_start || not_at_start_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002985 int choice_count = alternatives_->length();
2986 ASSERT(choice_count > 0);
2987 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2988 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002989 characters_filled_in,
2990 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002991 for (int i = 1; i < choice_count; i++) {
2992 QuickCheckDetails new_details(details->characters());
2993 RegExpNode* node = alternatives_->at(i).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002994 node->GetQuickCheckDetails(&new_details, compiler,
2995 characters_filled_in,
2996 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002997 // Here we merge the quick match details of the two branches.
2998 details->Merge(&new_details, characters_filled_in);
2999 }
3000}
3001
3002
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003003// Check for [0-9A-Z_a-z].
3004static void EmitWordCheck(RegExpMacroAssembler* assembler,
3005 Label* word,
3006 Label* non_word,
3007 bool fall_through_on_word) {
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003008 if (assembler->CheckSpecialCharacterClass(
3009 fall_through_on_word ? 'w' : 'W',
3010 fall_through_on_word ? non_word : word)) {
3011 // Optimized implementation available.
3012 return;
3013 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003014 assembler->CheckCharacterGT('z', non_word);
3015 assembler->CheckCharacterLT('0', non_word);
3016 assembler->CheckCharacterGT('a' - 1, word);
3017 assembler->CheckCharacterLT('9' + 1, word);
3018 assembler->CheckCharacterLT('A', non_word);
3019 assembler->CheckCharacterLT('Z' + 1, word);
3020 if (fall_through_on_word) {
3021 assembler->CheckNotCharacter('_', non_word);
3022 } else {
3023 assembler->CheckCharacter('_', word);
3024 }
3025}
3026
3027
3028// Emit the code to check for a ^ in multiline mode (1-character lookbehind
3029// that matches newline or the start of input).
3030static void EmitHat(RegExpCompiler* compiler,
3031 RegExpNode* on_success,
3032 Trace* trace) {
3033 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3034 // We will be loading the previous character into the current character
3035 // register.
3036 Trace new_trace(*trace);
3037 new_trace.InvalidateCurrentCharacter();
3038
3039 Label ok;
3040 if (new_trace.cp_offset() == 0) {
3041 // The start of input counts as a newline in this context, so skip to
3042 // ok if we are at the start.
3043 assembler->CheckAtStart(&ok);
3044 }
3045 // We already checked that we are not at the start of input so it must be
3046 // OK to load the previous character.
3047 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3048 new_trace.backtrack(),
3049 false);
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003050 if (!assembler->CheckSpecialCharacterClass('n',
3051 new_trace.backtrack())) {
3052 // Newline means \n, \r, 0x2028 or 0x2029.
3053 if (!compiler->ascii()) {
3054 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3055 }
3056 assembler->CheckCharacter('\n', &ok);
3057 assembler->CheckNotCharacter('\r', new_trace.backtrack());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003058 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003059 assembler->Bind(&ok);
3060 on_success->Emit(compiler, &new_trace);
3061}
3062
3063
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003064// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3065void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003066 RegExpMacroAssembler* assembler = compiler->macro_assembler();
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003067 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
danno@chromium.orgbee51992013-07-10 14:57:15 +00003068 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003069 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3070 if (lookahead == NULL) {
3071 int eats_at_least =
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00003072 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3073 kRecursionBudget,
3074 not_at_start));
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003075 if (eats_at_least >= 1) {
3076 BoyerMooreLookahead* bm =
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003077 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00003078 FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
danno@chromium.orgbee51992013-07-10 14:57:15 +00003079 if (bm->at(0)->is_non_word())
3080 next_is_word_character = Trace::FALSE_VALUE;
3081 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003082 }
3083 } else {
danno@chromium.orgbee51992013-07-10 14:57:15 +00003084 if (lookahead->at(0)->is_non_word())
3085 next_is_word_character = Trace::FALSE_VALUE;
3086 if (lookahead->at(0)->is_word())
3087 next_is_word_character = Trace::TRUE_VALUE;
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003088 }
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00003089 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003090 if (next_is_word_character == Trace::UNKNOWN) {
3091 Label before_non_word;
3092 Label before_word;
3093 if (trace->characters_preloaded() != 1) {
3094 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3095 }
3096 // Fall through on non-word.
3097 EmitWordCheck(assembler, &before_word, &before_non_word, false);
3098 // Next character is not a word character.
3099 assembler->Bind(&before_non_word);
3100 Label ok;
3101 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3102 assembler->GoTo(&ok);
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003103
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003104 assembler->Bind(&before_word);
3105 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3106 assembler->Bind(&ok);
danno@chromium.orgbee51992013-07-10 14:57:15 +00003107 } else if (next_is_word_character == Trace::TRUE_VALUE) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003108 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3109 } else {
danno@chromium.orgbee51992013-07-10 14:57:15 +00003110 ASSERT(next_is_word_character == Trace::FALSE_VALUE);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003111 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3112 }
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003113}
3114
3115
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003116void AssertionNode::BacktrackIfPrevious(
3117 RegExpCompiler* compiler,
3118 Trace* trace,
3119 AssertionNode::IfPrevious backtrack_if_previous) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003120 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003121 Trace new_trace(*trace);
3122 new_trace.InvalidateCurrentCharacter();
3123
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003124 Label fall_through, dummy;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003125
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003126 Label* non_word = backtrack_if_previous == kIsNonWord ?
3127 new_trace.backtrack() :
3128 &fall_through;
3129 Label* word = backtrack_if_previous == kIsNonWord ?
3130 &fall_through :
3131 new_trace.backtrack();
3132
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003133 if (new_trace.cp_offset() == 0) {
3134 // The start of input counts as a non-word character, so the question is
3135 // decided if we are at the start.
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003136 assembler->CheckAtStart(non_word);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003137 }
3138 // We already checked that we are not at the start of input so it must be
3139 // OK to load the previous character.
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003140 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3141 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003142
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003143 assembler->Bind(&fall_through);
3144 on_success()->Emit(compiler, &new_trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003145}
3146
3147
iposva@chromium.org245aa852009-02-10 00:49:54 +00003148void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3149 RegExpCompiler* compiler,
3150 int filled_in,
3151 bool not_at_start) {
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00003152 if (assertion_type_ == AT_START && not_at_start) {
iposva@chromium.org245aa852009-02-10 00:49:54 +00003153 details->set_cannot_match();
3154 return;
3155 }
3156 return on_success()->GetQuickCheckDetails(details,
3157 compiler,
3158 filled_in,
3159 not_at_start);
3160}
3161
3162
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003163void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3164 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00003165 switch (assertion_type_) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003166 case AT_END: {
3167 Label ok;
3168 assembler->CheckPosition(trace->cp_offset(), &ok);
3169 assembler->GoTo(trace->backtrack());
3170 assembler->Bind(&ok);
3171 break;
3172 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00003173 case AT_START: {
danno@chromium.orgbee51992013-07-10 14:57:15 +00003174 if (trace->at_start() == Trace::FALSE_VALUE) {
iposva@chromium.org245aa852009-02-10 00:49:54 +00003175 assembler->GoTo(trace->backtrack());
3176 return;
3177 }
3178 if (trace->at_start() == Trace::UNKNOWN) {
3179 assembler->CheckNotAtStart(trace->backtrack());
3180 Trace at_start_trace = *trace;
3181 at_start_trace.set_at_start(true);
3182 on_success()->Emit(compiler, &at_start_trace);
3183 return;
3184 }
3185 }
3186 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003187 case AFTER_NEWLINE:
3188 EmitHat(compiler, on_success(), trace);
3189 return;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003190 case AT_BOUNDARY:
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003191 case AT_NON_BOUNDARY: {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003192 EmitBoundaryCheck(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003193 return;
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00003194 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003195 }
3196 on_success()->Emit(compiler, trace);
3197}
3198
3199
ager@chromium.org381abbb2009-02-25 13:23:22 +00003200static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3201 if (quick_check == NULL) return false;
3202 if (offset >= quick_check->characters()) return false;
3203 return quick_check->positions(offset)->determines_perfectly;
3204}
3205
3206
3207static void UpdateBoundsCheck(int index, int* checked_up_to) {
3208 if (index > *checked_up_to) {
3209 *checked_up_to = index;
3210 }
3211}
3212
3213
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003214// We call this repeatedly to generate code for each pass over the text node.
3215// The passes are in increasing order of difficulty because we hope one
3216// of the first passes will fail in which case we are saved the work of the
3217// later passes. for example for the case independent regexp /%[asdfghjkl]a/
3218// we will check the '%' in the first pass, the case independent 'a' in the
3219// second pass and the character class in the last pass.
3220//
3221// The passes are done from right to left, so for example to test for /bar/
3222// we will first test for an 'r' with offset 2, then an 'a' with offset 1
3223// and then a 'b' with offset 0. This means we can avoid the end-of-input
3224// bounds check most of the time. In the example we only need to check for
3225// end-of-input when loading the putative 'r'.
3226//
3227// A slight complication involves the fact that the first character may already
3228// be fetched into a register by the previous node. In this case we want to
3229// do the test for that character first. We do this in separate passes. The
3230// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3231// pass has been performed then subsequent passes will have true in
3232// first_element_checked to indicate that that character does not need to be
3233// checked again.
3234//
ager@chromium.org32912102009-01-16 10:38:43 +00003235// In addition to all this we are passed a Trace, which can
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003236// contain an AlternativeGeneration object. In this AlternativeGeneration
3237// object we can see details of any quick check that was already passed in
3238// order to get to the code we are now generating. The quick check can involve
3239// loading characters, which means we do not need to recheck the bounds
3240// up to the limit the quick check already checked. In addition the quick
3241// check can have involved a mask and compare operation which may simplify
3242// or obviate the need for further checks at some character positions.
3243void TextNode::TextEmitPass(RegExpCompiler* compiler,
3244 TextEmitPassType pass,
3245 bool preloaded,
ager@chromium.org32912102009-01-16 10:38:43 +00003246 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003247 bool first_element_checked,
3248 int* checked_up_to) {
3249 RegExpMacroAssembler* assembler = compiler->macro_assembler();
jkummerow@chromium.org3d00d0a2013-09-04 13:57:32 +00003250 Isolate* isolate = assembler->zone()->isolate();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003251 bool ascii = compiler->ascii();
ager@chromium.org32912102009-01-16 10:38:43 +00003252 Label* backtrack = trace->backtrack();
3253 QuickCheckDetails* quick_check = trace->quick_check_performed();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003254 int element_count = elms_->length();
3255 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3256 TextElement elm = elms_->at(i);
rossberg@chromium.org92597162013-08-23 13:28:00 +00003257 int cp_offset = trace->cp_offset() + elm.cp_offset();
3258 if (elm.text_type() == TextElement::ATOM) {
3259 Vector<const uc16> quarks = elm.atom()->data();
ager@chromium.org381abbb2009-02-25 13:23:22 +00003260 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3261 if (first_element_checked && i == 0 && j == 0) continue;
rossberg@chromium.org92597162013-08-23 13:28:00 +00003262 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
ager@chromium.org381abbb2009-02-25 13:23:22 +00003263 EmitCharacterFunction* emit_function = NULL;
3264 switch (pass) {
3265 case NON_ASCII_MATCH:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003266 ASSERT(ascii);
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00003267 if (quarks[j] > String::kMaxOneByteCharCode) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003268 assembler->GoTo(backtrack);
3269 return;
3270 }
ager@chromium.org381abbb2009-02-25 13:23:22 +00003271 break;
3272 case NON_LETTER_CHARACTER_MATCH:
3273 emit_function = &EmitAtomNonLetter;
3274 break;
3275 case SIMPLE_CHARACTER_MATCH:
3276 emit_function = &EmitSimpleCharacter;
3277 break;
3278 case CASE_CHARACTER_MATCH:
3279 emit_function = &EmitAtomLetter;
3280 break;
3281 default:
3282 break;
3283 }
3284 if (emit_function != NULL) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00003285 bool bound_checked = emit_function(isolate,
3286 compiler,
ager@chromium.org6f10e412009-02-13 10:11:16 +00003287 quarks[j],
3288 backtrack,
3289 cp_offset + j,
3290 *checked_up_to < cp_offset + j,
3291 preloaded);
ager@chromium.org381abbb2009-02-25 13:23:22 +00003292 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003293 }
3294 }
3295 } else {
rossberg@chromium.org92597162013-08-23 13:28:00 +00003296 ASSERT_EQ(TextElement::CHAR_CLASS, elm.text_type());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003297 if (pass == CHARACTER_CLASS_MATCH) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00003298 if (first_element_checked && i == 0) continue;
rossberg@chromium.org92597162013-08-23 13:28:00 +00003299 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3300 RegExpCharacterClass* cc = elm.char_class();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003301 EmitCharClass(assembler,
3302 cc,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003303 ascii,
ager@chromium.org381abbb2009-02-25 13:23:22 +00003304 backtrack,
3305 cp_offset,
3306 *checked_up_to < cp_offset,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003307 preloaded,
3308 zone());
ager@chromium.org381abbb2009-02-25 13:23:22 +00003309 UpdateBoundsCheck(cp_offset, checked_up_to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003310 }
3311 }
3312 }
3313}
3314
3315
3316int TextNode::Length() {
3317 TextElement elm = elms_->last();
rossberg@chromium.org92597162013-08-23 13:28:00 +00003318 ASSERT(elm.cp_offset() >= 0);
3319 return elm.cp_offset() + elm.length();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003320}
3321
3322
ager@chromium.org381abbb2009-02-25 13:23:22 +00003323bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3324 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3325 if (ignore_case) {
3326 return pass == SIMPLE_CHARACTER_MATCH;
3327 } else {
3328 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3329 }
3330}
3331
3332
ager@chromium.org8bb60582008-12-11 12:02:20 +00003333// This generates the code to match a text node. A text node can contain
3334// straight character sequences (possibly to be matched in a case-independent
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003335// way) and character classes. For efficiency we do not do this in a single
3336// pass from left to right. Instead we pass over the text node several times,
3337// emitting code for some character positions every time. See the comment on
3338// TextEmitPass for details.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003339void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00003340 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003341 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003342 ASSERT(limit_result == CONTINUE);
3343
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003344 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3345 compiler->SetRegExpTooBig();
3346 return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003347 }
3348
3349 if (compiler->ascii()) {
3350 int dummy = 0;
ager@chromium.org32912102009-01-16 10:38:43 +00003351 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003352 }
3353
3354 bool first_elt_done = false;
ager@chromium.org32912102009-01-16 10:38:43 +00003355 int bound_checked_to = trace->cp_offset() - 1;
3356 bound_checked_to += trace->bound_checked_up_to();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003357
3358 // If a character is preloaded into the current character register then
3359 // check that now.
ager@chromium.org32912102009-01-16 10:38:43 +00003360 if (trace->characters_preloaded() == 1) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00003361 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3362 if (!SkipPass(pass, compiler->ignore_case())) {
3363 TextEmitPass(compiler,
3364 static_cast<TextEmitPassType>(pass),
3365 true,
3366 trace,
3367 false,
3368 &bound_checked_to);
3369 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003370 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003371 first_elt_done = true;
3372 }
3373
ager@chromium.org381abbb2009-02-25 13:23:22 +00003374 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3375 if (!SkipPass(pass, compiler->ignore_case())) {
3376 TextEmitPass(compiler,
3377 static_cast<TextEmitPassType>(pass),
3378 false,
3379 trace,
3380 first_elt_done,
3381 &bound_checked_to);
3382 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003383 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003384
ager@chromium.org32912102009-01-16 10:38:43 +00003385 Trace successor_trace(*trace);
iposva@chromium.org245aa852009-02-10 00:49:54 +00003386 successor_trace.set_at_start(false);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003387 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003388 RecursionCheck rc(compiler);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003389 on_success()->Emit(compiler, &successor_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003390}
3391
3392
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003393void Trace::InvalidateCurrentCharacter() {
3394 characters_preloaded_ = 0;
3395}
3396
3397
3398void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003399 ASSERT(by > 0);
3400 // We don't have an instruction for shifting the current character register
3401 // down or for using a shifted value for anything so lets just forget that
3402 // we preloaded any characters into it.
3403 characters_preloaded_ = 0;
3404 // Adjust the offsets of the quick check performed information. This
3405 // information is used to find out what we already determined about the
3406 // characters by means of mask and compare.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003407 quick_check_performed_.Advance(by, compiler->ascii());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003408 cp_offset_ += by;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003409 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3410 compiler->SetRegExpTooBig();
3411 cp_offset_ = 0;
3412 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003413 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003414}
3415
3416
ager@chromium.org38e4c712009-11-11 09:11:58 +00003417void TextNode::MakeCaseIndependent(bool is_ascii) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003418 int element_count = elms_->length();
3419 for (int i = 0; i < element_count; i++) {
3420 TextElement elm = elms_->at(i);
rossberg@chromium.org92597162013-08-23 13:28:00 +00003421 if (elm.text_type() == TextElement::CHAR_CLASS) {
3422 RegExpCharacterClass* cc = elm.char_class();
erik.corry@gmail.comf2038fb2012-01-16 11:42:08 +00003423 // None of the standard character classes is different in the case
ager@chromium.org38e4c712009-11-11 09:11:58 +00003424 // independent case and it slows us down if we don't know that.
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003425 if (cc->is_standard(zone())) continue;
3426 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003427 int range_count = ranges->length();
ager@chromium.org38e4c712009-11-11 09:11:58 +00003428 for (int j = 0; j < range_count; j++) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003429 ranges->at(j).AddCaseEquivalents(ranges, is_ascii, zone());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003430 }
3431 }
3432 }
3433}
3434
3435
ager@chromium.org8bb60582008-12-11 12:02:20 +00003436int TextNode::GreedyLoopTextLength() {
3437 TextElement elm = elms_->at(elms_->length() - 1);
rossberg@chromium.org92597162013-08-23 13:28:00 +00003438 return elm.cp_offset() + elm.length();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003439}
3440
3441
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003442RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3443 RegExpCompiler* compiler) {
3444 if (elms_->length() != 1) return NULL;
3445 TextElement elm = elms_->at(0);
rossberg@chromium.org92597162013-08-23 13:28:00 +00003446 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3447 RegExpCharacterClass* node = elm.char_class();
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003448 ZoneList<CharacterRange>* ranges = node->ranges(zone());
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003449 if (!CharacterRange::IsCanonical(ranges)) {
3450 CharacterRange::Canonicalize(ranges);
3451 }
3452 if (node->is_negated()) {
3453 return ranges->length() == 0 ? on_success() : NULL;
3454 }
3455 if (ranges->length() != 1) return NULL;
3456 uint32_t max_char;
3457 if (compiler->ascii()) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00003458 max_char = String::kMaxOneByteCharCode;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003459 } else {
3460 max_char = String::kMaxUtf16CodeUnit;
3461 }
3462 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3463}
3464
3465
ager@chromium.org8bb60582008-12-11 12:02:20 +00003466// Finds the fixed match length of a sequence of nodes that goes from
3467// this alternative and back to this choice node. If there are variable
3468// length nodes or other complications in the way then return a sentinel
3469// value indicating that a greedy loop cannot be constructed.
jkummerow@chromium.org486075a2011-09-07 12:44:28 +00003470int ChoiceNode::GreedyLoopTextLengthForAlternative(
3471 GuardedAlternative* alternative) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003472 int length = 0;
3473 RegExpNode* node = alternative->node();
3474 // Later we will generate code for all these text nodes using recursion
3475 // so we have to limit the max number.
3476 int recursion_depth = 0;
3477 while (node != this) {
3478 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3479 return kNodeIsTooComplexForGreedyLoops;
3480 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003481 int node_length = node->GreedyLoopTextLength();
3482 if (node_length == kNodeIsTooComplexForGreedyLoops) {
3483 return kNodeIsTooComplexForGreedyLoops;
3484 }
3485 length += node_length;
3486 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3487 node = seq_node->on_success();
3488 }
3489 return length;
3490}
3491
3492
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003493void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3494 ASSERT_EQ(loop_node_, NULL);
3495 AddAlternative(alt);
3496 loop_node_ = alt.node();
3497}
3498
3499
3500void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3501 ASSERT_EQ(continue_node_, NULL);
3502 AddAlternative(alt);
3503 continue_node_ = alt.node();
3504}
3505
3506
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003507void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003508 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003509 if (trace->stop_node() == this) {
jkummerow@chromium.org486075a2011-09-07 12:44:28 +00003510 int text_length =
3511 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
ager@chromium.org8bb60582008-12-11 12:02:20 +00003512 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
3513 // Update the counter-based backtracking info on the stack. This is an
3514 // optimization for greedy loops (see below).
ager@chromium.org32912102009-01-16 10:38:43 +00003515 ASSERT(trace->cp_offset() == text_length);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003516 macro_assembler->AdvanceCurrentPosition(text_length);
ager@chromium.org32912102009-01-16 10:38:43 +00003517 macro_assembler->GoTo(trace->loop_label());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003518 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003519 }
ager@chromium.org32912102009-01-16 10:38:43 +00003520 ASSERT(trace->stop_node() == NULL);
3521 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003522 trace->Flush(compiler, this);
3523 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003524 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003525 ChoiceNode::Emit(compiler, trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003526}
3527
3528
kasperl@chromium.orga5551262010-12-07 12:49:48 +00003529int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003530 int eats_at_least) {
3531 int preload_characters = Min(4, eats_at_least);
ager@chromium.org18ad94b2009-09-02 08:22:29 +00003532 if (compiler->macro_assembler()->CanReadUnaligned()) {
3533 bool ascii = compiler->ascii();
3534 if (ascii) {
3535 if (preload_characters > 4) preload_characters = 4;
3536 // We can't preload 3 characters because there is no machine instruction
3537 // to do that. We can't just load 4 because we could be reading
3538 // beyond the end of the string, which could cause a memory fault.
3539 if (preload_characters == 3) preload_characters = 2;
3540 } else {
3541 if (preload_characters > 2) preload_characters = 2;
3542 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003543 } else {
ager@chromium.org18ad94b2009-09-02 08:22:29 +00003544 if (preload_characters > 1) preload_characters = 1;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003545 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003546 return preload_characters;
3547}
3548
3549
3550// This class is used when generating the alternatives in a choice node. It
3551// records the way the alternative is being code generated.
3552class AlternativeGeneration: public Malloced {
3553 public:
3554 AlternativeGeneration()
3555 : possible_success(),
3556 expects_preload(false),
3557 after(),
3558 quick_check_details() { }
3559 Label possible_success;
3560 bool expects_preload;
3561 Label after;
3562 QuickCheckDetails quick_check_details;
3563};
3564
3565
3566// Creates a list of AlternativeGenerations. If the list has a reasonable
3567// size then it is on the stack, otherwise the excess is on the heap.
3568class AlternativeGenerationList {
3569 public:
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003570 AlternativeGenerationList(int count, Zone* zone)
3571 : alt_gens_(count, zone) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003572 for (int i = 0; i < count && i < kAFew; i++) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003573 alt_gens_.Add(a_few_alt_gens_ + i, zone);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003574 }
3575 for (int i = kAFew; i < count; i++) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003576 alt_gens_.Add(new AlternativeGeneration(), zone);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003577 }
3578 }
3579 ~AlternativeGenerationList() {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003580 for (int i = kAFew; i < alt_gens_.length(); i++) {
3581 delete alt_gens_[i];
3582 alt_gens_[i] = NULL;
3583 }
3584 }
3585
3586 AlternativeGeneration* at(int i) {
3587 return alt_gens_[i];
3588 }
jkummerow@chromium.orge297f592011-06-08 10:05:15 +00003589
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003590 private:
3591 static const int kAFew = 10;
3592 ZoneList<AlternativeGeneration*> alt_gens_;
3593 AlternativeGeneration a_few_alt_gens_[kAFew];
3594};
3595
3596
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003597// The '2' variant is has inclusive from and exclusive to.
3598static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0,
3599 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A,
3600 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 };
3601static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
3602
3603static const int kWordRanges[] = {
3604 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
3605static const int kWordRangeCount = ARRAY_SIZE(kWordRanges);
3606static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
3607static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
3608static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
3609static const int kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges);
3610static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
3611 0x2028, 0x202A, 0x10000 };
3612static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges);
3613
3614
3615void BoyerMoorePositionInfo::Set(int character) {
3616 SetInterval(Interval(character, character));
3617}
3618
3619
3620void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3621 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3622 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3623 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3624 surrogate_ =
3625 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3626 if (interval.to() - interval.from() >= kMapSize - 1) {
3627 if (map_count_ != kMapSize) {
3628 map_count_ = kMapSize;
3629 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3630 }
3631 return;
3632 }
3633 for (int i = interval.from(); i <= interval.to(); i++) {
3634 int mod_character = (i & kMask);
3635 if (!map_->at(mod_character)) {
3636 map_count_++;
3637 map_->at(mod_character) = true;
3638 }
3639 if (map_count_ == kMapSize) return;
3640 }
3641}
3642
3643
3644void BoyerMoorePositionInfo::SetAll() {
3645 s_ = w_ = d_ = kLatticeUnknown;
3646 if (map_count_ != kMapSize) {
3647 map_count_ = kMapSize;
3648 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3649 }
3650}
3651
3652
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003653BoyerMooreLookahead::BoyerMooreLookahead(
rossberg@chromium.org400388e2012-06-06 09:29:22 +00003654 int length, RegExpCompiler* compiler, Zone* zone)
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003655 : length_(length),
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003656 compiler_(compiler) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003657 if (compiler->ascii()) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00003658 max_char_ = String::kMaxOneByteCharCode;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003659 } else {
3660 max_char_ = String::kMaxUtf16CodeUnit;
3661 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003662 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003663 for (int i = 0; i < length; i++) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00003664 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003665 }
3666}
3667
3668
3669// Find the longest range of lookahead that has the fewest number of different
3670// characters that can occur at a given position. Since we are optimizing two
3671// different parameters at once this is a tradeoff.
3672bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3673 int biggest_points = 0;
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003674 // If more than 32 characters out of 128 can occur it is unlikely that we can
3675 // be lucky enough to step forwards much of the time.
3676 const int kMaxMax = 32;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003677 for (int max_number_of_chars = 4;
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003678 max_number_of_chars < kMaxMax;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003679 max_number_of_chars *= 2) {
3680 biggest_points =
3681 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3682 }
3683 if (biggest_points == 0) return false;
3684 return true;
3685}
3686
3687
3688// Find the highest-points range between 0 and length_ where the character
3689// information is not too vague. 'Too vague' means that there are more than
3690// max_number_of_chars that can occur at this position. Calculates the number
3691// of points as the product of width-of-the-range and
3692// probability-of-finding-one-of-the-characters, where the probability is
3693// calculated using the frequency distribution of the sample subject string.
3694int BoyerMooreLookahead::FindBestInterval(
3695 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3696 int biggest_points = old_biggest_points;
3697 static const int kSize = RegExpMacroAssembler::kTableSize;
3698 for (int i = 0; i < length_; ) {
3699 while (i < length_ && Count(i) > max_number_of_chars) i++;
3700 if (i == length_) break;
3701 int remembered_from = i;
3702 bool union_map[kSize];
3703 for (int j = 0; j < kSize; j++) union_map[j] = false;
3704 while (i < length_ && Count(i) <= max_number_of_chars) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003705 BoyerMoorePositionInfo* map = bitmaps_->at(i);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003706 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3707 i++;
3708 }
3709 int frequency = 0;
3710 for (int j = 0; j < kSize; j++) {
3711 if (union_map[j]) {
3712 // Add 1 to the frequency to give a small per-character boost for
3713 // the cases where our sampling is not good enough and many
3714 // characters have a frequency of zero. This means the frequency
3715 // can theoretically be up to 2*kSize though we treat it mostly as
3716 // a fraction of kSize.
3717 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3718 }
3719 }
3720 // We use the probability of skipping times the distance we are skipping to
3721 // judge the effectiveness of this. Actually we have a cut-off: By
3722 // dividing by 2 we switch off the skipping if the probability of skipping
3723 // is less than 50%. This is because the multibyte mask-and-compare
3724 // skipping in quickcheck is more likely to do well on this case.
3725 bool in_quickcheck_range = ((i - remembered_from < 4) ||
3726 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2));
3727 // Called 'probability' but it is only a rough estimate and can actually
3728 // be outside the 0-kSize range.
3729 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3730 int points = (i - remembered_from) * probability;
3731 if (points > biggest_points) {
3732 *from = remembered_from;
3733 *to = i - 1;
3734 biggest_points = points;
3735 }
3736 }
3737 return biggest_points;
3738}
3739
3740
3741// Take all the characters that will not prevent a successful match if they
3742// occur in the subject string in the range between min_lookahead and
3743// max_lookahead (inclusive) measured from the current position. If the
3744// character at max_lookahead offset is not one of these characters, then we
3745// can safely skip forwards by the number of characters in the range.
3746int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3747 int max_lookahead,
3748 Handle<ByteArray> boolean_skip_table) {
3749 const int kSize = RegExpMacroAssembler::kTableSize;
3750
3751 const int kSkipArrayEntry = 0;
3752 const int kDontSkipArrayEntry = 1;
3753
3754 for (int i = 0; i < kSize; i++) {
3755 boolean_skip_table->set(i, kSkipArrayEntry);
3756 }
3757 int skip = max_lookahead + 1 - min_lookahead;
3758
3759 for (int i = max_lookahead; i >= min_lookahead; i--) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003760 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3761 for (int j = 0; j < kSize; j++) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003762 if (map->at(j)) {
3763 boolean_skip_table->set(j, kDontSkipArrayEntry);
3764 }
3765 }
3766 }
3767
3768 return skip;
3769}
3770
3771
3772// See comment above on the implementation of GetSkipTable.
3773bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003774 const int kSize = RegExpMacroAssembler::kTableSize;
3775
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003776 int min_lookahead = 0;
3777 int max_lookahead = 0;
3778
3779 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;
3780
3781 bool found_single_character = false;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003782 int single_character = 0;
3783 for (int i = max_lookahead; i >= min_lookahead; i--) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003784 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3785 if (map->map_count() > 1 ||
3786 (found_single_character && map->map_count() != 0)) {
3787 found_single_character = false;
3788 break;
3789 }
3790 for (int j = 0; j < kSize; j++) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003791 if (map->at(j)) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003792 found_single_character = true;
3793 single_character = j;
3794 break;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003795 }
3796 }
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003797 }
3798
3799 int lookahead_width = max_lookahead + 1 - min_lookahead;
3800
3801 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3802 // The mask-compare can probably handle this better.
3803 return false;
3804 }
3805
3806 if (found_single_character) {
3807 Label cont, again;
3808 masm->Bind(&again);
3809 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00003810 if (max_char_ > kSize) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003811 masm->CheckCharacterAfterAnd(single_character,
3812 RegExpMacroAssembler::kTableMask,
3813 &cont);
3814 } else {
3815 masm->CheckCharacter(single_character, &cont);
3816 }
3817 masm->AdvanceCurrentPosition(lookahead_width);
3818 masm->GoTo(&again);
3819 masm->Bind(&cont);
3820 return true;
3821 }
3822
jkummerow@chromium.org3d00d0a2013-09-04 13:57:32 +00003823 Factory* factory = masm->zone()->isolate()->factory();
verwaest@chromium.orgd4be0f02013-06-05 13:39:03 +00003824 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003825 int skip_distance = GetSkipTable(
3826 min_lookahead, max_lookahead, boolean_skip_table);
3827 ASSERT(skip_distance != 0);
3828
3829 Label cont, again;
3830 masm->Bind(&again);
3831 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3832 masm->CheckBitInTable(boolean_skip_table, &cont);
3833 masm->AdvanceCurrentPosition(skip_distance);
3834 masm->GoTo(&again);
3835 masm->Bind(&cont);
3836
3837 return true;
3838}
3839
3840
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003841/* Code generation for choice nodes.
3842 *
3843 * We generate quick checks that do a mask and compare to eliminate a
3844 * choice. If the quick check succeeds then it jumps to the continuation to
3845 * do slow checks and check subsequent nodes. If it fails (the common case)
3846 * it falls through to the next choice.
3847 *
3848 * Here is the desired flow graph. Nodes directly below each other imply
3849 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3850 * 3 doesn't have a quick check so we have to call the slow check.
3851 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3852 * regexp continuation is generated directly after the Sn node, up to the
3853 * next GoTo if we decide to reuse some already generated code. Some
3854 * nodes expect preload_characters to be preloaded into the current
3855 * character register. R nodes do this preloading. Vertices are marked
3856 * F for failures and S for success (possible success in the case of quick
3857 * nodes). L, V, < and > are used as arrow heads.
3858 *
3859 * ----------> R
3860 * |
3861 * V
3862 * Q1 -----> S1
3863 * | S /
3864 * F| /
3865 * | F/
3866 * | /
3867 * | R
3868 * | /
3869 * V L
3870 * Q2 -----> S2
3871 * | S /
3872 * F| /
3873 * | F/
3874 * | /
3875 * | R
3876 * | /
3877 * V L
3878 * S3
3879 * |
3880 * F|
3881 * |
3882 * R
3883 * |
3884 * backtrack V
3885 * <----------Q4
3886 * \ F |
3887 * \ |S
3888 * \ F V
3889 * \-----S4
3890 *
3891 * For greedy loops we reverse our expectation and expect to match rather
3892 * than fail. Therefore we want the loop code to look like this (U is the
3893 * unwind code that steps back in the greedy loop). The following alternatives
3894 * look the same as above.
3895 * _____
3896 * / \
3897 * V |
3898 * ----------> S1 |
3899 * /| |
3900 * / |S |
3901 * F/ \_____/
3902 * /
3903 * |<-----------
3904 * | \
3905 * V \
3906 * Q2 ---> S2 \
3907 * | S / |
3908 * F| / |
3909 * | F/ |
3910 * | / |
3911 * | R |
3912 * | / |
3913 * F VL |
3914 * <------U |
3915 * back |S |
3916 * \______________/
3917 */
3918
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003919void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003920 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3921 int choice_count = alternatives_->length();
3922#ifdef DEBUG
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003923 for (int i = 0; i < choice_count - 1; i++) {
3924 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003925 ZoneList<Guard*>* guards = alternative.guards();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003926 int guard_count = (guards == NULL) ? 0 : guards->length();
3927 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003928 ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003929 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003930 }
3931#endif
3932
ager@chromium.org32912102009-01-16 10:38:43 +00003933 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003934 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003935 ASSERT(limit_result == CONTINUE);
3936
ager@chromium.org381abbb2009-02-25 13:23:22 +00003937 int new_flush_budget = trace->flush_budget() / choice_count;
3938 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3939 trace->Flush(compiler, this);
3940 return;
3941 }
3942
ager@chromium.org8bb60582008-12-11 12:02:20 +00003943 RecursionCheck rc(compiler);
3944
ager@chromium.org32912102009-01-16 10:38:43 +00003945 Trace* current_trace = trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003946
jkummerow@chromium.org486075a2011-09-07 12:44:28 +00003947 int text_length = GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
ager@chromium.org8bb60582008-12-11 12:02:20 +00003948 bool greedy_loop = false;
3949 Label greedy_loop_label;
ager@chromium.org32912102009-01-16 10:38:43 +00003950 Trace counter_backtrack_trace;
3951 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
iposva@chromium.org245aa852009-02-10 00:49:54 +00003952 if (not_at_start()) counter_backtrack_trace.set_at_start(false);
3953
ager@chromium.org8bb60582008-12-11 12:02:20 +00003954 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3955 // Here we have special handling for greedy loops containing only text nodes
3956 // and other simple nodes. These are handled by pushing the current
3957 // position on the stack and then incrementing the current position each
3958 // time around the switch. On backtrack we decrement the current position
3959 // and check it against the pushed value. This avoids pushing backtrack
3960 // information for each iteration of the loop, which could take up a lot of
3961 // space.
3962 greedy_loop = true;
ager@chromium.org32912102009-01-16 10:38:43 +00003963 ASSERT(trace->stop_node() == NULL);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003964 macro_assembler->PushCurrentPosition();
ager@chromium.org32912102009-01-16 10:38:43 +00003965 current_trace = &counter_backtrack_trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003966 Label greedy_match_failed;
ager@chromium.org32912102009-01-16 10:38:43 +00003967 Trace greedy_match_trace;
iposva@chromium.org245aa852009-02-10 00:49:54 +00003968 if (not_at_start()) greedy_match_trace.set_at_start(false);
ager@chromium.org32912102009-01-16 10:38:43 +00003969 greedy_match_trace.set_backtrack(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003970 Label loop_label;
3971 macro_assembler->Bind(&loop_label);
ager@chromium.org32912102009-01-16 10:38:43 +00003972 greedy_match_trace.set_stop_node(this);
3973 greedy_match_trace.set_loop_label(&loop_label);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003974 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003975 macro_assembler->Bind(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003976 }
3977
3978 Label second_choice; // For use in greedy matches.
3979 macro_assembler->Bind(&second_choice);
3980
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003981 int first_normal_choice = greedy_loop ? 1 : 0;
3982
danno@chromium.orgbee51992013-07-10 14:57:15 +00003983 bool not_at_start = current_trace->at_start() == Trace::FALSE_VALUE;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00003984 const int kEatsAtLeastNotYetInitialized = -1;
3985 int eats_at_least = kEatsAtLeastNotYetInitialized;
3986
3987 bool skip_was_emitted = false;
3988
3989 if (!greedy_loop && choice_count == 2) {
3990 GuardedAlternative alt1 = alternatives_->at(1);
3991 if (alt1.guards() == NULL || alt1.guards()->length() == 0) {
3992 RegExpNode* eats_anything_node = alt1.node();
3993 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==
3994 this) {
3995 // At this point we know that we are at a non-greedy loop that will eat
3996 // any character one at a time. Any non-anchored regexp has such a
3997 // loop prepended to it in order to find where it starts. We look for
3998 // a pattern of the form ...abc... where we can look 6 characters ahead
3999 // and step forwards 3 if the character is not one of abc. Abc need
4000 // not be atoms, they can be any reasonably limited character class or
4001 // small alternation.
4002 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004003 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
4004 if (lookahead == NULL) {
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00004005 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4006 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4007 kRecursionBudget,
4008 not_at_start));
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004009 if (eats_at_least >= 1) {
4010 BoyerMooreLookahead* bm =
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004011 new(zone()) BoyerMooreLookahead(eats_at_least,
4012 compiler,
4013 zone());
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004014 GuardedAlternative alt0 = alternatives_->at(0);
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00004015 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004016 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler);
4017 }
4018 } else {
4019 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00004020 }
4021 }
4022 }
4023 }
4024
4025 if (eats_at_least == kEatsAtLeastNotYetInitialized) {
4026 // Save some time by looking at most one machine word ahead.
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00004027 eats_at_least =
4028 EatsAtLeast(compiler->ascii() ? 4 : 2, kRecursionBudget, not_at_start);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00004029 }
4030 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);
4031
4032 bool preload_is_current = !skip_was_emitted &&
ager@chromium.org32912102009-01-16 10:38:43 +00004033 (current_trace->characters_preloaded() == preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004034 bool preload_has_checked_bounds = preload_is_current;
4035
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004036 AlternativeGenerationList alt_gens(choice_count, zone());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004037
ager@chromium.org8bb60582008-12-11 12:02:20 +00004038 // For now we just call all choices one after the other. The idea ultimately
4039 // is to use the Dispatch table to try only the relevant ones.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004040 for (int i = first_normal_choice; i < choice_count; i++) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004041 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00004042 AlternativeGeneration* alt_gen = alt_gens.at(i);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004043 alt_gen->quick_check_details.set_characters(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00004044 ZoneList<Guard*>* guards = alternative.guards();
4045 int guard_count = (guards == NULL) ? 0 : guards->length();
ager@chromium.org32912102009-01-16 10:38:43 +00004046 Trace new_trace(*current_trace);
4047 new_trace.set_characters_preloaded(preload_is_current ?
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004048 preload_characters :
4049 0);
4050 if (preload_has_checked_bounds) {
ager@chromium.org32912102009-01-16 10:38:43 +00004051 new_trace.set_bound_checked_up_to(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00004052 }
ager@chromium.org32912102009-01-16 10:38:43 +00004053 new_trace.quick_check_performed()->Clear();
danno@chromium.orgbee51992013-07-10 14:57:15 +00004054 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004055 alt_gen->expects_preload = preload_is_current;
4056 bool generate_full_check_inline = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00004057 if (FLAG_regexp_optimization &&
iposva@chromium.org245aa852009-02-10 00:49:54 +00004058 try_to_emit_quick_check_for_alternative(i) &&
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004059 alternative.node()->EmitQuickCheck(compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00004060 &new_trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004061 preload_has_checked_bounds,
4062 &alt_gen->possible_success,
4063 &alt_gen->quick_check_details,
4064 i < choice_count - 1)) {
4065 // Quick check was generated for this choice.
4066 preload_is_current = true;
4067 preload_has_checked_bounds = true;
4068 // On the last choice in the ChoiceNode we generated the quick
4069 // check to fall through on possible success. So now we need to
4070 // generate the full check inline.
4071 if (i == choice_count - 1) {
4072 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00004073 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4074 new_trace.set_characters_preloaded(preload_characters);
4075 new_trace.set_bound_checked_up_to(preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004076 generate_full_check_inline = true;
4077 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00004078 } else if (alt_gen->quick_check_details.cannot_match()) {
4079 if (i == choice_count - 1 && !greedy_loop) {
4080 macro_assembler->GoTo(trace->backtrack());
4081 }
4082 continue;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004083 } else {
4084 // No quick check was generated. Put the full code here.
4085 // If this is not the first choice then there could be slow checks from
4086 // previous cases that go here when they fail. There's no reason to
4087 // insist that they preload characters since the slow check we are about
4088 // to generate probably can't use it.
4089 if (i != first_normal_choice) {
4090 alt_gen->expects_preload = false;
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00004091 new_trace.InvalidateCurrentCharacter();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004092 }
4093 if (i < choice_count - 1) {
ager@chromium.org32912102009-01-16 10:38:43 +00004094 new_trace.set_backtrack(&alt_gen->after);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004095 }
4096 generate_full_check_inline = true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004097 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004098 if (generate_full_check_inline) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00004099 if (new_trace.actions() != NULL) {
4100 new_trace.set_flush_budget(new_flush_budget);
4101 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004102 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00004103 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004104 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004105 alternative.node()->Emit(compiler, &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004106 preload_is_current = false;
4107 }
4108 macro_assembler->Bind(&alt_gen->after);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004109 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004110 if (greedy_loop) {
4111 macro_assembler->Bind(&greedy_loop_label);
4112 // If we have unwound to the bottom then backtrack.
ager@chromium.org32912102009-01-16 10:38:43 +00004113 macro_assembler->CheckGreedyLoop(trace->backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00004114 // Otherwise try the second priority at an earlier position.
4115 macro_assembler->AdvanceCurrentPosition(-text_length);
4116 macro_assembler->GoTo(&second_choice);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004117 }
ager@chromium.org381abbb2009-02-25 13:23:22 +00004118
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004119 // At this point we need to generate slow checks for the alternatives where
4120 // the quick check was inlined. We can recognize these because the associated
4121 // label was bound.
4122 for (int i = first_normal_choice; i < choice_count - 1; i++) {
4123 AlternativeGeneration* alt_gen = alt_gens.at(i);
ager@chromium.org381abbb2009-02-25 13:23:22 +00004124 Trace new_trace(*current_trace);
4125 // If there are actions to be flushed we have to limit how many times
4126 // they are flushed. Take the budget of the parent trace and distribute
4127 // it fairly amongst the children.
4128 if (new_trace.actions() != NULL) {
4129 new_trace.set_flush_budget(new_flush_budget);
4130 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004131 EmitOutOfLineContinuation(compiler,
ager@chromium.org381abbb2009-02-25 13:23:22 +00004132 &new_trace,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004133 alternatives_->at(i),
4134 alt_gen,
4135 preload_characters,
4136 alt_gens.at(i + 1)->expects_preload);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004137 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004138}
4139
4140
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004141void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00004142 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004143 GuardedAlternative alternative,
4144 AlternativeGeneration* alt_gen,
4145 int preload_characters,
4146 bool next_expects_preload) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004147 if (!alt_gen->possible_success.is_linked()) return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004148
4149 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4150 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00004151 Trace out_of_line_trace(*trace);
4152 out_of_line_trace.set_characters_preloaded(preload_characters);
4153 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
danno@chromium.orgbee51992013-07-10 14:57:15 +00004154 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004155 ZoneList<Guard*>* guards = alternative.guards();
4156 int guard_count = (guards == NULL) ? 0 : guards->length();
4157 if (next_expects_preload) {
4158 Label reload_current_char;
ager@chromium.org32912102009-01-16 10:38:43 +00004159 out_of_line_trace.set_backtrack(&reload_current_char);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004160 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00004161 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004162 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004163 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004164 macro_assembler->Bind(&reload_current_char);
4165 // Reload the current character, since the next quick check expects that.
4166 // We don't need to check bounds here because we only get into this
4167 // code through a quick check which already did the checked load.
ager@chromium.org32912102009-01-16 10:38:43 +00004168 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004169 NULL,
4170 false,
4171 preload_characters);
4172 macro_assembler->GoTo(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004173 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00004174 out_of_line_trace.set_backtrack(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004175 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00004176 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004177 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004178 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004179 }
4180}
4181
4182
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004183void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004184 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00004185 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004186 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00004187 ASSERT(limit_result == CONTINUE);
4188
4189 RecursionCheck rc(compiler);
4190
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00004191 switch (action_type_) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004192 case STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00004193 Trace::DeferredCapture
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004194 new_capture(data_.u_position_register.reg,
4195 data_.u_position_register.is_capture,
4196 trace);
ager@chromium.org32912102009-01-16 10:38:43 +00004197 Trace new_trace = *trace;
4198 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004199 on_success()->Emit(compiler, &new_trace);
4200 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004201 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004202 case INCREMENT_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00004203 Trace::DeferredIncrementRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00004204 new_increment(data_.u_increment_register.reg);
ager@chromium.org32912102009-01-16 10:38:43 +00004205 Trace new_trace = *trace;
4206 new_trace.add_action(&new_increment);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004207 on_success()->Emit(compiler, &new_trace);
4208 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00004209 }
4210 case SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00004211 Trace::DeferredSetRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00004212 new_set(data_.u_store_register.reg, data_.u_store_register.value);
ager@chromium.org32912102009-01-16 10:38:43 +00004213 Trace new_trace = *trace;
4214 new_trace.add_action(&new_set);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004215 on_success()->Emit(compiler, &new_trace);
4216 break;
ager@chromium.org32912102009-01-16 10:38:43 +00004217 }
4218 case CLEAR_CAPTURES: {
4219 Trace::DeferredClearCaptures
4220 new_capture(Interval(data_.u_clear_captures.range_from,
4221 data_.u_clear_captures.range_to));
4222 Trace new_trace = *trace;
4223 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004224 on_success()->Emit(compiler, &new_trace);
4225 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00004226 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004227 case BEGIN_SUBMATCH:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004228 if (!trace->is_trivial()) {
4229 trace->Flush(compiler, this);
4230 } else {
4231 assembler->WriteCurrentPositionToRegister(
4232 data_.u_submatch.current_position_register, 0);
4233 assembler->WriteStackPointerToRegister(
4234 data_.u_submatch.stack_pointer_register);
4235 on_success()->Emit(compiler, trace);
4236 }
4237 break;
ager@chromium.org32912102009-01-16 10:38:43 +00004238 case EMPTY_MATCH_CHECK: {
4239 int start_pos_reg = data_.u_empty_match_check.start_register;
4240 int stored_pos = 0;
4241 int rep_reg = data_.u_empty_match_check.repetition_register;
4242 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4243 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4244 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4245 // If we know we haven't advanced and there is no minimum we
4246 // can just backtrack immediately.
4247 assembler->GoTo(trace->backtrack());
ager@chromium.org32912102009-01-16 10:38:43 +00004248 } else if (know_dist && stored_pos < trace->cp_offset()) {
4249 // If we know we've advanced we can generate the continuation
4250 // immediately.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004251 on_success()->Emit(compiler, trace);
4252 } else if (!trace->is_trivial()) {
4253 trace->Flush(compiler, this);
4254 } else {
4255 Label skip_empty_check;
4256 // If we have a minimum number of repetitions we check the current
4257 // number first and skip the empty check if it's not enough.
4258 if (has_minimum) {
4259 int limit = data_.u_empty_match_check.repetition_limit;
4260 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4261 }
4262 // If the match is empty we bail out, otherwise we fall through
4263 // to the on-success continuation.
4264 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4265 trace->backtrack());
4266 assembler->Bind(&skip_empty_check);
4267 on_success()->Emit(compiler, trace);
ager@chromium.org32912102009-01-16 10:38:43 +00004268 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004269 break;
ager@chromium.org32912102009-01-16 10:38:43 +00004270 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004271 case POSITIVE_SUBMATCH_SUCCESS: {
4272 if (!trace->is_trivial()) {
4273 trace->Flush(compiler, this);
4274 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004275 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004276 assembler->ReadCurrentPositionFromRegister(
ager@chromium.org8bb60582008-12-11 12:02:20 +00004277 data_.u_submatch.current_position_register);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004278 assembler->ReadStackPointerFromRegister(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004279 data_.u_submatch.stack_pointer_register);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004280 int clear_register_count = data_.u_submatch.clear_register_count;
4281 if (clear_register_count == 0) {
4282 on_success()->Emit(compiler, trace);
4283 return;
4284 }
4285 int clear_registers_from = data_.u_submatch.clear_register_from;
4286 Label clear_registers_backtrack;
4287 Trace new_trace = *trace;
4288 new_trace.set_backtrack(&clear_registers_backtrack);
4289 on_success()->Emit(compiler, &new_trace);
4290
4291 assembler->Bind(&clear_registers_backtrack);
4292 int clear_registers_to = clear_registers_from + clear_register_count - 1;
4293 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4294
4295 ASSERT(trace->backtrack() == NULL);
4296 assembler->Backtrack();
4297 return;
4298 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004299 default:
4300 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004301 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004302}
4303
4304
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004305void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004306 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00004307 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004308 trace->Flush(compiler, this);
4309 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00004310 }
4311
ager@chromium.org32912102009-01-16 10:38:43 +00004312 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004313 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00004314 ASSERT(limit_result == CONTINUE);
4315
4316 RecursionCheck rc(compiler);
4317
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004318 ASSERT_EQ(start_reg_ + 1, end_reg_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004319 if (compiler->ignore_case()) {
4320 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
4321 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004322 } else {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004323 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004324 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004325 on_success()->Emit(compiler, trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004326}
4327
4328
4329// -------------------------------------------------------------------
4330// Dot/dotty output
4331
4332
4333#ifdef DEBUG
4334
4335
4336class DotPrinter: public NodeVisitor {
4337 public:
4338 explicit DotPrinter(bool ignore_case)
4339 : ignore_case_(ignore_case),
4340 stream_(&alloc_) { }
4341 void PrintNode(const char* label, RegExpNode* node);
4342 void Visit(RegExpNode* node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004343 void PrintAttributes(RegExpNode* from);
4344 StringStream* stream() { return &stream_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004345 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004346#define DECLARE_VISIT(Type) \
4347 virtual void Visit##Type(Type##Node* that);
4348FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4349#undef DECLARE_VISIT
4350 private:
4351 bool ignore_case_;
4352 HeapStringAllocator alloc_;
4353 StringStream stream_;
4354};
4355
4356
4357void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4358 stream()->Add("digraph G {\n graph [label=\"");
4359 for (int i = 0; label[i]; i++) {
4360 switch (label[i]) {
4361 case '\\':
4362 stream()->Add("\\\\");
4363 break;
4364 case '"':
4365 stream()->Add("\"");
4366 break;
4367 default:
4368 stream()->Put(label[i]);
4369 break;
4370 }
4371 }
4372 stream()->Add("\"];\n");
4373 Visit(node);
4374 stream()->Add("}\n");
4375 printf("%s", *(stream()->ToCString()));
4376}
4377
4378
4379void DotPrinter::Visit(RegExpNode* node) {
4380 if (node->info()->visited) return;
4381 node->info()->visited = true;
4382 node->Accept(this);
4383}
4384
4385
4386void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004387 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
4388 Visit(on_failure);
4389}
4390
4391
4392class TableEntryBodyPrinter {
4393 public:
4394 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
4395 : stream_(stream), choice_(choice) { }
4396 void Call(uc16 from, DispatchTable::Entry entry) {
4397 OutSet* out_set = entry.out_set();
4398 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4399 if (out_set->Get(i)) {
4400 stream()->Add(" n%p:s%io%i -> n%p;\n",
4401 choice(),
4402 from,
4403 i,
4404 choice()->alternatives()->at(i).node());
4405 }
4406 }
4407 }
4408 private:
4409 StringStream* stream() { return stream_; }
4410 ChoiceNode* choice() { return choice_; }
4411 StringStream* stream_;
4412 ChoiceNode* choice_;
4413};
4414
4415
4416class TableEntryHeaderPrinter {
4417 public:
4418 explicit TableEntryHeaderPrinter(StringStream* stream)
4419 : first_(true), stream_(stream) { }
4420 void Call(uc16 from, DispatchTable::Entry entry) {
4421 if (first_) {
4422 first_ = false;
4423 } else {
4424 stream()->Add("|");
4425 }
4426 stream()->Add("{\\%k-\\%k|{", from, entry.to());
4427 OutSet* out_set = entry.out_set();
4428 int priority = 0;
4429 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4430 if (out_set->Get(i)) {
4431 if (priority > 0) stream()->Add("|");
4432 stream()->Add("<s%io%i> %i", from, i, priority);
4433 priority++;
4434 }
4435 }
4436 stream()->Add("}}");
4437 }
jkummerow@chromium.orge297f592011-06-08 10:05:15 +00004438
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004439 private:
4440 bool first_;
4441 StringStream* stream() { return stream_; }
4442 StringStream* stream_;
4443};
4444
4445
4446class AttributePrinter {
4447 public:
4448 explicit AttributePrinter(DotPrinter* out)
4449 : out_(out), first_(true) { }
4450 void PrintSeparator() {
4451 if (first_) {
4452 first_ = false;
4453 } else {
4454 out_->stream()->Add("|");
4455 }
4456 }
4457 void PrintBit(const char* name, bool value) {
4458 if (!value) return;
4459 PrintSeparator();
4460 out_->stream()->Add("{%s}", name);
4461 }
4462 void PrintPositive(const char* name, int value) {
4463 if (value < 0) return;
4464 PrintSeparator();
4465 out_->stream()->Add("{%s|%x}", name, value);
4466 }
4467 private:
4468 DotPrinter* out_;
4469 bool first_;
4470};
4471
4472
4473void DotPrinter::PrintAttributes(RegExpNode* that) {
4474 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
4475 "margin=0.1, fontsize=10, label=\"{",
4476 that);
4477 AttributePrinter printer(this);
4478 NodeInfo* info = that->info();
4479 printer.PrintBit("NI", info->follows_newline_interest);
4480 printer.PrintBit("WI", info->follows_word_interest);
4481 printer.PrintBit("SI", info->follows_start_interest);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004482 Label* label = that->label();
4483 if (label->is_bound())
4484 printer.PrintPositive("@", label->pos());
4485 stream()->Add("}\"];\n");
4486 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
4487 "arrowhead=none];\n", that, that);
4488}
4489
4490
4491static const bool kPrintDispatchTable = false;
4492void DotPrinter::VisitChoice(ChoiceNode* that) {
4493 if (kPrintDispatchTable) {
4494 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
4495 TableEntryHeaderPrinter header_printer(stream());
4496 that->GetTable(ignore_case_)->ForEach(&header_printer);
4497 stream()->Add("\"]\n", that);
4498 PrintAttributes(that);
4499 TableEntryBodyPrinter body_printer(stream(), that);
4500 that->GetTable(ignore_case_)->ForEach(&body_printer);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004501 } else {
4502 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
4503 for (int i = 0; i < that->alternatives()->length(); i++) {
4504 GuardedAlternative alt = that->alternatives()->at(i);
4505 stream()->Add(" n%p -> n%p;\n", that, alt.node());
4506 }
4507 }
4508 for (int i = 0; i < that->alternatives()->length(); i++) {
4509 GuardedAlternative alt = that->alternatives()->at(i);
4510 alt.node()->Accept(this);
4511 }
4512}
4513
4514
4515void DotPrinter::VisitText(TextNode* that) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004516 Zone* zone = that->zone();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004517 stream()->Add(" n%p [label=\"", that);
4518 for (int i = 0; i < that->elements()->length(); i++) {
4519 if (i > 0) stream()->Add(" ");
4520 TextElement elm = that->elements()->at(i);
rossberg@chromium.org92597162013-08-23 13:28:00 +00004521 switch (elm.text_type()) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004522 case TextElement::ATOM: {
rossberg@chromium.org92597162013-08-23 13:28:00 +00004523 stream()->Add("'%w'", elm.atom()->data());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004524 break;
4525 }
4526 case TextElement::CHAR_CLASS: {
rossberg@chromium.org92597162013-08-23 13:28:00 +00004527 RegExpCharacterClass* node = elm.char_class();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004528 stream()->Add("[");
4529 if (node->is_negated())
4530 stream()->Add("^");
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004531 for (int j = 0; j < node->ranges(zone)->length(); j++) {
4532 CharacterRange range = node->ranges(zone)->at(j);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004533 stream()->Add("%k-%k", range.from(), range.to());
4534 }
4535 stream()->Add("]");
4536 break;
4537 }
4538 default:
4539 UNREACHABLE();
4540 }
4541 }
4542 stream()->Add("\", shape=box, peripheries=2];\n");
4543 PrintAttributes(that);
4544 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
4545 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004546}
4547
4548
4549void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4550 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
4551 that,
4552 that->start_register(),
4553 that->end_register());
4554 PrintAttributes(that);
4555 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
4556 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004557}
4558
4559
4560void DotPrinter::VisitEnd(EndNode* that) {
4561 stream()->Add(" n%p [style=bold, shape=point];\n", that);
4562 PrintAttributes(that);
4563}
4564
4565
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004566void DotPrinter::VisitAssertion(AssertionNode* that) {
4567 stream()->Add(" n%p [", that);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00004568 switch (that->assertion_type()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004569 case AssertionNode::AT_END:
4570 stream()->Add("label=\"$\", shape=septagon");
4571 break;
4572 case AssertionNode::AT_START:
4573 stream()->Add("label=\"^\", shape=septagon");
4574 break;
4575 case AssertionNode::AT_BOUNDARY:
4576 stream()->Add("label=\"\\b\", shape=septagon");
4577 break;
4578 case AssertionNode::AT_NON_BOUNDARY:
4579 stream()->Add("label=\"\\B\", shape=septagon");
4580 break;
4581 case AssertionNode::AFTER_NEWLINE:
4582 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
4583 break;
4584 }
4585 stream()->Add("];\n");
4586 PrintAttributes(that);
4587 RegExpNode* successor = that->on_success();
4588 stream()->Add(" n%p -> n%p;\n", that, successor);
4589 Visit(successor);
4590}
4591
4592
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004593void DotPrinter::VisitAction(ActionNode* that) {
4594 stream()->Add(" n%p [", that);
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00004595 switch (that->action_type_) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004596 case ActionNode::SET_REGISTER:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004597 stream()->Add("label=\"$%i:=%i\", shape=octagon",
4598 that->data_.u_store_register.reg,
4599 that->data_.u_store_register.value);
4600 break;
4601 case ActionNode::INCREMENT_REGISTER:
4602 stream()->Add("label=\"$%i++\", shape=octagon",
4603 that->data_.u_increment_register.reg);
4604 break;
4605 case ActionNode::STORE_POSITION:
4606 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
4607 that->data_.u_position_register.reg);
4608 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004609 case ActionNode::BEGIN_SUBMATCH:
4610 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
4611 that->data_.u_submatch.current_position_register);
4612 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00004613 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004614 stream()->Add("label=\"escape\", shape=septagon");
4615 break;
ager@chromium.org32912102009-01-16 10:38:43 +00004616 case ActionNode::EMPTY_MATCH_CHECK:
4617 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
4618 that->data_.u_empty_match_check.start_register,
4619 that->data_.u_empty_match_check.repetition_register,
4620 that->data_.u_empty_match_check.repetition_limit);
4621 break;
4622 case ActionNode::CLEAR_CAPTURES: {
4623 stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
4624 that->data_.u_clear_captures.range_from,
4625 that->data_.u_clear_captures.range_to);
4626 break;
4627 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004628 }
4629 stream()->Add("];\n");
4630 PrintAttributes(that);
ager@chromium.org8bb60582008-12-11 12:02:20 +00004631 RegExpNode* successor = that->on_success();
4632 stream()->Add(" n%p -> n%p;\n", that, successor);
4633 Visit(successor);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004634}
4635
4636
4637class DispatchTableDumper {
4638 public:
4639 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
4640 void Call(uc16 key, DispatchTable::Entry entry);
4641 StringStream* stream() { return stream_; }
4642 private:
4643 StringStream* stream_;
4644};
4645
4646
4647void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4648 stream()->Add("[%k-%k]: {", key, entry.to());
4649 OutSet* set = entry.out_set();
4650 bool first = true;
4651 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4652 if (set->Get(i)) {
4653 if (first) {
4654 first = false;
4655 } else {
4656 stream()->Add(", ");
4657 }
4658 stream()->Add("%i", i);
4659 }
4660 }
4661 stream()->Add("}\n");
4662}
4663
4664
4665void DispatchTable::Dump() {
4666 HeapStringAllocator alloc;
4667 StringStream stream(&alloc);
4668 DispatchTableDumper dumper(&stream);
4669 tree()->ForEach(&dumper);
4670 OS::PrintError("%s", *stream.ToCString());
4671}
4672
4673
4674void RegExpEngine::DotPrint(const char* label,
4675 RegExpNode* node,
4676 bool ignore_case) {
4677 DotPrinter printer(ignore_case);
4678 printer.PrintNode(label, node);
4679}
4680
4681
4682#endif // DEBUG
4683
4684
4685// -------------------------------------------------------------------
4686// Tree to graph conversion
4687
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004688RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004689 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004690 ZoneList<TextElement>* elms =
4691 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4692 elms->Add(TextElement::Atom(this), compiler->zone());
4693 return new(compiler->zone()) TextNode(elms, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004694}
4695
4696
4697RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004698 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004699 return new(compiler->zone()) TextNode(elements(), on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004700}
4701
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004702
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004703static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004704 const int* special_class,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004705 int length) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004706 length--; // Remove final 0x10000.
4707 ASSERT(special_class[length] == 0x10000);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004708 ASSERT(ranges->length() != 0);
4709 ASSERT(length != 0);
4710 ASSERT(special_class[0] != 0);
4711 if (ranges->length() != (length >> 1) + 1) {
4712 return false;
4713 }
4714 CharacterRange range = ranges->at(0);
4715 if (range.from() != 0) {
4716 return false;
4717 }
4718 for (int i = 0; i < length; i += 2) {
4719 if (special_class[i] != (range.to() + 1)) {
4720 return false;
4721 }
4722 range = ranges->at((i >> 1) + 1);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004723 if (special_class[i+1] != range.from()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004724 return false;
4725 }
4726 }
4727 if (range.to() != 0xffff) {
4728 return false;
4729 }
4730 return true;
4731}
4732
4733
4734static bool CompareRanges(ZoneList<CharacterRange>* ranges,
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004735 const int* special_class,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004736 int length) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004737 length--; // Remove final 0x10000.
4738 ASSERT(special_class[length] == 0x10000);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004739 if (ranges->length() * 2 != length) {
4740 return false;
4741 }
4742 for (int i = 0; i < length; i += 2) {
4743 CharacterRange range = ranges->at(i >> 1);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00004744 if (range.from() != special_class[i] ||
4745 range.to() != special_class[i + 1] - 1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004746 return false;
4747 }
4748 }
4749 return true;
4750}
4751
4752
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004753bool RegExpCharacterClass::is_standard(Zone* zone) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004754 // TODO(lrn): Remove need for this function, by not throwing away information
4755 // along the way.
4756 if (is_negated_) {
4757 return false;
4758 }
4759 if (set_.is_standard()) {
4760 return true;
4761 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004762 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004763 set_.set_standard_set_type('s');
4764 return true;
4765 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004766 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004767 set_.set_standard_set_type('S');
4768 return true;
4769 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004770 if (CompareInverseRanges(set_.ranges(zone),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004771 kLineTerminatorRanges,
4772 kLineTerminatorRangeCount)) {
4773 set_.set_standard_set_type('.');
4774 return true;
4775 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004776 if (CompareRanges(set_.ranges(zone),
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00004777 kLineTerminatorRanges,
4778 kLineTerminatorRangeCount)) {
4779 set_.set_standard_set_type('n');
4780 return true;
4781 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004782 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00004783 set_.set_standard_set_type('w');
4784 return true;
4785 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004786 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00004787 set_.set_standard_set_type('W');
4788 return true;
4789 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004790 return false;
4791}
4792
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004793
4794RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004795 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004796 return new(compiler->zone()) TextNode(this, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004797}
4798
4799
4800RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004801 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004802 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4803 int length = alternatives->length();
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004804 ChoiceNode* result =
4805 new(compiler->zone()) ChoiceNode(length, compiler->zone());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004806 for (int i = 0; i < length; i++) {
4807 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004808 on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004809 result->AddAlternative(alternative);
4810 }
4811 return result;
4812}
4813
4814
4815RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004816 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004817 return ToNode(min(),
4818 max(),
4819 is_greedy(),
4820 body(),
4821 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004822 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004823}
4824
4825
whesse@chromium.org7b260152011-06-20 15:33:18 +00004826// Scoped object to keep track of how much we unroll quantifier loops in the
4827// regexp graph generator.
4828class RegExpExpansionLimiter {
4829 public:
4830 static const int kMaxExpansionFactor = 6;
4831 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
4832 : compiler_(compiler),
4833 saved_expansion_factor_(compiler->current_expansion_factor()),
4834 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
4835 ASSERT(factor > 0);
4836 if (ok_to_expand_) {
4837 if (factor > kMaxExpansionFactor) {
4838 // Avoid integer overflow of the current expansion factor.
4839 ok_to_expand_ = false;
4840 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
4841 } else {
4842 int new_factor = saved_expansion_factor_ * factor;
4843 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4844 compiler->set_current_expansion_factor(new_factor);
4845 }
4846 }
4847 }
4848
4849 ~RegExpExpansionLimiter() {
4850 compiler_->set_current_expansion_factor(saved_expansion_factor_);
4851 }
4852
4853 bool ok_to_expand() { return ok_to_expand_; }
4854
4855 private:
4856 RegExpCompiler* compiler_;
4857 int saved_expansion_factor_;
4858 bool ok_to_expand_;
4859
4860 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
4861};
4862
4863
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004864RegExpNode* RegExpQuantifier::ToNode(int min,
4865 int max,
4866 bool is_greedy,
4867 RegExpTree* body,
4868 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00004869 RegExpNode* on_success,
4870 bool not_at_start) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004871 // x{f, t} becomes this:
4872 //
4873 // (r++)<-.
4874 // | `
4875 // | (x)
4876 // v ^
4877 // (r=0)-->(?)---/ [if r < t]
4878 // |
4879 // [if r >= f] \----> ...
4880 //
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004881
4882 // 15.10.2.5 RepeatMatcher algorithm.
4883 // The parser has already eliminated the case where max is 0. In the case
4884 // where max_match is zero the parser has removed the quantifier if min was
4885 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
4886
4887 // If we know that we cannot match zero length then things are a little
4888 // simpler since we don't need to make the special zero length match check
4889 // from step 2.1. If the min and max are small we can unroll a little in
4890 // this case.
4891 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
4892 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
4893 if (max == 0) return on_success; // This can happen due to recursion.
ager@chromium.org32912102009-01-16 10:38:43 +00004894 bool body_can_be_empty = (body->min_match() == 0);
4895 int body_start_reg = RegExpCompiler::kNoRegister;
4896 Interval capture_registers = body->CaptureRegisters();
4897 bool needs_capture_clearing = !capture_registers.is_empty();
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004898 Zone* zone = compiler->zone();
4899
ager@chromium.org32912102009-01-16 10:38:43 +00004900 if (body_can_be_empty) {
4901 body_start_reg = compiler->AllocateRegister();
ager@chromium.org381abbb2009-02-25 13:23:22 +00004902 } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
ager@chromium.org32912102009-01-16 10:38:43 +00004903 // Only unroll if there are no captures and the body can't be
4904 // empty.
whesse@chromium.org7b260152011-06-20 15:33:18 +00004905 {
4906 RegExpExpansionLimiter limiter(
4907 compiler, min + ((max != min) ? 1 : 0));
4908 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4909 int new_max = (max == kInfinity) ? max : max - min;
4910 // Recurse once to get the loop or optional matches after the fixed
4911 // ones.
4912 RegExpNode* answer = ToNode(
4913 0, new_max, is_greedy, body, compiler, on_success, true);
4914 // Unroll the forced matches from 0 to min. This can cause chains of
4915 // TextNodes (which the parser does not generate). These should be
4916 // combined if it turns out they hinder good code generation.
4917 for (int i = 0; i < min; i++) {
4918 answer = body->ToNode(compiler, answer);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004919 }
whesse@chromium.org7b260152011-06-20 15:33:18 +00004920 return answer;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004921 }
whesse@chromium.org7b260152011-06-20 15:33:18 +00004922 }
4923 if (max <= kMaxUnrolledMaxMatches && min == 0) {
4924 ASSERT(max > 0); // Due to the 'if' above.
4925 RegExpExpansionLimiter limiter(compiler, max);
4926 if (limiter.ok_to_expand()) {
4927 // Unroll the optional matches up to max.
4928 RegExpNode* answer = on_success;
4929 for (int i = 0; i < max; i++) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004930 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
whesse@chromium.org7b260152011-06-20 15:33:18 +00004931 if (is_greedy) {
4932 alternation->AddAlternative(
4933 GuardedAlternative(body->ToNode(compiler, answer)));
4934 alternation->AddAlternative(GuardedAlternative(on_success));
4935 } else {
4936 alternation->AddAlternative(GuardedAlternative(on_success));
4937 alternation->AddAlternative(
4938 GuardedAlternative(body->ToNode(compiler, answer)));
4939 }
4940 answer = alternation;
4941 if (not_at_start) alternation->set_not_at_start();
4942 }
4943 return answer;
4944 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004945 }
4946 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004947 bool has_min = min > 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004948 bool has_max = max < RegExpTree::kInfinity;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004949 bool needs_counter = has_min || has_max;
ager@chromium.org32912102009-01-16 10:38:43 +00004950 int reg_ctr = needs_counter
4951 ? compiler->AllocateRegister()
4952 : RegExpCompiler::kNoRegister;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004953 LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0,
4954 zone);
iposva@chromium.org245aa852009-02-10 00:49:54 +00004955 if (not_at_start) center->set_not_at_start();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004956 RegExpNode* loop_return = needs_counter
4957 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
4958 : static_cast<RegExpNode*>(center);
ager@chromium.org32912102009-01-16 10:38:43 +00004959 if (body_can_be_empty) {
4960 // If the body can be empty we need to check if it was and then
4961 // backtrack.
4962 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
4963 reg_ctr,
4964 min,
4965 loop_return);
4966 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004967 RegExpNode* body_node = body->ToNode(compiler, loop_return);
ager@chromium.org32912102009-01-16 10:38:43 +00004968 if (body_can_be_empty) {
4969 // If the body can be empty we need to store the start position
4970 // so we can bail out if it was empty.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004971 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
ager@chromium.org32912102009-01-16 10:38:43 +00004972 }
4973 if (needs_capture_clearing) {
4974 // Before entering the body of this loop we need to clear captures.
4975 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
4976 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004977 GuardedAlternative body_alt(body_node);
4978 if (has_max) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004979 Guard* body_guard =
4980 new(zone) Guard(reg_ctr, Guard::LT, max);
4981 body_alt.AddGuard(body_guard, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004982 }
4983 GuardedAlternative rest_alt(on_success);
4984 if (has_min) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00004985 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
4986 rest_alt.AddGuard(rest_guard, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004987 }
4988 if (is_greedy) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004989 center->AddLoopAlternative(body_alt);
4990 center->AddContinueAlternative(rest_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004991 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004992 center->AddContinueAlternative(rest_alt);
4993 center->AddLoopAlternative(body_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004994 }
4995 if (needs_counter) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004996 return ActionNode::SetRegister(reg_ctr, 0, center);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004997 } else {
4998 return center;
4999 }
5000}
5001
5002
5003RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005004 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005005 NodeInfo info;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005006 Zone* zone = compiler->zone();
5007
ulan@chromium.orgdfe53072013-06-06 14:14:51 +00005008 switch (assertion_type()) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005009 case START_OF_LINE:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005010 return AssertionNode::AfterNewline(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005011 case START_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005012 return AssertionNode::AtStart(on_success);
5013 case BOUNDARY:
5014 return AssertionNode::AtBoundary(on_success);
5015 case NON_BOUNDARY:
5016 return AssertionNode::AtNonBoundary(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005017 case END_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005018 return AssertionNode::AtEnd(on_success);
5019 case END_OF_LINE: {
5020 // Compile $ in multiline regexps as an alternation with a positive
5021 // lookahead in one side and an end-of-input on the other side.
5022 // We need two registers for the lookahead.
5023 int stack_pointer_register = compiler->AllocateRegister();
5024 int position_register = compiler->AllocateRegister();
5025 // The ChoiceNode to distinguish between a newline and end-of-input.
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005026 ChoiceNode* result = new(zone) ChoiceNode(2, zone);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005027 // Create a newline atom.
5028 ZoneList<CharacterRange>* newline_ranges =
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005029 new(zone) ZoneList<CharacterRange>(3, zone);
5030 CharacterRange::AddClassEscape('n', newline_ranges, zone);
5031 RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n');
5032 TextNode* newline_matcher = new(zone) TextNode(
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005033 newline_atom,
5034 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5035 position_register,
5036 0, // No captures inside.
5037 -1, // Ignored if no captures.
5038 on_success));
5039 // Create an end-of-input matcher.
5040 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5041 stack_pointer_register,
5042 position_register,
5043 newline_matcher);
5044 // Add the two alternatives to the ChoiceNode.
5045 GuardedAlternative eol_alternative(end_of_line);
5046 result->AddAlternative(eol_alternative);
5047 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5048 result->AddAlternative(end_alternative);
5049 return result;
5050 }
5051 default:
5052 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005053 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005054 return on_success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005055}
5056
5057
5058RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005059 RegExpNode* on_success) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005060 return new(compiler->zone())
5061 BackReferenceNode(RegExpCapture::StartRegister(index()),
5062 RegExpCapture::EndRegister(index()),
5063 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005064}
5065
5066
5067RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005068 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005069 return on_success;
5070}
5071
5072
5073RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005074 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005075 int stack_pointer_register = compiler->AllocateRegister();
5076 int position_register = compiler->AllocateRegister();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005077
5078 const int registers_per_capture = 2;
5079 const int register_of_first_capture = 2;
5080 int register_count = capture_count_ * registers_per_capture;
5081 int register_start =
5082 register_of_first_capture + capture_from_ * registers_per_capture;
5083
ager@chromium.org8bb60582008-12-11 12:02:20 +00005084 RegExpNode* success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005085 if (is_positive()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005086 RegExpNode* node = ActionNode::BeginSubmatch(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005087 stack_pointer_register,
5088 position_register,
5089 body()->ToNode(
5090 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005091 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5092 position_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005093 register_count,
5094 register_start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005095 on_success)));
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005096 return node;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005097 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00005098 // We use a ChoiceNode for a negative lookahead because it has most of
5099 // the characteristics we need. It has the body of the lookahead as its
5100 // first alternative and the expression after the lookahead of the second
5101 // alternative. If the first alternative succeeds then the
5102 // NegativeSubmatchSuccess will unwind the stack including everything the
5103 // choice node set up and backtrack. If the first alternative fails then
5104 // the second alternative is tried, which is exactly the desired result
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005105 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
5106 // ChoiceNode that knows to ignore the first exit when calculating quick
5107 // checks.
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005108 Zone* zone = compiler->zone();
5109
ager@chromium.org8bb60582008-12-11 12:02:20 +00005110 GuardedAlternative body_alt(
5111 body()->ToNode(
5112 compiler,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005113 success = new(zone) NegativeSubmatchSuccess(stack_pointer_register,
5114 position_register,
5115 register_count,
5116 register_start,
5117 zone)));
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005118 ChoiceNode* choice_node =
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005119 new(zone) NegativeLookaheadChoiceNode(body_alt,
5120 GuardedAlternative(on_success),
5121 zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005122 return ActionNode::BeginSubmatch(stack_pointer_register,
5123 position_register,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005124 choice_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005125 }
5126}
5127
5128
5129RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005130 RegExpNode* on_success) {
5131 return ToNode(body(), index(), compiler, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005132}
5133
5134
5135RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5136 int index,
5137 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005138 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005139 int start_reg = RegExpCapture::StartRegister(index);
5140 int end_reg = RegExpCapture::EndRegister(index);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005141 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
ager@chromium.org8bb60582008-12-11 12:02:20 +00005142 RegExpNode* body_node = body->ToNode(compiler, store_end);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005143 return ActionNode::StorePosition(start_reg, true, body_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005144}
5145
5146
5147RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00005148 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005149 ZoneList<RegExpTree*>* children = nodes();
5150 RegExpNode* current = on_success;
5151 for (int i = children->length() - 1; i >= 0; i--) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00005152 current = children->at(i)->ToNode(compiler, current);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005153 }
5154 return current;
5155}
5156
5157
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005158static void AddClass(const int* elmv,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005159 int elmc,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005160 ZoneList<CharacterRange>* ranges,
5161 Zone* zone) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005162 elmc--;
5163 ASSERT(elmv[elmc] == 0x10000);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005164 for (int i = 0; i < elmc; i += 2) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005165 ASSERT(elmv[i] < elmv[i + 1]);
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005166 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005167 }
5168}
5169
5170
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005171static void AddClassNegated(const int *elmv,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005172 int elmc,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005173 ZoneList<CharacterRange>* ranges,
5174 Zone* zone) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005175 elmc--;
5176 ASSERT(elmv[elmc] == 0x10000);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005177 ASSERT(elmv[0] != 0x0000);
yangguo@chromium.org154ff992012-03-13 08:09:54 +00005178 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005179 uc16 last = 0x0000;
5180 for (int i = 0; i < elmc; i += 2) {
5181 ASSERT(last <= elmv[i] - 1);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005182 ASSERT(elmv[i] < elmv[i + 1]);
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005183 ranges->Add(CharacterRange(last, elmv[i] - 1), zone);
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005184 last = elmv[i + 1];
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005185 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005186 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005187}
5188
5189
5190void CharacterRange::AddClassEscape(uc16 type,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005191 ZoneList<CharacterRange>* ranges,
5192 Zone* zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005193 switch (type) {
5194 case 's':
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005195 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005196 break;
5197 case 'S':
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005198 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005199 break;
5200 case 'w':
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005201 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005202 break;
5203 case 'W':
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005204 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005205 break;
5206 case 'd':
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005207 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005208 break;
5209 case 'D':
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005210 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005211 break;
5212 case '.':
5213 AddClassNegated(kLineTerminatorRanges,
5214 kLineTerminatorRangeCount,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005215 ranges,
5216 zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005217 break;
5218 // This is not a character range as defined by the spec but a
5219 // convenient shorthand for a character class that matches any
5220 // character.
5221 case '*':
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005222 ranges->Add(CharacterRange::Everything(), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005223 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005224 // This is the set of characters matched by the $ and ^ symbols
5225 // in multiline mode.
5226 case 'n':
5227 AddClass(kLineTerminatorRanges,
5228 kLineTerminatorRangeCount,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005229 ranges,
5230 zone);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005231 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005232 default:
5233 UNREACHABLE();
5234 }
5235}
5236
5237
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005238Vector<const int> CharacterRange::GetWordBounds() {
5239 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005240}
5241
5242
5243class CharacterRangeSplitter {
5244 public:
5245 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005246 ZoneList<CharacterRange>** excluded,
5247 Zone* zone)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005248 : included_(included),
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005249 excluded_(excluded),
5250 zone_(zone) { }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005251 void Call(uc16 from, DispatchTable::Entry entry);
5252
5253 static const int kInBase = 0;
5254 static const int kInOverlay = 1;
5255
5256 private:
5257 ZoneList<CharacterRange>** included_;
5258 ZoneList<CharacterRange>** excluded_;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005259 Zone* zone_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005260};
5261
5262
5263void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
5264 if (!entry.out_set()->Get(kInBase)) return;
5265 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
5266 ? included_
5267 : excluded_;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005268 if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_);
5269 (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005270}
5271
5272
5273void CharacterRange::Split(ZoneList<CharacterRange>* base,
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005274 Vector<const int> overlay,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005275 ZoneList<CharacterRange>** included,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005276 ZoneList<CharacterRange>** excluded,
5277 Zone* zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005278 ASSERT_EQ(NULL, *included);
5279 ASSERT_EQ(NULL, *excluded);
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005280 DispatchTable table(zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005281 for (int i = 0; i < base->length(); i++)
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005282 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005283 for (int i = 0; i < overlay.length(); i += 2) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005284 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005285 CharacterRangeSplitter::kInOverlay, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005286 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005287 CharacterRangeSplitter callback(included, excluded, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005288 table.ForEach(&callback);
5289}
5290
5291
ager@chromium.org38e4c712009-11-11 09:11:58 +00005292void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005293 bool is_ascii,
5294 Zone* zone) {
jkummerow@chromium.org3d00d0a2013-09-04 13:57:32 +00005295 Isolate* isolate = zone->isolate();
ager@chromium.org38e4c712009-11-11 09:11:58 +00005296 uc16 bottom = from();
5297 uc16 top = to();
mvstanton@chromium.org6bec0092013-01-23 13:46:53 +00005298 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00005299 if (bottom > String::kMaxOneByteCharCode) return;
5300 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
ager@chromium.org38e4c712009-11-11 09:11:58 +00005301 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005302 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org38e4c712009-11-11 09:11:58 +00005303 if (top == bottom) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005304 // If this is a singleton we just expand the one character.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00005305 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005306 for (int i = 0; i < length; i++) {
5307 uc32 chr = chars[i];
ager@chromium.org38e4c712009-11-11 09:11:58 +00005308 if (chr != bottom) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005309 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005310 }
5311 }
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005312 } else {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005313 // If this is a range we expand the characters block by block,
5314 // expanding contiguous subranges (blocks) one at a time.
5315 // The approach is as follows. For a given start character we
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005316 // look up the remainder of the block that contains it (represented
5317 // by the end point), for instance we find 'z' if the character
5318 // is 'c'. A block is characterized by the property
5319 // that all characters uncanonicalize in the same way, except that
5320 // each entry in the result is incremented by the distance from the first
5321 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
5322 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
5323 // Once we've found the end point we look up its uncanonicalization
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005324 // and produce a range for each element. For instance for [c-f]
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005325 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005326 // add a range if it is not already contained in the input, so [c-f]
5327 // will be skipped but [C-F] will be added. If this range is not
5328 // completely contained in a block we do this for all the blocks
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005329 // covered by the range (handling characters that is not in a block
5330 // as a "singleton block").
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005331 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org38e4c712009-11-11 09:11:58 +00005332 int pos = bottom;
jkummerow@chromium.org1456e702012-03-30 08:38:13 +00005333 while (pos <= top) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00005334 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005335 uc16 block_end;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005336 if (length == 0) {
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005337 block_end = pos;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005338 } else {
5339 ASSERT_EQ(1, length);
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005340 block_end = range[0];
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005341 }
ager@chromium.org38e4c712009-11-11 09:11:58 +00005342 int end = (block_end > top) ? top : block_end;
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +00005343 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005344 for (int i = 0; i < length; i++) {
5345 uc32 c = range[i];
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005346 uc16 range_from = c - (block_end - pos);
5347 uc16 range_to = c - (block_end - end);
ager@chromium.org38e4c712009-11-11 09:11:58 +00005348 if (!(bottom <= range_from && range_to <= top)) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005349 ranges->Add(CharacterRange(range_from, range_to), zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005350 }
5351 }
whesse@chromium.orge90029b2010-08-02 11:52:17 +00005352 pos = end + 1;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005353 }
ager@chromium.org38e4c712009-11-11 09:11:58 +00005354 }
5355}
5356
5357
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005358bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5359 ASSERT_NOT_NULL(ranges);
5360 int n = ranges->length();
5361 if (n <= 1) return true;
5362 int max = ranges->at(0).to();
5363 for (int i = 1; i < n; i++) {
5364 CharacterRange next_range = ranges->at(i);
5365 if (next_range.from() <= max + 1) return false;
5366 max = next_range.to();
5367 }
5368 return true;
5369}
5370
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005371
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005372ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005373 if (ranges_ == NULL) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005374 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5375 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005376 }
5377 return ranges_;
5378}
5379
5380
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005381// Move a number of elements in a zonelist to another position
5382// in the same list. Handles overlapping source and target areas.
5383static void MoveRanges(ZoneList<CharacterRange>* list,
5384 int from,
5385 int to,
5386 int count) {
5387 // Ranges are potentially overlapping.
5388 if (from < to) {
5389 for (int i = count - 1; i >= 0; i--) {
5390 list->at(to + i) = list->at(from + i);
5391 }
5392 } else {
5393 for (int i = 0; i < count; i++) {
5394 list->at(to + i) = list->at(from + i);
5395 }
5396 }
5397}
5398
5399
5400static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5401 int count,
5402 CharacterRange insert) {
5403 // Inserts a range into list[0..count[, which must be sorted
5404 // by from value and non-overlapping and non-adjacent, using at most
5405 // list[0..count] for the result. Returns the number of resulting
5406 // canonicalized ranges. Inserting a range may collapse existing ranges into
5407 // fewer ranges, so the return value can be anything in the range 1..count+1.
5408 uc16 from = insert.from();
5409 uc16 to = insert.to();
5410 int start_pos = 0;
5411 int end_pos = count;
5412 for (int i = count - 1; i >= 0; i--) {
5413 CharacterRange current = list->at(i);
5414 if (current.from() > to + 1) {
5415 end_pos = i;
5416 } else if (current.to() + 1 < from) {
5417 start_pos = i + 1;
5418 break;
5419 }
5420 }
5421
5422 // Inserted range overlaps, or is adjacent to, ranges at positions
5423 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
5424 // not affected by the insertion.
5425 // If start_pos == end_pos, the range must be inserted before start_pos.
5426 // if start_pos < end_pos, the entire range from start_pos to end_pos
5427 // must be merged with the insert range.
5428
5429 if (start_pos == end_pos) {
5430 // Insert between existing ranges at position start_pos.
5431 if (start_pos < count) {
5432 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
5433 }
5434 list->at(start_pos) = insert;
5435 return count + 1;
5436 }
5437 if (start_pos + 1 == end_pos) {
5438 // Replace single existing range at position start_pos.
5439 CharacterRange to_replace = list->at(start_pos);
5440 int new_from = Min(to_replace.from(), from);
5441 int new_to = Max(to_replace.to(), to);
5442 list->at(start_pos) = CharacterRange(new_from, new_to);
5443 return count;
5444 }
5445 // Replace a number of existing ranges from start_pos to end_pos - 1.
5446 // Move the remaining ranges down.
5447
5448 int new_from = Min(list->at(start_pos).from(), from);
5449 int new_to = Max(list->at(end_pos - 1).to(), to);
5450 if (end_pos < count) {
5451 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
5452 }
5453 list->at(start_pos) = CharacterRange(new_from, new_to);
5454 return count - (end_pos - start_pos) + 1;
5455}
5456
5457
5458void CharacterSet::Canonicalize() {
5459 // Special/default classes are always considered canonical. The result
5460 // of calling ranges() will be sorted.
5461 if (ranges_ == NULL) return;
5462 CharacterRange::Canonicalize(ranges_);
5463}
5464
5465
5466void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
5467 if (character_ranges->length() <= 1) return;
5468 // Check whether ranges are already canonical (increasing, non-overlapping,
5469 // non-adjacent).
5470 int n = character_ranges->length();
5471 int max = character_ranges->at(0).to();
5472 int i = 1;
5473 while (i < n) {
5474 CharacterRange current = character_ranges->at(i);
5475 if (current.from() <= max + 1) {
5476 break;
5477 }
5478 max = current.to();
5479 i++;
5480 }
5481 // Canonical until the i'th range. If that's all of them, we are done.
5482 if (i == n) return;
5483
5484 // The ranges at index i and forward are not canonicalized. Make them so by
5485 // doing the equivalent of insertion sort (inserting each into the previous
5486 // list, in order).
5487 // Notice that inserting a range can reduce the number of ranges in the
5488 // result due to combining of adjacent and overlapping ranges.
5489 int read = i; // Range to insert.
5490 int num_canonical = i; // Length of canonicalized part of list.
5491 do {
5492 num_canonical = InsertRangeInCanonicalList(character_ranges,
5493 num_canonical,
5494 character_ranges->at(read));
5495 read++;
5496 } while (read < n);
5497 character_ranges->Rewind(num_canonical);
5498
5499 ASSERT(CharacterRange::IsCanonical(character_ranges));
5500}
5501
5502
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005503void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005504 ZoneList<CharacterRange>* negated_ranges,
5505 Zone* zone) {
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005506 ASSERT(CharacterRange::IsCanonical(ranges));
5507 ASSERT_EQ(0, negated_ranges->length());
5508 int range_count = ranges->length();
5509 uc16 from = 0;
5510 int i = 0;
5511 if (range_count > 0 && ranges->at(0).from() == 0) {
5512 from = ranges->at(0).to();
5513 i = 1;
5514 }
5515 while (i < range_count) {
5516 CharacterRange range = ranges->at(i);
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005517 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005518 from = range.to();
5519 i++;
5520 }
yangguo@chromium.org154ff992012-03-13 08:09:54 +00005521 if (from < String::kMaxUtf16CodeUnit) {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005522 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit),
5523 zone);
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005524 }
5525}
5526
5527
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005528// -------------------------------------------------------------------
5529// Splay tree
5530
5531
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005532OutSet* OutSet::Extend(unsigned value, Zone* zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005533 if (Get(value))
5534 return this;
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005535 if (successors(zone) != NULL) {
5536 for (int i = 0; i < successors(zone)->length(); i++) {
5537 OutSet* successor = successors(zone)->at(i);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005538 if (successor->Get(value))
5539 return successor;
5540 }
5541 } else {
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005542 successors_ = new(zone) ZoneList<OutSet*>(2, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005543 }
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005544 OutSet* result = new(zone) OutSet(first_, remaining_);
5545 result->Set(value, zone);
5546 successors(zone)->Add(result, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005547 return result;
5548}
5549
5550
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005551void OutSet::Set(unsigned value, Zone *zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005552 if (value < kFirstLimit) {
5553 first_ |= (1 << value);
5554 } else {
5555 if (remaining_ == NULL)
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005556 remaining_ = new(zone) ZoneList<unsigned>(1, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005557 if (remaining_->is_empty() || !remaining_->Contains(value))
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005558 remaining_->Add(value, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005559 }
5560}
5561
5562
5563bool OutSet::Get(unsigned value) {
5564 if (value < kFirstLimit) {
5565 return (first_ & (1 << value)) != 0;
5566 } else if (remaining_ == NULL) {
5567 return false;
5568 } else {
5569 return remaining_->Contains(value);
5570 }
5571}
5572
5573
5574const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005575
5576
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005577void DispatchTable::AddRange(CharacterRange full_range, int value,
5578 Zone* zone) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005579 CharacterRange current = full_range;
5580 if (tree()->is_empty()) {
5581 // If this is the first range we just insert into the table.
5582 ZoneSplayTree<Config>::Locator loc;
5583 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005584 loc.set_value(Entry(current.from(), current.to(),
5585 empty()->Extend(value, zone)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005586 return;
5587 }
5588 // First see if there is a range to the left of this one that
5589 // overlaps.
5590 ZoneSplayTree<Config>::Locator loc;
5591 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
5592 Entry* entry = &loc.value();
5593 // If we've found a range that overlaps with this one, and it
5594 // starts strictly to the left of this one, we have to fix it
5595 // because the following code only handles ranges that start on
5596 // or after the start point of the range we're adding.
5597 if (entry->from() < current.from() && entry->to() >= current.from()) {
5598 // Snap the overlapping range in half around the start point of
5599 // the range we're adding.
5600 CharacterRange left(entry->from(), current.from() - 1);
5601 CharacterRange right(current.from(), entry->to());
5602 // The left part of the overlapping range doesn't overlap.
5603 // Truncate the whole entry to be just the left part.
5604 entry->set_to(left.to());
5605 // The right part is the one that overlaps. We add this part
5606 // to the map and let the next step deal with merging it with
5607 // the range we're adding.
5608 ZoneSplayTree<Config>::Locator loc;
5609 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
5610 loc.set_value(Entry(right.from(),
5611 right.to(),
5612 entry->out_set()));
5613 }
5614 }
5615 while (current.is_valid()) {
5616 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
5617 (loc.value().from() <= current.to()) &&
5618 (loc.value().to() >= current.from())) {
5619 Entry* entry = &loc.value();
5620 // We have overlap. If there is space between the start point of
5621 // the range we're adding and where the overlapping range starts
5622 // then we have to add a range covering just that space.
5623 if (current.from() < entry->from()) {
5624 ZoneSplayTree<Config>::Locator ins;
5625 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
5626 ins.set_value(Entry(current.from(),
5627 entry->from() - 1,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005628 empty()->Extend(value, zone)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005629 current.set_from(entry->from());
5630 }
5631 ASSERT_EQ(current.from(), entry->from());
5632 // If the overlapping range extends beyond the one we want to add
5633 // we have to snap the right part off and add it separately.
5634 if (entry->to() > current.to()) {
5635 ZoneSplayTree<Config>::Locator ins;
5636 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
5637 ins.set_value(Entry(current.to() + 1,
5638 entry->to(),
5639 entry->out_set()));
5640 entry->set_to(current.to());
5641 }
5642 ASSERT(entry->to() <= current.to());
5643 // The overlapping range is now completely contained by the range
5644 // we're adding so we can just update it and move the start point
5645 // of the range we're adding just past it.
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005646 entry->AddValue(value, zone);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005647 // Bail out if the last interval ended at 0xFFFF since otherwise
5648 // adding 1 will wrap around to 0.
yangguo@chromium.org154ff992012-03-13 08:09:54 +00005649 if (entry->to() == String::kMaxUtf16CodeUnit)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005650 break;
5651 ASSERT(entry->to() + 1 > current.from());
5652 current.set_from(entry->to() + 1);
5653 } else {
5654 // There is no overlap so we can just add the range
5655 ZoneSplayTree<Config>::Locator ins;
5656 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
5657 ins.set_value(Entry(current.from(),
5658 current.to(),
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005659 empty()->Extend(value, zone)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005660 break;
5661 }
5662 }
5663}
5664
5665
5666OutSet* DispatchTable::Get(uc16 value) {
5667 ZoneSplayTree<Config>::Locator loc;
5668 if (!tree()->FindGreatestLessThan(value, &loc))
5669 return empty();
5670 Entry* entry = &loc.value();
5671 if (value <= entry->to())
5672 return entry->out_set();
5673 else
5674 return empty();
5675}
5676
5677
5678// -------------------------------------------------------------------
5679// Analysis
5680
5681
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005682void Analysis::EnsureAnalyzed(RegExpNode* that) {
jkummerow@chromium.org3d00d0a2013-09-04 13:57:32 +00005683 StackLimitCheck check(that->zone()->isolate());
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00005684 if (check.HasOverflowed()) {
5685 fail("Stack overflow");
5686 return;
5687 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005688 if (that->info()->been_analyzed || that->info()->being_analyzed)
5689 return;
5690 that->info()->being_analyzed = true;
5691 that->Accept(this);
5692 that->info()->being_analyzed = false;
5693 that->info()->been_analyzed = true;
5694}
5695
5696
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005697void Analysis::VisitEnd(EndNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005698 // nothing to do
5699}
5700
5701
ager@chromium.org8bb60582008-12-11 12:02:20 +00005702void TextNode::CalculateOffsets() {
5703 int element_count = elements()->length();
5704 // Set up the offsets of the elements relative to the start. This is a fixed
5705 // quantity since a TextNode can only contain fixed-width things.
5706 int cp_offset = 0;
5707 for (int i = 0; i < element_count; i++) {
5708 TextElement& elm = elements()->at(i);
rossberg@chromium.org92597162013-08-23 13:28:00 +00005709 elm.set_cp_offset(cp_offset);
5710 cp_offset += elm.length();
ager@chromium.org8bb60582008-12-11 12:02:20 +00005711 }
5712}
5713
5714
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005715void Analysis::VisitText(TextNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005716 if (ignore_case_) {
ager@chromium.org38e4c712009-11-11 09:11:58 +00005717 that->MakeCaseIndependent(is_ascii_);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005718 }
5719 EnsureAnalyzed(that->on_success());
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00005720 if (!has_failed()) {
5721 that->CalculateOffsets();
5722 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005723}
5724
5725
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005726void Analysis::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00005727 RegExpNode* target = that->on_success();
5728 EnsureAnalyzed(target);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00005729 if (!has_failed()) {
5730 // If the next node is interested in what it follows then this node
5731 // has to be interested too so it can pass the information on.
5732 that->info()->AddFromFollowing(target->info());
5733 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005734}
5735
5736
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005737void Analysis::VisitChoice(ChoiceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005738 NodeInfo* info = that->info();
5739 for (int i = 0; i < that->alternatives()->length(); i++) {
5740 RegExpNode* node = that->alternatives()->at(i).node();
5741 EnsureAnalyzed(node);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00005742 if (has_failed()) return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005743 // Anything the following nodes need to know has to be known by
5744 // this node also, so it can pass it on.
5745 info->AddFromFollowing(node->info());
5746 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005747}
5748
5749
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005750void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
5751 NodeInfo* info = that->info();
5752 for (int i = 0; i < that->alternatives()->length(); i++) {
5753 RegExpNode* node = that->alternatives()->at(i).node();
5754 if (node != that->loop_node()) {
5755 EnsureAnalyzed(node);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00005756 if (has_failed()) return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005757 info->AddFromFollowing(node->info());
5758 }
5759 }
5760 // Check the loop last since it may need the value of this node
5761 // to get a correct result.
5762 EnsureAnalyzed(that->loop_node());
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00005763 if (!has_failed()) {
5764 info->AddFromFollowing(that->loop_node()->info());
5765 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00005766}
5767
5768
5769void Analysis::VisitBackReference(BackReferenceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005770 EnsureAnalyzed(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005771}
5772
5773
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005774void Analysis::VisitAssertion(AssertionNode* that) {
5775 EnsureAnalyzed(that->on_success());
5776}
5777
5778
verwaest@chromium.org37141392012-05-31 13:27:02 +00005779void BackReferenceNode::FillInBMInfo(int offset,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00005780 int budget,
verwaest@chromium.org37141392012-05-31 13:27:02 +00005781 BoyerMooreLookahead* bm,
5782 bool not_at_start) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005783 // Working out the set of characters that a backreference can match is too
5784 // hard, so we just say that any character can match.
5785 bm->SetRest(offset);
5786 SaveBMInfo(bm, not_at_start, offset);
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005787}
5788
5789
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005790STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
5791 RegExpMacroAssembler::kTableSize);
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00005792
5793
verwaest@chromium.org37141392012-05-31 13:27:02 +00005794void ChoiceNode::FillInBMInfo(int offset,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00005795 int budget,
verwaest@chromium.org37141392012-05-31 13:27:02 +00005796 BoyerMooreLookahead* bm,
5797 bool not_at_start) {
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005798 ZoneList<GuardedAlternative>* alts = alternatives();
rossberg@chromium.org400388e2012-06-06 09:29:22 +00005799 budget = (budget - 1) / alts->length();
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005800 for (int i = 0; i < alts->length(); i++) {
5801 GuardedAlternative& alt = alts->at(i);
5802 if (alt.guards() != NULL && alt.guards()->length() != 0) {
5803 bm->SetRest(offset); // Give up trying to fill in info.
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005804 SaveBMInfo(bm, not_at_start, offset);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005805 return;
5806 }
yangguo@chromium.org4a9f6552013-03-04 14:46:33 +00005807 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005808 }
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005809 SaveBMInfo(bm, not_at_start, offset);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005810}
5811
5812
verwaest@chromium.org37141392012-05-31 13:27:02 +00005813void TextNode::FillInBMInfo(int initial_offset,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00005814 int budget,
verwaest@chromium.org37141392012-05-31 13:27:02 +00005815 BoyerMooreLookahead* bm,
5816 bool not_at_start) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005817 if (initial_offset >= bm->length()) return;
5818 int offset = initial_offset;
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005819 int max_char = bm->max_char();
5820 for (int i = 0; i < elements()->length(); i++) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005821 if (offset >= bm->length()) {
5822 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5823 return;
5824 }
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005825 TextElement text = elements()->at(i);
rossberg@chromium.org92597162013-08-23 13:28:00 +00005826 if (text.text_type() == TextElement::ATOM) {
5827 RegExpAtom* atom = text.atom();
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005828 for (int j = 0; j < atom->length(); j++, offset++) {
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005829 if (offset >= bm->length()) {
5830 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5831 return;
5832 }
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005833 uc16 character = atom->data()[j];
5834 if (bm->compiler()->ignore_case()) {
5835 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5836 int length = GetCaseIndependentLetters(
hpayer@chromium.orgc5d49712013-09-11 08:25:48 +00005837 Isolate::Current(),
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005838 character,
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00005839 bm->max_char() == String::kMaxOneByteCharCode,
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005840 chars);
5841 for (int j = 0; j < length; j++) {
5842 bm->Set(offset, chars[j]);
5843 }
5844 } else {
5845 if (character <= max_char) bm->Set(offset, character);
5846 }
5847 }
5848 } else {
rossberg@chromium.org92597162013-08-23 13:28:00 +00005849 ASSERT_EQ(TextElement::CHAR_CLASS, text.text_type());
5850 RegExpCharacterClass* char_class = text.char_class();
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005851 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005852 if (char_class->is_negated()) {
5853 bm->SetAll(offset);
5854 } else {
5855 for (int k = 0; k < ranges->length(); k++) {
5856 CharacterRange& range = ranges->at(k);
5857 if (range.from() > max_char) continue;
5858 int to = Min(max_char, static_cast<int>(range.to()));
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005859 bm->SetInterval(offset, Interval(range.from(), to));
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005860 }
5861 }
5862 offset++;
5863 }
5864 }
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005865 if (offset >= bm->length()) {
5866 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5867 return;
5868 }
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005869 on_success()->FillInBMInfo(offset,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00005870 budget - 1,
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005871 bm,
5872 true); // Not at start after a text node.
erik.corry@gmail.comed49e962012-04-17 11:57:53 +00005873 if (initial_offset == 0) set_bm_info(not_at_start, bm);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005874}
5875
5876
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005877// -------------------------------------------------------------------
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005878// Dispatch table construction
5879
5880
5881void DispatchTableConstructor::VisitEnd(EndNode* that) {
5882 AddRange(CharacterRange::Everything());
5883}
5884
5885
5886void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
5887 node->set_being_calculated(true);
5888 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
5889 for (int i = 0; i < alternatives->length(); i++) {
5890 set_choice_index(i);
5891 alternatives->at(i).node()->Accept(this);
5892 }
5893 node->set_being_calculated(false);
5894}
5895
5896
5897class AddDispatchRange {
5898 public:
5899 explicit AddDispatchRange(DispatchTableConstructor* constructor)
5900 : constructor_(constructor) { }
5901 void Call(uc32 from, DispatchTable::Entry entry);
5902 private:
5903 DispatchTableConstructor* constructor_;
5904};
5905
5906
5907void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
5908 CharacterRange range(from, entry.to());
5909 constructor_->AddRange(range);
5910}
5911
5912
5913void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
5914 if (node->being_calculated())
5915 return;
5916 DispatchTable* table = node->GetTable(ignore_case_);
5917 AddDispatchRange adder(this);
5918 table->ForEach(&adder);
5919}
5920
5921
5922void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
5923 // TODO(160): Find the node that we refer back to and propagate its start
5924 // set back to here. For now we just accept anything.
5925 AddRange(CharacterRange::Everything());
5926}
5927
5928
ager@chromium.orgddb913d2009-01-27 10:01:48 +00005929void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
5930 RegExpNode* target = that->on_success();
5931 target->Accept(this);
5932}
5933
5934
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005935static int CompareRangeByFrom(const CharacterRange* a,
5936 const CharacterRange* b) {
5937 return Compare<uc16>(a->from(), b->from());
5938}
5939
5940
5941void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
5942 ranges->Sort(CompareRangeByFrom);
5943 uc16 last = 0;
5944 for (int i = 0; i < ranges->length(); i++) {
5945 CharacterRange range = ranges->at(i);
5946 if (last < range.from())
5947 AddRange(CharacterRange(last, range.from() - 1));
5948 if (range.to() >= last) {
yangguo@chromium.org154ff992012-03-13 08:09:54 +00005949 if (range.to() == String::kMaxUtf16CodeUnit) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005950 return;
5951 } else {
5952 last = range.to() + 1;
5953 }
5954 }
5955 }
yangguo@chromium.org154ff992012-03-13 08:09:54 +00005956 AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005957}
5958
5959
5960void DispatchTableConstructor::VisitText(TextNode* that) {
5961 TextElement elm = that->elements()->at(0);
rossberg@chromium.org92597162013-08-23 13:28:00 +00005962 switch (elm.text_type()) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005963 case TextElement::ATOM: {
rossberg@chromium.org92597162013-08-23 13:28:00 +00005964 uc16 c = elm.atom()->data()[0];
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005965 AddRange(CharacterRange(c, c));
5966 break;
5967 }
5968 case TextElement::CHAR_CLASS: {
rossberg@chromium.org92597162013-08-23 13:28:00 +00005969 RegExpCharacterClass* tree = elm.char_class();
mmassi@chromium.org7028c052012-06-13 11:51:58 +00005970 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00005971 if (tree->is_negated()) {
5972 AddInverse(ranges);
5973 } else {
5974 for (int i = 0; i < ranges->length(); i++)
5975 AddRange(ranges->at(i));
5976 }
5977 break;
5978 }
5979 default: {
5980 UNIMPLEMENTED();
5981 }
5982 }
5983}
5984
5985
5986void DispatchTableConstructor::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00005987 RegExpNode* target = that->on_success();
5988 target->Accept(this);
5989}
5990
5991
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005992RegExpEngine::CompilationResult RegExpEngine::Compile(
5993 RegExpCompileData* data,
5994 bool ignore_case,
mstarzinger@chromium.org15613d02012-05-23 12:04:37 +00005995 bool is_global,
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00005996 bool is_multiline,
5997 Handle<String> pattern,
5998 Handle<String> sample_subject,
rossberg@chromium.org400388e2012-06-06 09:29:22 +00005999 bool is_ascii,
6000 Zone* zone) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00006001 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
dslomov@chromium.orge97852d2013-09-12 09:02:59 +00006002 return IrregexpRegExpTooBig(zone->isolate());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00006003 }
rossberg@chromium.org400388e2012-06-06 09:29:22 +00006004 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii, zone);
fschneider@chromium.org7d10be52012-04-10 12:30:14 +00006005
6006 // Sample some characters from the middle of the string.
6007 static const int kSampleSize = 128;
6008
6009 FlattenString(sample_subject);
6010 int chars_sampled = 0;
6011 int half_way = (sample_subject->length() - kSampleSize) / 2;
6012 for (int i = Max(0, half_way);
6013 i < sample_subject->length() && chars_sampled < kSampleSize;
6014 i++, chars_sampled++) {
6015 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6016 }
6017
ager@chromium.orga74f0da2008-12-03 16:05:52 +00006018 // Wrap the body of the regexp in capture #0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00006019 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00006020 0,
6021 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00006022 compiler.accept());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00006023 RegExpNode* node = captured_body;
whesse@chromium.org4a5224e2010-10-20 12:37:07 +00006024 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6025 bool is_start_anchored = data->tree->IsAnchoredAtStart();
6026 int max_length = data->tree->max_match();
6027 if (!is_start_anchored) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00006028 // Add a .*? at the beginning, outside the body capture, unless
6029 // this expression is anchored at the beginning.
iposva@chromium.org245aa852009-02-10 00:49:54 +00006030 RegExpNode* loop_node =
6031 RegExpQuantifier::ToNode(0,
6032 RegExpTree::kInfinity,
6033 false,
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006034 new(zone) RegExpCharacterClass('*'),
iposva@chromium.org245aa852009-02-10 00:49:54 +00006035 &compiler,
6036 captured_body,
6037 data->contains_anchor);
6038
6039 if (data->contains_anchor) {
6040 // Unroll loop once, to take care of the case that might start
6041 // at the start of input.
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006042 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
iposva@chromium.org245aa852009-02-10 00:49:54 +00006043 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6044 first_step_node->AddAlternative(GuardedAlternative(
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006045 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node)));
iposva@chromium.org245aa852009-02-10 00:49:54 +00006046 node = first_step_node;
6047 } else {
6048 node = loop_node;
6049 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00006050 }
danno@chromium.orgb10deab2012-05-07 14:28:47 +00006051 if (is_ascii) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00006052 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
danno@chromium.orgb10deab2012-05-07 14:28:47 +00006053 // Do it again to propagate the new nodes to places where they were not
6054 // put because they had not been calculated yet.
jkummerow@chromium.org59297c72013-01-09 16:32:23 +00006055 if (node != NULL) {
6056 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
6057 }
danno@chromium.orgb10deab2012-05-07 14:28:47 +00006058 }
danno@chromium.org1044a4d2012-04-30 12:34:39 +00006059
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006060 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00006061 data->node = node;
ager@chromium.org38e4c712009-11-11 09:11:58 +00006062 Analysis analysis(ignore_case, is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00006063 analysis.EnsureAnalyzed(node);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00006064 if (analysis.has_failed()) {
6065 const char* error_message = analysis.error_message();
dslomov@chromium.orge97852d2013-09-12 09:02:59 +00006066 return CompilationResult(zone->isolate(), error_message);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00006067 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00006068
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00006069 // Create the correct assembler for the architecture.
ricow@chromium.orgc9c80822010-04-21 08:22:37 +00006070#ifndef V8_INTERPRETED_REGEXP
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00006071 // Native regexp implementation.
6072
6073 NativeRegExpMacroAssembler::Mode mode =
6074 is_ascii ? NativeRegExpMacroAssembler::ASCII
6075 : NativeRegExpMacroAssembler::UC16;
6076
ager@chromium.org18ad94b2009-09-02 08:22:29 +00006077#if V8_TARGET_ARCH_IA32
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006078 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2,
6079 zone);
ager@chromium.org18ad94b2009-09-02 08:22:29 +00006080#elif V8_TARGET_ARCH_X64
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006081 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2,
6082 zone);
ager@chromium.org18ad94b2009-09-02 08:22:29 +00006083#elif V8_TARGET_ARCH_ARM
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006084 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2,
6085 zone);
lrn@chromium.org7516f052011-03-30 08:52:27 +00006086#elif V8_TARGET_ARCH_MIPS
mmassi@chromium.org7028c052012-06-13 11:51:58 +00006087 RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2,
6088 zone);
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00006089#endif
6090
ricow@chromium.orgc9c80822010-04-21 08:22:37 +00006091#else // V8_INTERPRETED_REGEXP
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00006092 // Interpreted regexp implementation.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00006093 EmbeddedVector<byte, 1024> codes;
yangguo@chromium.org5a11aaf2012-06-20 11:29:00 +00006094 RegExpMacroAssemblerIrregexp macro_assembler(codes, zone);
ricow@chromium.orgc9c80822010-04-21 08:22:37 +00006095#endif // V8_INTERPRETED_REGEXP
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00006096
whesse@chromium.org4a5224e2010-10-20 12:37:07 +00006097 // Inserted here, instead of in Assembler, because it depends on information
6098 // in the AST that isn't replicated in the Node structure.
6099 static const int kMaxBacksearchLimit = 1024;
6100 if (is_end_anchored &&
6101 !is_start_anchored &&
6102 max_length < kMaxBacksearchLimit) {
6103 macro_assembler.SetCurrentPositionFromEnd(max_length);
6104 }
6105
rossberg@chromium.org400388e2012-06-06 09:29:22 +00006106 if (is_global) {
6107 macro_assembler.set_global_mode(
6108 (data->tree->min_match() > 0)
6109 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
6110 : RegExpMacroAssembler::GLOBAL);
6111 }
mstarzinger@chromium.org15613d02012-05-23 12:04:37 +00006112
ager@chromium.orga74f0da2008-12-03 16:05:52 +00006113 return compiler.Assemble(&macro_assembler,
6114 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00006115 data->capture_count,
6116 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00006117}
6118
fschneider@chromium.org0c20e672010-01-14 15:28:53 +00006119
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00006120}} // namespace v8::internal