blob: 51fef71464d58b3a7395fee48d6373cfb9b0aea1 [file] [log] [blame]
Steve Blocka7e24c12009-10-30 11:49:00 +00001// Copyright 2008 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28
29#include <stdlib.h>
30
31#include "v8.h"
32
33#include "string-stream.h"
34#include "cctest.h"
35#include "zone-inl.h"
36#include "parser.h"
37#include "ast.h"
38#include "jsregexp.h"
39#include "regexp-macro-assembler.h"
40#include "regexp-macro-assembler-irregexp.h"
Steve Block6ded16b2010-05-10 14:33:55 +010041#ifdef V8_INTERPRETED_REGEXP
42#include "interpreter-irregexp.h"
43#else // V8_INTERPRETED_REGEXP
Steve Blocka7e24c12009-10-30 11:49:00 +000044#ifdef V8_TARGET_ARCH_ARM
45#include "arm/macro-assembler-arm.h"
46#include "arm/regexp-macro-assembler-arm.h"
47#endif
48#ifdef V8_TARGET_ARCH_X64
49#include "x64/macro-assembler-x64.h"
50#include "x64/regexp-macro-assembler-x64.h"
51#endif
52#ifdef V8_TARGET_ARCH_IA32
53#include "ia32/macro-assembler-ia32.h"
54#include "ia32/regexp-macro-assembler-ia32.h"
55#endif
Steve Block6ded16b2010-05-10 14:33:55 +010056#endif // V8_INTERPRETED_REGEXP
Steve Blocka7e24c12009-10-30 11:49:00 +000057
58using namespace v8::internal;
59
60
Leon Clarkee46be812010-01-19 14:06:41 +000061static bool CheckParse(const char* input) {
62 V8::Initialize(NULL);
63 v8::HandleScope scope;
64 ZoneScope zone_scope(DELETE_ON_EXIT);
65 FlatStringReader reader(CStrVector(input));
66 RegExpCompileData result;
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -080067 return v8::internal::RegExpParser::ParseRegExp(&reader, false, &result);
Leon Clarkee46be812010-01-19 14:06:41 +000068}
69
70
Steve Blocka7e24c12009-10-30 11:49:00 +000071static SmartPointer<const char> Parse(const char* input) {
72 V8::Initialize(NULL);
73 v8::HandleScope scope;
74 ZoneScope zone_scope(DELETE_ON_EXIT);
75 FlatStringReader reader(CStrVector(input));
76 RegExpCompileData result;
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -080077 CHECK(v8::internal::RegExpParser::ParseRegExp(&reader, false, &result));
Steve Blocka7e24c12009-10-30 11:49:00 +000078 CHECK(result.tree != NULL);
79 CHECK(result.error.is_null());
80 SmartPointer<const char> output = result.tree->ToString();
81 return output;
82}
83
84static bool CheckSimple(const char* input) {
85 V8::Initialize(NULL);
86 v8::HandleScope scope;
Steve Blockd0582a62009-12-15 09:54:21 +000087 unibrow::Utf8InputBuffer<> buffer(input, StrLength(input));
Steve Blocka7e24c12009-10-30 11:49:00 +000088 ZoneScope zone_scope(DELETE_ON_EXIT);
89 FlatStringReader reader(CStrVector(input));
90 RegExpCompileData result;
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -080091 CHECK(v8::internal::RegExpParser::ParseRegExp(&reader, false, &result));
Steve Blocka7e24c12009-10-30 11:49:00 +000092 CHECK(result.tree != NULL);
93 CHECK(result.error.is_null());
94 return result.simple;
95}
96
97struct MinMaxPair {
98 int min_match;
99 int max_match;
100};
101
102static MinMaxPair CheckMinMaxMatch(const char* input) {
103 V8::Initialize(NULL);
104 v8::HandleScope scope;
Steve Blockd0582a62009-12-15 09:54:21 +0000105 unibrow::Utf8InputBuffer<> buffer(input, StrLength(input));
Steve Blocka7e24c12009-10-30 11:49:00 +0000106 ZoneScope zone_scope(DELETE_ON_EXIT);
107 FlatStringReader reader(CStrVector(input));
108 RegExpCompileData result;
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800109 CHECK(v8::internal::RegExpParser::ParseRegExp(&reader, false, &result));
Steve Blocka7e24c12009-10-30 11:49:00 +0000110 CHECK(result.tree != NULL);
111 CHECK(result.error.is_null());
112 int min_match = result.tree->min_match();
113 int max_match = result.tree->max_match();
114 MinMaxPair pair = { min_match, max_match };
115 return pair;
116}
117
118
Leon Clarkee46be812010-01-19 14:06:41 +0000119#define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input))
Steve Blocka7e24c12009-10-30 11:49:00 +0000120#define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input))
121#define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
122#define CHECK_MIN_MAX(input, min, max) \
123 { MinMaxPair min_max = CheckMinMaxMatch(input); \
124 CHECK_EQ(min, min_max.min_match); \
125 CHECK_EQ(max, min_max.max_match); \
126 }
127
128TEST(Parser) {
129 V8::Initialize(NULL);
Leon Clarkee46be812010-01-19 14:06:41 +0000130
131 CHECK_PARSE_ERROR("?");
132
Steve Blocka7e24c12009-10-30 11:49:00 +0000133 CHECK_PARSE_EQ("abc", "'abc'");
134 CHECK_PARSE_EQ("", "%");
135 CHECK_PARSE_EQ("abc|def", "(| 'abc' 'def')");
136 CHECK_PARSE_EQ("abc|def|ghi", "(| 'abc' 'def' 'ghi')");
137 CHECK_PARSE_EQ("^xxx$", "(: @^i 'xxx' @$i)");
138 CHECK_PARSE_EQ("ab\\b\\d\\bcd", "(: 'ab' @b [0-9] @b 'cd')");
139 CHECK_PARSE_EQ("\\w|\\d", "(| [0-9 A-Z _ a-z] [0-9])");
140 CHECK_PARSE_EQ("a*", "(# 0 - g 'a')");
141 CHECK_PARSE_EQ("a*?", "(# 0 - n 'a')");
142 CHECK_PARSE_EQ("abc+", "(: 'ab' (# 1 - g 'c'))");
143 CHECK_PARSE_EQ("abc+?", "(: 'ab' (# 1 - n 'c'))");
144 CHECK_PARSE_EQ("xyz?", "(: 'xy' (# 0 1 g 'z'))");
145 CHECK_PARSE_EQ("xyz??", "(: 'xy' (# 0 1 n 'z'))");
146 CHECK_PARSE_EQ("xyz{0,1}", "(: 'xy' (# 0 1 g 'z'))");
147 CHECK_PARSE_EQ("xyz{0,1}?", "(: 'xy' (# 0 1 n 'z'))");
148 CHECK_PARSE_EQ("xyz{93}", "(: 'xy' (# 93 93 g 'z'))");
149 CHECK_PARSE_EQ("xyz{93}?", "(: 'xy' (# 93 93 n 'z'))");
150 CHECK_PARSE_EQ("xyz{1,32}", "(: 'xy' (# 1 32 g 'z'))");
151 CHECK_PARSE_EQ("xyz{1,32}?", "(: 'xy' (# 1 32 n 'z'))");
152 CHECK_PARSE_EQ("xyz{1,}", "(: 'xy' (# 1 - g 'z'))");
153 CHECK_PARSE_EQ("xyz{1,}?", "(: 'xy' (# 1 - n 'z'))");
154 CHECK_PARSE_EQ("a\\fb\\nc\\rd\\te\\vf", "'a\\x0cb\\x0ac\\x0dd\\x09e\\x0bf'");
155 CHECK_PARSE_EQ("a\\nb\\bc", "(: 'a\\x0ab' @b 'c')");
156 CHECK_PARSE_EQ("(?:foo)", "'foo'");
157 CHECK_PARSE_EQ("(?: foo )", "' foo '");
158 CHECK_PARSE_EQ("(foo|bar|baz)", "(^ (| 'foo' 'bar' 'baz'))");
159 CHECK_PARSE_EQ("foo|(bar|baz)|quux", "(| 'foo' (^ (| 'bar' 'baz')) 'quux')");
160 CHECK_PARSE_EQ("foo(?=bar)baz", "(: 'foo' (-> + 'bar') 'baz')");
161 CHECK_PARSE_EQ("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')");
162 CHECK_PARSE_EQ("()", "(^ %)");
163 CHECK_PARSE_EQ("(?=)", "(-> + %)");
164 CHECK_PARSE_EQ("[]", "^[\\x00-\\uffff]"); // Doesn't compile on windows
165 CHECK_PARSE_EQ("[^]", "[\\x00-\\uffff]"); // \uffff isn't in codepage 1252
166 CHECK_PARSE_EQ("[x]", "[x]");
167 CHECK_PARSE_EQ("[xyz]", "[x y z]");
168 CHECK_PARSE_EQ("[a-zA-Z0-9]", "[a-z A-Z 0-9]");
169 CHECK_PARSE_EQ("[-123]", "[- 1 2 3]");
170 CHECK_PARSE_EQ("[^123]", "^[1 2 3]");
171 CHECK_PARSE_EQ("]", "']'");
172 CHECK_PARSE_EQ("}", "'}'");
173 CHECK_PARSE_EQ("[a-b-c]", "[a-b - c]");
174 CHECK_PARSE_EQ("[\\d]", "[0-9]");
175 CHECK_PARSE_EQ("[x\\dz]", "[x 0-9 z]");
176 CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]");
177 CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]");
178 CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]");
Ben Murdoch086aeea2011-05-13 15:57:08 +0100179 // Control character outside character class.
Steve Blocka7e24c12009-10-30 11:49:00 +0000180 CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK",
181 "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'");
Ben Murdoch086aeea2011-05-13 15:57:08 +0100182 CHECK_PARSE_EQ("\\c!", "'\\c!'");
183 CHECK_PARSE_EQ("\\c_", "'\\c_'");
184 CHECK_PARSE_EQ("\\c~", "'\\c~'");
185 CHECK_PARSE_EQ("\\c1", "'\\c1'");
186 // Control character inside character class.
187 CHECK_PARSE_EQ("[\\c!]", "[\\ c !]");
188 CHECK_PARSE_EQ("[\\c_]", "[\\x1f]");
189 CHECK_PARSE_EQ("[\\c~]", "[\\ c ~]");
190 CHECK_PARSE_EQ("[\\ca]", "[\\x01]");
191 CHECK_PARSE_EQ("[\\cz]", "[\\x1a]");
192 CHECK_PARSE_EQ("[\\cA]", "[\\x01]");
193 CHECK_PARSE_EQ("[\\cZ]", "[\\x1a]");
194 CHECK_PARSE_EQ("[\\c1]", "[\\x11]");
195
Steve Blocka7e24c12009-10-30 11:49:00 +0000196 CHECK_PARSE_EQ("[a\\]c]", "[a ] c]");
197 CHECK_PARSE_EQ("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '");
198 CHECK_PARSE_EQ("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ # ]");
199 CHECK_PARSE_EQ("\\0", "'\\x00'");
200 CHECK_PARSE_EQ("\\8", "'8'");
201 CHECK_PARSE_EQ("\\9", "'9'");
202 CHECK_PARSE_EQ("\\11", "'\\x09'");
203 CHECK_PARSE_EQ("\\11a", "'\\x09a'");
204 CHECK_PARSE_EQ("\\011", "'\\x09'");
205 CHECK_PARSE_EQ("\\00011", "'\\x0011'");
206 CHECK_PARSE_EQ("\\118", "'\\x098'");
207 CHECK_PARSE_EQ("\\111", "'I'");
208 CHECK_PARSE_EQ("\\1111", "'I1'");
209 CHECK_PARSE_EQ("(x)(x)(x)\\1", "(: (^ 'x') (^ 'x') (^ 'x') (<- 1))");
210 CHECK_PARSE_EQ("(x)(x)(x)\\2", "(: (^ 'x') (^ 'x') (^ 'x') (<- 2))");
211 CHECK_PARSE_EQ("(x)(x)(x)\\3", "(: (^ 'x') (^ 'x') (^ 'x') (<- 3))");
212 CHECK_PARSE_EQ("(x)(x)(x)\\4", "(: (^ 'x') (^ 'x') (^ 'x') '\\x04')");
213 CHECK_PARSE_EQ("(x)(x)(x)\\1*", "(: (^ 'x') (^ 'x') (^ 'x')"
214 " (# 0 - g (<- 1)))");
215 CHECK_PARSE_EQ("(x)(x)(x)\\2*", "(: (^ 'x') (^ 'x') (^ 'x')"
216 " (# 0 - g (<- 2)))");
217 CHECK_PARSE_EQ("(x)(x)(x)\\3*", "(: (^ 'x') (^ 'x') (^ 'x')"
218 " (# 0 - g (<- 3)))");
219 CHECK_PARSE_EQ("(x)(x)(x)\\4*", "(: (^ 'x') (^ 'x') (^ 'x')"
220 " (# 0 - g '\\x04'))");
221 CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10",
222 "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
223 " (^ 'x') (^ 'x') (^ 'x') (^ 'x') (<- 10))");
224 CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11",
225 "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
226 " (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\\x09')");
227 CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))");
228 CHECK_PARSE_EQ("(a\\1)", "(^ 'a')");
229 CHECK_PARSE_EQ("(\\1a)", "(^ 'a')");
230 CHECK_PARSE_EQ("(?=a)?a", "'a'");
231 CHECK_PARSE_EQ("(?=a){0,10}a", "'a'");
232 CHECK_PARSE_EQ("(?=a){1,10}a", "(: (-> + 'a') 'a')");
233 CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')");
234 CHECK_PARSE_EQ("(?!a)?a", "'a'");
235 CHECK_PARSE_EQ("\\1(a)", "(^ 'a')");
236 CHECK_PARSE_EQ("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))");
237 CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(: (-> - (: (^ 'a') (<- 1))) (<- 1))");
238 CHECK_PARSE_EQ("[\\0]", "[\\x00]");
239 CHECK_PARSE_EQ("[\\11]", "[\\x09]");
240 CHECK_PARSE_EQ("[\\11a]", "[\\x09 a]");
241 CHECK_PARSE_EQ("[\\011]", "[\\x09]");
242 CHECK_PARSE_EQ("[\\00011]", "[\\x00 1 1]");
243 CHECK_PARSE_EQ("[\\118]", "[\\x09 8]");
244 CHECK_PARSE_EQ("[\\111]", "[I]");
245 CHECK_PARSE_EQ("[\\1111]", "[I 1]");
246 CHECK_PARSE_EQ("\\x34", "'\x34'");
247 CHECK_PARSE_EQ("\\x60", "'\x60'");
248 CHECK_PARSE_EQ("\\x3z", "'x3z'");
Ben Murdoch086aeea2011-05-13 15:57:08 +0100249 CHECK_PARSE_EQ("\\c", "'\\c'");
Steve Blocka7e24c12009-10-30 11:49:00 +0000250 CHECK_PARSE_EQ("\\u0034", "'\x34'");
251 CHECK_PARSE_EQ("\\u003z", "'u003z'");
252 CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))");
253
254 CHECK_SIMPLE("a", true);
255 CHECK_SIMPLE("a|b", false);
256 CHECK_SIMPLE("a\\n", false);
257 CHECK_SIMPLE("^a", false);
258 CHECK_SIMPLE("a$", false);
259 CHECK_SIMPLE("a\\b!", false);
260 CHECK_SIMPLE("a\\Bb", false);
261 CHECK_SIMPLE("a*", false);
262 CHECK_SIMPLE("a*?", false);
263 CHECK_SIMPLE("a?", false);
264 CHECK_SIMPLE("a??", false);
265 CHECK_SIMPLE("a{0,1}?", false);
266 CHECK_SIMPLE("a{1,1}?", false);
267 CHECK_SIMPLE("a{1,2}?", false);
268 CHECK_SIMPLE("a+?", false);
269 CHECK_SIMPLE("(a)", false);
270 CHECK_SIMPLE("(a)\\1", false);
271 CHECK_SIMPLE("(\\1a)", false);
272 CHECK_SIMPLE("\\1(a)", false);
273 CHECK_SIMPLE("a\\s", false);
274 CHECK_SIMPLE("a\\S", false);
275 CHECK_SIMPLE("a\\d", false);
276 CHECK_SIMPLE("a\\D", false);
277 CHECK_SIMPLE("a\\w", false);
278 CHECK_SIMPLE("a\\W", false);
279 CHECK_SIMPLE("a.", false);
280 CHECK_SIMPLE("a\\q", false);
281 CHECK_SIMPLE("a[a]", false);
282 CHECK_SIMPLE("a[^a]", false);
283 CHECK_SIMPLE("a[a-z]", false);
284 CHECK_SIMPLE("a[\\q]", false);
285 CHECK_SIMPLE("a(?:b)", false);
286 CHECK_SIMPLE("a(?=b)", false);
287 CHECK_SIMPLE("a(?!b)", false);
288 CHECK_SIMPLE("\\x60", false);
289 CHECK_SIMPLE("\\u0060", false);
290 CHECK_SIMPLE("\\cA", false);
291 CHECK_SIMPLE("\\q", false);
292 CHECK_SIMPLE("\\1112", false);
293 CHECK_SIMPLE("\\0", false);
294 CHECK_SIMPLE("(a)\\1", false);
295 CHECK_SIMPLE("(?=a)?a", false);
296 CHECK_SIMPLE("(?!a)?a\\1", false);
297 CHECK_SIMPLE("(?:(?=a))a\\1", false);
298
299 CHECK_PARSE_EQ("a{}", "'a{}'");
300 CHECK_PARSE_EQ("a{,}", "'a{,}'");
301 CHECK_PARSE_EQ("a{", "'a{'");
302 CHECK_PARSE_EQ("a{z}", "'a{z}'");
303 CHECK_PARSE_EQ("a{1z}", "'a{1z}'");
304 CHECK_PARSE_EQ("a{12z}", "'a{12z}'");
305 CHECK_PARSE_EQ("a{12,", "'a{12,'");
306 CHECK_PARSE_EQ("a{12,3b", "'a{12,3b'");
307 CHECK_PARSE_EQ("{}", "'{}'");
308 CHECK_PARSE_EQ("{,}", "'{,}'");
309 CHECK_PARSE_EQ("{", "'{'");
310 CHECK_PARSE_EQ("{z}", "'{z}'");
311 CHECK_PARSE_EQ("{1z}", "'{1z}'");
312 CHECK_PARSE_EQ("{12z}", "'{12z}'");
313 CHECK_PARSE_EQ("{12,", "'{12,'");
314 CHECK_PARSE_EQ("{12,3b", "'{12,3b'");
315
316 CHECK_MIN_MAX("a", 1, 1);
317 CHECK_MIN_MAX("abc", 3, 3);
318 CHECK_MIN_MAX("a[bc]d", 3, 3);
319 CHECK_MIN_MAX("a|bc", 1, 2);
320 CHECK_MIN_MAX("ab|c", 1, 2);
321 CHECK_MIN_MAX("a||bc", 0, 2);
322 CHECK_MIN_MAX("|", 0, 0);
323 CHECK_MIN_MAX("(?:ab)", 2, 2);
324 CHECK_MIN_MAX("(?:ab|cde)", 2, 3);
325 CHECK_MIN_MAX("(?:ab)|cde", 2, 3);
326 CHECK_MIN_MAX("(ab)", 2, 2);
327 CHECK_MIN_MAX("(ab|cde)", 2, 3);
328 CHECK_MIN_MAX("(ab)\\1", 2, 4);
329 CHECK_MIN_MAX("(ab|cde)\\1", 2, 6);
330 CHECK_MIN_MAX("(?:ab)?", 0, 2);
331 CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity);
332 CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity);
333 CHECK_MIN_MAX("a?", 0, 1);
334 CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity);
335 CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity);
336 CHECK_MIN_MAX("a??", 0, 1);
337 CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity);
338 CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity);
339 CHECK_MIN_MAX("(?:a?)?", 0, 1);
340 CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity);
341 CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity);
342 CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity);
343 CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity);
344 CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity);
345 CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity);
346 CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity);
347 CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity);
348 CHECK_MIN_MAX("a{0}", 0, 0);
349 CHECK_MIN_MAX("(?:a+){0}", 0, 0);
350 CHECK_MIN_MAX("(?:a+){0,0}", 0, 0);
351 CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity);
352 CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity);
353 CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity);
354 CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity);
355 CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity);
356 CHECK_MIN_MAX("(?:ab){4,7}", 8, 14);
357 CHECK_MIN_MAX("a\\bc", 2, 2);
358 CHECK_MIN_MAX("a\\Bc", 2, 2);
359 CHECK_MIN_MAX("a\\sc", 3, 3);
360 CHECK_MIN_MAX("a\\Sc", 3, 3);
361 CHECK_MIN_MAX("a(?=b)c", 2, 2);
362 CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2);
363 CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2);
364}
365
366TEST(ParserRegression) {
367 CHECK_PARSE_EQ("[A-Z$-][x]", "(! [A-Z $ -] [x])");
368 CHECK_PARSE_EQ("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')");
369 CHECK_PARSE_EQ("{", "'{'");
370 CHECK_PARSE_EQ("a|", "(| 'a' %)");
371}
372
373static void ExpectError(const char* input,
374 const char* expected) {
375 V8::Initialize(NULL);
376 v8::HandleScope scope;
377 ZoneScope zone_scope(DELETE_ON_EXIT);
378 FlatStringReader reader(CStrVector(input));
379 RegExpCompileData result;
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800380 CHECK(!v8::internal::RegExpParser::ParseRegExp(&reader, false, &result));
Steve Blocka7e24c12009-10-30 11:49:00 +0000381 CHECK(result.tree == NULL);
382 CHECK(!result.error.is_null());
383 SmartPointer<char> str = result.error->ToCString(ALLOW_NULLS);
384 CHECK_EQ(expected, *str);
385}
386
387
388TEST(Errors) {
389 V8::Initialize(NULL);
390 const char* kEndBackslash = "\\ at end of pattern";
391 ExpectError("\\", kEndBackslash);
392 const char* kUnterminatedGroup = "Unterminated group";
393 ExpectError("(foo", kUnterminatedGroup);
394 const char* kInvalidGroup = "Invalid group";
395 ExpectError("(?", kInvalidGroup);
396 const char* kUnterminatedCharacterClass = "Unterminated character class";
397 ExpectError("[", kUnterminatedCharacterClass);
398 ExpectError("[a-", kUnterminatedCharacterClass);
399 const char* kNothingToRepeat = "Nothing to repeat";
400 ExpectError("*", kNothingToRepeat);
401 ExpectError("?", kNothingToRepeat);
402 ExpectError("+", kNothingToRepeat);
403 ExpectError("{1}", kNothingToRepeat);
404 ExpectError("{1,2}", kNothingToRepeat);
405 ExpectError("{1,}", kNothingToRepeat);
406
407 // Check that we don't allow more than kMaxCapture captures
408 const int kMaxCaptures = 1 << 16; // Must match RegExpParser::kMaxCaptures.
409 const char* kTooManyCaptures = "Too many captures";
410 HeapStringAllocator allocator;
411 StringStream accumulator(&allocator);
412 for (int i = 0; i <= kMaxCaptures; i++) {
413 accumulator.Add("()");
414 }
415 SmartPointer<const char> many_captures(accumulator.ToCString());
416 ExpectError(*many_captures, kTooManyCaptures);
417}
418
419
420static bool IsDigit(uc16 c) {
421 return ('0' <= c && c <= '9');
422}
423
424
425static bool NotDigit(uc16 c) {
426 return !IsDigit(c);
427}
428
429
430static bool IsWhiteSpace(uc16 c) {
431 switch (c) {
432 case 0x09:
433 case 0x0A:
434 case 0x0B:
435 case 0x0C:
436 case 0x0d:
437 case 0x20:
438 case 0xA0:
439 case 0x2028:
440 case 0x2029:
441 return true;
442 default:
443 return unibrow::Space::Is(c);
444 }
445}
446
447
448static bool NotWhiteSpace(uc16 c) {
449 return !IsWhiteSpace(c);
450}
451
452
453static bool NotWord(uc16 c) {
454 return !IsRegExpWord(c);
455}
456
457
458static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
459 ZoneScope scope(DELETE_ON_EXIT);
460 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
461 CharacterRange::AddClassEscape(c, ranges);
462 for (unsigned i = 0; i < (1 << 16); i++) {
463 bool in_class = false;
464 for (int j = 0; !in_class && j < ranges->length(); j++) {
465 CharacterRange& range = ranges->at(j);
466 in_class = (range.from() <= i && i <= range.to());
467 }
468 CHECK_EQ(pred(i), in_class);
469 }
470}
471
472
473TEST(CharacterClassEscapes) {
474 TestCharacterClassEscapes('.', IsRegExpNewline);
475 TestCharacterClassEscapes('d', IsDigit);
476 TestCharacterClassEscapes('D', NotDigit);
477 TestCharacterClassEscapes('s', IsWhiteSpace);
478 TestCharacterClassEscapes('S', NotWhiteSpace);
479 TestCharacterClassEscapes('w', IsRegExpWord);
480 TestCharacterClassEscapes('W', NotWord);
481}
482
483
484static RegExpNode* Compile(const char* input, bool multiline, bool is_ascii) {
485 V8::Initialize(NULL);
486 FlatStringReader reader(CStrVector(input));
487 RegExpCompileData compile_data;
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800488 if (!v8::internal::RegExpParser::ParseRegExp(&reader, multiline,
489 &compile_data))
Steve Blocka7e24c12009-10-30 11:49:00 +0000490 return NULL;
491 Handle<String> pattern = Factory::NewStringFromUtf8(CStrVector(input));
492 RegExpEngine::Compile(&compile_data, false, multiline, pattern, is_ascii);
493 return compile_data.node;
494}
495
496
497static void Execute(const char* input,
498 bool multiline,
499 bool is_ascii,
500 bool dot_output = false) {
501 v8::HandleScope scope;
502 ZoneScope zone_scope(DELETE_ON_EXIT);
503 RegExpNode* node = Compile(input, multiline, is_ascii);
504 USE(node);
505#ifdef DEBUG
506 if (dot_output) {
507 RegExpEngine::DotPrint(input, node, false);
508 exit(0);
509 }
510#endif // DEBUG
511}
512
513
514class TestConfig {
515 public:
516 typedef int Key;
517 typedef int Value;
518 static const int kNoKey;
519 static const int kNoValue;
520 static inline int Compare(int a, int b) {
521 if (a < b)
522 return -1;
523 else if (a > b)
524 return 1;
525 else
526 return 0;
527 }
528};
529
530
531const int TestConfig::kNoKey = 0;
532const int TestConfig::kNoValue = 0;
533
534
535static unsigned PseudoRandom(int i, int j) {
536 return ~(~((i * 781) ^ (j * 329)));
537}
538
539
540TEST(SplayTreeSimple) {
541 static const unsigned kLimit = 1000;
542 ZoneScope zone_scope(DELETE_ON_EXIT);
543 ZoneSplayTree<TestConfig> tree;
544 bool seen[kLimit];
545 for (unsigned i = 0; i < kLimit; i++) seen[i] = false;
546#define CHECK_MAPS_EQUAL() do { \
547 for (unsigned k = 0; k < kLimit; k++) \
548 CHECK_EQ(seen[k], tree.Find(k, &loc)); \
549 } while (false)
550 for (int i = 0; i < 50; i++) {
551 for (int j = 0; j < 50; j++) {
552 unsigned next = PseudoRandom(i, j) % kLimit;
553 if (seen[next]) {
554 // We've already seen this one. Check the value and remove
555 // it.
556 ZoneSplayTree<TestConfig>::Locator loc;
557 CHECK(tree.Find(next, &loc));
558 CHECK_EQ(next, loc.key());
559 CHECK_EQ(3 * next, loc.value());
560 tree.Remove(next);
561 seen[next] = false;
562 CHECK_MAPS_EQUAL();
563 } else {
564 // Check that it wasn't there already and then add it.
565 ZoneSplayTree<TestConfig>::Locator loc;
566 CHECK(!tree.Find(next, &loc));
567 CHECK(tree.Insert(next, &loc));
568 CHECK_EQ(next, loc.key());
569 loc.set_value(3 * next);
570 seen[next] = true;
571 CHECK_MAPS_EQUAL();
572 }
573 int val = PseudoRandom(j, i) % kLimit;
574 if (seen[val]) {
575 ZoneSplayTree<TestConfig>::Locator loc;
576 CHECK(tree.FindGreatestLessThan(val, &loc));
577 CHECK_EQ(loc.key(), val);
578 break;
579 }
580 val = PseudoRandom(i + j, i - j) % kLimit;
581 if (seen[val]) {
582 ZoneSplayTree<TestConfig>::Locator loc;
583 CHECK(tree.FindLeastGreaterThan(val, &loc));
584 CHECK_EQ(loc.key(), val);
585 break;
586 }
587 }
588 }
589}
590
591
592TEST(DispatchTableConstruction) {
593 // Initialize test data.
594 static const int kLimit = 1000;
595 static const int kRangeCount = 8;
596 static const int kRangeSize = 16;
597 uc16 ranges[kRangeCount][2 * kRangeSize];
598 for (int i = 0; i < kRangeCount; i++) {
599 Vector<uc16> range(ranges[i], 2 * kRangeSize);
600 for (int j = 0; j < 2 * kRangeSize; j++) {
601 range[j] = PseudoRandom(i + 25, j + 87) % kLimit;
602 }
603 range.Sort();
604 for (int j = 1; j < 2 * kRangeSize; j++) {
605 CHECK(range[j-1] <= range[j]);
606 }
607 }
608 // Enter test data into dispatch table.
609 ZoneScope zone_scope(DELETE_ON_EXIT);
610 DispatchTable table;
611 for (int i = 0; i < kRangeCount; i++) {
612 uc16* range = ranges[i];
613 for (int j = 0; j < 2 * kRangeSize; j += 2)
614 table.AddRange(CharacterRange(range[j], range[j + 1]), i);
615 }
616 // Check that the table looks as we would expect
617 for (int p = 0; p < kLimit; p++) {
618 OutSet* outs = table.Get(p);
619 for (int j = 0; j < kRangeCount; j++) {
620 uc16* range = ranges[j];
621 bool is_on = false;
622 for (int k = 0; !is_on && (k < 2 * kRangeSize); k += 2)
623 is_on = (range[k] <= p && p <= range[k + 1]);
624 CHECK_EQ(is_on, outs->Get(j));
625 }
626 }
627}
628
Leon Clarkee46be812010-01-19 14:06:41 +0000629// Test of debug-only syntax.
630#ifdef DEBUG
631
632TEST(ParsePossessiveRepetition) {
633 bool old_flag_value = FLAG_regexp_possessive_quantifier;
634
635 // Enable possessive quantifier syntax.
636 FLAG_regexp_possessive_quantifier = true;
637
638 CHECK_PARSE_EQ("a*+", "(# 0 - p 'a')");
639 CHECK_PARSE_EQ("a++", "(# 1 - p 'a')");
640 CHECK_PARSE_EQ("a?+", "(# 0 1 p 'a')");
641 CHECK_PARSE_EQ("a{10,20}+", "(# 10 20 p 'a')");
642 CHECK_PARSE_EQ("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')");
643
644 // Disable possessive quantifier syntax.
645 FLAG_regexp_possessive_quantifier = false;
646
647 CHECK_PARSE_ERROR("a*+");
648 CHECK_PARSE_ERROR("a++");
649 CHECK_PARSE_ERROR("a?+");
650 CHECK_PARSE_ERROR("a{10,20}+");
651 CHECK_PARSE_ERROR("a{10,20}+b");
652
653 FLAG_regexp_possessive_quantifier = old_flag_value;
654}
655
656#endif
Steve Blocka7e24c12009-10-30 11:49:00 +0000657
658// Tests of interpreter.
659
660
Steve Block6ded16b2010-05-10 14:33:55 +0100661#ifndef V8_INTERPRETED_REGEXP
Steve Blocka7e24c12009-10-30 11:49:00 +0000662
663#if V8_TARGET_ARCH_IA32
664typedef RegExpMacroAssemblerIA32 ArchRegExpMacroAssembler;
665#elif V8_TARGET_ARCH_X64
666typedef RegExpMacroAssemblerX64 ArchRegExpMacroAssembler;
667#elif V8_TARGET_ARCH_ARM
668typedef RegExpMacroAssemblerARM ArchRegExpMacroAssembler;
Andrei Popescu31002712010-02-23 13:46:05 +0000669#elif V8_TARGET_ARCH_MIPS
670typedef RegExpMacroAssembler ArchRegExpMacroAssembler;
Steve Blocka7e24c12009-10-30 11:49:00 +0000671#endif
672
673class ContextInitializer {
674 public:
675 ContextInitializer()
676 : env_(), scope_(), zone_(DELETE_ON_EXIT), stack_guard_() {
677 env_ = v8::Context::New();
678 env_->Enter();
679 }
680 ~ContextInitializer() {
681 env_->Exit();
682 env_.Dispose();
683 }
684 private:
685 v8::Persistent<v8::Context> env_;
686 v8::HandleScope scope_;
687 v8::internal::ZoneScope zone_;
688 v8::internal::StackGuard stack_guard_;
689};
690
691
692static ArchRegExpMacroAssembler::Result Execute(Code* code,
693 String* input,
694 int start_offset,
695 const byte* input_start,
696 const byte* input_end,
Leon Clarked91b9f72010-01-27 17:25:45 +0000697 int* captures) {
Steve Blocka7e24c12009-10-30 11:49:00 +0000698 return NativeRegExpMacroAssembler::Execute(
699 code,
700 input,
701 start_offset,
702 input_start,
703 input_end,
Leon Clarked91b9f72010-01-27 17:25:45 +0000704 captures);
Steve Blocka7e24c12009-10-30 11:49:00 +0000705}
706
707
708TEST(MacroAssemblerNativeSuccess) {
709 v8::V8::Initialize();
710 ContextInitializer initializer;
711
712 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4);
713
714 m.Succeed();
715
716 Handle<String> source = Factory::NewStringFromAscii(CStrVector(""));
717 Handle<Object> code_object = m.GetCode(source);
718 Handle<Code> code = Handle<Code>::cast(code_object);
719
720 int captures[4] = {42, 37, 87, 117};
721 Handle<String> input = Factory::NewStringFromAscii(CStrVector("foofoo"));
722 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
723 const byte* start_adr =
724 reinterpret_cast<const byte*>(seq_input->GetCharsAddress());
725
726 NativeRegExpMacroAssembler::Result result =
727 Execute(*code,
728 *input,
729 0,
730 start_adr,
731 start_adr + seq_input->length(),
Leon Clarked91b9f72010-01-27 17:25:45 +0000732 captures);
Steve Blocka7e24c12009-10-30 11:49:00 +0000733
734 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
735 CHECK_EQ(-1, captures[0]);
736 CHECK_EQ(-1, captures[1]);
737 CHECK_EQ(-1, captures[2]);
738 CHECK_EQ(-1, captures[3]);
739}
740
741
742TEST(MacroAssemblerNativeSimple) {
743 v8::V8::Initialize();
744 ContextInitializer initializer;
745
746 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4);
747
748 uc16 foo_chars[3] = {'f', 'o', 'o'};
749 Vector<const uc16> foo(foo_chars, 3);
750
751 Label fail;
752 m.CheckCharacters(foo, 0, &fail, true);
753 m.WriteCurrentPositionToRegister(0, 0);
754 m.AdvanceCurrentPosition(3);
755 m.WriteCurrentPositionToRegister(1, 0);
756 m.Succeed();
757 m.Bind(&fail);
758 m.Fail();
759
760 Handle<String> source = Factory::NewStringFromAscii(CStrVector("^foo"));
761 Handle<Object> code_object = m.GetCode(source);
762 Handle<Code> code = Handle<Code>::cast(code_object);
763
764 int captures[4] = {42, 37, 87, 117};
765 Handle<String> input = Factory::NewStringFromAscii(CStrVector("foofoo"));
766 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
767 Address start_adr = seq_input->GetCharsAddress();
768
769 NativeRegExpMacroAssembler::Result result =
770 Execute(*code,
771 *input,
772 0,
773 start_adr,
774 start_adr + input->length(),
Leon Clarked91b9f72010-01-27 17:25:45 +0000775 captures);
Steve Blocka7e24c12009-10-30 11:49:00 +0000776
777 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
778 CHECK_EQ(0, captures[0]);
779 CHECK_EQ(3, captures[1]);
780 CHECK_EQ(-1, captures[2]);
781 CHECK_EQ(-1, captures[3]);
782
783 input = Factory::NewStringFromAscii(CStrVector("barbarbar"));
784 seq_input = Handle<SeqAsciiString>::cast(input);
785 start_adr = seq_input->GetCharsAddress();
786
787 result = Execute(*code,
788 *input,
789 0,
790 start_adr,
791 start_adr + input->length(),
Leon Clarked91b9f72010-01-27 17:25:45 +0000792 captures);
Steve Blocka7e24c12009-10-30 11:49:00 +0000793
794 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
795}
796
797
798TEST(MacroAssemblerNativeSimpleUC16) {
799 v8::V8::Initialize();
800 ContextInitializer initializer;
801
802 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4);
803
804 uc16 foo_chars[3] = {'f', 'o', 'o'};
805 Vector<const uc16> foo(foo_chars, 3);
806
807 Label fail;
808 m.CheckCharacters(foo, 0, &fail, true);
809 m.WriteCurrentPositionToRegister(0, 0);
810 m.AdvanceCurrentPosition(3);
811 m.WriteCurrentPositionToRegister(1, 0);
812 m.Succeed();
813 m.Bind(&fail);
814 m.Fail();
815
816 Handle<String> source = Factory::NewStringFromAscii(CStrVector("^foo"));
817 Handle<Object> code_object = m.GetCode(source);
818 Handle<Code> code = Handle<Code>::cast(code_object);
819
820 int captures[4] = {42, 37, 87, 117};
821 const uc16 input_data[6] = {'f', 'o', 'o', 'f', 'o', '\xa0'};
822 Handle<String> input =
823 Factory::NewStringFromTwoByte(Vector<const uc16>(input_data, 6));
824 Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
825 Address start_adr = seq_input->GetCharsAddress();
826
827 NativeRegExpMacroAssembler::Result result =
828 Execute(*code,
829 *input,
830 0,
831 start_adr,
832 start_adr + input->length(),
Leon Clarked91b9f72010-01-27 17:25:45 +0000833 captures);
Steve Blocka7e24c12009-10-30 11:49:00 +0000834
835 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
836 CHECK_EQ(0, captures[0]);
837 CHECK_EQ(3, captures[1]);
838 CHECK_EQ(-1, captures[2]);
839 CHECK_EQ(-1, captures[3]);
840
841 const uc16 input_data2[9] = {'b', 'a', 'r', 'b', 'a', 'r', 'b', 'a', '\xa0'};
842 input = Factory::NewStringFromTwoByte(Vector<const uc16>(input_data2, 9));
843 seq_input = Handle<SeqTwoByteString>::cast(input);
844 start_adr = seq_input->GetCharsAddress();
845
846 result = Execute(*code,
847 *input,
848 0,
849 start_adr,
850 start_adr + input->length() * 2,
Leon Clarked91b9f72010-01-27 17:25:45 +0000851 captures);
Steve Blocka7e24c12009-10-30 11:49:00 +0000852
853 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
854}
855
856
857TEST(MacroAssemblerNativeBacktrack) {
858 v8::V8::Initialize();
859 ContextInitializer initializer;
860
861 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0);
862
863 Label fail;
864 Label backtrack;
865 m.LoadCurrentCharacter(10, &fail);
866 m.Succeed();
867 m.Bind(&fail);
868 m.PushBacktrack(&backtrack);
869 m.LoadCurrentCharacter(10, NULL);
870 m.Succeed();
871 m.Bind(&backtrack);
872 m.Fail();
873
874 Handle<String> source = Factory::NewStringFromAscii(CStrVector(".........."));
875 Handle<Object> code_object = m.GetCode(source);
876 Handle<Code> code = Handle<Code>::cast(code_object);
877
878 Handle<String> input = Factory::NewStringFromAscii(CStrVector("foofoo"));
879 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
880 Address start_adr = seq_input->GetCharsAddress();
881
882 NativeRegExpMacroAssembler::Result result =
883 Execute(*code,
884 *input,
885 0,
886 start_adr,
887 start_adr + input->length(),
Leon Clarked91b9f72010-01-27 17:25:45 +0000888 NULL);
Steve Blocka7e24c12009-10-30 11:49:00 +0000889
890 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
891}
892
893
894TEST(MacroAssemblerNativeBackReferenceASCII) {
895 v8::V8::Initialize();
896 ContextInitializer initializer;
897
898 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4);
899
900 m.WriteCurrentPositionToRegister(0, 0);
901 m.AdvanceCurrentPosition(2);
902 m.WriteCurrentPositionToRegister(1, 0);
903 Label nomatch;
904 m.CheckNotBackReference(0, &nomatch);
905 m.Fail();
906 m.Bind(&nomatch);
907 m.AdvanceCurrentPosition(2);
908 Label missing_match;
909 m.CheckNotBackReference(0, &missing_match);
910 m.WriteCurrentPositionToRegister(2, 0);
911 m.Succeed();
912 m.Bind(&missing_match);
913 m.Fail();
914
915 Handle<String> source = Factory::NewStringFromAscii(CStrVector("^(..)..\1"));
916 Handle<Object> code_object = m.GetCode(source);
917 Handle<Code> code = Handle<Code>::cast(code_object);
918
919 Handle<String> input = Factory::NewStringFromAscii(CStrVector("fooofo"));
920 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
921 Address start_adr = seq_input->GetCharsAddress();
922
923 int output[4];
924 NativeRegExpMacroAssembler::Result result =
925 Execute(*code,
926 *input,
927 0,
928 start_adr,
929 start_adr + input->length(),
Leon Clarked91b9f72010-01-27 17:25:45 +0000930 output);
Steve Blocka7e24c12009-10-30 11:49:00 +0000931
932 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
933 CHECK_EQ(0, output[0]);
934 CHECK_EQ(2, output[1]);
935 CHECK_EQ(6, output[2]);
936 CHECK_EQ(-1, output[3]);
937}
938
939
940TEST(MacroAssemblerNativeBackReferenceUC16) {
941 v8::V8::Initialize();
942 ContextInitializer initializer;
943
944 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4);
945
946 m.WriteCurrentPositionToRegister(0, 0);
947 m.AdvanceCurrentPosition(2);
948 m.WriteCurrentPositionToRegister(1, 0);
949 Label nomatch;
950 m.CheckNotBackReference(0, &nomatch);
951 m.Fail();
952 m.Bind(&nomatch);
953 m.AdvanceCurrentPosition(2);
954 Label missing_match;
955 m.CheckNotBackReference(0, &missing_match);
956 m.WriteCurrentPositionToRegister(2, 0);
957 m.Succeed();
958 m.Bind(&missing_match);
959 m.Fail();
960
961 Handle<String> source = Factory::NewStringFromAscii(CStrVector("^(..)..\1"));
962 Handle<Object> code_object = m.GetCode(source);
963 Handle<Code> code = Handle<Code>::cast(code_object);
964
965 const uc16 input_data[6] = {'f', 0x2028, 'o', 'o', 'f', 0x2028};
966 Handle<String> input =
967 Factory::NewStringFromTwoByte(Vector<const uc16>(input_data, 6));
968 Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
969 Address start_adr = seq_input->GetCharsAddress();
970
971 int output[4];
972 NativeRegExpMacroAssembler::Result result =
973 Execute(*code,
974 *input,
975 0,
976 start_adr,
977 start_adr + input->length() * 2,
Leon Clarked91b9f72010-01-27 17:25:45 +0000978 output);
Steve Blocka7e24c12009-10-30 11:49:00 +0000979
980 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
981 CHECK_EQ(0, output[0]);
982 CHECK_EQ(2, output[1]);
983 CHECK_EQ(6, output[2]);
984 CHECK_EQ(-1, output[3]);
985}
986
987
988
989TEST(MacroAssemblernativeAtStart) {
990 v8::V8::Initialize();
991 ContextInitializer initializer;
992
993 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0);
994
995 Label not_at_start, newline, fail;
996 m.CheckNotAtStart(&not_at_start);
997 // Check that prevchar = '\n' and current = 'f'.
998 m.CheckCharacter('\n', &newline);
999 m.Bind(&fail);
1000 m.Fail();
1001 m.Bind(&newline);
1002 m.LoadCurrentCharacter(0, &fail);
1003 m.CheckNotCharacter('f', &fail);
1004 m.Succeed();
1005
1006 m.Bind(&not_at_start);
1007 // Check that prevchar = 'o' and current = 'b'.
1008 Label prevo;
1009 m.CheckCharacter('o', &prevo);
1010 m.Fail();
1011 m.Bind(&prevo);
1012 m.LoadCurrentCharacter(0, &fail);
1013 m.CheckNotCharacter('b', &fail);
1014 m.Succeed();
1015
1016 Handle<String> source = Factory::NewStringFromAscii(CStrVector("(^f|ob)"));
1017 Handle<Object> code_object = m.GetCode(source);
1018 Handle<Code> code = Handle<Code>::cast(code_object);
1019
1020 Handle<String> input = Factory::NewStringFromAscii(CStrVector("foobar"));
1021 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
1022 Address start_adr = seq_input->GetCharsAddress();
1023
1024 NativeRegExpMacroAssembler::Result result =
1025 Execute(*code,
1026 *input,
1027 0,
1028 start_adr,
1029 start_adr + input->length(),
Leon Clarked91b9f72010-01-27 17:25:45 +00001030 NULL);
Steve Blocka7e24c12009-10-30 11:49:00 +00001031
1032 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1033
1034 result = Execute(*code,
1035 *input,
1036 3,
1037 start_adr + 3,
1038 start_adr + input->length(),
Leon Clarked91b9f72010-01-27 17:25:45 +00001039 NULL);
Steve Blocka7e24c12009-10-30 11:49:00 +00001040
1041 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1042}
1043
1044
1045TEST(MacroAssemblerNativeBackRefNoCase) {
1046 v8::V8::Initialize();
1047 ContextInitializer initializer;
1048
1049 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4);
1050
1051 Label fail, succ;
1052
1053 m.WriteCurrentPositionToRegister(0, 0);
1054 m.WriteCurrentPositionToRegister(2, 0);
1055 m.AdvanceCurrentPosition(3);
1056 m.WriteCurrentPositionToRegister(3, 0);
1057 m.CheckNotBackReferenceIgnoreCase(2, &fail); // Match "AbC".
1058 m.CheckNotBackReferenceIgnoreCase(2, &fail); // Match "ABC".
1059 Label expected_fail;
1060 m.CheckNotBackReferenceIgnoreCase(2, &expected_fail);
1061 m.Bind(&fail);
1062 m.Fail();
1063
1064 m.Bind(&expected_fail);
1065 m.AdvanceCurrentPosition(3); // Skip "xYz"
1066 m.CheckNotBackReferenceIgnoreCase(2, &succ);
1067 m.Fail();
1068
1069 m.Bind(&succ);
1070 m.WriteCurrentPositionToRegister(1, 0);
1071 m.Succeed();
1072
1073 Handle<String> source =
1074 Factory::NewStringFromAscii(CStrVector("^(abc)\1\1(?!\1)...(?!\1)"));
1075 Handle<Object> code_object = m.GetCode(source);
1076 Handle<Code> code = Handle<Code>::cast(code_object);
1077
1078 Handle<String> input =
1079 Factory::NewStringFromAscii(CStrVector("aBcAbCABCxYzab"));
1080 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
1081 Address start_adr = seq_input->GetCharsAddress();
1082
1083 int output[4];
1084 NativeRegExpMacroAssembler::Result result =
1085 Execute(*code,
1086 *input,
1087 0,
1088 start_adr,
1089 start_adr + input->length(),
Leon Clarked91b9f72010-01-27 17:25:45 +00001090 output);
Steve Blocka7e24c12009-10-30 11:49:00 +00001091
1092 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1093 CHECK_EQ(0, output[0]);
1094 CHECK_EQ(12, output[1]);
1095 CHECK_EQ(0, output[2]);
1096 CHECK_EQ(3, output[3]);
1097}
1098
1099
1100
1101TEST(MacroAssemblerNativeRegisters) {
1102 v8::V8::Initialize();
1103 ContextInitializer initializer;
1104
1105 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 6);
1106
1107 uc16 foo_chars[3] = {'f', 'o', 'o'};
1108 Vector<const uc16> foo(foo_chars, 3);
1109
1110 enum registers { out1, out2, out3, out4, out5, out6, sp, loop_cnt };
1111 Label fail;
1112 Label backtrack;
1113 m.WriteCurrentPositionToRegister(out1, 0); // Output: [0]
1114 m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1115 m.PushBacktrack(&backtrack);
1116 m.WriteStackPointerToRegister(sp);
1117 // Fill stack and registers
1118 m.AdvanceCurrentPosition(2);
1119 m.WriteCurrentPositionToRegister(out1, 0);
1120 m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1121 m.PushBacktrack(&fail);
1122 // Drop backtrack stack frames.
1123 m.ReadStackPointerFromRegister(sp);
1124 // And take the first backtrack (to &backtrack)
1125 m.Backtrack();
1126
1127 m.PushCurrentPosition();
1128 m.AdvanceCurrentPosition(2);
1129 m.PopCurrentPosition();
1130
1131 m.Bind(&backtrack);
1132 m.PopRegister(out1);
1133 m.ReadCurrentPositionFromRegister(out1);
1134 m.AdvanceCurrentPosition(3);
1135 m.WriteCurrentPositionToRegister(out2, 0); // [0,3]
1136
1137 Label loop;
1138 m.SetRegister(loop_cnt, 0); // loop counter
1139 m.Bind(&loop);
1140 m.AdvanceRegister(loop_cnt, 1);
1141 m.AdvanceCurrentPosition(1);
1142 m.IfRegisterLT(loop_cnt, 3, &loop);
1143 m.WriteCurrentPositionToRegister(out3, 0); // [0,3,6]
1144
1145 Label loop2;
1146 m.SetRegister(loop_cnt, 2); // loop counter
1147 m.Bind(&loop2);
1148 m.AdvanceRegister(loop_cnt, -1);
1149 m.AdvanceCurrentPosition(1);
1150 m.IfRegisterGE(loop_cnt, 0, &loop2);
1151 m.WriteCurrentPositionToRegister(out4, 0); // [0,3,6,9]
1152
1153 Label loop3;
1154 Label exit_loop3;
1155 m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1156 m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1157 m.ReadCurrentPositionFromRegister(out3);
1158 m.Bind(&loop3);
1159 m.AdvanceCurrentPosition(1);
1160 m.CheckGreedyLoop(&exit_loop3);
1161 m.GoTo(&loop3);
1162 m.Bind(&exit_loop3);
1163 m.PopCurrentPosition();
1164 m.WriteCurrentPositionToRegister(out5, 0); // [0,3,6,9,9,-1]
1165
1166 m.Succeed();
1167
1168 m.Bind(&fail);
1169 m.Fail();
1170
1171 Handle<String> source =
1172 Factory::NewStringFromAscii(CStrVector("<loop test>"));
1173 Handle<Object> code_object = m.GetCode(source);
1174 Handle<Code> code = Handle<Code>::cast(code_object);
1175
1176 // String long enough for test (content doesn't matter).
1177 Handle<String> input =
1178 Factory::NewStringFromAscii(CStrVector("foofoofoofoofoo"));
1179 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
1180 Address start_adr = seq_input->GetCharsAddress();
1181
1182 int output[6];
1183 NativeRegExpMacroAssembler::Result result =
1184 Execute(*code,
1185 *input,
1186 0,
1187 start_adr,
1188 start_adr + input->length(),
Leon Clarked91b9f72010-01-27 17:25:45 +00001189 output);
Steve Blocka7e24c12009-10-30 11:49:00 +00001190
1191 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1192 CHECK_EQ(0, output[0]);
1193 CHECK_EQ(3, output[1]);
1194 CHECK_EQ(6, output[2]);
1195 CHECK_EQ(9, output[3]);
1196 CHECK_EQ(9, output[4]);
1197 CHECK_EQ(-1, output[5]);
1198}
1199
1200
1201TEST(MacroAssemblerStackOverflow) {
1202 v8::V8::Initialize();
1203 ContextInitializer initializer;
1204
1205 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0);
1206
1207 Label loop;
1208 m.Bind(&loop);
1209 m.PushBacktrack(&loop);
1210 m.GoTo(&loop);
1211
1212 Handle<String> source =
1213 Factory::NewStringFromAscii(CStrVector("<stack overflow test>"));
1214 Handle<Object> code_object = m.GetCode(source);
1215 Handle<Code> code = Handle<Code>::cast(code_object);
1216
1217 // String long enough for test (content doesn't matter).
1218 Handle<String> input =
1219 Factory::NewStringFromAscii(CStrVector("dummy"));
1220 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
1221 Address start_adr = seq_input->GetCharsAddress();
1222
1223 NativeRegExpMacroAssembler::Result result =
1224 Execute(*code,
1225 *input,
1226 0,
1227 start_adr,
1228 start_adr + input->length(),
Leon Clarked91b9f72010-01-27 17:25:45 +00001229 NULL);
Steve Blocka7e24c12009-10-30 11:49:00 +00001230
1231 CHECK_EQ(NativeRegExpMacroAssembler::EXCEPTION, result);
1232 CHECK(Top::has_pending_exception());
1233 Top::clear_pending_exception();
1234}
1235
1236
1237TEST(MacroAssemblerNativeLotsOfRegisters) {
1238 v8::V8::Initialize();
1239 ContextInitializer initializer;
1240
1241 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 2);
1242
1243 // At least 2048, to ensure the allocated space for registers
1244 // span one full page.
1245 const int large_number = 8000;
1246 m.WriteCurrentPositionToRegister(large_number, 42);
1247 m.WriteCurrentPositionToRegister(0, 0);
1248 m.WriteCurrentPositionToRegister(1, 1);
1249 Label done;
1250 m.CheckNotBackReference(0, &done); // Performs a system-stack push.
1251 m.Bind(&done);
1252 m.PushRegister(large_number, RegExpMacroAssembler::kNoStackLimitCheck);
1253 m.PopRegister(1);
1254 m.Succeed();
1255
1256 Handle<String> source =
1257 Factory::NewStringFromAscii(CStrVector("<huge register space test>"));
1258 Handle<Object> code_object = m.GetCode(source);
1259 Handle<Code> code = Handle<Code>::cast(code_object);
1260
1261 // String long enough for test (content doesn't matter).
1262 Handle<String> input =
1263 Factory::NewStringFromAscii(CStrVector("sample text"));
1264 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
1265 Address start_adr = seq_input->GetCharsAddress();
1266
1267 int captures[2];
1268 NativeRegExpMacroAssembler::Result result =
1269 Execute(*code,
1270 *input,
1271 0,
1272 start_adr,
1273 start_adr + input->length(),
Leon Clarked91b9f72010-01-27 17:25:45 +00001274 captures);
Steve Blocka7e24c12009-10-30 11:49:00 +00001275
1276 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1277 CHECK_EQ(0, captures[0]);
1278 CHECK_EQ(42, captures[1]);
1279
1280 Top::clear_pending_exception();
1281}
1282
Steve Block6ded16b2010-05-10 14:33:55 +01001283#else // V8_INTERPRETED_REGEXP
Steve Blocka7e24c12009-10-30 11:49:00 +00001284
1285TEST(MacroAssembler) {
1286 V8::Initialize(NULL);
1287 byte codes[1024];
1288 RegExpMacroAssemblerIrregexp m(Vector<byte>(codes, 1024));
1289 // ^f(o)o.
1290 Label fail, fail2, start;
1291 uc16 foo_chars[3];
1292 foo_chars[0] = 'f';
1293 foo_chars[1] = 'o';
1294 foo_chars[2] = 'o';
1295 Vector<const uc16> foo(foo_chars, 3);
1296 m.SetRegister(4, 42);
1297 m.PushRegister(4, RegExpMacroAssembler::kNoStackLimitCheck);
1298 m.AdvanceRegister(4, 42);
1299 m.GoTo(&start);
1300 m.Fail();
1301 m.Bind(&start);
1302 m.PushBacktrack(&fail2);
1303 m.CheckCharacters(foo, 0, &fail, true);
1304 m.WriteCurrentPositionToRegister(0, 0);
1305 m.PushCurrentPosition();
1306 m.AdvanceCurrentPosition(3);
1307 m.WriteCurrentPositionToRegister(1, 0);
1308 m.PopCurrentPosition();
1309 m.AdvanceCurrentPosition(1);
1310 m.WriteCurrentPositionToRegister(2, 0);
1311 m.AdvanceCurrentPosition(1);
1312 m.WriteCurrentPositionToRegister(3, 0);
1313 m.Succeed();
1314
1315 m.Bind(&fail);
1316 m.Backtrack();
1317 m.Succeed();
1318
1319 m.Bind(&fail2);
1320 m.PopRegister(0);
1321 m.Fail();
1322
1323 v8::HandleScope scope;
1324
1325 Handle<String> source = Factory::NewStringFromAscii(CStrVector("^f(o)o"));
1326 Handle<ByteArray> array = Handle<ByteArray>::cast(m.GetCode(source));
1327 int captures[5];
1328
1329 const uc16 str1[] = {'f', 'o', 'o', 'b', 'a', 'r'};
1330 Handle<String> f1_16 =
1331 Factory::NewStringFromTwoByte(Vector<const uc16>(str1, 6));
1332
1333 CHECK(IrregexpInterpreter::Match(array, f1_16, captures, 0));
1334 CHECK_EQ(0, captures[0]);
1335 CHECK_EQ(3, captures[1]);
1336 CHECK_EQ(1, captures[2]);
1337 CHECK_EQ(2, captures[3]);
1338 CHECK_EQ(84, captures[4]);
1339
1340 const uc16 str2[] = {'b', 'a', 'r', 'f', 'o', 'o'};
1341 Handle<String> f2_16 =
1342 Factory::NewStringFromTwoByte(Vector<const uc16>(str2, 6));
1343
1344 CHECK(!IrregexpInterpreter::Match(array, f2_16, captures, 0));
1345 CHECK_EQ(42, captures[0]);
1346}
1347
Steve Block6ded16b2010-05-10 14:33:55 +01001348#endif // V8_INTERPRETED_REGEXP
Steve Blocka7e24c12009-10-30 11:49:00 +00001349
1350
1351TEST(AddInverseToTable) {
1352 static const int kLimit = 1000;
1353 static const int kRangeCount = 16;
1354 for (int t = 0; t < 10; t++) {
1355 ZoneScope zone_scope(DELETE_ON_EXIT);
1356 ZoneList<CharacterRange>* ranges =
1357 new ZoneList<CharacterRange>(kRangeCount);
1358 for (int i = 0; i < kRangeCount; i++) {
1359 int from = PseudoRandom(t + 87, i + 25) % kLimit;
1360 int to = from + (PseudoRandom(i + 87, t + 25) % (kLimit / 20));
1361 if (to > kLimit) to = kLimit;
1362 ranges->Add(CharacterRange(from, to));
1363 }
1364 DispatchTable table;
1365 DispatchTableConstructor cons(&table, false);
1366 cons.set_choice_index(0);
1367 cons.AddInverse(ranges);
1368 for (int i = 0; i < kLimit; i++) {
1369 bool is_on = false;
1370 for (int j = 0; !is_on && j < kRangeCount; j++)
1371 is_on = ranges->at(j).Contains(i);
1372 OutSet* set = table.Get(i);
1373 CHECK_EQ(is_on, set->Get(0) == false);
1374 }
1375 }
1376 ZoneScope zone_scope(DELETE_ON_EXIT);
1377 ZoneList<CharacterRange>* ranges =
1378 new ZoneList<CharacterRange>(1);
1379 ranges->Add(CharacterRange(0xFFF0, 0xFFFE));
1380 DispatchTable table;
1381 DispatchTableConstructor cons(&table, false);
1382 cons.set_choice_index(0);
1383 cons.AddInverse(ranges);
1384 CHECK(!table.Get(0xFFFE)->Get(0));
1385 CHECK(table.Get(0xFFFF)->Get(0));
1386}
1387
1388
1389static uc32 canonicalize(uc32 c) {
1390 unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth];
1391 int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, NULL);
1392 if (count == 0) {
1393 return c;
1394 } else {
1395 CHECK_EQ(1, count);
1396 return canon[0];
1397 }
1398}
1399
1400
1401TEST(LatinCanonicalize) {
1402 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1403 for (char lower = 'a'; lower <= 'z'; lower++) {
1404 char upper = lower + ('A' - 'a');
1405 CHECK_EQ(canonicalize(lower), canonicalize(upper));
1406 unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1407 int length = un_canonicalize.get(lower, '\0', uncanon);
1408 CHECK_EQ(2, length);
1409 CHECK_EQ(upper, uncanon[0]);
1410 CHECK_EQ(lower, uncanon[1]);
1411 }
1412 for (uc32 c = 128; c < (1 << 21); c++)
1413 CHECK_GE(canonicalize(c), 128);
1414 unibrow::Mapping<unibrow::ToUppercase> to_upper;
Ben Murdochbb769b22010-08-11 14:56:33 +01001415 // Canonicalization is only defined for the Basic Multilingual Plane.
1416 for (uc32 c = 0; c < (1 << 16); c++) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001417 unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth];
1418 int length = to_upper.get(c, '\0', upper);
1419 if (length == 0) {
1420 length = 1;
1421 upper[0] = c;
1422 }
1423 uc32 u = upper[0];
1424 if (length > 1 || (c >= 128 && u < 128))
1425 u = c;
1426 CHECK_EQ(u, canonicalize(c));
1427 }
1428}
1429
1430
Ben Murdochbb769b22010-08-11 14:56:33 +01001431static uc32 CanonRangeEnd(uc32 c) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001432 unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
1433 int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL);
1434 if (count == 0) {
1435 return c;
1436 } else {
1437 CHECK_EQ(1, count);
1438 return canon[0];
1439 }
1440}
1441
1442
1443TEST(RangeCanonicalization) {
Steve Blocka7e24c12009-10-30 11:49:00 +00001444 // Check that we arrive at the same result when using the basic
1445 // range canonicalization primitives as when using immediate
1446 // canonicalization.
1447 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
Ben Murdochbb769b22010-08-11 14:56:33 +01001448 int block_start = 0;
1449 while (block_start <= 0xFFFF) {
1450 uc32 block_end = CanonRangeEnd(block_start);
1451 unsigned block_length = block_end - block_start + 1;
1452 if (block_length > 1) {
1453 unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1454 int first_length = un_canonicalize.get(block_start, '\0', first);
1455 for (unsigned i = 1; i < block_length; i++) {
1456 unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1457 int succ_length = un_canonicalize.get(block_start + i, '\0', succ);
1458 CHECK_EQ(first_length, succ_length);
1459 for (int j = 0; j < succ_length; j++) {
1460 int calc = first[j] + i;
1461 int found = succ[j];
1462 CHECK_EQ(calc, found);
1463 }
Steve Blocka7e24c12009-10-30 11:49:00 +00001464 }
1465 }
Ben Murdochbb769b22010-08-11 14:56:33 +01001466 block_start = block_start + block_length;
Steve Blocka7e24c12009-10-30 11:49:00 +00001467 }
1468}
1469
1470
1471TEST(UncanonicalizeEquivalence) {
1472 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1473 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1474 for (int i = 0; i < (1 << 16); i++) {
1475 int length = un_canonicalize.get(i, '\0', chars);
1476 for (int j = 0; j < length; j++) {
1477 unibrow::uchar chars2[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1478 int length2 = un_canonicalize.get(chars[j], '\0', chars2);
1479 CHECK_EQ(length, length2);
1480 for (int k = 0; k < length; k++)
1481 CHECK_EQ(static_cast<int>(chars[k]), static_cast<int>(chars2[k]));
1482 }
1483 }
1484}
1485
1486
1487static void TestRangeCaseIndependence(CharacterRange input,
1488 Vector<CharacterRange> expected) {
1489 ZoneScope zone_scope(DELETE_ON_EXIT);
1490 int count = expected.length();
1491 ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(count);
Steve Blockd0582a62009-12-15 09:54:21 +00001492 input.AddCaseEquivalents(list, false);
Steve Blocka7e24c12009-10-30 11:49:00 +00001493 CHECK_EQ(count, list->length());
1494 for (int i = 0; i < list->length(); i++) {
1495 CHECK_EQ(expected[i].from(), list->at(i).from());
1496 CHECK_EQ(expected[i].to(), list->at(i).to());
1497 }
1498}
1499
1500
1501static void TestSimpleRangeCaseIndependence(CharacterRange input,
1502 CharacterRange expected) {
1503 EmbeddedVector<CharacterRange, 1> vector;
1504 vector[0] = expected;
1505 TestRangeCaseIndependence(input, vector);
1506}
1507
1508
1509TEST(CharacterRangeCaseIndependence) {
1510 TestSimpleRangeCaseIndependence(CharacterRange::Singleton('a'),
1511 CharacterRange::Singleton('A'));
1512 TestSimpleRangeCaseIndependence(CharacterRange::Singleton('z'),
1513 CharacterRange::Singleton('Z'));
1514 TestSimpleRangeCaseIndependence(CharacterRange('a', 'z'),
1515 CharacterRange('A', 'Z'));
1516 TestSimpleRangeCaseIndependence(CharacterRange('c', 'f'),
1517 CharacterRange('C', 'F'));
1518 TestSimpleRangeCaseIndependence(CharacterRange('a', 'b'),
1519 CharacterRange('A', 'B'));
1520 TestSimpleRangeCaseIndependence(CharacterRange('y', 'z'),
1521 CharacterRange('Y', 'Z'));
1522 TestSimpleRangeCaseIndependence(CharacterRange('a' - 1, 'z' + 1),
1523 CharacterRange('A', 'Z'));
1524 TestSimpleRangeCaseIndependence(CharacterRange('A', 'Z'),
1525 CharacterRange('a', 'z'));
1526 TestSimpleRangeCaseIndependence(CharacterRange('C', 'F'),
1527 CharacterRange('c', 'f'));
1528 TestSimpleRangeCaseIndependence(CharacterRange('A' - 1, 'Z' + 1),
1529 CharacterRange('a', 'z'));
1530 // Here we need to add [l-z] to complete the case independence of
1531 // [A-Za-z] but we expect [a-z] to be added since we always add a
1532 // whole block at a time.
1533 TestSimpleRangeCaseIndependence(CharacterRange('A', 'k'),
1534 CharacterRange('a', 'z'));
1535}
1536
1537
1538static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) {
1539 if (ranges == NULL)
1540 return false;
1541 for (int i = 0; i < ranges->length(); i++) {
1542 CharacterRange range = ranges->at(i);
1543 if (range.from() <= c && c <= range.to())
1544 return true;
1545 }
1546 return false;
1547}
1548
1549
1550TEST(CharClassDifference) {
1551 ZoneScope zone_scope(DELETE_ON_EXIT);
1552 ZoneList<CharacterRange>* base = new ZoneList<CharacterRange>(1);
1553 base->Add(CharacterRange::Everything());
1554 Vector<const uc16> overlay = CharacterRange::GetWordBounds();
1555 ZoneList<CharacterRange>* included = NULL;
1556 ZoneList<CharacterRange>* excluded = NULL;
1557 CharacterRange::Split(base, overlay, &included, &excluded);
1558 for (int i = 0; i < (1 << 16); i++) {
1559 bool in_base = InClass(i, base);
1560 if (in_base) {
1561 bool in_overlay = false;
1562 for (int j = 0; !in_overlay && j < overlay.length(); j += 2) {
1563 if (overlay[j] <= i && i <= overlay[j+1])
1564 in_overlay = true;
1565 }
1566 CHECK_EQ(in_overlay, InClass(i, included));
1567 CHECK_EQ(!in_overlay, InClass(i, excluded));
1568 } else {
1569 CHECK(!InClass(i, included));
1570 CHECK(!InClass(i, excluded));
1571 }
1572 }
1573}
1574
1575
Leon Clarkee46be812010-01-19 14:06:41 +00001576TEST(CanonicalizeCharacterSets) {
1577 ZoneScope scope(DELETE_ON_EXIT);
1578 ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(4);
1579 CharacterSet set(list);
1580
1581 list->Add(CharacterRange(10, 20));
1582 list->Add(CharacterRange(30, 40));
1583 list->Add(CharacterRange(50, 60));
1584 set.Canonicalize();
1585 ASSERT_EQ(3, list->length());
1586 ASSERT_EQ(10, list->at(0).from());
1587 ASSERT_EQ(20, list->at(0).to());
1588 ASSERT_EQ(30, list->at(1).from());
1589 ASSERT_EQ(40, list->at(1).to());
1590 ASSERT_EQ(50, list->at(2).from());
1591 ASSERT_EQ(60, list->at(2).to());
1592
1593 list->Rewind(0);
1594 list->Add(CharacterRange(10, 20));
1595 list->Add(CharacterRange(50, 60));
1596 list->Add(CharacterRange(30, 40));
1597 set.Canonicalize();
1598 ASSERT_EQ(3, list->length());
1599 ASSERT_EQ(10, list->at(0).from());
1600 ASSERT_EQ(20, list->at(0).to());
1601 ASSERT_EQ(30, list->at(1).from());
1602 ASSERT_EQ(40, list->at(1).to());
1603 ASSERT_EQ(50, list->at(2).from());
1604 ASSERT_EQ(60, list->at(2).to());
1605
1606 list->Rewind(0);
1607 list->Add(CharacterRange(30, 40));
1608 list->Add(CharacterRange(10, 20));
1609 list->Add(CharacterRange(25, 25));
1610 list->Add(CharacterRange(100, 100));
1611 list->Add(CharacterRange(1, 1));
1612 set.Canonicalize();
1613 ASSERT_EQ(5, list->length());
1614 ASSERT_EQ(1, list->at(0).from());
1615 ASSERT_EQ(1, list->at(0).to());
1616 ASSERT_EQ(10, list->at(1).from());
1617 ASSERT_EQ(20, list->at(1).to());
1618 ASSERT_EQ(25, list->at(2).from());
1619 ASSERT_EQ(25, list->at(2).to());
1620 ASSERT_EQ(30, list->at(3).from());
1621 ASSERT_EQ(40, list->at(3).to());
1622 ASSERT_EQ(100, list->at(4).from());
1623 ASSERT_EQ(100, list->at(4).to());
1624
1625 list->Rewind(0);
1626 list->Add(CharacterRange(10, 19));
1627 list->Add(CharacterRange(21, 30));
1628 list->Add(CharacterRange(20, 20));
1629 set.Canonicalize();
1630 ASSERT_EQ(1, list->length());
1631 ASSERT_EQ(10, list->at(0).from());
1632 ASSERT_EQ(30, list->at(0).to());
1633}
1634
Leon Clarked91b9f72010-01-27 17:25:45 +00001635// Checks whether a character is in the set represented by a list of ranges.
1636static bool CharacterInSet(ZoneList<CharacterRange>* set, uc16 value) {
1637 for (int i = 0; i < set->length(); i++) {
1638 CharacterRange range = set->at(i);
1639 if (range.from() <= value && value <= range.to()) {
1640 return true;
1641 }
1642 }
1643 return false;
1644}
1645
1646TEST(CharacterRangeMerge) {
1647 ZoneScope zone_scope(DELETE_ON_EXIT);
1648 ZoneList<CharacterRange> l1(4);
1649 ZoneList<CharacterRange> l2(4);
1650 // Create all combinations of intersections of ranges, both singletons and
1651 // longer.
1652
1653 int offset = 0;
1654
1655 // The five kinds of singleton intersections:
1656 // X
1657 // Y - outside before
1658 // Y - outside touching start
1659 // Y - overlap
1660 // Y - outside touching end
1661 // Y - outside after
1662
1663 for (int i = 0; i < 5; i++) {
1664 l1.Add(CharacterRange::Singleton(offset + 2));
1665 l2.Add(CharacterRange::Singleton(offset + i));
1666 offset += 6;
1667 }
1668
1669 // The seven kinds of singleton/non-singleton intersections:
1670 // XXX
1671 // Y - outside before
1672 // Y - outside touching start
1673 // Y - inside touching start
1674 // Y - entirely inside
1675 // Y - inside touching end
1676 // Y - outside touching end
1677 // Y - disjoint after
1678
1679 for (int i = 0; i < 7; i++) {
1680 l1.Add(CharacterRange::Range(offset + 2, offset + 4));
1681 l2.Add(CharacterRange::Singleton(offset + i));
1682 offset += 8;
1683 }
1684
1685 // The eleven kinds of non-singleton intersections:
1686 //
1687 // XXXXXXXX
1688 // YYYY - outside before.
1689 // YYYY - outside touching start.
1690 // YYYY - overlapping start
1691 // YYYY - inside touching start
1692 // YYYY - entirely inside
1693 // YYYY - inside touching end
1694 // YYYY - overlapping end
1695 // YYYY - outside touching end
1696 // YYYY - outside after
1697 // YYYYYYYY - identical
1698 // YYYYYYYYYYYY - containing entirely.
1699
1700 for (int i = 0; i < 9; i++) {
1701 l1.Add(CharacterRange::Range(offset + 6, offset + 15)); // Length 8.
1702 l2.Add(CharacterRange::Range(offset + 2 * i, offset + 2 * i + 3));
1703 offset += 22;
1704 }
1705 l1.Add(CharacterRange::Range(offset + 6, offset + 15));
1706 l2.Add(CharacterRange::Range(offset + 6, offset + 15));
1707 offset += 22;
1708 l1.Add(CharacterRange::Range(offset + 6, offset + 15));
1709 l2.Add(CharacterRange::Range(offset + 4, offset + 17));
1710 offset += 22;
1711
1712 // Different kinds of multi-range overlap:
1713 // XXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXX
1714 // YYYY Y YYYY Y YYYY Y YYYY Y YYYY Y YYYY Y
1715
1716 l1.Add(CharacterRange::Range(offset, offset + 21));
1717 l1.Add(CharacterRange::Range(offset + 31, offset + 52));
1718 for (int i = 0; i < 6; i++) {
1719 l2.Add(CharacterRange::Range(offset + 2, offset + 5));
1720 l2.Add(CharacterRange::Singleton(offset + 8));
1721 offset += 9;
1722 }
1723
1724 ASSERT(CharacterRange::IsCanonical(&l1));
1725 ASSERT(CharacterRange::IsCanonical(&l2));
1726
1727 ZoneList<CharacterRange> first_only(4);
1728 ZoneList<CharacterRange> second_only(4);
1729 ZoneList<CharacterRange> both(4);
1730
1731 // Merge one direction.
1732 CharacterRange::Merge(&l1, &l2, &first_only, &second_only, &both);
1733
1734 CHECK(CharacterRange::IsCanonical(&first_only));
1735 CHECK(CharacterRange::IsCanonical(&second_only));
1736 CHECK(CharacterRange::IsCanonical(&both));
1737
1738 for (uc16 i = 0; i < offset; i++) {
1739 bool in_first = CharacterInSet(&l1, i);
1740 bool in_second = CharacterInSet(&l2, i);
1741 CHECK((in_first && !in_second) == CharacterInSet(&first_only, i));
1742 CHECK((!in_first && in_second) == CharacterInSet(&second_only, i));
1743 CHECK((in_first && in_second) == CharacterInSet(&both, i));
1744 }
1745
1746 first_only.Clear();
1747 second_only.Clear();
1748 both.Clear();
1749
1750 // Merge other direction.
1751 CharacterRange::Merge(&l2, &l1, &second_only, &first_only, &both);
1752
1753 CHECK(CharacterRange::IsCanonical(&first_only));
1754 CHECK(CharacterRange::IsCanonical(&second_only));
1755 CHECK(CharacterRange::IsCanonical(&both));
1756
1757 for (uc16 i = 0; i < offset; i++) {
1758 bool in_first = CharacterInSet(&l1, i);
1759 bool in_second = CharacterInSet(&l2, i);
1760 CHECK((in_first && !in_second) == CharacterInSet(&first_only, i));
1761 CHECK((!in_first && in_second) == CharacterInSet(&second_only, i));
1762 CHECK((in_first && in_second) == CharacterInSet(&both, i));
1763 }
1764
1765 first_only.Clear();
1766 second_only.Clear();
1767 both.Clear();
1768
1769 // Merge but don't record all combinations.
1770 CharacterRange::Merge(&l1, &l2, NULL, NULL, &both);
1771
1772 CHECK(CharacterRange::IsCanonical(&both));
1773
1774 for (uc16 i = 0; i < offset; i++) {
1775 bool in_first = CharacterInSet(&l1, i);
1776 bool in_second = CharacterInSet(&l2, i);
1777 CHECK((in_first && in_second) == CharacterInSet(&both, i));
1778 }
1779
1780 // Merge into same set.
1781 ZoneList<CharacterRange> all(4);
1782 CharacterRange::Merge(&l1, &l2, &all, &all, &all);
1783
1784 CHECK(CharacterRange::IsCanonical(&all));
1785
1786 for (uc16 i = 0; i < offset; i++) {
1787 bool in_first = CharacterInSet(&l1, i);
1788 bool in_second = CharacterInSet(&l2, i);
1789 CHECK((in_first || in_second) == CharacterInSet(&all, i));
1790 }
1791}
Leon Clarkee46be812010-01-19 14:06:41 +00001792
1793
Steve Blocka7e24c12009-10-30 11:49:00 +00001794TEST(Graph) {
1795 V8::Initialize(NULL);
Leon Clarkee46be812010-01-19 14:06:41 +00001796 Execute("\\b\\w+\\b", false, true, true);
Steve Blocka7e24c12009-10-30 11:49:00 +00001797}