blob: ea748e4e55de21fbb6dc9c61725828f052be76fe [file] [log] [blame]
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001// Copyright 2011 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// A simple interpreter for the Irregexp byte code.
6
7#include "src/regexp/interpreter-irregexp.h"
8
9#include "src/ast/ast.h"
10#include "src/regexp/bytecodes-irregexp.h"
11#include "src/regexp/jsregexp.h"
12#include "src/regexp/regexp-macro-assembler.h"
13#include "src/unicode.h"
14#include "src/utils.h"
15
16namespace v8 {
17namespace internal {
18
19
20typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize;
21
22static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
23 int from,
24 int current,
25 int len,
26 Vector<const uc16> subject) {
27 for (int i = 0; i < len; i++) {
28 unibrow::uchar old_char = subject[from++];
29 unibrow::uchar new_char = subject[current++];
30 if (old_char == new_char) continue;
31 unibrow::uchar old_string[1] = { old_char };
32 unibrow::uchar new_string[1] = { new_char };
33 interp_canonicalize->get(old_char, '\0', old_string);
34 interp_canonicalize->get(new_char, '\0', new_string);
35 if (old_string[0] != new_string[0]) {
36 return false;
37 }
38 }
39 return true;
40}
41
42
43static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
44 int from,
45 int current,
46 int len,
47 Vector<const uint8_t> subject) {
48 for (int i = 0; i < len; i++) {
49 unsigned int old_char = subject[from++];
50 unsigned int new_char = subject[current++];
51 if (old_char == new_char) continue;
52 // Convert both characters to lower case.
53 old_char |= 0x20;
54 new_char |= 0x20;
55 if (old_char != new_char) return false;
56 // Not letters in the ASCII range and Latin-1 range.
57 if (!(old_char - 'a' <= 'z' - 'a') &&
58 !(old_char - 224 <= 254 - 224 && old_char != 247)) {
59 return false;
60 }
61 }
62 return true;
63}
64
65
66#ifdef DEBUG
67static void TraceInterpreter(const byte* code_base,
68 const byte* pc,
69 int stack_depth,
70 int current_position,
71 uint32_t current_char,
72 int bytecode_length,
73 const char* bytecode_name) {
74 if (FLAG_trace_regexp_bytecodes) {
75 bool printable = (current_char < 127 && current_char >= 32);
76 const char* format =
77 printable ?
78 "pc = %02x, sp = %d, curpos = %d, curchar = %08x (%c), bc = %s" :
79 "pc = %02x, sp = %d, curpos = %d, curchar = %08x .%c., bc = %s";
80 PrintF(format,
81 pc - code_base,
82 stack_depth,
83 current_position,
84 current_char,
85 printable ? current_char : '.',
86 bytecode_name);
87 for (int i = 0; i < bytecode_length; i++) {
88 printf(", %02x", pc[i]);
89 }
90 printf(" ");
91 for (int i = 1; i < bytecode_length; i++) {
92 unsigned char b = pc[i];
93 if (b < 127 && b >= 32) {
94 printf("%c", b);
95 } else {
96 printf(".");
97 }
98 }
99 printf("\n");
100 }
101}
102
103
104#define BYTECODE(name) \
105 case BC_##name: \
106 TraceInterpreter(code_base, \
107 pc, \
108 static_cast<int>(backtrack_sp - backtrack_stack_base), \
109 current, \
110 current_char, \
111 BC_##name##_LENGTH, \
112 #name);
113#else
114#define BYTECODE(name) \
115 case BC_##name:
116#endif
117
118
119static int32_t Load32Aligned(const byte* pc) {
120 DCHECK((reinterpret_cast<intptr_t>(pc) & 3) == 0);
121 return *reinterpret_cast<const int32_t *>(pc);
122}
123
124
125static int32_t Load16Aligned(const byte* pc) {
126 DCHECK((reinterpret_cast<intptr_t>(pc) & 1) == 0);
127 return *reinterpret_cast<const uint16_t *>(pc);
128}
129
130
131// A simple abstraction over the backtracking stack used by the interpreter.
132// This backtracking stack does not grow automatically, but it ensures that the
133// the memory held by the stack is released or remembered in a cache if the
134// matching terminates.
135class BacktrackStack {
136 public:
137 BacktrackStack() { data_ = NewArray<int>(kBacktrackStackSize); }
138
139 ~BacktrackStack() {
140 DeleteArray(data_);
141 }
142
143 int* data() const { return data_; }
144
145 int max_size() const { return kBacktrackStackSize; }
146
147 private:
148 static const int kBacktrackStackSize = 10000;
149
150 int* data_;
151
152 DISALLOW_COPY_AND_ASSIGN(BacktrackStack);
153};
154
155
156template <typename Char>
157static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate,
158 const byte* code_base,
159 Vector<const Char> subject,
160 int* registers,
161 int current,
162 uint32_t current_char) {
163 const byte* pc = code_base;
164 // BacktrackStack ensures that the memory allocated for the backtracking stack
165 // is returned to the system or cached if there is no stack being cached at
166 // the moment.
167 BacktrackStack backtrack_stack;
168 int* backtrack_stack_base = backtrack_stack.data();
169 int* backtrack_sp = backtrack_stack_base;
170 int backtrack_stack_space = backtrack_stack.max_size();
171#ifdef DEBUG
172 if (FLAG_trace_regexp_bytecodes) {
173 PrintF("\n\nStart bytecode interpreter\n\n");
174 }
175#endif
176 while (true) {
177 int32_t insn = Load32Aligned(pc);
178 switch (insn & BYTECODE_MASK) {
179 BYTECODE(BREAK)
180 UNREACHABLE();
181 return RegExpImpl::RE_FAILURE;
182 BYTECODE(PUSH_CP)
183 if (--backtrack_stack_space < 0) {
184 return RegExpImpl::RE_EXCEPTION;
185 }
186 *backtrack_sp++ = current;
187 pc += BC_PUSH_CP_LENGTH;
188 break;
189 BYTECODE(PUSH_BT)
190 if (--backtrack_stack_space < 0) {
191 return RegExpImpl::RE_EXCEPTION;
192 }
193 *backtrack_sp++ = Load32Aligned(pc + 4);
194 pc += BC_PUSH_BT_LENGTH;
195 break;
196 BYTECODE(PUSH_REGISTER)
197 if (--backtrack_stack_space < 0) {
198 return RegExpImpl::RE_EXCEPTION;
199 }
200 *backtrack_sp++ = registers[insn >> BYTECODE_SHIFT];
201 pc += BC_PUSH_REGISTER_LENGTH;
202 break;
203 BYTECODE(SET_REGISTER)
204 registers[insn >> BYTECODE_SHIFT] = Load32Aligned(pc + 4);
205 pc += BC_SET_REGISTER_LENGTH;
206 break;
207 BYTECODE(ADVANCE_REGISTER)
208 registers[insn >> BYTECODE_SHIFT] += Load32Aligned(pc + 4);
209 pc += BC_ADVANCE_REGISTER_LENGTH;
210 break;
211 BYTECODE(SET_REGISTER_TO_CP)
212 registers[insn >> BYTECODE_SHIFT] = current + Load32Aligned(pc + 4);
213 pc += BC_SET_REGISTER_TO_CP_LENGTH;
214 break;
215 BYTECODE(SET_CP_TO_REGISTER)
216 current = registers[insn >> BYTECODE_SHIFT];
217 pc += BC_SET_CP_TO_REGISTER_LENGTH;
218 break;
219 BYTECODE(SET_REGISTER_TO_SP)
220 registers[insn >> BYTECODE_SHIFT] =
221 static_cast<int>(backtrack_sp - backtrack_stack_base);
222 pc += BC_SET_REGISTER_TO_SP_LENGTH;
223 break;
224 BYTECODE(SET_SP_TO_REGISTER)
225 backtrack_sp = backtrack_stack_base + registers[insn >> BYTECODE_SHIFT];
226 backtrack_stack_space = backtrack_stack.max_size() -
227 static_cast<int>(backtrack_sp - backtrack_stack_base);
228 pc += BC_SET_SP_TO_REGISTER_LENGTH;
229 break;
230 BYTECODE(POP_CP)
231 backtrack_stack_space++;
232 --backtrack_sp;
233 current = *backtrack_sp;
234 pc += BC_POP_CP_LENGTH;
235 break;
236 BYTECODE(POP_BT)
237 backtrack_stack_space++;
238 --backtrack_sp;
239 pc = code_base + *backtrack_sp;
240 break;
241 BYTECODE(POP_REGISTER)
242 backtrack_stack_space++;
243 --backtrack_sp;
244 registers[insn >> BYTECODE_SHIFT] = *backtrack_sp;
245 pc += BC_POP_REGISTER_LENGTH;
246 break;
247 BYTECODE(FAIL)
248 return RegExpImpl::RE_FAILURE;
249 BYTECODE(SUCCEED)
250 return RegExpImpl::RE_SUCCESS;
251 BYTECODE(ADVANCE_CP)
252 current += insn >> BYTECODE_SHIFT;
253 pc += BC_ADVANCE_CP_LENGTH;
254 break;
255 BYTECODE(GOTO)
256 pc = code_base + Load32Aligned(pc + 4);
257 break;
258 BYTECODE(ADVANCE_CP_AND_GOTO)
259 current += insn >> BYTECODE_SHIFT;
260 pc = code_base + Load32Aligned(pc + 4);
261 break;
262 BYTECODE(CHECK_GREEDY)
263 if (current == backtrack_sp[-1]) {
264 backtrack_sp--;
265 backtrack_stack_space++;
266 pc = code_base + Load32Aligned(pc + 4);
267 } else {
268 pc += BC_CHECK_GREEDY_LENGTH;
269 }
270 break;
271 BYTECODE(LOAD_CURRENT_CHAR) {
272 int pos = current + (insn >> BYTECODE_SHIFT);
273 if (pos >= subject.length() || pos < 0) {
274 pc = code_base + Load32Aligned(pc + 4);
275 } else {
276 current_char = subject[pos];
277 pc += BC_LOAD_CURRENT_CHAR_LENGTH;
278 }
279 break;
280 }
281 BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) {
282 int pos = current + (insn >> BYTECODE_SHIFT);
283 current_char = subject[pos];
284 pc += BC_LOAD_CURRENT_CHAR_UNCHECKED_LENGTH;
285 break;
286 }
287 BYTECODE(LOAD_2_CURRENT_CHARS) {
288 int pos = current + (insn >> BYTECODE_SHIFT);
289 if (pos + 2 > subject.length() || pos < 0) {
290 pc = code_base + Load32Aligned(pc + 4);
291 } else {
292 Char next = subject[pos + 1];
293 current_char =
294 (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
295 pc += BC_LOAD_2_CURRENT_CHARS_LENGTH;
296 }
297 break;
298 }
299 BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) {
300 int pos = current + (insn >> BYTECODE_SHIFT);
301 Char next = subject[pos + 1];
302 current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
303 pc += BC_LOAD_2_CURRENT_CHARS_UNCHECKED_LENGTH;
304 break;
305 }
306 BYTECODE(LOAD_4_CURRENT_CHARS) {
307 DCHECK(sizeof(Char) == 1);
308 int pos = current + (insn >> BYTECODE_SHIFT);
309 if (pos + 4 > subject.length() || pos < 0) {
310 pc = code_base + Load32Aligned(pc + 4);
311 } else {
312 Char next1 = subject[pos + 1];
313 Char next2 = subject[pos + 2];
314 Char next3 = subject[pos + 3];
315 current_char = (subject[pos] |
316 (next1 << 8) |
317 (next2 << 16) |
318 (next3 << 24));
319 pc += BC_LOAD_4_CURRENT_CHARS_LENGTH;
320 }
321 break;
322 }
323 BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) {
324 DCHECK(sizeof(Char) == 1);
325 int pos = current + (insn >> BYTECODE_SHIFT);
326 Char next1 = subject[pos + 1];
327 Char next2 = subject[pos + 2];
328 Char next3 = subject[pos + 3];
329 current_char = (subject[pos] |
330 (next1 << 8) |
331 (next2 << 16) |
332 (next3 << 24));
333 pc += BC_LOAD_4_CURRENT_CHARS_UNCHECKED_LENGTH;
334 break;
335 }
336 BYTECODE(CHECK_4_CHARS) {
337 uint32_t c = Load32Aligned(pc + 4);
338 if (c == current_char) {
339 pc = code_base + Load32Aligned(pc + 8);
340 } else {
341 pc += BC_CHECK_4_CHARS_LENGTH;
342 }
343 break;
344 }
345 BYTECODE(CHECK_CHAR) {
346 uint32_t c = (insn >> BYTECODE_SHIFT);
347 if (c == current_char) {
348 pc = code_base + Load32Aligned(pc + 4);
349 } else {
350 pc += BC_CHECK_CHAR_LENGTH;
351 }
352 break;
353 }
354 BYTECODE(CHECK_NOT_4_CHARS) {
355 uint32_t c = Load32Aligned(pc + 4);
356 if (c != current_char) {
357 pc = code_base + Load32Aligned(pc + 8);
358 } else {
359 pc += BC_CHECK_NOT_4_CHARS_LENGTH;
360 }
361 break;
362 }
363 BYTECODE(CHECK_NOT_CHAR) {
364 uint32_t c = (insn >> BYTECODE_SHIFT);
365 if (c != current_char) {
366 pc = code_base + Load32Aligned(pc + 4);
367 } else {
368 pc += BC_CHECK_NOT_CHAR_LENGTH;
369 }
370 break;
371 }
372 BYTECODE(AND_CHECK_4_CHARS) {
373 uint32_t c = Load32Aligned(pc + 4);
374 if (c == (current_char & Load32Aligned(pc + 8))) {
375 pc = code_base + Load32Aligned(pc + 12);
376 } else {
377 pc += BC_AND_CHECK_4_CHARS_LENGTH;
378 }
379 break;
380 }
381 BYTECODE(AND_CHECK_CHAR) {
382 uint32_t c = (insn >> BYTECODE_SHIFT);
383 if (c == (current_char & Load32Aligned(pc + 4))) {
384 pc = code_base + Load32Aligned(pc + 8);
385 } else {
386 pc += BC_AND_CHECK_CHAR_LENGTH;
387 }
388 break;
389 }
390 BYTECODE(AND_CHECK_NOT_4_CHARS) {
391 uint32_t c = Load32Aligned(pc + 4);
392 if (c != (current_char & Load32Aligned(pc + 8))) {
393 pc = code_base + Load32Aligned(pc + 12);
394 } else {
395 pc += BC_AND_CHECK_NOT_4_CHARS_LENGTH;
396 }
397 break;
398 }
399 BYTECODE(AND_CHECK_NOT_CHAR) {
400 uint32_t c = (insn >> BYTECODE_SHIFT);
401 if (c != (current_char & Load32Aligned(pc + 4))) {
402 pc = code_base + Load32Aligned(pc + 8);
403 } else {
404 pc += BC_AND_CHECK_NOT_CHAR_LENGTH;
405 }
406 break;
407 }
408 BYTECODE(MINUS_AND_CHECK_NOT_CHAR) {
409 uint32_t c = (insn >> BYTECODE_SHIFT);
410 uint32_t minus = Load16Aligned(pc + 4);
411 uint32_t mask = Load16Aligned(pc + 6);
412 if (c != ((current_char - minus) & mask)) {
413 pc = code_base + Load32Aligned(pc + 8);
414 } else {
415 pc += BC_MINUS_AND_CHECK_NOT_CHAR_LENGTH;
416 }
417 break;
418 }
419 BYTECODE(CHECK_CHAR_IN_RANGE) {
420 uint32_t from = Load16Aligned(pc + 4);
421 uint32_t to = Load16Aligned(pc + 6);
422 if (from <= current_char && current_char <= to) {
423 pc = code_base + Load32Aligned(pc + 8);
424 } else {
425 pc += BC_CHECK_CHAR_IN_RANGE_LENGTH;
426 }
427 break;
428 }
429 BYTECODE(CHECK_CHAR_NOT_IN_RANGE) {
430 uint32_t from = Load16Aligned(pc + 4);
431 uint32_t to = Load16Aligned(pc + 6);
432 if (from > current_char || current_char > to) {
433 pc = code_base + Load32Aligned(pc + 8);
434 } else {
435 pc += BC_CHECK_CHAR_NOT_IN_RANGE_LENGTH;
436 }
437 break;
438 }
439 BYTECODE(CHECK_BIT_IN_TABLE) {
440 int mask = RegExpMacroAssembler::kTableMask;
441 byte b = pc[8 + ((current_char & mask) >> kBitsPerByteLog2)];
442 int bit = (current_char & (kBitsPerByte - 1));
443 if ((b & (1 << bit)) != 0) {
444 pc = code_base + Load32Aligned(pc + 4);
445 } else {
446 pc += BC_CHECK_BIT_IN_TABLE_LENGTH;
447 }
448 break;
449 }
450 BYTECODE(CHECK_LT) {
451 uint32_t limit = (insn >> BYTECODE_SHIFT);
452 if (current_char < limit) {
453 pc = code_base + Load32Aligned(pc + 4);
454 } else {
455 pc += BC_CHECK_LT_LENGTH;
456 }
457 break;
458 }
459 BYTECODE(CHECK_GT) {
460 uint32_t limit = (insn >> BYTECODE_SHIFT);
461 if (current_char > limit) {
462 pc = code_base + Load32Aligned(pc + 4);
463 } else {
464 pc += BC_CHECK_GT_LENGTH;
465 }
466 break;
467 }
468 BYTECODE(CHECK_REGISTER_LT)
469 if (registers[insn >> BYTECODE_SHIFT] < Load32Aligned(pc + 4)) {
470 pc = code_base + Load32Aligned(pc + 8);
471 } else {
472 pc += BC_CHECK_REGISTER_LT_LENGTH;
473 }
474 break;
475 BYTECODE(CHECK_REGISTER_GE)
476 if (registers[insn >> BYTECODE_SHIFT] >= Load32Aligned(pc + 4)) {
477 pc = code_base + Load32Aligned(pc + 8);
478 } else {
479 pc += BC_CHECK_REGISTER_GE_LENGTH;
480 }
481 break;
482 BYTECODE(CHECK_REGISTER_EQ_POS)
483 if (registers[insn >> BYTECODE_SHIFT] == current) {
484 pc = code_base + Load32Aligned(pc + 4);
485 } else {
486 pc += BC_CHECK_REGISTER_EQ_POS_LENGTH;
487 }
488 break;
489 BYTECODE(CHECK_NOT_REGS_EQUAL)
490 if (registers[insn >> BYTECODE_SHIFT] ==
491 registers[Load32Aligned(pc + 4)]) {
492 pc += BC_CHECK_NOT_REGS_EQUAL_LENGTH;
493 } else {
494 pc = code_base + Load32Aligned(pc + 8);
495 }
496 break;
497 BYTECODE(CHECK_NOT_BACK_REF) {
498 int from = registers[insn >> BYTECODE_SHIFT];
499 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
500 if (from >= 0 && len > 0) {
501 if (current + len > subject.length() ||
502 CompareChars(&subject[from], &subject[current], len) != 0) {
503 pc = code_base + Load32Aligned(pc + 4);
504 break;
505 }
506 current += len;
507 }
508 pc += BC_CHECK_NOT_BACK_REF_LENGTH;
509 break;
510 }
511 BYTECODE(CHECK_NOT_BACK_REF_BACKWARD) {
512 int from = registers[insn >> BYTECODE_SHIFT];
513 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
514 if (from >= 0 && len > 0) {
515 if (current - len < 0 ||
516 CompareChars(&subject[from], &subject[current - len], len) != 0) {
517 pc = code_base + Load32Aligned(pc + 4);
518 break;
519 }
520 current -= len;
521 }
522 pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH;
523 break;
524 }
525 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
526 int from = registers[insn >> BYTECODE_SHIFT];
527 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
528 if (from >= 0 && len > 0) {
529 if (current + len > subject.length() ||
530 !BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),
531 from, current, len, subject)) {
532 pc = code_base + Load32Aligned(pc + 4);
533 break;
534 }
535 current += len;
536 }
537 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
538 break;
539 }
540 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) {
541 int from = registers[insn >> BYTECODE_SHIFT];
542 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
543 if (from >= 0 && len > 0) {
544 if (current - len < 0 ||
545 !BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),
546 from, current - len, len, subject)) {
547 pc = code_base + Load32Aligned(pc + 4);
548 break;
549 }
550 current -= len;
551 }
552 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_LENGTH;
553 break;
554 }
555 BYTECODE(CHECK_AT_START)
556 if (current == 0) {
557 pc = code_base + Load32Aligned(pc + 4);
558 } else {
559 pc += BC_CHECK_AT_START_LENGTH;
560 }
561 break;
562 BYTECODE(CHECK_NOT_AT_START)
563 if (current + (insn >> BYTECODE_SHIFT) == 0) {
564 pc += BC_CHECK_NOT_AT_START_LENGTH;
565 } else {
566 pc = code_base + Load32Aligned(pc + 4);
567 }
568 break;
569 BYTECODE(SET_CURRENT_POSITION_FROM_END) {
570 int by = static_cast<uint32_t>(insn) >> BYTECODE_SHIFT;
571 if (subject.length() - current > by) {
572 current = subject.length() - by;
573 current_char = subject[current - 1];
574 }
575 pc += BC_SET_CURRENT_POSITION_FROM_END_LENGTH;
576 break;
577 }
578 default:
579 UNREACHABLE();
580 break;
581 }
582 }
583}
584
585
586RegExpImpl::IrregexpResult IrregexpInterpreter::Match(
587 Isolate* isolate,
588 Handle<ByteArray> code_array,
589 Handle<String> subject,
590 int* registers,
591 int start_position) {
592 DCHECK(subject->IsFlat());
593
594 DisallowHeapAllocation no_gc;
595 const byte* code_base = code_array->GetDataStartAddress();
596 uc16 previous_char = '\n';
597 String::FlatContent subject_content = subject->GetFlatContent();
598 if (subject_content.IsOneByte()) {
599 Vector<const uint8_t> subject_vector = subject_content.ToOneByteVector();
600 if (start_position != 0) previous_char = subject_vector[start_position - 1];
601 return RawMatch(isolate,
602 code_base,
603 subject_vector,
604 registers,
605 start_position,
606 previous_char);
607 } else {
608 DCHECK(subject_content.IsTwoByte());
609 Vector<const uc16> subject_vector = subject_content.ToUC16Vector();
610 if (start_position != 0) previous_char = subject_vector[start_position - 1];
611 return RawMatch(isolate,
612 code_base,
613 subject_vector,
614 registers,
615 start_position,
616 previous_char);
617 }
618}
619
620} // namespace internal
621} // namespace v8