Revert "Revert "Upgrade to 5.0.71.48"" DO NOT MERGE
This reverts commit f2e3994fa5148cc3d9946666f0b0596290192b0e,
and updates the x64 makefile properly so it doesn't break that
build.
FPIIM-449
Change-Id: Ib83e35bfbae6af627451c926a9650ec57c045605
(cherry picked from commit 109988c7ccb6f3fd1a58574fa3dfb88beaef6632)
diff --git a/src/regexp/arm/regexp-macro-assembler-arm.cc b/src/regexp/arm/regexp-macro-assembler-arm.cc
index 6fafdfb..ce72188 100644
--- a/src/regexp/arm/regexp-macro-assembler-arm.cc
+++ b/src/regexp/arm/regexp-macro-assembler-arm.cc
@@ -210,7 +210,7 @@
void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, Label* on_no_match) {
+ int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough;
__ ldr(r0, register_location(start_reg)); // Index of start of capture
__ ldr(r1, register_location(start_reg + 1)); // Index of end of capture
@@ -302,7 +302,7 @@
// r0: Address byte_offset1 - Address captured substring's start.
// r1: Address byte_offset2 - Address of current character position.
// r2: size_t byte_length - length of capture in bytes(!)
- // r3: Isolate* isolate
+ // r3: Isolate* isolate or 0 if unicode flag.
// Address of start of capture.
__ add(r0, r0, Operand(end_of_input_address()));
@@ -316,7 +316,14 @@
__ sub(r1, r1, r4);
}
// Isolate.
- __ mov(r3, Operand(ExternalReference::isolate_address(isolate())));
+#ifdef V8_I18N_SUPPORT
+ if (unicode) {
+ __ mov(r3, Operand(0));
+ } else // NOLINT
+#endif // V8_I18N_SUPPORT
+ {
+ __ mov(r3, Operand(ExternalReference::isolate_address(isolate())));
+ }
{
AllowExternalCallThatCantCauseGC scope(masm_);
@@ -798,9 +805,12 @@
__ cmp(current_input_offset(), Operand::Zero());
__ b(eq, &exit_label_);
// Advance current position after a zero-length match.
+ Label advance;
+ __ bind(&advance);
__ add(current_input_offset(),
current_input_offset(),
Operand((mode_ == UC16) ? 2 : 1));
+ if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
}
__ b(&load_char_start_regexp);
diff --git a/src/regexp/arm/regexp-macro-assembler-arm.h b/src/regexp/arm/regexp-macro-assembler-arm.h
index 233a98f..f808538 100644
--- a/src/regexp/arm/regexp-macro-assembler-arm.h
+++ b/src/regexp/arm/regexp-macro-assembler-arm.h
@@ -38,7 +38,7 @@
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward,
+ bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c,
diff --git a/src/regexp/arm64/regexp-macro-assembler-arm64.cc b/src/regexp/arm64/regexp-macro-assembler-arm64.cc
index 9948597..941ccea 100644
--- a/src/regexp/arm64/regexp-macro-assembler-arm64.cc
+++ b/src/regexp/arm64/regexp-macro-assembler-arm64.cc
@@ -274,7 +274,7 @@
void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, Label* on_no_match) {
+ int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough;
Register capture_start_offset = w10;
@@ -388,7 +388,7 @@
// x0: Address byte_offset1 - Address captured substring's start.
// x1: Address byte_offset2 - Address of current character position.
// w2: size_t byte_length - length of capture in bytes(!)
- // x3: Isolate* isolate
+ // x3: Isolate* isolate or 0 if unicode flag
// Address of start of capture.
__ Add(x0, input_end(), Operand(capture_start_offset, SXTW));
@@ -400,7 +400,14 @@
__ Sub(x1, x1, Operand(capture_length, SXTW));
}
// Isolate.
- __ Mov(x3, ExternalReference::isolate_address(isolate()));
+#ifdef V8_I18N_SUPPORT
+ if (unicode) {
+ __ Mov(x3, Operand(0));
+ } else // NOLINT
+#endif // V8_I18N_SUPPORT
+ {
+ __ Mov(x3, ExternalReference::isolate_address(isolate()));
+ }
{
AllowExternalCallThatCantCauseGC scope(masm_);
@@ -991,9 +998,12 @@
// Offset from the end is zero if we already reached the end.
__ Cbz(current_input_offset(), &return_w0);
// Advance current position after a zero-length match.
+ Label advance;
+ __ bind(&advance);
__ Add(current_input_offset(),
current_input_offset(),
Operand((mode_ == UC16) ? 2 : 1));
+ if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
}
__ B(&load_char_start_regexp);
diff --git a/src/regexp/arm64/regexp-macro-assembler-arm64.h b/src/regexp/arm64/regexp-macro-assembler-arm64.h
index d71f063..69624f6 100644
--- a/src/regexp/arm64/regexp-macro-assembler-arm64.h
+++ b/src/regexp/arm64/regexp-macro-assembler-arm64.h
@@ -43,7 +43,7 @@
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward,
+ bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c,
diff --git a/src/regexp/bytecodes-irregexp.h b/src/regexp/bytecodes-irregexp.h
index 2dbfbc0..3848f15 100644
--- a/src/regexp/bytecodes-irregexp.h
+++ b/src/regexp/bytecodes-irregexp.h
@@ -6,6 +6,8 @@
#ifndef V8_REGEXP_BYTECODES_IRREGEXP_H_
#define V8_REGEXP_BYTECODES_IRREGEXP_H_
+#ifdef V8_INTERPRETED_REGEXP
+
namespace v8 {
namespace internal {
@@ -18,56 +20,58 @@
const int BYTECODE_SHIFT = 8;
#define BYTECODE_ITERATOR(V) \
-V(BREAK, 0, 4) /* bc8 */ \
-V(PUSH_CP, 1, 4) /* bc8 pad24 */ \
-V(PUSH_BT, 2, 8) /* bc8 pad24 offset32 */ \
-V(PUSH_REGISTER, 3, 4) /* bc8 reg_idx24 */ \
-V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32 */ \
-V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24 */ \
-V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24 */ \
-V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24 */ \
-V(SET_REGISTER, 8, 8) /* bc8 reg_idx24 value32 */ \
-V(ADVANCE_REGISTER, 9, 8) /* bc8 reg_idx24 value32 */ \
-V(POP_CP, 10, 4) /* bc8 pad24 */ \
-V(POP_BT, 11, 4) /* bc8 pad24 */ \
-V(POP_REGISTER, 12, 4) /* bc8 reg_idx24 */ \
-V(FAIL, 13, 4) /* bc8 pad24 */ \
-V(SUCCEED, 14, 4) /* bc8 pad24 */ \
-V(ADVANCE_CP, 15, 4) /* bc8 offset24 */ \
-V(GOTO, 16, 8) /* bc8 pad24 addr32 */ \
-V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32 */ \
-V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4) /* bc8 offset24 */ \
-V(LOAD_2_CURRENT_CHARS, 19, 8) /* bc8 offset24 addr32 */ \
-V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */ \
-V(LOAD_4_CURRENT_CHARS, 21, 8) /* bc8 offset24 addr32 */ \
-V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */ \
-V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32 */ \
-V(CHECK_CHAR, 24, 8) /* bc8 pad8 uint16 addr32 */ \
-V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */ \
-V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32 */ \
-V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
-V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
-V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
-V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
-V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32 */ \
-V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
-V(CHECK_CHAR_NOT_IN_RANGE, 33, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
-V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \
-V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \
-V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \
-V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
-V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
-V(CHECK_NOT_BACK_REF_BACKWARD, 39, 8) /* bc8 reg_idx24 addr32 */ \
-V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \
-V(CHECK_NOT_REGS_EQUAL, 41, 12) /* bc8 regidx24 reg_idx32 addr32 */ \
-V(CHECK_REGISTER_LT, 42, 12) /* bc8 reg_idx24 value32 addr32 */ \
-V(CHECK_REGISTER_GE, 43, 12) /* bc8 reg_idx24 value32 addr32 */ \
-V(CHECK_REGISTER_EQ_POS, 44, 8) /* bc8 reg_idx24 addr32 */ \
-V(CHECK_AT_START, 45, 8) /* bc8 pad24 addr32 */ \
-V(CHECK_NOT_AT_START, 46, 8) /* bc8 offset24 addr32 */ \
-V(CHECK_GREEDY, 47, 8) /* bc8 pad24 addr32 */ \
-V(ADVANCE_CP_AND_GOTO, 48, 8) /* bc8 offset24 addr32 */ \
-V(SET_CURRENT_POSITION_FROM_END, 49, 4) /* bc8 idx24 */
+ V(BREAK, 0, 4) /* bc8 */ \
+ V(PUSH_CP, 1, 4) /* bc8 pad24 */ \
+ V(PUSH_BT, 2, 8) /* bc8 pad24 offset32 */ \
+ V(PUSH_REGISTER, 3, 4) /* bc8 reg_idx24 */ \
+ V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32 */ \
+ V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24 */ \
+ V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24 */ \
+ V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24 */ \
+ V(SET_REGISTER, 8, 8) /* bc8 reg_idx24 value32 */ \
+ V(ADVANCE_REGISTER, 9, 8) /* bc8 reg_idx24 value32 */ \
+ V(POP_CP, 10, 4) /* bc8 pad24 */ \
+ V(POP_BT, 11, 4) /* bc8 pad24 */ \
+ V(POP_REGISTER, 12, 4) /* bc8 reg_idx24 */ \
+ V(FAIL, 13, 4) /* bc8 pad24 */ \
+ V(SUCCEED, 14, 4) /* bc8 pad24 */ \
+ V(ADVANCE_CP, 15, 4) /* bc8 offset24 */ \
+ V(GOTO, 16, 8) /* bc8 pad24 addr32 */ \
+ V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32 */ \
+ V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4) /* bc8 offset24 */ \
+ V(LOAD_2_CURRENT_CHARS, 19, 8) /* bc8 offset24 addr32 */ \
+ V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */ \
+ V(LOAD_4_CURRENT_CHARS, 21, 8) /* bc8 offset24 addr32 */ \
+ V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */ \
+ V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32 */ \
+ V(CHECK_CHAR, 24, 8) /* bc8 pad8 uint16 addr32 */ \
+ V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */ \
+ V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32 */ \
+ V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
+ V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
+ V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
+ V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
+ V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32 */ \
+ V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
+ V(CHECK_CHAR_NOT_IN_RANGE, 33, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
+ V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \
+ V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \
+ V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \
+ V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
+ V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
+ V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) \
+ V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \
+ V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \
+ V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) \
+ V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \
+ V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \
+ V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \
+ V(CHECK_REGISTER_EQ_POS, 46, 8) /* bc8 reg_idx24 addr32 */ \
+ V(CHECK_AT_START, 47, 8) /* bc8 pad24 addr32 */ \
+ V(CHECK_NOT_AT_START, 48, 8) /* bc8 offset24 addr32 */ \
+ V(CHECK_GREEDY, 49, 8) /* bc8 pad24 addr32 */ \
+ V(ADVANCE_CP_AND_GOTO, 50, 8) /* bc8 offset24 addr32 */ \
+ V(SET_CURRENT_POSITION_FROM_END, 51, 4) /* bc8 idx24 */
#define DECLARE_BYTECODES(name, code, length) \
static const int BC_##name = code;
@@ -82,4 +86,6 @@
} // namespace internal
} // namespace v8
+#endif // V8_INTERPRETED_REGEXP
+
#endif // V8_REGEXP_BYTECODES_IRREGEXP_H_
diff --git a/src/regexp/ia32/regexp-macro-assembler-ia32.cc b/src/regexp/ia32/regexp-macro-assembler-ia32.cc
index 6ef0f5f..4c22b43 100644
--- a/src/regexp/ia32/regexp-macro-assembler-ia32.cc
+++ b/src/regexp/ia32/regexp-macro-assembler-ia32.cc
@@ -189,7 +189,7 @@
void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, Label* on_no_match) {
+ int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough;
__ mov(edx, register_location(start_reg)); // Index of start of capture
__ mov(ebx, register_location(start_reg + 1)); // Index of end of capture
@@ -296,11 +296,18 @@
// Address byte_offset1 - Address captured substring's start.
// Address byte_offset2 - Address of current character position.
// size_t byte_length - length of capture in bytes(!)
- // Isolate* isolate
+// Isolate* isolate or 0 if unicode flag.
// Set isolate.
- __ mov(Operand(esp, 3 * kPointerSize),
- Immediate(ExternalReference::isolate_address(isolate())));
+#ifdef V8_I18N_SUPPORT
+ if (unicode) {
+ __ mov(Operand(esp, 3 * kPointerSize), Immediate(0));
+ } else // NOLINT
+#endif // V8_I18N_SUPPORT
+ {
+ __ mov(Operand(esp, 3 * kPointerSize),
+ Immediate(ExternalReference::isolate_address(isolate())));
+ }
// Set byte_length.
__ mov(Operand(esp, 2 * kPointerSize), ebx);
// Set byte_offset2.
@@ -822,13 +829,15 @@
__ test(edi, edi);
__ j(zero, &exit_label_, Label::kNear);
// Advance current position after a zero-length match.
+ Label advance;
+ __ bind(&advance);
if (mode_ == UC16) {
__ add(edi, Immediate(2));
} else {
__ inc(edi);
}
+ if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
}
-
__ jmp(&load_char_start_regexp);
} else {
__ mov(eax, Immediate(SUCCESS));
diff --git a/src/regexp/ia32/regexp-macro-assembler-ia32.h b/src/regexp/ia32/regexp-macro-assembler-ia32.h
index 1ef87ee..fa17413 100644
--- a/src/regexp/ia32/regexp-macro-assembler-ia32.h
+++ b/src/regexp/ia32/regexp-macro-assembler-ia32.h
@@ -37,7 +37,7 @@
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward,
+ bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c,
diff --git a/src/regexp/interpreter-irregexp.cc b/src/regexp/interpreter-irregexp.cc
index ea748e4..14834d5 100644
--- a/src/regexp/interpreter-irregexp.cc
+++ b/src/regexp/interpreter-irregexp.cc
@@ -4,6 +4,8 @@
// A simple interpreter for the Irregexp byte code.
+#ifdef V8_INTERPRETED_REGEXP
+
#include "src/regexp/interpreter-irregexp.h"
#include "src/ast/ast.h"
@@ -13,38 +15,32 @@
#include "src/unicode.h"
#include "src/utils.h"
+#ifdef V8_I18N_SUPPORT
+#include "unicode/uchar.h"
+#endif // V8_I18N_SUPPORT
+
namespace v8 {
namespace internal {
-
typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize;
-static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
- int from,
- int current,
- int len,
- Vector<const uc16> subject) {
- for (int i = 0; i < len; i++) {
- unibrow::uchar old_char = subject[from++];
- unibrow::uchar new_char = subject[current++];
- if (old_char == new_char) continue;
- unibrow::uchar old_string[1] = { old_char };
- unibrow::uchar new_string[1] = { new_char };
- interp_canonicalize->get(old_char, '\0', old_string);
- interp_canonicalize->get(new_char, '\0', new_string);
- if (old_string[0] != new_string[0]) {
- return false;
- }
- }
- return true;
+static bool BackRefMatchesNoCase(Isolate* isolate, int from, int current,
+ int len, Vector<const uc16> subject,
+ bool unicode) {
+ Address offset_a =
+ reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(from)));
+ Address offset_b =
+ reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(current)));
+ size_t length = len * kUC16Size;
+ return RegExpMacroAssembler::CaseInsensitiveCompareUC16(
+ offset_a, offset_b, length, unicode ? nullptr : isolate) == 1;
}
-static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
- int from,
- int current,
- int len,
- Vector<const uint8_t> subject) {
+static bool BackRefMatchesNoCase(Isolate* isolate, int from, int current,
+ int len, Vector<const uint8_t> subject,
+ bool unicode) {
+ // For Latin1 characters the unicode flag makes no difference.
for (int i = 0; i < len; i++) {
unsigned int old_char = subject[from++];
unsigned int new_char = subject[current++];
@@ -522,13 +518,16 @@
pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH;
break;
}
+ BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE)
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
+ bool unicode =
+ (insn & BYTECODE_MASK) == BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE;
int from = registers[insn >> BYTECODE_SHIFT];
int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
if (from >= 0 && len > 0) {
if (current + len > subject.length() ||
- !BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),
- from, current, len, subject)) {
+ !BackRefMatchesNoCase(isolate, from, current, len, subject,
+ unicode)) {
pc = code_base + Load32Aligned(pc + 4);
break;
}
@@ -537,13 +536,16 @@
pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
break;
}
+ BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD)
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) {
+ bool unicode = (insn & BYTECODE_MASK) ==
+ BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD;
int from = registers[insn >> BYTECODE_SHIFT];
int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
if (from >= 0 && len > 0) {
if (current - len < 0 ||
- !BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),
- from, current - len, len, subject)) {
+ !BackRefMatchesNoCase(isolate, from, current - len, len, subject,
+ unicode)) {
pc = code_base + Load32Aligned(pc + 4);
break;
}
@@ -619,3 +621,5 @@
} // namespace internal
} // namespace v8
+
+#endif // V8_INTERPRETED_REGEXP
diff --git a/src/regexp/interpreter-irregexp.h b/src/regexp/interpreter-irregexp.h
index 244af99..887fab6 100644
--- a/src/regexp/interpreter-irregexp.h
+++ b/src/regexp/interpreter-irregexp.h
@@ -7,12 +7,13 @@
#ifndef V8_REGEXP_INTERPRETER_IRREGEXP_H_
#define V8_REGEXP_INTERPRETER_IRREGEXP_H_
+#ifdef V8_INTERPRETED_REGEXP
+
#include "src/regexp/jsregexp.h"
namespace v8 {
namespace internal {
-
class IrregexpInterpreter {
public:
static RegExpImpl::IrregexpResult Match(Isolate* isolate,
@@ -26,4 +27,6 @@
} // namespace internal
} // namespace v8
+#endif // V8_INTERPRETED_REGEXP
+
#endif // V8_REGEXP_INTERPRETER_IRREGEXP_H_
diff --git a/src/regexp/jsregexp-inl.h b/src/regexp/jsregexp-inl.h
index 3eb7c3c..ca7a9fe 100644
--- a/src/regexp/jsregexp-inl.h
+++ b/src/regexp/jsregexp-inl.h
@@ -47,7 +47,10 @@
register_array_size_);
} else {
int last_start_index = last_match[0];
- if (last_start_index == last_end_index) last_end_index++;
+ if (last_start_index == last_end_index) {
+ // Zero-length match. Advance by one code point.
+ last_end_index = AdvanceZeroLength(last_end_index);
+ }
if (last_end_index > subject_->length()) {
num_matches_ = 0; // Signal failed match.
return NULL;
diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc
index 34d20fe..80f48ca 100644
--- a/src/regexp/jsregexp.cc
+++ b/src/regexp/jsregexp.cc
@@ -25,6 +25,11 @@
#include "src/string-search.h"
#include "src/unicode-decoder.h"
+#ifdef V8_I18N_SUPPORT
+#include "unicode/uset.h"
+#include "unicode/utypes.h"
+#endif // V8_I18N_SUPPORT
+
#ifndef V8_INTERPRETED_REGEXP
#if V8_TARGET_ARCH_IA32
#include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
@@ -72,7 +77,7 @@
int ranges_length,
Interval new_range) {
DCHECK((ranges_length & 1) == 1);
- DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
+ DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
if (containment == kLatticeUnknown) return containment;
bool inside = false;
int last = 0;
@@ -145,9 +150,8 @@
PostponeInterruptsScope postpone(isolate);
RegExpCompileData parse_result;
FlatStringReader reader(isolate, pattern);
- if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader,
- flags & JSRegExp::kMultiline,
- flags & JSRegExp::kUnicode, &parse_result)) {
+ if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
+ &parse_result)) {
// Throw an exception if we fail to parse the pattern.
return ThrowRegExpException(re, pattern, parse_result.error);
}
@@ -371,18 +375,16 @@
pattern = String::Flatten(pattern);
RegExpCompileData compile_data;
FlatStringReader reader(isolate, pattern);
- if (!RegExpParser::ParseRegExp(isolate, &zone, &reader,
- flags & JSRegExp::kMultiline,
- flags & JSRegExp::kUnicode, &compile_data)) {
+ if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
+ &compile_data)) {
// Throw an exception if we fail to parse the pattern.
// THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
USE(ThrowRegExpException(re, pattern, compile_data.error));
return false;
}
- RegExpEngine::CompilationResult result = RegExpEngine::Compile(
- isolate, &zone, &compile_data, flags & JSRegExp::kIgnoreCase,
- flags & JSRegExp::kGlobal, flags & JSRegExp::kMultiline,
- flags & JSRegExp::kSticky, pattern, sample_subject, is_one_byte);
+ RegExpEngine::CompilationResult result =
+ RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
+ sample_subject, is_one_byte);
if (result.error_message != NULL) {
// Unable to compile regexp.
Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
@@ -636,7 +638,6 @@
RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
Handle<String> subject,
- bool is_global,
Isolate* isolate)
: register_array_(NULL),
register_array_size_(0),
@@ -661,7 +662,8 @@
}
}
- if (is_global && !interpreted) {
+ DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
+ if (!interpreted) {
register_array_size_ =
Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
max_matches_ = register_array_size_ / registers_per_match_;
@@ -690,6 +692,16 @@
last_match[1] = 0;
}
+int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
+ if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
+ last_index + 1 < subject_->length() &&
+ unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
+ unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
+ // Advance over the surrogate pair.
+ return last_index + 2;
+ }
+ return last_index + 1;
+}
// -------------------------------------------------------------------
// Implementation of the Irregexp regular expression engine.
@@ -945,7 +957,7 @@
class RegExpCompiler {
public:
RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
- bool ignore_case, bool is_one_byte);
+ JSRegExp::Flags flags, bool is_one_byte);
int AllocateRegister() {
if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
@@ -955,6 +967,22 @@
return next_register_++;
}
+ // Lookarounds to match lone surrogates for unicode character class matches
+ // are never nested. We can therefore reuse registers.
+ int UnicodeLookaroundStackRegister() {
+ if (unicode_lookaround_stack_register_ == kNoRegister) {
+ unicode_lookaround_stack_register_ = AllocateRegister();
+ }
+ return unicode_lookaround_stack_register_;
+ }
+
+ int UnicodeLookaroundPositionRegister() {
+ if (unicode_lookaround_position_register_ == kNoRegister) {
+ unicode_lookaround_position_register_ = AllocateRegister();
+ }
+ return unicode_lookaround_position_register_;
+ }
+
RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
RegExpNode* start,
int capture_count,
@@ -981,7 +1009,8 @@
void SetRegExpTooBig() { reg_exp_too_big_ = true; }
- inline bool ignore_case() { return ignore_case_; }
+ inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
+ inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
inline bool one_byte() { return one_byte_; }
inline bool optimize() { return optimize_; }
inline void set_optimize(bool value) { optimize_ = value; }
@@ -1006,10 +1035,12 @@
private:
EndNode* accept_;
int next_register_;
+ int unicode_lookaround_stack_register_;
+ int unicode_lookaround_position_register_;
List<RegExpNode*>* work_list_;
int recursion_depth_;
RegExpMacroAssembler* macro_assembler_;
- bool ignore_case_;
+ JSRegExp::Flags flags_;
bool one_byte_;
bool reg_exp_too_big_;
bool limiting_recursion_;
@@ -1041,11 +1072,13 @@
// Attempts to compile the regexp using an Irregexp code generator. Returns
// a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
- bool ignore_case, bool one_byte)
+ JSRegExp::Flags flags, bool one_byte)
: next_register_(2 * (capture_count + 1)),
+ unicode_lookaround_stack_register_(kNoRegister),
+ unicode_lookaround_position_register_(kNoRegister),
work_list_(NULL),
recursion_depth_(0),
- ignore_case_(ignore_case),
+ flags_(flags),
one_byte_(one_byte),
reg_exp_too_big_(false),
limiting_recursion_(false),
@@ -1941,15 +1974,13 @@
// know that the character is in the range of min_char to max_char inclusive.
// Either label can be NULL indicating backtracking. Either label can also be
// equal to the fall_through label.
-static void GenerateBranches(RegExpMacroAssembler* masm,
- ZoneList<int>* ranges,
- int start_index,
- int end_index,
- uc16 min_char,
- uc16 max_char,
- Label* fall_through,
- Label* even_label,
- Label* odd_label) {
+static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
+ int start_index, int end_index, uc32 min_char,
+ uc32 max_char, Label* fall_through,
+ Label* even_label, Label* odd_label) {
+ DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
+ DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
+
int first = ranges->at(start_index);
int last = ranges->at(end_index) - 1;
@@ -2098,9 +2129,7 @@
Label* on_failure, int cp_offset, bool check_offset,
bool preloaded, Zone* zone) {
ZoneList<CharacterRange>* ranges = cc->ranges(zone);
- if (!CharacterRange::IsCanonical(ranges)) {
- CharacterRange::Canonicalize(ranges);
- }
+ CharacterRange::Canonicalize(ranges);
int max_char;
if (one_byte) {
@@ -2142,23 +2171,14 @@
}
return;
}
- if (last_valid_range == 0 &&
- !cc->is_negated() &&
- ranges->at(0).IsEverything(max_char)) {
- // This is a common case hit by non-anchored expressions.
- if (check_offset) {
- macro_assembler->CheckPosition(cp_offset, on_failure);
- }
- return;
- }
if (!preloaded) {
macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
}
if (cc->is_standard(zone) &&
- macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
- on_failure)) {
+ macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
+ on_failure)) {
return;
}
@@ -2470,12 +2490,14 @@
} else {
// For 2-character preloads in one-byte mode or 1-character preloads in
// two-byte mode we also use a 16 bit load with zero extend.
+ static const uint32_t kTwoByteMask = 0xffff;
+ static const uint32_t kFourByteMask = 0xffffffff;
if (details->characters() == 2 && compiler->one_byte()) {
- if ((mask & 0xffff) == 0xffff) need_mask = false;
+ if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
} else if (details->characters() == 1 && !compiler->one_byte()) {
- if ((mask & 0xffff) == 0xffff) need_mask = false;
+ if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
} else {
- if (mask == 0xffffffff) need_mask = false;
+ if (mask == kFourByteMask) need_mask = false;
}
}
@@ -2798,9 +2820,7 @@
DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
RegExpCharacterClass* cc = elm.char_class();
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
- if (!CharacterRange::IsCanonical(ranges)) {
- CharacterRange::Canonicalize(ranges);
- }
+ CharacterRange::Canonicalize(ranges);
// Now they are in order so we only need to look at the first.
int range_count = ranges->length();
if (cc->is_negated()) {
@@ -3289,6 +3309,36 @@
}
+TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
+ ZoneList<CharacterRange>* ranges,
+ bool read_backward,
+ RegExpNode* on_success) {
+ DCHECK_NOT_NULL(ranges);
+ ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
+ elms->Add(
+ TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
+ zone);
+ return new (zone) TextNode(elms, read_backward, on_success);
+}
+
+
+TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
+ CharacterRange trail,
+ bool read_backward,
+ RegExpNode* on_success) {
+ ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
+ ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
+ ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
+ elms->Add(TextElement::CharClass(
+ new (zone) RegExpCharacterClass(lead_ranges, false)),
+ zone);
+ elms->Add(TextElement::CharClass(
+ new (zone) RegExpCharacterClass(trail_ranges, false)),
+ zone);
+ return new (zone) TextNode(elms, read_backward, on_success);
+}
+
+
// This generates the code to match a text node. A text node can contain
// straight character sequences (possibly to be matched in a case-independent
// way) and character classes. For efficiency we do not do this in a single
@@ -3385,10 +3435,7 @@
// independent case and it slows us down if we don't know that.
if (cc->is_standard(zone())) continue;
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
- int range_count = ranges->length();
- for (int j = 0; j < range_count; j++) {
- ranges->at(j).AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
- }
+ CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
}
}
}
@@ -3405,9 +3452,7 @@
if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
RegExpCharacterClass* node = elm.char_class();
ZoneList<CharacterRange>* ranges = node->ranges(zone());
- if (!CharacterRange::IsCanonical(ranges)) {
- CharacterRange::Canonicalize(ranges);
- }
+ CharacterRange::Canonicalize(ranges);
if (node->is_negated()) {
return ranges->length() == 0 ? on_success() : NULL;
}
@@ -3554,27 +3599,29 @@
};
+static const uc32 kRangeEndMarker = 0x110000;
+
// The '2' variant is has inclusive from and exclusive to.
// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
-static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
- 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
- 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
- 0xFEFF, 0xFF00, 0x10000 };
+static const int kSpaceRanges[] = {
+ '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681,
+ 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
+ 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
static const int kSpaceRangeCount = arraysize(kSpaceRanges);
static const int kWordRanges[] = {
- '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
+ '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
static const int kWordRangeCount = arraysize(kWordRanges);
-static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
+static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
static const int kDigitRangeCount = arraysize(kDigitRanges);
-static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
+static const int kSurrogateRanges[] = {
+ kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
-static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
- 0x2028, 0x202A, 0x10000 };
+static const int kLineTerminatorRanges[] = {
+ 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
-
void BoyerMoorePositionInfo::Set(int character) {
SetInterval(Interval(character, character));
}
@@ -3916,6 +3963,11 @@
void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
int choice_count = alternatives_->length();
+ if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
+ alternatives_->at(0).node()->Emit(compiler, trace);
+ return;
+ }
+
AssertGuardsMentionRegisters(trace);
LimitResult limit_result = LimitVersions(compiler, trace);
@@ -4349,14 +4401,19 @@
DCHECK_EQ(start_reg_ + 1, end_reg_);
if (compiler->ignore_case()) {
- assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
- trace->backtrack());
+ assembler->CheckNotBackReferenceIgnoreCase(
+ start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
} else {
assembler->CheckNotBackReference(start_reg_, read_backward(),
trace->backtrack());
}
// We are going to advance backward, so we may end up at the start.
if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
+
+ // Check that the back reference does not end inside a surrogate pair.
+ if (compiler->unicode() && !compiler->one_byte()) {
+ assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
+ }
on_success()->Emit(compiler, trace);
}
@@ -4732,8 +4789,8 @@
static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
const int* special_class,
int length) {
- length--; // Remove final 0x10000.
- DCHECK(special_class[length] == 0x10000);
+ length--; // Remove final marker.
+ DCHECK(special_class[length] == kRangeEndMarker);
DCHECK(ranges->length() != 0);
DCHECK(length != 0);
DCHECK(special_class[0] != 0);
@@ -4753,7 +4810,7 @@
return false;
}
}
- if (range.to() != 0xffff) {
+ if (range.to() != String::kMaxCodePoint) {
return false;
}
return true;
@@ -4763,8 +4820,8 @@
static bool CompareRanges(ZoneList<CharacterRange>* ranges,
const int* special_class,
int length) {
- length--; // Remove final 0x10000.
- DCHECK(special_class[length] == 0x10000);
+ length--; // Remove final marker.
+ DCHECK(special_class[length] == kRangeEndMarker);
if (ranges->length() * 2 != length) {
return false;
}
@@ -4820,10 +4877,303 @@
}
+UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
+ ZoneList<CharacterRange>* base)
+ : zone_(zone),
+ table_(zone),
+ bmp_(nullptr),
+ lead_surrogates_(nullptr),
+ trail_surrogates_(nullptr),
+ non_bmp_(nullptr) {
+ // The unicode range splitter categorizes given character ranges into:
+ // - Code points from the BMP representable by one code unit.
+ // - Code points outside the BMP that need to be split into surrogate pairs.
+ // - Lone lead surrogates.
+ // - Lone trail surrogates.
+ // Lone surrogates are valid code points, even though no actual characters.
+ // They require special matching to make sure we do not split surrogate pairs.
+ // We use the dispatch table to accomplish this. The base range is split up
+ // by the table by the overlay ranges, and the Call callback is used to
+ // filter and collect ranges for each category.
+ for (int i = 0; i < base->length(); i++) {
+ table_.AddRange(base->at(i), kBase, zone_);
+ }
+ // Add overlay ranges.
+ table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
+ kBmpCodePoints, zone_);
+ table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
+ kLeadSurrogates, zone_);
+ table_.AddRange(
+ CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
+ kTrailSurrogates, zone_);
+ table_.AddRange(
+ CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
+ kBmpCodePoints, zone_);
+ table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
+ kNonBmpCodePoints, zone_);
+ table_.ForEach(this);
+}
+
+
+void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
+ OutSet* outset = entry.out_set();
+ if (!outset->Get(kBase)) return;
+ ZoneList<CharacterRange>** target = NULL;
+ if (outset->Get(kBmpCodePoints)) {
+ target = &bmp_;
+ } else if (outset->Get(kLeadSurrogates)) {
+ target = &lead_surrogates_;
+ } else if (outset->Get(kTrailSurrogates)) {
+ target = &trail_surrogates_;
+ } else {
+ DCHECK(outset->Get(kNonBmpCodePoints));
+ target = &non_bmp_;
+ }
+ if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
+ (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
+}
+
+
+void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
+ RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
+ ZoneList<CharacterRange>* bmp = splitter->bmp();
+ if (bmp == nullptr) return;
+ result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
+ compiler->zone(), bmp, compiler->read_backward(), on_success)));
+}
+
+
+void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
+ RegExpNode* on_success,
+ UnicodeRangeSplitter* splitter) {
+ ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
+ if (non_bmp == nullptr) return;
+ DCHECK(compiler->unicode());
+ DCHECK(!compiler->one_byte());
+ Zone* zone = compiler->zone();
+ CharacterRange::Canonicalize(non_bmp);
+ for (int i = 0; i < non_bmp->length(); i++) {
+ // Match surrogate pair.
+ // E.g. [\u10005-\u11005] becomes
+ // \ud800[\udc05-\udfff]|
+ // [\ud801-\ud803][\udc00-\udfff]|
+ // \ud804[\udc00-\udc05]
+ uc32 from = non_bmp->at(i).from();
+ uc32 to = non_bmp->at(i).to();
+ uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
+ uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
+ uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
+ uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
+ if (from_l == to_l) {
+ // The lead surrogate is the same.
+ result->AddAlternative(
+ GuardedAlternative(TextNode::CreateForSurrogatePair(
+ zone, CharacterRange::Singleton(from_l),
+ CharacterRange::Range(from_t, to_t), compiler->read_backward(),
+ on_success)));
+ } else {
+ if (from_t != kTrailSurrogateStart) {
+ // Add [from_l][from_t-\udfff]
+ result->AddAlternative(
+ GuardedAlternative(TextNode::CreateForSurrogatePair(
+ zone, CharacterRange::Singleton(from_l),
+ CharacterRange::Range(from_t, kTrailSurrogateEnd),
+ compiler->read_backward(), on_success)));
+ from_l++;
+ }
+ if (to_t != kTrailSurrogateEnd) {
+ // Add [to_l][\udc00-to_t]
+ result->AddAlternative(
+ GuardedAlternative(TextNode::CreateForSurrogatePair(
+ zone, CharacterRange::Singleton(to_l),
+ CharacterRange::Range(kTrailSurrogateStart, to_t),
+ compiler->read_backward(), on_success)));
+ to_l--;
+ }
+ if (from_l <= to_l) {
+ // Add [from_l-to_l][\udc00-\udfff]
+ result->AddAlternative(
+ GuardedAlternative(TextNode::CreateForSurrogatePair(
+ zone, CharacterRange::Range(from_l, to_l),
+ CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
+ compiler->read_backward(), on_success)));
+ }
+ }
+ }
+}
+
+
+RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
+ RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
+ ZoneList<CharacterRange>* match, RegExpNode* on_success,
+ bool read_backward) {
+ Zone* zone = compiler->zone();
+ RegExpNode* match_node = TextNode::CreateForCharacterRanges(
+ zone, match, read_backward, on_success);
+ int stack_register = compiler->UnicodeLookaroundStackRegister();
+ int position_register = compiler->UnicodeLookaroundPositionRegister();
+ RegExpLookaround::Builder lookaround(false, match_node, stack_register,
+ position_register);
+ RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
+ zone, lookbehind, !read_backward, lookaround.on_match_success());
+ return lookaround.ForMatch(negative_match);
+}
+
+
+RegExpNode* MatchAndNegativeLookaroundInReadDirection(
+ RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
+ ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
+ bool read_backward) {
+ Zone* zone = compiler->zone();
+ int stack_register = compiler->UnicodeLookaroundStackRegister();
+ int position_register = compiler->UnicodeLookaroundPositionRegister();
+ RegExpLookaround::Builder lookaround(false, on_success, stack_register,
+ position_register);
+ RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
+ zone, lookahead, read_backward, lookaround.on_match_success());
+ return TextNode::CreateForCharacterRanges(
+ zone, match, read_backward, lookaround.ForMatch(negative_match));
+}
+
+
+void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
+ RegExpNode* on_success,
+ UnicodeRangeSplitter* splitter) {
+ ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
+ if (lead_surrogates == nullptr) return;
+ Zone* zone = compiler->zone();
+ // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
+ ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
+ zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
+
+ RegExpNode* match;
+ if (compiler->read_backward()) {
+ // Reading backward. Assert that reading forward, there is no trail
+ // surrogate, and then backward match the lead surrogate.
+ match = NegativeLookaroundAgainstReadDirectionAndMatch(
+ compiler, trail_surrogates, lead_surrogates, on_success, true);
+ } else {
+ // Reading forward. Forward match the lead surrogate and assert that
+ // no trail surrogate follows.
+ match = MatchAndNegativeLookaroundInReadDirection(
+ compiler, lead_surrogates, trail_surrogates, on_success, false);
+ }
+ result->AddAlternative(GuardedAlternative(match));
+}
+
+
+void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
+ RegExpNode* on_success,
+ UnicodeRangeSplitter* splitter) {
+ ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
+ if (trail_surrogates == nullptr) return;
+ Zone* zone = compiler->zone();
+ // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
+ ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
+ zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
+
+ RegExpNode* match;
+ if (compiler->read_backward()) {
+ // Reading backward. Backward match the trail surrogate and assert that no
+ // lead surrogate precedes it.
+ match = MatchAndNegativeLookaroundInReadDirection(
+ compiler, trail_surrogates, lead_surrogates, on_success, true);
+ } else {
+ // Reading forward. Assert that reading backward, there is no lead
+ // surrogate, and then forward match the trail surrogate.
+ match = NegativeLookaroundAgainstReadDirectionAndMatch(
+ compiler, lead_surrogates, trail_surrogates, on_success, false);
+ }
+ result->AddAlternative(GuardedAlternative(match));
+}
+
+RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
+ DCHECK(!compiler->read_backward());
+ Zone* zone = compiler->zone();
+ // Advance any character. If the character happens to be a lead surrogate and
+ // we advanced into the middle of a surrogate pair, it will work out, as
+ // nothing will match from there. We will have to advance again, consuming
+ // the associated trail surrogate.
+ ZoneList<CharacterRange>* range = CharacterRange::List(
+ zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
+ return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
+}
+
+
+void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
+ ZoneList<CharacterRange>* ranges) {
+#ifdef V8_I18N_SUPPORT
+ // Use ICU to compute the case fold closure over the ranges.
+ DCHECK(compiler->unicode());
+ DCHECK(compiler->ignore_case());
+ USet* set = uset_openEmpty();
+ for (int i = 0; i < ranges->length(); i++) {
+ uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
+ }
+ ranges->Clear();
+ uset_closeOver(set, USET_CASE_INSENSITIVE);
+ // Full case mapping map single characters to multiple characters.
+ // Those are represented as strings in the set. Remove them so that
+ // we end up with only simple and common case mappings.
+ uset_removeAllStrings(set);
+ int item_count = uset_getItemCount(set);
+ int item_result = 0;
+ UErrorCode ec = U_ZERO_ERROR;
+ Zone* zone = compiler->zone();
+ for (int i = 0; i < item_count; i++) {
+ uc32 start = 0;
+ uc32 end = 0;
+ item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
+ ranges->Add(CharacterRange::Range(start, end), zone);
+ }
+ // No errors and everything we collected have been ranges.
+ DCHECK_EQ(U_ZERO_ERROR, ec);
+ DCHECK_EQ(0, item_result);
+ uset_close(set);
+#else
+ // Fallback if ICU is not included.
+ CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
+ ranges, compiler->one_byte());
+#endif // V8_I18N_SUPPORT
+ CharacterRange::Canonicalize(ranges);
+}
+
+
RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
- return new (compiler->zone())
- TextNode(this, compiler->read_backward(), on_success);
+ set_.Canonicalize();
+ Zone* zone = compiler->zone();
+ ZoneList<CharacterRange>* ranges = this->ranges(zone);
+ if (compiler->unicode() && compiler->ignore_case()) {
+ AddUnicodeCaseEquivalents(compiler, ranges);
+ }
+ if (compiler->unicode() && !compiler->one_byte()) {
+ if (is_negated()) {
+ ZoneList<CharacterRange>* negated =
+ new (zone) ZoneList<CharacterRange>(2, zone);
+ CharacterRange::Negate(ranges, negated, zone);
+ ranges = negated;
+ }
+ if (ranges->length() == 0) {
+ // No matches possible.
+ return new (zone) EndNode(EndNode::BACKTRACK, zone);
+ }
+ if (standard_type() == '*') {
+ return UnanchoredAdvance(compiler, on_success);
+ } else {
+ ChoiceNode* result = new (zone) ChoiceNode(2, zone);
+ UnicodeRangeSplitter splitter(zone, ranges);
+ AddBmpCharacters(compiler, result, on_success, &splitter);
+ AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
+ AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
+ AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
+ return result;
+ }
+ } else {
+ return new (zone) TextNode(this, compiler->read_backward(), on_success);
+ }
}
@@ -5338,6 +5688,47 @@
}
+RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
+ int stack_pointer_register,
+ int position_register,
+ int capture_register_count,
+ int capture_register_start)
+ : is_positive_(is_positive),
+ on_success_(on_success),
+ stack_pointer_register_(stack_pointer_register),
+ position_register_(position_register) {
+ if (is_positive_) {
+ on_match_success_ = ActionNode::PositiveSubmatchSuccess(
+ stack_pointer_register, position_register, capture_register_count,
+ capture_register_start, on_success_);
+ } else {
+ Zone* zone = on_success_->zone();
+ on_match_success_ = new (zone) NegativeSubmatchSuccess(
+ stack_pointer_register, position_register, capture_register_count,
+ capture_register_start, zone);
+ }
+}
+
+
+RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
+ if (is_positive_) {
+ return ActionNode::BeginSubmatch(stack_pointer_register_,
+ position_register_, match);
+ } else {
+ Zone* zone = on_success_->zone();
+ // We use a ChoiceNode to represent the negative lookaround. The first
+ // alternative is the negative match. On success, the end node backtracks.
+ // On failure, the second alternative is tried and leads to success.
+ // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
+ // first exit when calculating quick checks.
+ ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
+ GuardedAlternative(match), GuardedAlternative(on_success_), zone);
+ return ActionNode::BeginSubmatch(stack_pointer_register_,
+ position_register_, choice_node);
+ }
+}
+
+
RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
int stack_pointer_register = compiler->AllocateRegister();
@@ -5352,35 +5743,10 @@
RegExpNode* result;
bool was_reading_backward = compiler->read_backward();
compiler->set_read_backward(type() == LOOKBEHIND);
- if (is_positive()) {
- result = ActionNode::BeginSubmatch(
- stack_pointer_register, position_register,
- body()->ToNode(compiler,
- ActionNode::PositiveSubmatchSuccess(
- stack_pointer_register, position_register,
- register_count, register_start, on_success)));
- } else {
- // We use a ChoiceNode for a negative lookahead because it has most of
- // the characteristics we need. It has the body of the lookahead as its
- // first alternative and the expression after the lookahead of the second
- // alternative. If the first alternative succeeds then the
- // NegativeSubmatchSuccess will unwind the stack including everything the
- // choice node set up and backtrack. If the first alternative fails then
- // the second alternative is tried, which is exactly the desired result
- // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
- // ChoiceNode that knows to ignore the first exit when calculating quick
- // checks.
- Zone* zone = compiler->zone();
-
- GuardedAlternative body_alt(
- body()->ToNode(compiler, new (zone) NegativeSubmatchSuccess(
- stack_pointer_register, position_register,
- register_count, register_start, zone)));
- ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
- body_alt, GuardedAlternative(on_success), zone);
- result = ActionNode::BeginSubmatch(stack_pointer_register,
- position_register, choice_node);
- }
+ Builder builder(is_positive(), on_success, stack_pointer_register,
+ position_register, register_count, register_start);
+ RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
+ result = builder.ForMatch(match);
compiler->set_read_backward(was_reading_backward);
return result;
}
@@ -5428,10 +5794,10 @@
ZoneList<CharacterRange>* ranges,
Zone* zone) {
elmc--;
- DCHECK(elmv[elmc] == 0x10000);
+ DCHECK(elmv[elmc] == kRangeEndMarker);
for (int i = 0; i < elmc; i += 2) {
DCHECK(elmv[i] < elmv[i + 1]);
- ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);
+ ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
}
}
@@ -5441,17 +5807,17 @@
ZoneList<CharacterRange>* ranges,
Zone* zone) {
elmc--;
- DCHECK(elmv[elmc] == 0x10000);
+ DCHECK(elmv[elmc] == kRangeEndMarker);
DCHECK(elmv[0] != 0x0000);
- DCHECK(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
+ DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
uc16 last = 0x0000;
for (int i = 0; i < elmc; i += 2) {
DCHECK(last <= elmv[i] - 1);
DCHECK(elmv[i] < elmv[i + 1]);
- ranges->Add(CharacterRange(last, elmv[i] - 1), zone);
+ ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
last = elmv[i + 1];
}
- ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);
+ ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
}
@@ -5508,115 +5874,73 @@
}
-class CharacterRangeSplitter {
- public:
- CharacterRangeSplitter(ZoneList<CharacterRange>** included,
- ZoneList<CharacterRange>** excluded,
- Zone* zone)
- : included_(included),
- excluded_(excluded),
- zone_(zone) { }
- void Call(uc16 from, DispatchTable::Entry entry);
-
- static const int kInBase = 0;
- static const int kInOverlay = 1;
-
- private:
- ZoneList<CharacterRange>** included_;
- ZoneList<CharacterRange>** excluded_;
- Zone* zone_;
-};
-
-
-void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
- if (!entry.out_set()->Get(kInBase)) return;
- ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
- ? included_
- : excluded_;
- if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_);
- (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);
-}
-
-
-void CharacterRange::Split(ZoneList<CharacterRange>* base,
- Vector<const int> overlay,
- ZoneList<CharacterRange>** included,
- ZoneList<CharacterRange>** excluded,
- Zone* zone) {
- DCHECK_NULL(*included);
- DCHECK_NULL(*excluded);
- DispatchTable table(zone);
- for (int i = 0; i < base->length(); i++)
- table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);
- for (int i = 0; i < overlay.length(); i += 2) {
- table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
- CharacterRangeSplitter::kInOverlay, zone);
- }
- CharacterRangeSplitter callback(included, excluded, zone);
- table.ForEach(&callback);
-}
-
-
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges,
bool is_one_byte) {
- uc16 bottom = from();
- uc16 top = to();
- if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {
- if (bottom > String::kMaxOneByteCharCode) return;
- if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
- }
- unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
- if (top == bottom) {
- // If this is a singleton we just expand the one character.
- int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
- for (int i = 0; i < length; i++) {
- uc32 chr = chars[i];
- if (chr != bottom) {
- ranges->Add(CharacterRange::Singleton(chars[i]), zone);
- }
+ int range_count = ranges->length();
+ for (int i = 0; i < range_count; i++) {
+ CharacterRange range = ranges->at(i);
+ uc32 bottom = range.from();
+ if (bottom > String::kMaxUtf16CodeUnit) return;
+ uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
+ // Nothing to be done for surrogates.
+ if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
+ if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
+ if (bottom > String::kMaxOneByteCharCode) return;
+ if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
}
- } else {
- // If this is a range we expand the characters block by block,
- // expanding contiguous subranges (blocks) one at a time.
- // The approach is as follows. For a given start character we
- // look up the remainder of the block that contains it (represented
- // by the end point), for instance we find 'z' if the character
- // is 'c'. A block is characterized by the property
- // that all characters uncanonicalize in the same way, except that
- // each entry in the result is incremented by the distance from the first
- // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
- // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
- // Once we've found the end point we look up its uncanonicalization
- // and produce a range for each element. For instance for [c-f]
- // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
- // add a range if it is not already contained in the input, so [c-f]
- // will be skipped but [C-F] will be added. If this range is not
- // completely contained in a block we do this for all the blocks
- // covered by the range (handling characters that is not in a block
- // as a "singleton block").
- unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
- int pos = bottom;
- while (pos <= top) {
- int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
- uc16 block_end;
- if (length == 0) {
- block_end = pos;
- } else {
- DCHECK_EQ(1, length);
- block_end = range[0];
- }
- int end = (block_end > top) ? top : block_end;
- length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ if (top == bottom) {
+ // If this is a singleton we just expand the one character.
+ int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
for (int i = 0; i < length; i++) {
- uc32 c = range[i];
- uc16 range_from = c - (block_end - pos);
- uc16 range_to = c - (block_end - end);
- if (!(bottom <= range_from && range_to <= top)) {
- ranges->Add(CharacterRange(range_from, range_to), zone);
+ uc32 chr = chars[i];
+ if (chr != bottom) {
+ ranges->Add(CharacterRange::Singleton(chars[i]), zone);
}
}
- pos = end + 1;
+ } else {
+ // If this is a range we expand the characters block by block, expanding
+ // contiguous subranges (blocks) one at a time. The approach is as
+ // follows. For a given start character we look up the remainder of the
+ // block that contains it (represented by the end point), for instance we
+ // find 'z' if the character is 'c'. A block is characterized by the
+ // property that all characters uncanonicalize in the same way, except
+ // that each entry in the result is incremented by the distance from the
+ // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
+ // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
+ // we've found the end point we look up its uncanonicalization and
+ // produce a range for each element. For instance for [c-f] we look up
+ // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
+ // it is not already contained in the input, so [c-f] will be skipped but
+ // [C-F] will be added. If this range is not completely contained in a
+ // block we do this for all the blocks covered by the range (handling
+ // characters that is not in a block as a "singleton block").
+ unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ int pos = bottom;
+ while (pos <= top) {
+ int length =
+ isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
+ uc32 block_end;
+ if (length == 0) {
+ block_end = pos;
+ } else {
+ DCHECK_EQ(1, length);
+ block_end = equivalents[0];
+ }
+ int end = (block_end > top) ? top : block_end;
+ length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
+ equivalents);
+ for (int i = 0; i < length; i++) {
+ uc32 c = equivalents[i];
+ uc32 range_from = c - (block_end - pos);
+ uc32 range_to = c - (block_end - end);
+ if (!(bottom <= range_from && range_to <= top)) {
+ ranges->Add(CharacterRange::Range(range_from, range_to), zone);
+ }
+ }
+ pos = end + 1;
+ }
}
}
}
@@ -5672,8 +5996,8 @@
// list[0..count] for the result. Returns the number of resulting
// canonicalized ranges. Inserting a range may collapse existing ranges into
// fewer ranges, so the return value can be anything in the range 1..count+1.
- uc16 from = insert.from();
- uc16 to = insert.to();
+ uc32 from = insert.from();
+ uc32 to = insert.to();
int start_pos = 0;
int end_pos = count;
for (int i = count - 1; i >= 0; i--) {
@@ -5706,7 +6030,7 @@
CharacterRange to_replace = list->at(start_pos);
int new_from = Min(to_replace.from(), from);
int new_to = Max(to_replace.to(), to);
- list->at(start_pos) = CharacterRange(new_from, new_to);
+ list->at(start_pos) = CharacterRange::Range(new_from, new_to);
return count;
}
// Replace a number of existing ranges from start_pos to end_pos - 1.
@@ -5717,7 +6041,7 @@
if (end_pos < count) {
MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
}
- list->at(start_pos) = CharacterRange(new_from, new_to);
+ list->at(start_pos) = CharacterRange::Range(new_from, new_to);
return count - (end_pos - start_pos) + 1;
}
@@ -5773,20 +6097,20 @@
DCHECK(CharacterRange::IsCanonical(ranges));
DCHECK_EQ(0, negated_ranges->length());
int range_count = ranges->length();
- uc16 from = 0;
+ uc32 from = 0;
int i = 0;
if (range_count > 0 && ranges->at(0).from() == 0) {
- from = ranges->at(0).to();
+ from = ranges->at(0).to() + 1;
i = 1;
}
while (i < range_count) {
CharacterRange range = ranges->at(i);
- negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);
- from = range.to();
+ negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
+ from = range.to() + 1;
i++;
}
- if (from < String::kMaxUtf16CodeUnit) {
- negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit),
+ if (from < String::kMaxCodePoint) {
+ negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
zone);
}
}
@@ -5838,7 +6162,7 @@
}
-const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
+const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
void DispatchTable::AddRange(CharacterRange full_range, int value,
@@ -5866,8 +6190,9 @@
if (entry->from() < current.from() && entry->to() >= current.from()) {
// Snap the overlapping range in half around the start point of
// the range we're adding.
- CharacterRange left(entry->from(), current.from() - 1);
- CharacterRange right(current.from(), entry->to());
+ CharacterRange left =
+ CharacterRange::Range(entry->from(), current.from() - 1);
+ CharacterRange right = CharacterRange::Range(current.from(), entry->to());
// The left part of the overlapping range doesn't overlap.
// Truncate the whole entry to be just the left part.
entry->set_to(left.to());
@@ -5919,10 +6244,6 @@
// we're adding so we can just update it and move the start point
// of the range we're adding just past it.
entry->AddValue(value, zone);
- // Bail out if the last interval ended at 0xFFFF since otherwise
- // adding 1 will wrap around to 0.
- if (entry->to() == String::kMaxUtf16CodeUnit)
- break;
DCHECK(entry->to() + 1 > current.from());
current.set_from(entry->to() + 1);
} else {
@@ -5940,7 +6261,7 @@
}
-OutSet* DispatchTable::Get(uc16 value) {
+OutSet* DispatchTable::Get(uc32 value) {
ZoneSplayTree<Config>::Locator loc;
if (!tree()->FindGreatestLessThan(value, &loc))
return empty();
@@ -5990,7 +6311,7 @@
void Analysis::VisitText(TextNode* that) {
- if (ignore_case_) {
+ if (ignore_case()) {
that->MakeCaseIndependent(isolate(), is_one_byte_);
}
EnsureAnalyzed(that->on_success());
@@ -6173,8 +6494,7 @@
void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
- CharacterRange range(from, entry.to());
- constructor_->AddRange(range);
+ constructor_->AddRange(CharacterRange::Range(from, entry.to()));
}
@@ -6212,16 +6532,16 @@
for (int i = 0; i < ranges->length(); i++) {
CharacterRange range = ranges->at(i);
if (last < range.from())
- AddRange(CharacterRange(last, range.from() - 1));
+ AddRange(CharacterRange::Range(last, range.from() - 1));
if (range.to() >= last) {
- if (range.to() == String::kMaxUtf16CodeUnit) {
+ if (range.to() == String::kMaxCodePoint) {
return;
} else {
last = range.to() + 1;
}
}
}
- AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
+ AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
}
@@ -6230,7 +6550,7 @@
switch (elm.text_type()) {
case TextElement::ATOM: {
uc16 c = elm.atom()->data()[0];
- AddRange(CharacterRange(c, c));
+ AddRange(CharacterRange::Range(c, c));
break;
}
case TextElement::CHAR_CLASS: {
@@ -6257,14 +6577,48 @@
}
+RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ // If the regexp matching starts within a surrogate pair, step back
+ // to the lead surrogate and start matching from there.
+ DCHECK(!compiler->read_backward());
+ Zone* zone = compiler->zone();
+ ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
+ zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
+ ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
+ zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
+
+ ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
+
+ int stack_register = compiler->UnicodeLookaroundStackRegister();
+ int position_register = compiler->UnicodeLookaroundPositionRegister();
+ RegExpNode* step_back = TextNode::CreateForCharacterRanges(
+ zone, lead_surrogates, true, on_success);
+ RegExpLookaround::Builder builder(true, step_back, stack_register,
+ position_register);
+ RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
+ zone, trail_surrogates, false, builder.on_match_success());
+
+ optional_step_back->AddAlternative(
+ GuardedAlternative(builder.ForMatch(match_trail)));
+ optional_step_back->AddAlternative(GuardedAlternative(on_success));
+
+ return optional_step_back;
+}
+
+
RegExpEngine::CompilationResult RegExpEngine::Compile(
- Isolate* isolate, Zone* zone, RegExpCompileData* data, bool ignore_case,
- bool is_global, bool is_multiline, bool is_sticky, Handle<String> pattern,
+ Isolate* isolate, Zone* zone, RegExpCompileData* data,
+ JSRegExp::Flags flags, Handle<String> pattern,
Handle<String> sample_subject, bool is_one_byte) {
if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
return IrregexpRegExpTooBig(isolate);
}
- RegExpCompiler compiler(isolate, zone, data->capture_count, ignore_case,
+ bool ignore_case = flags & JSRegExp::kIgnoreCase;
+ bool is_sticky = flags & JSRegExp::kSticky;
+ bool is_global = flags & JSRegExp::kGlobal;
+ bool is_unicode = flags & JSRegExp::kUnicode;
+ RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
is_one_byte);
if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
@@ -6316,11 +6670,13 @@
if (node != NULL) {
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
}
+ } else if (compiler.unicode() && (is_global || is_sticky)) {
+ node = OptionallyStepBackToLeadSurrogate(&compiler, node);
}
if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
data->node = node;
- Analysis analysis(isolate, ignore_case, is_one_byte);
+ Analysis analysis(isolate, flags, is_one_byte);
analysis.EnsureAnalyzed(node);
if (analysis.has_failed()) {
const char* error_message = analysis.error_message();
@@ -6381,10 +6737,13 @@
}
if (is_global) {
- macro_assembler.set_global_mode(
- (data->tree->min_match() > 0)
- ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
- : RegExpMacroAssembler::GLOBAL);
+ RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
+ if (data->tree->min_match() > 0) {
+ mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
+ } else if (is_unicode) {
+ mode = RegExpMacroAssembler::GLOBAL_UNICODE;
+ }
+ macro_assembler.set_global_mode(mode);
}
return compiler.Assemble(¯o_assembler,
diff --git a/src/regexp/jsregexp.h b/src/regexp/jsregexp.h
index 0ad4b79..e55d650 100644
--- a/src/regexp/jsregexp.h
+++ b/src/regexp/jsregexp.h
@@ -8,6 +8,7 @@
#include "src/allocation.h"
#include "src/assembler.h"
#include "src/regexp/regexp-ast.h"
+#include "src/regexp/regexp-macro-assembler.h"
namespace v8 {
namespace internal {
@@ -121,7 +122,6 @@
public:
GlobalCache(Handle<JSRegExp> regexp,
Handle<String> subject,
- bool is_global,
Isolate* isolate);
INLINE(~GlobalCache());
@@ -137,6 +137,8 @@
INLINE(bool HasException()) { return num_matches_ < 0; }
private:
+ int AdvanceZeroLength(int last_index);
+
int num_matches_;
int max_matches_;
int current_match_index_;
@@ -265,28 +267,30 @@
class Entry {
public:
Entry() : from_(0), to_(0), out_set_(NULL) { }
- Entry(uc16 from, uc16 to, OutSet* out_set)
- : from_(from), to_(to), out_set_(out_set) { }
- uc16 from() { return from_; }
- uc16 to() { return to_; }
- void set_to(uc16 value) { to_ = value; }
+ Entry(uc32 from, uc32 to, OutSet* out_set)
+ : from_(from), to_(to), out_set_(out_set) {
+ DCHECK(from <= to);
+ }
+ uc32 from() { return from_; }
+ uc32 to() { return to_; }
+ void set_to(uc32 value) { to_ = value; }
void AddValue(int value, Zone* zone) {
out_set_ = out_set_->Extend(value, zone);
}
OutSet* out_set() { return out_set_; }
private:
- uc16 from_;
- uc16 to_;
+ uc32 from_;
+ uc32 to_;
OutSet* out_set_;
};
class Config {
public:
- typedef uc16 Key;
+ typedef uc32 Key;
typedef Entry Value;
- static const uc16 kNoKey;
+ static const uc32 kNoKey;
static const Entry NoValue() { return Value(); }
- static inline int Compare(uc16 a, uc16 b) {
+ static inline int Compare(uc32 a, uc32 b) {
if (a == b)
return 0;
else if (a < b)
@@ -297,7 +301,7 @@
};
void AddRange(CharacterRange range, int value, Zone* zone);
- OutSet* Get(uc16 value);
+ OutSet* Get(uc32 value);
void Dump();
template <typename Callback>
@@ -315,6 +319,34 @@
};
+// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
+class UnicodeRangeSplitter {
+ public:
+ UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
+ void Call(uc32 from, DispatchTable::Entry entry);
+
+ ZoneList<CharacterRange>* bmp() { return bmp_; }
+ ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
+ ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
+ ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
+
+ private:
+ static const int kBase = 0;
+ // Separate ranges into
+ static const int kBmpCodePoints = 1;
+ static const int kLeadSurrogates = 2;
+ static const int kTrailSurrogates = 3;
+ static const int kNonBmpCodePoints = 4;
+
+ Zone* zone_;
+ DispatchTable table_;
+ ZoneList<CharacterRange>* bmp_;
+ ZoneList<CharacterRange>* lead_surrogates_;
+ ZoneList<CharacterRange>* trail_surrogates_;
+ ZoneList<CharacterRange>* non_bmp_;
+};
+
+
#define FOR_EACH_NODE_TYPE(VISIT) \
VISIT(End) \
VISIT(Action) \
@@ -690,6 +722,17 @@
read_backward_(read_backward) {
elms_->Add(TextElement::CharClass(that), zone());
}
+ // Create TextNode for a single character class for the given ranges.
+ static TextNode* CreateForCharacterRanges(Zone* zone,
+ ZoneList<CharacterRange>* ranges,
+ bool read_backward,
+ RegExpNode* on_success);
+ // Create TextNode for a surrogate pair with a range given for the
+ // lead and the trail surrogate each.
+ static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
+ CharacterRange trail,
+ bool read_backward,
+ RegExpNode* on_success);
virtual void Accept(NodeVisitor* visitor);
virtual void Emit(RegExpCompiler* compiler, Trace* trace);
virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
@@ -813,8 +856,7 @@
class EndNode: public RegExpNode {
public:
enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
- explicit EndNode(Action action, Zone* zone)
- : RegExpNode(zone), action_(action) { }
+ EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
virtual void Accept(NodeVisitor* visitor);
virtual void Emit(RegExpCompiler* compiler, Trace* trace);
virtual int EatsAtLeast(int still_to_find,
@@ -1440,9 +1482,9 @@
// +-------+ +------------+
class Analysis: public NodeVisitor {
public:
- Analysis(Isolate* isolate, bool ignore_case, bool is_one_byte)
+ Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte)
: isolate_(isolate),
- ignore_case_(ignore_case),
+ flags_(flags),
is_one_byte_(is_one_byte),
error_message_(NULL) {}
void EnsureAnalyzed(RegExpNode* node);
@@ -1464,9 +1506,12 @@
Isolate* isolate() const { return isolate_; }
+ bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
+ bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
+
private:
Isolate* isolate_;
- bool ignore_case_;
+ JSRegExp::Flags flags_;
bool is_one_byte_;
const char* error_message_;
@@ -1505,8 +1550,8 @@
};
static CompilationResult Compile(Isolate* isolate, Zone* zone,
- RegExpCompileData* input, bool ignore_case,
- bool global, bool multiline, bool sticky,
+ RegExpCompileData* input,
+ JSRegExp::Flags flags,
Handle<String> pattern,
Handle<String> sample_subject,
bool is_one_byte);
diff --git a/src/regexp/mips/regexp-macro-assembler-mips.cc b/src/regexp/mips/regexp-macro-assembler-mips.cc
index 9c59328..6197f45 100644
--- a/src/regexp/mips/regexp-macro-assembler-mips.cc
+++ b/src/regexp/mips/regexp-macro-assembler-mips.cc
@@ -215,7 +215,7 @@
void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, Label* on_no_match) {
+ int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough;
__ lw(a0, register_location(start_reg)); // Index of start of capture.
__ lw(a1, register_location(start_reg + 1)); // Index of end of capture.
@@ -310,7 +310,7 @@
// a0: Address byte_offset1 - Address captured substring's start.
// a1: Address byte_offset2 - Address of current character position.
// a2: size_t byte_length - length of capture in bytes(!).
- // a3: Isolate* isolate.
+ // a3: Isolate* isolate or 0 if unicode flag.
// Address of start of capture.
__ Addu(a0, a0, Operand(end_of_input_address()));
@@ -324,7 +324,14 @@
__ Subu(a1, a1, Operand(s3));
}
// Isolate.
- __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
+#ifdef V8_I18N_SUPPORT
+ if (unicode) {
+ __ mov(a3, zero_reg);
+ } else // NOLINT
+#endif // V8_I18N_SUPPORT
+ {
+ __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
+ }
{
AllowExternalCallThatCantCauseGC scope(masm_);
@@ -801,9 +808,12 @@
__ Branch(&exit_label_, eq, current_input_offset(),
Operand(zero_reg));
// Advance current position after a zero-length match.
+ Label advance;
+ __ bind(&advance);
__ Addu(current_input_offset(),
current_input_offset(),
Operand((mode_ == UC16) ? 2 : 1));
+ if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
}
__ Branch(&load_char_start_regexp);
diff --git a/src/regexp/mips/regexp-macro-assembler-mips.h b/src/regexp/mips/regexp-macro-assembler-mips.h
index 902e220..6dedb1e 100644
--- a/src/regexp/mips/regexp-macro-assembler-mips.h
+++ b/src/regexp/mips/regexp-macro-assembler-mips.h
@@ -37,7 +37,7 @@
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward,
+ bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c,
diff --git a/src/regexp/mips64/regexp-macro-assembler-mips64.cc b/src/regexp/mips64/regexp-macro-assembler-mips64.cc
index 5153bd0..bf95a9c 100644
--- a/src/regexp/mips64/regexp-macro-assembler-mips64.cc
+++ b/src/regexp/mips64/regexp-macro-assembler-mips64.cc
@@ -251,7 +251,7 @@
void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, Label* on_no_match) {
+ int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough;
__ ld(a0, register_location(start_reg)); // Index of start of capture.
__ ld(a1, register_location(start_reg + 1)); // Index of end of capture.
@@ -346,7 +346,7 @@
// a0: Address byte_offset1 - Address captured substring's start.
// a1: Address byte_offset2 - Address of current character position.
// a2: size_t byte_length - length of capture in bytes(!).
- // a3: Isolate* isolate.
+ // a3: Isolate* isolate or 0 if unicode flag.
// Address of start of capture.
__ Daddu(a0, a0, Operand(end_of_input_address()));
@@ -360,7 +360,14 @@
__ Dsubu(a1, a1, Operand(s3));
}
// Isolate.
- __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
+#ifdef V8_I18N_SUPPORT
+ if (unicode) {
+ __ mov(a3, zero_reg);
+ } else // NOLINT
+#endif // V8_I18N_SUPPORT
+ {
+ __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
+ }
{
AllowExternalCallThatCantCauseGC scope(masm_);
@@ -664,10 +671,7 @@
s3.bit() | s4.bit() | s5.bit() | s6.bit() | s7.bit() | fp.bit();
RegList argument_registers = a0.bit() | a1.bit() | a2.bit() | a3.bit();
- if (kMipsAbi == kN64) {
- // TODO(plind): Should probably alias a4-a7, for clarity.
- argument_registers |= a4.bit() | a5.bit() | a6.bit() | a7.bit();
- }
+ argument_registers |= a4.bit() | a5.bit() | a6.bit() | a7.bit();
__ MultiPush(argument_registers | registers_to_retain | ra.bit());
// Set frame pointer in space for it if this is not a direct call
@@ -841,9 +845,12 @@
__ Branch(&exit_label_, eq, current_input_offset(),
Operand(zero_reg));
// Advance current position after a zero-length match.
+ Label advance;
+ __ bind(&advance);
__ Daddu(current_input_offset(),
current_input_offset(),
Operand((mode_ == UC16) ? 2 : 1));
+ if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
}
__ Branch(&load_char_start_regexp);
diff --git a/src/regexp/mips64/regexp-macro-assembler-mips64.h b/src/regexp/mips64/regexp-macro-assembler-mips64.h
index 9a8ca17..df2c6c5 100644
--- a/src/regexp/mips64/regexp-macro-assembler-mips64.h
+++ b/src/regexp/mips64/regexp-macro-assembler-mips64.h
@@ -37,7 +37,7 @@
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward,
+ bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c,
@@ -96,7 +96,6 @@
void print_regexp_frame_constants();
private:
-#if defined(MIPS_ABI_N64)
// Offsets from frame_pointer() of function parameters and stored registers.
static const int kFramePointer = 0;
@@ -105,7 +104,7 @@
static const int kStoredRegisters = kFramePointer;
// Return address (stored from link register, read into pc on return).
-// TODO(plind): This 9 - is 8 s-regs (s0..s7) plus fp.
+ // TODO(plind): This 9 - is 8 s-regs (s0..s7) plus fp.
static const int kReturnAddress = kStoredRegisters + 9 * kPointerSize;
static const int kSecondaryReturnAddress = kReturnAddress + kPointerSize;
@@ -131,43 +130,6 @@
// First register address. Following registers are below it on the stack.
static const int kRegisterZero = kStringStartMinusOne - kPointerSize;
-#elif defined(MIPS_ABI_O32)
- // Offsets from frame_pointer() of function parameters and stored registers.
- static const int kFramePointer = 0;
-
- // Above the frame pointer - Stored registers and stack passed parameters.
- // Registers s0 to s7, fp, and ra.
- static const int kStoredRegisters = kFramePointer;
- // Return address (stored from link register, read into pc on return).
- static const int kReturnAddress = kStoredRegisters + 9 * kPointerSize;
- static const int kSecondaryReturnAddress = kReturnAddress + kPointerSize;
- // Stack frame header.
- static const int kStackFrameHeader = kReturnAddress + kPointerSize;
- // Stack parameters placed by caller.
- static const int kRegisterOutput =
- kStackFrameHeader + 4 * kPointerSize + kPointerSize;
- static const int kNumOutputRegisters = kRegisterOutput + kPointerSize;
- static const int kStackHighEnd = kNumOutputRegisters + kPointerSize;
- static const int kDirectCall = kStackHighEnd + kPointerSize;
- static const int kIsolate = kDirectCall + kPointerSize;
-
- // Below the frame pointer.
- // Register parameters stored by setup code.
- static const int kInputEnd = kFramePointer - kPointerSize;
- static const int kInputStart = kInputEnd - kPointerSize;
- static const int kStartIndex = kInputStart - kPointerSize;
- static const int kInputString = kStartIndex - kPointerSize;
- // When adding local variables remember to push space for them in
- // the frame in GetCode.
- static const int kSuccessfulCaptures = kInputString - kPointerSize;
- static const int kStringStartMinusOne = kSuccessfulCaptures - kPointerSize;
- // First register address. Following registers are below it on the stack.
- static const int kRegisterZero = kStringStartMinusOne - kPointerSize;
-
-#else
-# error "undefined MIPS ABI"
-#endif
-
// Initial size of code buffer.
static const size_t kRegExpCodeSize = 1024;
diff --git a/src/regexp/ppc/regexp-macro-assembler-ppc.cc b/src/regexp/ppc/regexp-macro-assembler-ppc.cc
index f3ddf7b..c05c580 100644
--- a/src/regexp/ppc/regexp-macro-assembler-ppc.cc
+++ b/src/regexp/ppc/regexp-macro-assembler-ppc.cc
@@ -225,9 +225,8 @@
BranchOrBacktrack(eq, on_equal);
}
-
void RegExpMacroAssemblerPPC::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, Label* on_no_match) {
+ int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough;
__ LoadP(r3, register_location(start_reg), r0); // Index of start of capture
__ LoadP(r4, register_location(start_reg + 1), r0); // Index of end
@@ -322,7 +321,7 @@
// r3: Address byte_offset1 - Address captured substring's start.
// r4: Address byte_offset2 - Address of current character position.
// r5: size_t byte_length - length of capture in bytes(!)
- // r6: Isolate* isolate
+ // r6: Isolate* isolate or 0 if unicode flag.
// Address of start of capture.
__ add(r3, r3, end_of_input_address());
@@ -336,7 +335,14 @@
__ sub(r4, r4, r25);
}
// Isolate.
- __ mov(r6, Operand(ExternalReference::isolate_address(isolate())));
+#ifdef V8_I18N_SUPPORT
+ if (unicode) {
+ __ li(r6, Operand::Zero());
+ } else // NOLINT
+#endif // V8_I18N_SUPPORT
+ {
+ __ mov(r6, Operand(ExternalReference::isolate_address(isolate())));
+ }
{
AllowExternalCallThatCantCauseGC scope(masm_);
@@ -845,8 +851,11 @@
__ cmpi(current_input_offset(), Operand::Zero());
__ beq(&exit_label_);
// Advance current position after a zero-length match.
+ Label advance;
+ __ bind(&advance);
__ addi(current_input_offset(), current_input_offset(),
Operand((mode_ == UC16) ? 2 : 1));
+ if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
}
__ b(&load_char_start_regexp);
diff --git a/src/regexp/ppc/regexp-macro-assembler-ppc.h b/src/regexp/ppc/regexp-macro-assembler-ppc.h
index 4d1836f..d281387 100644
--- a/src/regexp/ppc/regexp-macro-assembler-ppc.h
+++ b/src/regexp/ppc/regexp-macro-assembler-ppc.h
@@ -38,7 +38,7 @@
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward,
+ bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned mask,
diff --git a/src/regexp/regexp-ast.cc b/src/regexp/regexp-ast.cc
index 31c93b1..b5c2bb6 100644
--- a/src/regexp/regexp-ast.cc
+++ b/src/regexp/regexp-ast.cc
@@ -172,9 +172,9 @@
void RegExpUnparser::VisitCharacterRange(CharacterRange that) {
- os_ << AsUC16(that.from());
+ os_ << AsUC32(that.from());
if (!that.IsSingleton()) {
- os_ << "-" << AsUC16(that.to());
+ os_ << "-" << AsUC32(that.to());
}
}
diff --git a/src/regexp/regexp-ast.h b/src/regexp/regexp-ast.h
index f877785..0e718d3 100644
--- a/src/regexp/regexp-ast.h
+++ b/src/regexp/regexp-ast.h
@@ -5,6 +5,7 @@
#ifndef V8_REGEXP_REGEXP_AST_H_
#define V8_REGEXP_REGEXP_AST_H_
+#include "src/objects.h"
#include "src/utils.h"
#include "src/zone.h"
@@ -77,33 +78,38 @@
CharacterRange() : from_(0), to_(0) {}
// For compatibility with the CHECK_OK macro
CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
- CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) {}
static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
Zone* zone);
static Vector<const int> GetWordBounds();
- static inline CharacterRange Singleton(uc16 value) {
+ static inline CharacterRange Singleton(uc32 value) {
return CharacterRange(value, value);
}
- static inline CharacterRange Range(uc16 from, uc16 to) {
- DCHECK(from <= to);
+ static inline CharacterRange Range(uc32 from, uc32 to) {
+ DCHECK(0 <= from && to <= String::kMaxCodePoint);
+ DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
return CharacterRange(from, to);
}
static inline CharacterRange Everything() {
- return CharacterRange(0, 0xFFFF);
+ return CharacterRange(0, String::kMaxCodePoint);
}
- bool Contains(uc16 i) { return from_ <= i && i <= to_; }
- uc16 from() const { return from_; }
- void set_from(uc16 value) { from_ = value; }
- uc16 to() const { return to_; }
- void set_to(uc16 value) { to_ = value; }
+ static inline ZoneList<CharacterRange>* List(Zone* zone,
+ CharacterRange range) {
+ ZoneList<CharacterRange>* list =
+ new (zone) ZoneList<CharacterRange>(1, zone);
+ list->Add(range, zone);
+ return list;
+ }
+ bool Contains(uc32 i) { return from_ <= i && i <= to_; }
+ uc32 from() const { return from_; }
+ void set_from(uc32 value) { from_ = value; }
+ uc32 to() const { return to_; }
+ void set_to(uc32 value) { to_ = value; }
bool is_valid() { return from_ <= to_; }
bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
bool IsSingleton() { return (from_ == to_); }
- void AddCaseEquivalents(Isolate* isolate, Zone* zone,
- ZoneList<CharacterRange>* ranges, bool is_one_byte);
- static void Split(ZoneList<CharacterRange>* base, Vector<const int> overlay,
- ZoneList<CharacterRange>** included,
- ZoneList<CharacterRange>** excluded, Zone* zone);
+ static void AddCaseEquivalents(Isolate* isolate, Zone* zone,
+ ZoneList<CharacterRange>* ranges,
+ bool is_one_byte);
// Whether a range list is in canonical form: Ranges ordered by from value,
// and ranges non-overlapping and non-adjacent.
static bool IsCanonical(ZoneList<CharacterRange>* ranges);
@@ -119,8 +125,10 @@
static const int kPayloadMask = (1 << 24) - 1;
private:
- uc16 from_;
- uc16 to_;
+ CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
+
+ uc32 from_;
+ uc32 to_;
};
@@ -303,8 +311,8 @@
// W : non-ASCII word character
// d : ASCII digit
// D : non-ASCII digit
- // . : non-unicode non-newline
- // * : All characters
+ // . : non-newline
+ // * : All characters, for advancing unanchored regexp
uc16 standard_type() { return set_.standard_set_type(); }
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
bool is_negated() { return is_negated_; }
@@ -451,6 +459,22 @@
int capture_from() { return capture_from_; }
Type type() { return type_; }
+ class Builder {
+ public:
+ Builder(bool is_positive, RegExpNode* on_success,
+ int stack_pointer_register, int position_register,
+ int capture_register_count = 0, int capture_register_start = 0);
+ RegExpNode* on_match_success() { return on_match_success_; }
+ RegExpNode* ForMatch(RegExpNode* match);
+
+ private:
+ bool is_positive_;
+ RegExpNode* on_match_success_;
+ RegExpNode* on_success_;
+ int stack_pointer_register_;
+ int position_register_;
+ };
+
private:
RegExpTree* body_;
bool is_positive_;
diff --git a/src/regexp/regexp-macro-assembler-irregexp-inl.h b/src/regexp/regexp-macro-assembler-irregexp-inl.h
index 4d0b1bc..a602129 100644
--- a/src/regexp/regexp-macro-assembler-irregexp-inl.h
+++ b/src/regexp/regexp-macro-assembler-irregexp-inl.h
@@ -5,14 +5,14 @@
#ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_IRREGEXP_INL_H_
#define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_IRREGEXP_INL_H_
+#ifdef V8_INTERPRETED_REGEXP
+
#include "src/ast/ast.h"
#include "src/regexp/bytecodes-irregexp.h"
namespace v8 {
namespace internal {
-#ifdef V8_INTERPRETED_REGEXP
-
void RegExpMacroAssemblerIrregexp::Emit(uint32_t byte,
uint32_t twenty_four_bits) {
uint32_t word = ((twenty_four_bits << BYTECODE_SHIFT) | byte);
@@ -54,9 +54,9 @@
pc_ += 4;
}
-#endif // V8_INTERPRETED_REGEXP
-
} // namespace internal
} // namespace v8
+#endif // V8_INTERPRETED_REGEXP
+
#endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_IRREGEXP_INL_H_
diff --git a/src/regexp/regexp-macro-assembler-irregexp.cc b/src/regexp/regexp-macro-assembler-irregexp.cc
index 751ee44..a0bb5e7 100644
--- a/src/regexp/regexp-macro-assembler-irregexp.cc
+++ b/src/regexp/regexp-macro-assembler-irregexp.cc
@@ -2,6 +2,8 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
+#ifdef V8_INTERPRETED_REGEXP
+
#include "src/regexp/regexp-macro-assembler-irregexp.h"
#include "src/ast/ast.h"
@@ -9,12 +11,9 @@
#include "src/regexp/regexp-macro-assembler.h"
#include "src/regexp/regexp-macro-assembler-irregexp-inl.h"
-
namespace v8 {
namespace internal {
-#ifdef V8_INTERPRETED_REGEXP
-
RegExpMacroAssemblerIrregexp::RegExpMacroAssemblerIrregexp(Isolate* isolate,
Vector<byte> buffer,
Zone* zone)
@@ -382,11 +381,13 @@
void RegExpMacroAssemblerIrregexp::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, Label* on_not_equal) {
+ int start_reg, bool read_backward, bool unicode, Label* on_not_equal) {
DCHECK(start_reg >= 0);
DCHECK(start_reg <= kMaxRegister);
- Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD
- : BC_CHECK_NOT_BACK_REF_NO_CASE,
+ Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD
+ : BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD)
+ : (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE
+ : BC_CHECK_NOT_BACK_REF_NO_CASE),
start_reg);
EmitOrLink(on_not_equal);
}
@@ -454,7 +455,7 @@
}
}
-#endif // V8_INTERPRETED_REGEXP
-
} // namespace internal
} // namespace v8
+
+#endif // V8_INTERPRETED_REGEXP
diff --git a/src/regexp/regexp-macro-assembler-irregexp.h b/src/regexp/regexp-macro-assembler-irregexp.h
index f1ace63..dad2e9a 100644
--- a/src/regexp/regexp-macro-assembler-irregexp.h
+++ b/src/regexp/regexp-macro-assembler-irregexp.h
@@ -5,13 +5,13 @@
#ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_IRREGEXP_H_
#define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_IRREGEXP_H_
+#ifdef V8_INTERPRETED_REGEXP
+
#include "src/regexp/regexp-macro-assembler.h"
namespace v8 {
namespace internal {
-#ifdef V8_INTERPRETED_REGEXP
-
// A light-weight assembler for the Irregexp byte code.
class RegExpMacroAssemblerIrregexp: public RegExpMacroAssembler {
public:
@@ -85,7 +85,7 @@
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward,
+ bool read_backward, bool unicode,
Label* on_no_match);
virtual void IfRegisterLT(int register_index, int comparand, Label* if_lt);
virtual void IfRegisterGE(int register_index, int comparand, Label* if_ge);
@@ -125,9 +125,9 @@
DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpMacroAssemblerIrregexp);
};
-#endif // V8_INTERPRETED_REGEXP
-
} // namespace internal
} // namespace v8
+#endif // V8_INTERPRETED_REGEXP
+
#endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_IRREGEXP_H_
diff --git a/src/regexp/regexp-macro-assembler-tracer.cc b/src/regexp/regexp-macro-assembler-tracer.cc
index 5301ead..ec86526 100644
--- a/src/regexp/regexp-macro-assembler-tracer.cc
+++ b/src/regexp/regexp-macro-assembler-tracer.cc
@@ -360,11 +360,11 @@
void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, Label* on_no_match) {
- PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s, label[%08x]);\n",
+ int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
+ PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s %s, label[%08x]);\n",
start_reg, read_backward ? "backward" : "forward",
- LabelToInt(on_no_match));
- assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward,
+ unicode ? "unicode" : "non-unicode", LabelToInt(on_no_match));
+ assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, unicode,
on_no_match);
}
diff --git a/src/regexp/regexp-macro-assembler-tracer.h b/src/regexp/regexp-macro-assembler-tracer.h
index 77377aa..8a9ebe3 100644
--- a/src/regexp/regexp-macro-assembler-tracer.h
+++ b/src/regexp/regexp-macro-assembler-tracer.h
@@ -34,7 +34,7 @@
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward,
+ bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c,
diff --git a/src/regexp/regexp-macro-assembler.cc b/src/regexp/regexp-macro-assembler.cc
index caf8b51..9bb5073 100644
--- a/src/regexp/regexp-macro-assembler.cc
+++ b/src/regexp/regexp-macro-assembler.cc
@@ -9,6 +9,10 @@
#include "src/regexp/regexp-stack.h"
#include "src/simulator.h"
+#ifdef V8_I18N_SUPPORT
+#include "unicode/uchar.h"
+#endif // V8_I18N_SUPPORT
+
namespace v8 {
namespace internal {
@@ -23,6 +27,80 @@
}
+int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
+ Address byte_offset2,
+ size_t byte_length,
+ Isolate* isolate) {
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
+ isolate->regexp_macro_assembler_canonicalize();
+ // This function is not allowed to cause a garbage collection.
+ // A GC might move the calling generated code and invalidate the
+ // return address on the stack.
+ DCHECK(byte_length % 2 == 0);
+ uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
+ uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
+ size_t length = byte_length >> 1;
+
+#ifdef V8_I18N_SUPPORT
+ if (isolate == nullptr) {
+ for (size_t i = 0; i < length; i++) {
+ uc32 c1 = substring1[i];
+ uc32 c2 = substring2[i];
+ if (unibrow::Utf16::IsLeadSurrogate(c1)) {
+ // Non-BMP characters do not have case-equivalents in the BMP.
+ // Both have to be non-BMP for them to be able to match.
+ if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
+ if (i + 1 < length) {
+ uc16 c1t = substring1[i + 1];
+ uc16 c2t = substring2[i + 1];
+ if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
+ unibrow::Utf16::IsTrailSurrogate(c2t)) {
+ c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
+ c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
+ i++;
+ }
+ }
+ }
+ c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
+ c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
+ if (c1 != c2) return 0;
+ }
+ return 1;
+ }
+#endif // V8_I18N_SUPPORT
+ DCHECK_NOT_NULL(isolate);
+ for (size_t i = 0; i < length; i++) {
+ unibrow::uchar c1 = substring1[i];
+ unibrow::uchar c2 = substring2[i];
+ if (c1 != c2) {
+ unibrow::uchar s1[1] = {c1};
+ canonicalize->get(c1, '\0', s1);
+ if (s1[0] != c2) {
+ unibrow::uchar s2[1] = {c2};
+ canonicalize->get(c2, '\0', s2);
+ if (s1[0] != s2[0]) {
+ return 0;
+ }
+ }
+ }
+ }
+ return 1;
+}
+
+
+void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
+ Label* on_failure) {
+ Label ok;
+ // Check that current character is not a trail surrogate.
+ LoadCurrentCharacter(cp_offset, &ok);
+ CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
+ // Check that previous character is not a lead surrogate.
+ LoadCurrentCharacter(cp_offset - 1, &ok);
+ CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
+ Bind(&ok);
+}
+
+
#ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM.
NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
@@ -245,40 +323,6 @@
};
-int NativeRegExpMacroAssembler::CaseInsensitiveCompareUC16(
- Address byte_offset1,
- Address byte_offset2,
- size_t byte_length,
- Isolate* isolate) {
- unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
- isolate->regexp_macro_assembler_canonicalize();
- // This function is not allowed to cause a garbage collection.
- // A GC might move the calling generated code and invalidate the
- // return address on the stack.
- DCHECK(byte_length % 2 == 0);
- uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
- uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
- size_t length = byte_length >> 1;
-
- for (size_t i = 0; i < length; i++) {
- unibrow::uchar c1 = substring1[i];
- unibrow::uchar c2 = substring2[i];
- if (c1 != c2) {
- unibrow::uchar s1[1] = { c1 };
- canonicalize->get(c1, '\0', s1);
- if (s1[0] != c2) {
- unibrow::uchar s2[1] = { c2 };
- canonicalize->get(c2, '\0', s2);
- if (s1[0] != s2[0]) {
- return 0;
- }
- }
- }
- }
- return 1;
-}
-
-
Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
Address* stack_base,
Isolate* isolate) {
diff --git a/src/regexp/regexp-macro-assembler.h b/src/regexp/regexp-macro-assembler.h
index 2059933..6f79a16 100644
--- a/src/regexp/regexp-macro-assembler.h
+++ b/src/regexp/regexp-macro-assembler.h
@@ -11,6 +11,13 @@
namespace v8 {
namespace internal {
+static const uc32 kLeadSurrogateStart = 0xd800;
+static const uc32 kLeadSurrogateEnd = 0xdbff;
+static const uc32 kTrailSurrogateStart = 0xdc00;
+static const uc32 kTrailSurrogateEnd = 0xdfff;
+static const uc32 kNonBmpStart = 0x10000;
+static const uc32 kNonBmpEnd = 0x10ffff;
+
struct DisjunctDecisionRow {
RegExpCharacterClass cc;
Label* on_match;
@@ -76,7 +83,7 @@
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match) = 0;
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward,
+ bool read_backward, bool unicode,
Label* on_no_match) = 0;
// Check the current character for a match with a literal character. If we
// fail to match then goto the on_failure label. End of input always
@@ -146,25 +153,40 @@
virtual void ClearRegisters(int reg_from, int reg_to) = 0;
virtual void WriteStackPointerToRegister(int reg) = 0;
+ // Compares two-byte strings case insensitively.
+ // Called from generated RegExp code.
+ static int CaseInsensitiveCompareUC16(Address byte_offset1,
+ Address byte_offset2,
+ size_t byte_length, Isolate* isolate);
+
+ // Check that we are not in the middle of a surrogate pair.
+ void CheckNotInSurrogatePair(int cp_offset, Label* on_failure);
+
// Controls the generation of large inlined constants in the code.
void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; }
bool slow_safe() { return slow_safe_compiler_; }
- enum GlobalMode { NOT_GLOBAL, GLOBAL, GLOBAL_NO_ZERO_LENGTH_CHECK };
+ enum GlobalMode {
+ NOT_GLOBAL,
+ GLOBAL_NO_ZERO_LENGTH_CHECK,
+ GLOBAL,
+ GLOBAL_UNICODE
+ };
// Set whether the regular expression has the global flag. Exiting due to
// a failure in a global regexp may still mean success overall.
inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; }
inline bool global() { return global_mode_ != NOT_GLOBAL; }
inline bool global_with_zero_length_check() {
- return global_mode_ == GLOBAL;
+ return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE;
}
+ inline bool global_unicode() { return global_mode_ == GLOBAL_UNICODE; }
Isolate* isolate() const { return isolate_; }
Zone* zone() const { return zone_; }
private:
bool slow_safe_compiler_;
- bool global_mode_;
+ GlobalMode global_mode_;
Isolate* isolate_;
Zone* zone_;
};
@@ -199,13 +221,6 @@
int previous_index,
Isolate* isolate);
- // Compares two-byte strings case insensitively.
- // Called from generated RegExp code.
- static int CaseInsensitiveCompareUC16(Address byte_offset1,
- Address byte_offset2,
- size_t byte_length,
- Isolate* isolate);
-
// Called from RegExp if the backtrack stack limit is hit.
// Tries to expand the stack. Returns the new stack-pointer if
// successful, and updates the stack_top address, or returns 0 if unable
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
index fa89003..46c593c 100644
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@@ -8,27 +8,32 @@
#include "src/factory.h"
#include "src/isolate.h"
#include "src/objects-inl.h"
+#include "src/ostreams.h"
#include "src/regexp/jsregexp.h"
#include "src/utils.h"
+#ifdef V8_I18N_SUPPORT
+#include "unicode/uset.h"
+#endif // V8_I18N_SUPPORT
+
namespace v8 {
namespace internal {
RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
- bool multiline, bool unicode, Isolate* isolate,
- Zone* zone)
+ JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
: isolate_(isolate),
zone_(zone),
error_(error),
captures_(NULL),
in_(in),
current_(kEndMarker),
+ ignore_case_(flags & JSRegExp::kIgnoreCase),
+ multiline_(flags & JSRegExp::kMultiline),
+ unicode_(flags & JSRegExp::kUnicode),
next_pos_(0),
captures_started_(0),
capture_count_(0),
has_more_(true),
- multiline_(multiline),
- unicode_(unicode),
simple_(false),
contains_anchor_(false),
is_scanned_for_captures_(false),
@@ -36,10 +41,28 @@
Advance();
}
+template <bool update_position>
+inline uc32 RegExpParser::ReadNext() {
+ int position = next_pos_;
+ uc32 c0 = in()->Get(position);
+ position++;
+ // Read the whole surrogate pair in case of unicode flag, if possible.
+ if (unicode() && position < in()->length() &&
+ unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
+ uc16 c1 = in()->Get(position);
+ if (unibrow::Utf16::IsTrailSurrogate(c1)) {
+ c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
+ position++;
+ }
+ }
+ if (update_position) next_pos_ = position;
+ return c0;
+}
+
uc32 RegExpParser::Next() {
if (has_next()) {
- return in()->Get(next_pos_);
+ return ReadNext<false>();
} else {
return kEndMarker;
}
@@ -47,25 +70,14 @@
void RegExpParser::Advance() {
- if (next_pos_ < in()->length()) {
+ if (has_next()) {
StackLimitCheck check(isolate());
if (check.HasOverflowed()) {
ReportError(CStrVector(Isolate::kStackOverflowMessage));
} else if (zone()->excess_allocation()) {
ReportError(CStrVector("Regular expression too large"));
} else {
- current_ = in()->Get(next_pos_);
- next_pos_++;
- // Read the whole surrogate pair in case of unicode flag, if possible.
- if (unicode_ && next_pos_ < in()->length() &&
- unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {
- uc16 trail = in()->Get(next_pos_);
- if (unibrow::Utf16::IsTrailSurrogate(trail)) {
- current_ = unibrow::Utf16::CombineSurrogatePair(
- static_cast<uc16>(current_), trail);
- next_pos_++;
- }
- }
+ current_ = ReadNext<true>();
}
} else {
current_ = kEndMarker;
@@ -92,11 +104,28 @@
bool RegExpParser::simple() { return simple_; }
-
-bool RegExpParser::IsSyntaxCharacter(uc32 c) {
- return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
- c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
- c == '{' || c == '}' || c == '|';
+bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
+ switch (c) {
+ case '^':
+ case '$':
+ case '\\':
+ case '.':
+ case '*':
+ case '+':
+ case '?':
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case '{':
+ case '}':
+ case '|':
+ case '/':
+ return true;
+ default:
+ break;
+ }
+ return false;
}
@@ -142,7 +171,7 @@
RegExpTree* RegExpParser::ParseDisjunction() {
// Used to store current state while parsing subexpressions.
RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
- zone());
+ ignore_case(), unicode(), zone());
RegExpParserState* state = &initial_state;
// Cache the builder in a local variable for quick access.
RegExpBuilder* builder = initial_state.builder();
@@ -151,14 +180,14 @@
case kEndMarker:
if (state->IsSubexpression()) {
// Inside a parenthesized group when hitting end of input.
- ReportError(CStrVector("Unterminated group") CHECK_FAILED);
+ return ReportError(CStrVector("Unterminated group"));
}
DCHECK_EQ(INITIAL, state->group_type());
// Parsing completed successfully.
return builder->ToRegExp();
case ')': {
if (!state->IsSubexpression()) {
- ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
+ return ReportError(CStrVector("Unmatched ')'"));
}
DCHECK_NE(INITIAL, state->group_type());
@@ -206,7 +235,7 @@
return ReportError(CStrVector("Nothing to repeat"));
case '^': {
Advance();
- if (multiline_) {
+ if (multiline()) {
builder->AddAssertion(
new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
} else {
@@ -219,8 +248,8 @@
case '$': {
Advance();
RegExpAssertion::AssertionType assertion_type =
- multiline_ ? RegExpAssertion::END_OF_LINE
- : RegExpAssertion::END_OF_INPUT;
+ multiline() ? RegExpAssertion::END_OF_LINE
+ : RegExpAssertion::END_OF_INPUT;
builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
continue;
}
@@ -230,8 +259,9 @@
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
CharacterRange::AddClassEscape('.', ranges, zone());
- RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
- builder->AddAtom(atom);
+ RegExpCharacterClass* cc =
+ new (zone()) RegExpCharacterClass(ranges, false);
+ builder->AddCharacterClass(cc);
break;
}
case '(': {
@@ -265,25 +295,25 @@
}
// Fall through.
default:
- ReportError(CStrVector("Invalid group") CHECK_FAILED);
- break;
+ return ReportError(CStrVector("Invalid group"));
}
Advance(2);
} else {
if (captures_started_ >= kMaxCaptures) {
- ReportError(CStrVector("Too many captures") CHECK_FAILED);
+ return ReportError(CStrVector("Too many captures"));
}
captures_started_++;
}
// Store current state and begin new disjunction parsing.
state = new (zone()) RegExpParserState(
- state, subexpr_type, lookaround_type, captures_started_, zone());
+ state, subexpr_type, lookaround_type, captures_started_,
+ ignore_case(), unicode(), zone());
builder = state->builder();
continue;
}
case '[': {
- RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
- builder->AddAtom(atom);
+ RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
+ builder->AddCharacterClass(cc->AsCharacterClass());
break;
}
// Atom ::
@@ -318,8 +348,30 @@
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
CharacterRange::AddClassEscape(c, ranges, zone());
- RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
- builder->AddAtom(atom);
+ RegExpCharacterClass* cc =
+ new (zone()) RegExpCharacterClass(ranges, false);
+ builder->AddCharacterClass(cc);
+ break;
+ }
+ case 'p':
+ case 'P': {
+ uc32 p = Next();
+ Advance(2);
+ if (unicode()) {
+ if (FLAG_harmony_regexp_property) {
+ ZoneList<CharacterRange>* ranges = ParsePropertyClass();
+ if (ranges == nullptr) {
+ return ReportError(CStrVector("Invalid property name"));
+ }
+ RegExpCharacterClass* cc =
+ new (zone()) RegExpCharacterClass(ranges, p == 'P');
+ builder->AddCharacterClass(cc);
+ } else {
+ return ReportError(CStrVector("Invalid escape"));
+ }
+ } else {
+ builder->AddCharacter(p);
+ }
break;
}
case '1':
@@ -332,7 +384,8 @@
case '8':
case '9': {
int index = 0;
- if (ParseBackReferenceIndex(&index)) {
+ bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
+ if (is_backref) {
if (state->IsInsideCaptureGroup(index)) {
// The back reference is inside the capture group it refers to.
// Nothing can possibly have been captured yet, so we use empty
@@ -347,24 +400,25 @@
}
break;
}
+ // With /u, no identity escapes except for syntax characters
+ // are allowed. Otherwise, all identity escapes are allowed.
+ if (unicode()) {
+ return ReportError(CStrVector("Invalid escape"));
+ }
uc32 first_digit = Next();
if (first_digit == '8' || first_digit == '9') {
- // If the 'u' flag is present, only syntax characters can be
- // escaped,
- // no other identity escapes are allowed. If the 'u' flag is not
- // present, all identity escapes are allowed.
- if (!unicode_) {
- builder->AddCharacter(first_digit);
- Advance(2);
- } else {
- return ReportError(CStrVector("Invalid escape"));
- }
+ builder->AddCharacter(first_digit);
+ Advance(2);
break;
}
}
// FALLTHROUGH
case '0': {
Advance();
+ if (unicode() && Next() >= '0' && Next() <= '9') {
+ // With /u, decimal escape with leading 0 are not parsed as octal.
+ return ReportError(CStrVector("Invalid decimal escape"));
+ }
uc32 octal = ParseOctalLiteral();
builder->AddCharacter(octal);
break;
@@ -402,6 +456,10 @@
// This is outside the specification. We match JSC in
// reading the backslash as a literal character instead
// of as starting an escape.
+ if (unicode()) {
+ // With /u, invalid escapes are not treated as identity escapes.
+ return ReportError(CStrVector("Invalid unicode escape"));
+ }
builder->AddCharacter('\\');
} else {
Advance(2);
@@ -414,11 +472,10 @@
uc32 value;
if (ParseHexEscape(2, &value)) {
builder->AddCharacter(value);
- } else if (!unicode_) {
+ } else if (!unicode()) {
builder->AddCharacter('x');
} else {
- // If the 'u' flag is present, invalid escapes are not treated as
- // identity escapes.
+ // With /u, invalid escapes are not treated as identity escapes.
return ReportError(CStrVector("Invalid escape"));
}
break;
@@ -427,24 +484,20 @@
Advance(2);
uc32 value;
if (ParseUnicodeEscape(&value)) {
- builder->AddUnicodeCharacter(value);
- } else if (!unicode_) {
+ builder->AddEscapedUnicodeCharacter(value);
+ } else if (!unicode()) {
builder->AddCharacter('u');
} else {
- // If the 'u' flag is present, invalid escapes are not treated as
- // identity escapes.
+ // With /u, invalid escapes are not treated as identity escapes.
return ReportError(CStrVector("Invalid unicode escape"));
}
break;
}
default:
Advance();
- // If the 'u' flag is present, only syntax characters can be
- // escaped, no
- // other identity escapes are allowed. If the 'u' flag is not
- // present,
- // all identity escapes are allowed.
- if (!unicode_ || IsSyntaxCharacter(current())) {
+ // With /u, no identity escapes except for syntax characters
+ // are allowed. Otherwise, all identity escapes are allowed.
+ if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
builder->AddCharacter(current());
Advance();
} else {
@@ -456,10 +509,16 @@
case '{': {
int dummy;
if (ParseIntervalQuantifier(&dummy, &dummy)) {
- ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
+ return ReportError(CStrVector("Nothing to repeat"));
}
// fallthrough
}
+ case '}':
+ case ']':
+ if (unicode()) {
+ return ReportError(CStrVector("Lone quantifier brackets"));
+ }
+ // fallthrough
default:
builder->AddUnicodeCharacter(current());
Advance();
@@ -492,13 +551,15 @@
case '{':
if (ParseIntervalQuantifier(&min, &max)) {
if (max < min) {
- ReportError(CStrVector("numbers out of order in {} quantifier.")
- CHECK_FAILED);
+ return ReportError(
+ CStrVector("numbers out of order in {} quantifier"));
}
break;
- } else {
- continue;
+ } else if (unicode()) {
+ // With /u, incomplete quantifiers are not allowed.
+ return ReportError(CStrVector("Incomplete quantifier"));
}
+ continue;
default:
continue;
}
@@ -511,7 +572,9 @@
quantifier_type = RegExpQuantifier::POSSESSIVE;
Advance();
}
- builder->AddQuantifierToAtom(min, max, quantifier_type);
+ if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
+ return ReportError(CStrVector("Invalid quantifier"));
+ }
}
}
@@ -740,12 +803,12 @@
return true;
}
-
+// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
bool RegExpParser::ParseUnicodeEscape(uc32* value) {
// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
// allowed). In the latter case, the number of hex digits between { } is
// arbitrary. \ and u have already been read.
- if (current() == '{' && unicode_) {
+ if (current() == '{' && unicode()) {
int start = position();
Advance();
if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
@@ -758,9 +821,75 @@
return false;
}
// \u but no {, or \u{...} escapes not allowed.
- return ParseHexEscape(4, value);
+ bool result = ParseHexEscape(4, value);
+ if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
+ current() == '\\') {
+ // Attempt to read trail surrogate.
+ int start = position();
+ if (Next() == 'u') {
+ Advance(2);
+ uc32 trail;
+ if (ParseHexEscape(4, &trail) &&
+ unibrow::Utf16::IsTrailSurrogate(trail)) {
+ *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
+ static_cast<uc16>(trail));
+ return true;
+ }
+ }
+ Reset(start);
+ }
+ return result;
}
+ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {
+#ifdef V8_I18N_SUPPORT
+ char property_name[3];
+ memset(property_name, 0, sizeof(property_name));
+ if (current() == '{') {
+ Advance();
+ if (current() < 'A' || current() > 'Z') return nullptr;
+ property_name[0] = static_cast<char>(current());
+ Advance();
+ if (current() >= 'a' && current() <= 'z') {
+ property_name[1] = static_cast<char>(current());
+ Advance();
+ }
+ if (current() != '}') return nullptr;
+ } else if (current() >= 'A' && current() <= 'Z') {
+ property_name[0] = static_cast<char>(current());
+ } else {
+ return nullptr;
+ }
+ Advance();
+
+ int32_t category =
+ u_getPropertyValueEnum(UCHAR_GENERAL_CATEGORY_MASK, property_name);
+ if (category == UCHAR_INVALID_CODE) return nullptr;
+
+ USet* set = uset_openEmpty();
+ UErrorCode ec = U_ZERO_ERROR;
+ uset_applyIntPropertyValue(set, UCHAR_GENERAL_CATEGORY_MASK, category, &ec);
+ ZoneList<CharacterRange>* ranges = nullptr;
+ if (ec == U_ZERO_ERROR && !uset_isEmpty(set)) {
+ uset_removeAllStrings(set);
+ int item_count = uset_getItemCount(set);
+ ranges = new (zone()) ZoneList<CharacterRange>(item_count, zone());
+ int item_result = 0;
+ for (int i = 0; i < item_count; i++) {
+ uc32 start = 0;
+ uc32 end = 0;
+ item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
+ ranges->Add(CharacterRange::Range(start, end), zone());
+ }
+ DCHECK_EQ(U_ZERO_ERROR, ec);
+ DCHECK_EQ(0, item_result);
+ }
+ uset_close(set);
+ return ranges;
+#else // V8_I18N_SUPPORT
+ return nullptr;
+#endif // V8_I18N_SUPPORT
+}
bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
uc32 x = 0;
@@ -809,20 +938,35 @@
case 'c': {
uc32 controlLetter = Next();
uc32 letter = controlLetter & ~('A' ^ 'a');
- // For compatibility with JSC, inside a character class
- // we also accept digits and underscore as control characters.
- if ((controlLetter >= '0' && controlLetter <= '9') ||
- controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
+ // For compatibility with JSC, inside a character class. We also accept
+ // digits and underscore as control characters, unless with /u.
+ if (letter >= 'A' && letter <= 'Z') {
Advance(2);
// Control letters mapped to ASCII control characters in the range
// 0x00-0x1f.
return controlLetter & 0x1f;
}
+ if (unicode()) {
+ // With /u, invalid escapes are not treated as identity escapes.
+ ReportError(CStrVector("Invalid class escape"));
+ return 0;
+ }
+ if ((controlLetter >= '0' && controlLetter <= '9') ||
+ controlLetter == '_') {
+ Advance(2);
+ return controlLetter & 0x1f;
+ }
// We match JSC in reading the backslash as a literal
// character instead of as starting an escape.
return '\\';
}
case '0':
+ // With /u, \0 is interpreted as NUL if not followed by another digit.
+ if (unicode() && !(Next() >= '0' && Next() <= '9')) {
+ Advance();
+ return 0;
+ }
+ // Fall through.
case '1':
case '2':
case '3':
@@ -833,43 +977,43 @@
// For compatibility, we interpret a decimal escape that isn't
// a back reference (and therefore either \0 or not valid according
// to the specification) as a 1..3 digit octal character code.
+ if (unicode()) {
+ // With /u, decimal escape is not interpreted as octal character code.
+ ReportError(CStrVector("Invalid class escape"));
+ return 0;
+ }
return ParseOctalLiteral();
case 'x': {
Advance();
uc32 value;
- if (ParseHexEscape(2, &value)) {
- return value;
+ if (ParseHexEscape(2, &value)) return value;
+ if (unicode()) {
+ // With /u, invalid escapes are not treated as identity escapes.
+ ReportError(CStrVector("Invalid escape"));
+ return 0;
}
- if (!unicode_) {
- // If \x is not followed by a two-digit hexadecimal, treat it
- // as an identity escape.
- return 'x';
- }
- // If the 'u' flag is present, invalid escapes are not treated as
- // identity escapes.
- ReportError(CStrVector("Invalid escape"));
- return 0;
+ // If \x is not followed by a two-digit hexadecimal, treat it
+ // as an identity escape.
+ return 'x';
}
case 'u': {
Advance();
uc32 value;
- if (ParseUnicodeEscape(&value)) {
- return value;
+ if (ParseUnicodeEscape(&value)) return value;
+ if (unicode()) {
+ // With /u, invalid escapes are not treated as identity escapes.
+ ReportError(CStrVector("Invalid unicode escape"));
+ return 0;
}
- if (!unicode_) {
- return 'u';
- }
- // If the 'u' flag is present, invalid escapes are not treated as
- // identity escapes.
- ReportError(CStrVector("Invalid unicode escape"));
- return 0;
+ // If \u is not followed by a two-digit hexadecimal, treat it
+ // as an identity escape.
+ return 'u';
}
default: {
uc32 result = current();
- // If the 'u' flag is present, only syntax characters can be escaped, no
- // other identity escapes are allowed. If the 'u' flag is not present, all
- // identity escapes are allowed.
- if (!unicode_ || IsSyntaxCharacter(result)) {
+ // With /u, no identity escapes except for syntax characters and '-' are
+ // allowed. Otherwise, all identity escapes are allowed.
+ if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
Advance();
return result;
}
@@ -899,13 +1043,13 @@
case kEndMarker:
return ReportError(CStrVector("\\ at end of pattern"));
default:
- uc32 c = ParseClassCharacterEscape(CHECK_FAILED);
- return CharacterRange::Singleton(c);
+ first = ParseClassCharacterEscape(CHECK_FAILED);
}
} else {
Advance();
- return CharacterRange::Singleton(first);
}
+
+ return CharacterRange::Singleton(first);
}
@@ -927,6 +1071,7 @@
RegExpTree* RegExpParser::ParseCharacterClass() {
static const char* kUnterminated = "Unterminated character class";
+ static const char* kRangeInvalid = "Invalid character class";
static const char* kRangeOutOfOrder = "Range out of order in character class";
DCHECK_EQ(current(), '[');
@@ -956,13 +1101,18 @@
CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
// Either end is an escaped character class. Treat the '-' verbatim.
+ if (unicode()) {
+ // ES2015 21.2.2.15.1 step 1.
+ return ReportError(CStrVector(kRangeInvalid));
+ }
AddRangeOrEscape(ranges, char_class, first, zone());
ranges->Add(CharacterRange::Singleton('-'), zone());
AddRangeOrEscape(ranges, char_class_2, next, zone());
continue;
}
+ // ES2015 21.2.2.15.1 step 6.
if (first.from() > next.to()) {
- return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
+ return ReportError(CStrVector(kRangeOutOfOrder));
}
ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
} else {
@@ -970,7 +1120,7 @@
}
}
if (!has_more()) {
- return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
+ return ReportError(CStrVector(kUnterminated));
}
Advance();
if (ranges->length() == 0) {
@@ -985,10 +1135,10 @@
bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
- FlatStringReader* input, bool multiline,
- bool unicode, RegExpCompileData* result) {
+ FlatStringReader* input, JSRegExp::Flags flags,
+ RegExpCompileData* result) {
DCHECK(result != NULL);
- RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);
+ RegExpParser parser(input, &result->error, flags, isolate, zone);
RegExpTree* tree = parser.ParsePattern();
if (parser.failed()) {
DCHECK(tree == NULL);
@@ -1010,11 +1160,13 @@
return !parser.failed();
}
-
-RegExpBuilder::RegExpBuilder(Zone* zone)
+RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
: zone_(zone),
pending_empty_(false),
+ ignore_case_(ignore_case),
+ unicode_(unicode),
characters_(NULL),
+ pending_surrogate_(kNoPendingSurrogate),
terms_(),
alternatives_()
#ifdef DEBUG
@@ -1025,7 +1177,51 @@
}
+void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+ FlushPendingSurrogate();
+ // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
+ pending_surrogate_ = lead_surrogate;
+}
+
+
+void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
+ DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
+ if (pending_surrogate_ != kNoPendingSurrogate) {
+ uc16 lead_surrogate = pending_surrogate_;
+ pending_surrogate_ = kNoPendingSurrogate;
+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+ uc32 combined =
+ unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
+ if (NeedsDesugaringForIgnoreCase(combined)) {
+ AddCharacterClassForDesugaring(combined);
+ } else {
+ ZoneList<uc16> surrogate_pair(2, zone());
+ surrogate_pair.Add(lead_surrogate, zone());
+ surrogate_pair.Add(trail_surrogate, zone());
+ RegExpAtom* atom =
+ new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
+ AddAtom(atom);
+ }
+ } else {
+ pending_surrogate_ = trail_surrogate;
+ FlushPendingSurrogate();
+ }
+}
+
+
+void RegExpBuilder::FlushPendingSurrogate() {
+ if (pending_surrogate_ != kNoPendingSurrogate) {
+ DCHECK(unicode());
+ uc32 c = pending_surrogate_;
+ pending_surrogate_ = kNoPendingSurrogate;
+ AddCharacterClassForDesugaring(c);
+ }
+}
+
+
void RegExpBuilder::FlushCharacters() {
+ FlushPendingSurrogate();
pending_empty_ = false;
if (characters_ != NULL) {
RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
@@ -1053,31 +1249,61 @@
void RegExpBuilder::AddCharacter(uc16 c) {
+ FlushPendingSurrogate();
pending_empty_ = false;
- if (characters_ == NULL) {
- characters_ = new (zone()) ZoneList<uc16>(4, zone());
+ if (NeedsDesugaringForIgnoreCase(c)) {
+ AddCharacterClassForDesugaring(c);
+ } else {
+ if (characters_ == NULL) {
+ characters_ = new (zone()) ZoneList<uc16>(4, zone());
+ }
+ characters_->Add(c, zone());
+ LAST(ADD_CHAR);
}
- characters_->Add(c, zone());
- LAST(ADD_CHAR);
}
void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
- ZoneList<uc16> surrogate_pair(2, zone());
- surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());
- surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());
- RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
- AddAtom(atom);
+ DCHECK(unicode());
+ AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
+ AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
+ } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
+ AddLeadSurrogate(c);
+ } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
+ AddTrailSurrogate(c);
} else {
AddCharacter(static_cast<uc16>(c));
}
}
+void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
+ // A lead or trail surrogate parsed via escape sequence will not
+ // pair up with any preceding lead or following trail surrogate.
+ FlushPendingSurrogate();
+ AddUnicodeCharacter(character);
+ FlushPendingSurrogate();
+}
void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
+void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
+ if (NeedsDesugaringForUnicode(cc)) {
+ // With /u, character class needs to be desugared, so it
+ // must be a standalone term instead of being part of a RegExpText.
+ AddTerm(cc);
+ } else {
+ AddAtom(cc);
+ }
+}
+
+void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
+ AddTerm(new (zone()) RegExpCharacterClass(
+ CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
+}
+
+
void RegExpBuilder::AddAtom(RegExpTree* term) {
if (term->IsEmpty()) {
AddEmpty();
@@ -1094,6 +1320,13 @@
}
+void RegExpBuilder::AddTerm(RegExpTree* term) {
+ FlushText();
+ terms_.Add(term, zone());
+ LAST(ADD_ATOM);
+}
+
+
void RegExpBuilder::AddAssertion(RegExpTree* assert) {
FlushText();
terms_.Add(assert, zone());
@@ -1121,6 +1354,47 @@
}
+bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
+ if (!unicode()) return false;
+ switch (cc->standard_type()) {
+ case 's': // white space
+ case 'w': // ASCII word character
+ case 'd': // ASCII digit
+ return false; // These characters do not need desugaring.
+ default:
+ break;
+ }
+ ZoneList<CharacterRange>* ranges = cc->ranges(zone());
+ CharacterRange::Canonicalize(ranges);
+ for (int i = ranges->length() - 1; i >= 0; i--) {
+ uc32 from = ranges->at(i).from();
+ uc32 to = ranges->at(i).to();
+ // Check for non-BMP characters.
+ if (to >= kNonBmpStart) return true;
+ // Check for lone surrogates.
+ if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
+ }
+ return false;
+}
+
+
+bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
+#ifdef V8_I18N_SUPPORT
+ if (unicode() && ignore_case()) {
+ USet* set = uset_open(c, c);
+ uset_closeOver(set, USET_CASE_INSENSITIVE);
+ uset_removeAllStrings(set);
+ bool result = uset_size(set) > 1;
+ uset_close(set);
+ return result;
+ }
+ // In the case where ICU is not included, we act as if the unicode flag is
+ // not set, and do not desugar.
+#endif // V8_I18N_SUPPORT
+ return false;
+}
+
+
RegExpTree* RegExpBuilder::ToRegExp() {
FlushTerms();
int num_alternatives = alternatives_.length();
@@ -1129,12 +1403,12 @@
return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
}
-
-void RegExpBuilder::AddQuantifierToAtom(
+bool RegExpBuilder::AddQuantifierToAtom(
int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
+ FlushPendingSurrogate();
if (pending_empty_) {
pending_empty_ = false;
- return;
+ return true;
}
RegExpTree* atom;
if (characters_ != NULL) {
@@ -1157,23 +1431,26 @@
} else if (terms_.length() > 0) {
DCHECK(last_added_ == ADD_ATOM);
atom = terms_.RemoveLast();
+ // With /u, lookarounds are not quantifiable.
+ if (unicode() && atom->IsLookaround()) return false;
if (atom->max_match() == 0) {
// Guaranteed to only match an empty string.
LAST(ADD_TERM);
if (min == 0) {
- return;
+ return true;
}
terms_.Add(atom, zone());
- return;
+ return true;
}
} else {
// Only call immediately after adding an atom or character!
UNREACHABLE();
- return;
+ return false;
}
terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
zone());
LAST(ADD_TERM);
+ return true;
}
} // namespace internal
diff --git a/src/regexp/regexp-parser.h b/src/regexp/regexp-parser.h
index af9b765..acf783c 100644
--- a/src/regexp/regexp-parser.h
+++ b/src/regexp/regexp-parser.h
@@ -99,28 +99,43 @@
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
class RegExpBuilder : public ZoneObject {
public:
- explicit RegExpBuilder(Zone* zone);
+ RegExpBuilder(Zone* zone, bool ignore_case, bool unicode);
void AddCharacter(uc16 character);
void AddUnicodeCharacter(uc32 character);
+ void AddEscapedUnicodeCharacter(uc32 character);
// "Adds" an empty expression. Does nothing except consume a
// following quantifier
void AddEmpty();
+ void AddCharacterClass(RegExpCharacterClass* cc);
+ void AddCharacterClassForDesugaring(uc32 c);
void AddAtom(RegExpTree* tree);
+ void AddTerm(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|'
- void AddQuantifierToAtom(int min, int max,
+ bool AddQuantifierToAtom(int min, int max,
RegExpQuantifier::QuantifierType type);
RegExpTree* ToRegExp();
private:
+ static const uc16 kNoPendingSurrogate = 0;
+ void AddLeadSurrogate(uc16 lead_surrogate);
+ void AddTrailSurrogate(uc16 trail_surrogate);
+ void FlushPendingSurrogate();
void FlushCharacters();
void FlushText();
void FlushTerms();
+ bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
+ bool NeedsDesugaringForIgnoreCase(uc32 c);
Zone* zone() const { return zone_; }
+ bool ignore_case() const { return ignore_case_; }
+ bool unicode() const { return unicode_; }
Zone* zone_;
bool pending_empty_;
+ bool ignore_case_;
+ bool unicode_;
ZoneList<uc16>* characters_;
+ uc16 pending_surrogate_;
BufferedZoneList<RegExpTree, 2> terms_;
BufferedZoneList<RegExpTree, 2> text_;
BufferedZoneList<RegExpTree, 2> alternatives_;
@@ -135,12 +150,11 @@
class RegExpParser BASE_EMBEDDED {
public:
- RegExpParser(FlatStringReader* in, Handle<String>* error, bool multiline_mode,
- bool unicode, Isolate* isolate, Zone* zone);
+ RegExpParser(FlatStringReader* in, Handle<String>* error,
+ JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
- bool multiline, bool unicode,
- RegExpCompileData* result);
+ JSRegExp::Flags flags, RegExpCompileData* result);
RegExpTree* ParsePattern();
RegExpTree* ParseDisjunction();
@@ -160,6 +174,7 @@
bool ParseHexEscape(int length, uc32* value);
bool ParseUnicodeEscape(uc32* value);
bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
+ ZoneList<CharacterRange>* ParsePropertyClass();
uc32 ParseOctalLiteral();
@@ -183,8 +198,11 @@
int captures_started() { return captures_started_; }
int position() { return next_pos_ - 1; }
bool failed() { return failed_; }
+ bool ignore_case() const { return ignore_case_; }
+ bool multiline() const { return multiline_; }
+ bool unicode() const { return unicode_; }
- static bool IsSyntaxCharacter(uc32 c);
+ static bool IsSyntaxCharacterOrSlash(uc32 c);
static const int kMaxCaptures = 1 << 16;
static const uc32 kEndMarker = (1 << 21);
@@ -203,9 +221,10 @@
RegExpParserState(RegExpParserState* previous_state,
SubexpressionType group_type,
RegExpLookaround::Type lookaround_type,
- int disjunction_capture_index, Zone* zone)
+ int disjunction_capture_index, bool ignore_case,
+ bool unicode, Zone* zone)
: previous_state_(previous_state),
- builder_(new (zone) RegExpBuilder(zone)),
+ builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)),
group_type_(group_type),
lookaround_type_(lookaround_type),
disjunction_capture_index_(disjunction_capture_index) {}
@@ -249,6 +268,8 @@
bool has_more() { return has_more_; }
bool has_next() { return next_pos_ < in()->length(); }
uc32 Next();
+ template <bool update_position>
+ uc32 ReadNext();
FlatStringReader* in() { return in_; }
void ScanForCaptures();
@@ -258,13 +279,14 @@
ZoneList<RegExpCapture*>* captures_;
FlatStringReader* in_;
uc32 current_;
+ bool ignore_case_;
+ bool multiline_;
+ bool unicode_;
int next_pos_;
int captures_started_;
// The capture count is only valid after we have scanned for captures.
int capture_count_;
bool has_more_;
- bool multiline_;
- bool unicode_;
bool simple_;
bool contains_anchor_;
bool is_scanned_for_captures_;
diff --git a/src/regexp/x64/regexp-macro-assembler-x64.cc b/src/regexp/x64/regexp-macro-assembler-x64.cc
index 286f159..952034f 100644
--- a/src/regexp/x64/regexp-macro-assembler-x64.cc
+++ b/src/regexp/x64/regexp-macro-assembler-x64.cc
@@ -203,7 +203,7 @@
void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, Label* on_no_match) {
+ int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough;
ReadPositionFromRegister(rdx, start_reg); // Offset of start of capture
ReadPositionFromRegister(rbx, start_reg + 1); // Offset of end of capture
@@ -308,8 +308,10 @@
// Address byte_offset1 - Address captured substring's start.
// Address byte_offset2 - Address of current character position.
// size_t byte_length - length of capture in bytes(!)
- // Isolate* isolate
+// Isolate* isolate or 0 if unicode flag.
#ifdef _WIN64
+ DCHECK(rcx.is(arg_reg_1));
+ DCHECK(rdx.is(arg_reg_2));
// Compute and set byte_offset1 (start of capture).
__ leap(rcx, Operand(rsi, rdx, times_1, 0));
// Set byte_offset2.
@@ -317,11 +319,9 @@
if (read_backward) {
__ subq(rdx, rbx);
}
- // Set byte_length.
- __ movp(r8, rbx);
- // Isolate.
- __ LoadAddress(r9, ExternalReference::isolate_address(isolate()));
#else // AMD64 calling convention
+ DCHECK(rdi.is(arg_reg_1));
+ DCHECK(rsi.is(arg_reg_2));
// Compute byte_offset2 (current position = rsi+rdi).
__ leap(rax, Operand(rsi, rdi, times_1, 0));
// Compute and set byte_offset1 (start of capture).
@@ -331,11 +331,19 @@
if (read_backward) {
__ subq(rsi, rbx);
}
+#endif // _WIN64
+
// Set byte_length.
- __ movp(rdx, rbx);
+ __ movp(arg_reg_3, rbx);
// Isolate.
- __ LoadAddress(rcx, ExternalReference::isolate_address(isolate()));
-#endif
+#ifdef V8_I18N_SUPPORT
+ if (unicode) {
+ __ movp(arg_reg_4, Immediate(0));
+ } else // NOLINT
+#endif // V8_I18N_SUPPORT
+ {
+ __ LoadAddress(arg_reg_4, ExternalReference::isolate_address(isolate()));
+ }
{ // NOLINT: Can't find a way to open this scope without confusing the
// linter.
@@ -869,11 +877,14 @@
__ testp(rdi, rdi);
__ j(zero, &exit_label_, Label::kNear);
// Advance current position after a zero-length match.
+ Label advance;
+ __ bind(&advance);
if (mode_ == UC16) {
__ addq(rdi, Immediate(2));
} else {
__ incq(rdi);
}
+ if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
}
__ jmp(&load_char_start_regexp);
diff --git a/src/regexp/x64/regexp-macro-assembler-x64.h b/src/regexp/x64/regexp-macro-assembler-x64.h
index 2578047..4c37771 100644
--- a/src/regexp/x64/regexp-macro-assembler-x64.h
+++ b/src/regexp/x64/regexp-macro-assembler-x64.h
@@ -38,7 +38,7 @@
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward,
+ bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c,
diff --git a/src/regexp/x87/regexp-macro-assembler-x87.cc b/src/regexp/x87/regexp-macro-assembler-x87.cc
index 01d0b24..6e62092 100644
--- a/src/regexp/x87/regexp-macro-assembler-x87.cc
+++ b/src/regexp/x87/regexp-macro-assembler-x87.cc
@@ -187,9 +187,8 @@
__ bind(&fallthrough);
}
-
void RegExpMacroAssemblerX87::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, Label* on_no_match) {
+ int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough;
__ mov(edx, register_location(start_reg)); // Index of start of capture
__ mov(ebx, register_location(start_reg + 1)); // Index of end of capture
@@ -296,11 +295,18 @@
// Address byte_offset1 - Address captured substring's start.
// Address byte_offset2 - Address of current character position.
// size_t byte_length - length of capture in bytes(!)
- // Isolate* isolate
+// Isolate* isolate or 0 if unicode flag.
// Set isolate.
- __ mov(Operand(esp, 3 * kPointerSize),
- Immediate(ExternalReference::isolate_address(isolate())));
+#ifdef V8_I18N_SUPPORT
+ if (unicode) {
+ __ mov(Operand(esp, 3 * kPointerSize), Immediate(0));
+ } else // NOLINT
+#endif // V8_I18N_SUPPORT
+ {
+ __ mov(Operand(esp, 3 * kPointerSize),
+ Immediate(ExternalReference::isolate_address(isolate())));
+ }
// Set byte_length.
__ mov(Operand(esp, 2 * kPointerSize), ebx);
// Set byte_offset2.
@@ -822,13 +828,15 @@
__ test(edi, edi);
__ j(zero, &exit_label_, Label::kNear);
// Advance current position after a zero-length match.
+ Label advance;
+ __ bind(&advance);
if (mode_ == UC16) {
__ add(edi, Immediate(2));
} else {
__ inc(edi);
}
+ if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
}
-
__ jmp(&load_char_start_regexp);
} else {
__ mov(eax, Immediate(SUCCESS));
diff --git a/src/regexp/x87/regexp-macro-assembler-x87.h b/src/regexp/x87/regexp-macro-assembler-x87.h
index c955412..2f68961 100644
--- a/src/regexp/x87/regexp-macro-assembler-x87.h
+++ b/src/regexp/x87/regexp-macro-assembler-x87.h
@@ -37,7 +37,7 @@
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward,
+ bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c,