Revert "Revert "Upgrade to 5.0.71.48"" DO NOT MERGE This reverts commit f2e3994fa5148cc3d9946666f0b0596290192b0e, and updates the x64 makefile properly so it doesn't break that build. FPIIM-449 Change-Id: Ib83e35bfbae6af627451c926a9650ec57c045605 (cherry picked from commit 109988c7ccb6f3fd1a58574fa3dfb88beaef6632)

commit: 097c5b25f8b57b92e6c94f25383a883ff242344f [log] [tgz]
author: Ben Murdoch <benm@google.com> Wed May 18 11:27:45 2016 +0100
committer: Dirk Vogt <dirk@fairphone.com> Fri Mar 17 16:05:59 2017 +0100
tree: 79bf5b2d1246e60291de69b04c80eef46ff43ee1
parent: 3942d2ab9eee35cb86018242521effc970f4e902 [diff]
diff --git a/src/regexp/arm/regexp-macro-assembler-arm.cc b/src/regexp/arm/regexp-macro-assembler-arm.cc
index 6fafdfb..ce72188 100644
--- a/src/regexp/arm/regexp-macro-assembler-arm.cc
+++ b/src/regexp/arm/regexp-macro-assembler-arm.cc

@@ -210,7 +210,7 @@
 
 
 void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase(
-    int start_reg, bool read_backward, Label* on_no_match) {
+    int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
   Label fallthrough;
   __ ldr(r0, register_location(start_reg));  // Index of start of capture
   __ ldr(r1, register_location(start_reg + 1));  // Index of end of capture
@@ -302,7 +302,7 @@
     //   r0: Address byte_offset1 - Address captured substring's start.
     //   r1: Address byte_offset2 - Address of current character position.
     //   r2: size_t byte_length - length of capture in bytes(!)
-    //   r3: Isolate* isolate
+    //   r3: Isolate* isolate or 0 if unicode flag.
 
     // Address of start of capture.
     __ add(r0, r0, Operand(end_of_input_address()));
@@ -316,7 +316,14 @@
       __ sub(r1, r1, r4);
     }
     // Isolate.
-    __ mov(r3, Operand(ExternalReference::isolate_address(isolate())));
+#ifdef V8_I18N_SUPPORT
+    if (unicode) {
+      __ mov(r3, Operand(0));
+    } else  // NOLINT
+#endif      // V8_I18N_SUPPORT
+    {
+      __ mov(r3, Operand(ExternalReference::isolate_address(isolate())));
+    }
 
     {
       AllowExternalCallThatCantCauseGC scope(masm_);
@@ -798,9 +805,12 @@
         __ cmp(current_input_offset(), Operand::Zero());
         __ b(eq, &exit_label_);
         // Advance current position after a zero-length match.
+        Label advance;
+        __ bind(&advance);
         __ add(current_input_offset(),
                current_input_offset(),
                Operand((mode_ == UC16) ? 2 : 1));
+        if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
       }
 
       __ b(&load_char_start_regexp);

diff --git a/src/regexp/arm/regexp-macro-assembler-arm.h b/src/regexp/arm/regexp-macro-assembler-arm.h
index 233a98f..f808538 100644
--- a/src/regexp/arm/regexp-macro-assembler-arm.h
+++ b/src/regexp/arm/regexp-macro-assembler-arm.h

@@ -38,7 +38,7 @@
   virtual void CheckNotBackReference(int start_reg, bool read_backward,
                                      Label* on_no_match);
   virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
-                                               bool read_backward,
+                                               bool read_backward, bool unicode,
                                                Label* on_no_match);
   virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
   virtual void CheckNotCharacterAfterAnd(unsigned c,

diff --git a/src/regexp/arm64/regexp-macro-assembler-arm64.cc b/src/regexp/arm64/regexp-macro-assembler-arm64.cc
index 9948597..941ccea 100644
--- a/src/regexp/arm64/regexp-macro-assembler-arm64.cc
+++ b/src/regexp/arm64/regexp-macro-assembler-arm64.cc

@@ -274,7 +274,7 @@
 
 
 void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase(
-    int start_reg, bool read_backward, Label* on_no_match) {
+    int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
   Label fallthrough;
 
   Register capture_start_offset = w10;
@@ -388,7 +388,7 @@
     //   x0: Address byte_offset1 - Address captured substring's start.
     //   x1: Address byte_offset2 - Address of current character position.
     //   w2: size_t byte_length - length of capture in bytes(!)
-    //   x3: Isolate* isolate
+    //   x3: Isolate* isolate or 0 if unicode flag
 
     // Address of start of capture.
     __ Add(x0, input_end(), Operand(capture_start_offset, SXTW));
@@ -400,7 +400,14 @@
       __ Sub(x1, x1, Operand(capture_length, SXTW));
     }
     // Isolate.
-    __ Mov(x3, ExternalReference::isolate_address(isolate()));
+#ifdef V8_I18N_SUPPORT
+    if (unicode) {
+      __ Mov(x3, Operand(0));
+    } else  // NOLINT
+#endif      // V8_I18N_SUPPORT
+    {
+      __ Mov(x3, ExternalReference::isolate_address(isolate()));
+    }
 
     {
       AllowExternalCallThatCantCauseGC scope(masm_);
@@ -991,9 +998,12 @@
         // Offset from the end is zero if we already reached the end.
         __ Cbz(current_input_offset(), &return_w0);
         // Advance current position after a zero-length match.
+        Label advance;
+        __ bind(&advance);
         __ Add(current_input_offset(),
                current_input_offset(),
                Operand((mode_ == UC16) ? 2 : 1));
+        if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
       }
 
       __ B(&load_char_start_regexp);

diff --git a/src/regexp/arm64/regexp-macro-assembler-arm64.h b/src/regexp/arm64/regexp-macro-assembler-arm64.h
index d71f063..69624f6 100644
--- a/src/regexp/arm64/regexp-macro-assembler-arm64.h
+++ b/src/regexp/arm64/regexp-macro-assembler-arm64.h

@@ -43,7 +43,7 @@
   virtual void CheckNotBackReference(int start_reg, bool read_backward,
                                      Label* on_no_match);
   virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
-                                               bool read_backward,
+                                               bool read_backward, bool unicode,
                                                Label* on_no_match);
   virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
   virtual void CheckNotCharacterAfterAnd(unsigned c,

diff --git a/src/regexp/bytecodes-irregexp.h b/src/regexp/bytecodes-irregexp.h
index 2dbfbc0..3848f15 100644
--- a/src/regexp/bytecodes-irregexp.h
+++ b/src/regexp/bytecodes-irregexp.h

@@ -6,6 +6,8 @@
 #ifndef V8_REGEXP_BYTECODES_IRREGEXP_H_
 #define V8_REGEXP_BYTECODES_IRREGEXP_H_
 
+#ifdef V8_INTERPRETED_REGEXP
+
 namespace v8 {
 namespace internal {
 
@@ -18,56 +20,58 @@
 const int BYTECODE_SHIFT = 8;
 
 #define BYTECODE_ITERATOR(V)                                                   \
-V(BREAK,              0, 4)   /* bc8                                        */ \
-V(PUSH_CP,            1, 4)   /* bc8 pad24                                  */ \
-V(PUSH_BT,            2, 8)   /* bc8 pad24 offset32                         */ \
-V(PUSH_REGISTER,      3, 4)   /* bc8 reg_idx24                              */ \
-V(SET_REGISTER_TO_CP, 4, 8)   /* bc8 reg_idx24 offset32                     */ \
-V(SET_CP_TO_REGISTER, 5, 4)   /* bc8 reg_idx24                              */ \
-V(SET_REGISTER_TO_SP, 6, 4)   /* bc8 reg_idx24                              */ \
-V(SET_SP_TO_REGISTER, 7, 4)   /* bc8 reg_idx24                              */ \
-V(SET_REGISTER,       8, 8)   /* bc8 reg_idx24 value32                      */ \
-V(ADVANCE_REGISTER,   9, 8)   /* bc8 reg_idx24 value32                      */ \
-V(POP_CP,            10, 4)   /* bc8 pad24                                  */ \
-V(POP_BT,            11, 4)   /* bc8 pad24                                  */ \
-V(POP_REGISTER,      12, 4)   /* bc8 reg_idx24                              */ \
-V(FAIL,              13, 4)   /* bc8 pad24                                  */ \
-V(SUCCEED,           14, 4)   /* bc8 pad24                                  */ \
-V(ADVANCE_CP,        15, 4)   /* bc8 offset24                               */ \
-V(GOTO,              16, 8)   /* bc8 pad24 addr32                           */ \
-V(LOAD_CURRENT_CHAR, 17, 8)   /* bc8 offset24 addr32                        */ \
-V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4) /* bc8 offset24                       */ \
-V(LOAD_2_CURRENT_CHARS, 19, 8) /* bc8 offset24 addr32                       */ \
-V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24                    */ \
-V(LOAD_4_CURRENT_CHARS, 21, 8) /* bc8 offset24 addr32                       */ \
-V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24                    */ \
-V(CHECK_4_CHARS,     23, 12)  /* bc8 pad24 uint32 addr32                    */ \
-V(CHECK_CHAR,        24, 8)   /* bc8 pad8 uint16 addr32                     */ \
-V(CHECK_NOT_4_CHARS, 25, 12)  /* bc8 pad24 uint32 addr32                    */ \
-V(CHECK_NOT_CHAR,    26, 8)   /* bc8 pad8 uint16 addr32                     */ \
-V(AND_CHECK_4_CHARS, 27, 16)  /* bc8 pad24 uint32 uint32 addr32             */ \
-V(AND_CHECK_CHAR,    28, 12)  /* bc8 pad8 uint16 uint32 addr32              */ \
-V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32          */ \
-V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32              */ \
-V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32       */ \
-V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 uc16 uc16 addr32                */ \
-V(CHECK_CHAR_NOT_IN_RANGE, 33, 12) /* bc8 pad24 uc16 uc16 addr32            */ \
-V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128                   */ \
-V(CHECK_LT,          35, 8)   /* bc8 pad8 uc16 addr32                       */ \
-V(CHECK_GT,          36, 8)   /* bc8 pad8 uc16 addr32                       */ \
-V(CHECK_NOT_BACK_REF, 37, 8)  /* bc8 reg_idx24 addr32                       */ \
-V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32                */ \
-V(CHECK_NOT_BACK_REF_BACKWARD, 39, 8)         /* bc8 reg_idx24 addr32       */ \
-V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32       */ \
-V(CHECK_NOT_REGS_EQUAL, 41, 12) /* bc8 regidx24 reg_idx32 addr32            */ \
-V(CHECK_REGISTER_LT, 42, 12)  /* bc8 reg_idx24 value32 addr32               */ \
-V(CHECK_REGISTER_GE, 43, 12)  /* bc8 reg_idx24 value32 addr32               */ \
-V(CHECK_REGISTER_EQ_POS, 44, 8) /* bc8 reg_idx24 addr32                     */ \
-V(CHECK_AT_START,    45, 8)   /* bc8 pad24 addr32                           */ \
-V(CHECK_NOT_AT_START, 46, 8)  /* bc8 offset24 addr32                        */ \
-V(CHECK_GREEDY,      47, 8)   /* bc8 pad24 addr32                           */ \
-V(ADVANCE_CP_AND_GOTO, 48, 8) /* bc8 offset24 addr32                        */ \
-V(SET_CURRENT_POSITION_FROM_END, 49, 4) /* bc8 idx24                        */
+  V(BREAK, 0, 4)              /* bc8                                        */ \
+  V(PUSH_CP, 1, 4)            /* bc8 pad24                                  */ \
+  V(PUSH_BT, 2, 8)            /* bc8 pad24 offset32                         */ \
+  V(PUSH_REGISTER, 3, 4)      /* bc8 reg_idx24                              */ \
+  V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32                     */ \
+  V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24                              */ \
+  V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24                              */ \
+  V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24                              */ \
+  V(SET_REGISTER, 8, 8)       /* bc8 reg_idx24 value32                      */ \
+  V(ADVANCE_REGISTER, 9, 8)   /* bc8 reg_idx24 value32                      */ \
+  V(POP_CP, 10, 4)            /* bc8 pad24                                  */ \
+  V(POP_BT, 11, 4)            /* bc8 pad24                                  */ \
+  V(POP_REGISTER, 12, 4)      /* bc8 reg_idx24                              */ \
+  V(FAIL, 13, 4)              /* bc8 pad24                                  */ \
+  V(SUCCEED, 14, 4)           /* bc8 pad24                                  */ \
+  V(ADVANCE_CP, 15, 4)        /* bc8 offset24                               */ \
+  V(GOTO, 16, 8)              /* bc8 pad24 addr32                           */ \
+  V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32                        */ \
+  V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4)    /* bc8 offset24 */                  \
+  V(LOAD_2_CURRENT_CHARS, 19, 8)           /* bc8 offset24 addr32 */           \
+  V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */                  \
+  V(LOAD_4_CURRENT_CHARS, 21, 8)           /* bc8 offset24 addr32 */           \
+  V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */                  \
+  V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32                    */    \
+  V(CHECK_CHAR, 24, 8)     /* bc8 pad8 uint16 addr32                     */    \
+  V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */                   \
+  V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32                     */    \
+  V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */            \
+  V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32              */   \
+  V(AND_CHECK_NOT_4_CHARS, 29, 16)    /* bc8 pad24 uint32 uint32 addr32 */     \
+  V(AND_CHECK_NOT_CHAR, 30, 12)       /* bc8 pad8 uint16 uint32 addr32 */      \
+  V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32 */     \
+  V(CHECK_CHAR_IN_RANGE, 32, 12)      /* bc8 pad24 uc16 uc16 addr32 */         \
+  V(CHECK_CHAR_NOT_IN_RANGE, 33, 12)  /* bc8 pad24 uc16 uc16 addr32 */         \
+  V(CHECK_BIT_IN_TABLE, 34, 24)       /* bc8 pad24 addr32 bits128 */           \
+  V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32                       */          \
+  V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32                       */          \
+  V(CHECK_NOT_BACK_REF, 37, 8)         /* bc8 reg_idx24 addr32 */              \
+  V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */              \
+  V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8)                                 \
+  V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8)         /* bc8 reg_idx24 addr32 */     \
+  V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */     \
+  V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8)                        \
+  V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */          \
+  V(CHECK_REGISTER_LT, 44, 12)    /* bc8 reg_idx24 value32 addr32 */           \
+  V(CHECK_REGISTER_GE, 45, 12)    /* bc8 reg_idx24 value32 addr32 */           \
+  V(CHECK_REGISTER_EQ_POS, 46, 8) /* bc8 reg_idx24 addr32 */                   \
+  V(CHECK_AT_START, 47, 8) /* bc8 pad24 addr32                           */    \
+  V(CHECK_NOT_AT_START, 48, 8) /* bc8 offset24 addr32 */                       \
+  V(CHECK_GREEDY, 49, 8) /* bc8 pad24 addr32                           */      \
+  V(ADVANCE_CP_AND_GOTO, 50, 8)           /* bc8 offset24 addr32 */            \
+  V(SET_CURRENT_POSITION_FROM_END, 51, 4) /* bc8 idx24 */
 
 #define DECLARE_BYTECODES(name, code, length) \
   static const int BC_##name = code;
@@ -82,4 +86,6 @@
 }  // namespace internal
 }  // namespace v8
 
+#endif  // V8_INTERPRETED_REGEXP
+
 #endif  // V8_REGEXP_BYTECODES_IRREGEXP_H_

diff --git a/src/regexp/ia32/regexp-macro-assembler-ia32.cc b/src/regexp/ia32/regexp-macro-assembler-ia32.cc
index 6ef0f5f..4c22b43 100644
--- a/src/regexp/ia32/regexp-macro-assembler-ia32.cc
+++ b/src/regexp/ia32/regexp-macro-assembler-ia32.cc

@@ -189,7 +189,7 @@
 
 
 void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase(
-    int start_reg, bool read_backward, Label* on_no_match) {
+    int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
   Label fallthrough;
   __ mov(edx, register_location(start_reg));  // Index of start of capture
   __ mov(ebx, register_location(start_reg + 1));  // Index of end of capture
@@ -296,11 +296,18 @@
     //   Address byte_offset1 - Address captured substring's start.
     //   Address byte_offset2 - Address of current character position.
     //   size_t byte_length - length of capture in bytes(!)
-    //   Isolate* isolate
+//   Isolate* isolate or 0 if unicode flag.
 
     // Set isolate.
-    __ mov(Operand(esp, 3 * kPointerSize),
-           Immediate(ExternalReference::isolate_address(isolate())));
+#ifdef V8_I18N_SUPPORT
+    if (unicode) {
+      __ mov(Operand(esp, 3 * kPointerSize), Immediate(0));
+    } else  // NOLINT
+#endif      // V8_I18N_SUPPORT
+    {
+      __ mov(Operand(esp, 3 * kPointerSize),
+             Immediate(ExternalReference::isolate_address(isolate())));
+    }
     // Set byte_length.
     __ mov(Operand(esp, 2 * kPointerSize), ebx);
     // Set byte_offset2.
@@ -822,13 +829,15 @@
         __ test(edi, edi);
         __ j(zero, &exit_label_, Label::kNear);
         // Advance current position after a zero-length match.
+        Label advance;
+        __ bind(&advance);
         if (mode_ == UC16) {
           __ add(edi, Immediate(2));
         } else {
           __ inc(edi);
         }
+        if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
       }
-
       __ jmp(&load_char_start_regexp);
     } else {
       __ mov(eax, Immediate(SUCCESS));

diff --git a/src/regexp/ia32/regexp-macro-assembler-ia32.h b/src/regexp/ia32/regexp-macro-assembler-ia32.h
index 1ef87ee..fa17413 100644
--- a/src/regexp/ia32/regexp-macro-assembler-ia32.h
+++ b/src/regexp/ia32/regexp-macro-assembler-ia32.h

@@ -37,7 +37,7 @@
   virtual void CheckNotBackReference(int start_reg, bool read_backward,
                                      Label* on_no_match);
   virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
-                                               bool read_backward,
+                                               bool read_backward, bool unicode,
                                                Label* on_no_match);
   virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
   virtual void CheckNotCharacterAfterAnd(uint32_t c,

diff --git a/src/regexp/interpreter-irregexp.cc b/src/regexp/interpreter-irregexp.cc
index ea748e4..14834d5 100644
--- a/src/regexp/interpreter-irregexp.cc
+++ b/src/regexp/interpreter-irregexp.cc

@@ -4,6 +4,8 @@
 
 // A simple interpreter for the Irregexp byte code.
 
+#ifdef V8_INTERPRETED_REGEXP
+
 #include "src/regexp/interpreter-irregexp.h"
 
 #include "src/ast/ast.h"
@@ -13,38 +15,32 @@
 #include "src/unicode.h"
 #include "src/utils.h"
 
+#ifdef V8_I18N_SUPPORT
+#include "unicode/uchar.h"
+#endif  // V8_I18N_SUPPORT
+
 namespace v8 {
 namespace internal {
 
-
 typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize;
 
-static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
-                                 int from,
-                                 int current,
-                                 int len,
-                                 Vector<const uc16> subject) {
-  for (int i = 0; i < len; i++) {
-    unibrow::uchar old_char = subject[from++];
-    unibrow::uchar new_char = subject[current++];
-    if (old_char == new_char) continue;
-    unibrow::uchar old_string[1] = { old_char };
-    unibrow::uchar new_string[1] = { new_char };
-    interp_canonicalize->get(old_char, '\0', old_string);
-    interp_canonicalize->get(new_char, '\0', new_string);
-    if (old_string[0] != new_string[0]) {
-      return false;
-    }
-  }
-  return true;
+static bool BackRefMatchesNoCase(Isolate* isolate, int from, int current,
+                                 int len, Vector<const uc16> subject,
+                                 bool unicode) {
+  Address offset_a =
+      reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(from)));
+  Address offset_b =
+      reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(current)));
+  size_t length = len * kUC16Size;
+  return RegExpMacroAssembler::CaseInsensitiveCompareUC16(
+             offset_a, offset_b, length, unicode ? nullptr : isolate) == 1;
 }
 
 
-static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
-                                 int from,
-                                 int current,
-                                 int len,
-                                 Vector<const uint8_t> subject) {
+static bool BackRefMatchesNoCase(Isolate* isolate, int from, int current,
+                                 int len, Vector<const uint8_t> subject,
+                                 bool unicode) {
+  // For Latin1 characters the unicode flag makes no difference.
   for (int i = 0; i < len; i++) {
     unsigned int old_char = subject[from++];
     unsigned int new_char = subject[current++];
@@ -522,13 +518,16 @@
         pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH;
         break;
       }
+      BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE)
       BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
+        bool unicode =
+            (insn & BYTECODE_MASK) == BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE;
         int from = registers[insn >> BYTECODE_SHIFT];
         int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
         if (from >= 0 && len > 0) {
           if (current + len > subject.length() ||
-              !BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),
-                                    from, current, len, subject)) {
+              !BackRefMatchesNoCase(isolate, from, current, len, subject,
+                                    unicode)) {
             pc = code_base + Load32Aligned(pc + 4);
             break;
           }
@@ -537,13 +536,16 @@
         pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
         break;
       }
+      BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD)
       BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) {
+        bool unicode = (insn & BYTECODE_MASK) ==
+                       BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD;
         int from = registers[insn >> BYTECODE_SHIFT];
         int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
         if (from >= 0 && len > 0) {
           if (current - len < 0 ||
-              !BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),
-                                    from, current - len, len, subject)) {
+              !BackRefMatchesNoCase(isolate, from, current - len, len, subject,
+                                    unicode)) {
             pc = code_base + Load32Aligned(pc + 4);
             break;
           }
@@ -619,3 +621,5 @@
 
 }  // namespace internal
 }  // namespace v8
+
+#endif  // V8_INTERPRETED_REGEXP

diff --git a/src/regexp/interpreter-irregexp.h b/src/regexp/interpreter-irregexp.h
index 244af99..887fab6 100644
--- a/src/regexp/interpreter-irregexp.h
+++ b/src/regexp/interpreter-irregexp.h

@@ -7,12 +7,13 @@
 #ifndef V8_REGEXP_INTERPRETER_IRREGEXP_H_
 #define V8_REGEXP_INTERPRETER_IRREGEXP_H_
 
+#ifdef V8_INTERPRETED_REGEXP
+
 #include "src/regexp/jsregexp.h"
 
 namespace v8 {
 namespace internal {
 
-
 class IrregexpInterpreter {
  public:
   static RegExpImpl::IrregexpResult Match(Isolate* isolate,
@@ -26,4 +27,6 @@
 }  // namespace internal
 }  // namespace v8
 
+#endif  // V8_INTERPRETED_REGEXP
+
 #endif  // V8_REGEXP_INTERPRETER_IRREGEXP_H_

diff --git a/src/regexp/jsregexp-inl.h b/src/regexp/jsregexp-inl.h
index 3eb7c3c..ca7a9fe 100644
--- a/src/regexp/jsregexp-inl.h
+++ b/src/regexp/jsregexp-inl.h

@@ -47,7 +47,10 @@
                                              register_array_size_);
     } else {
       int last_start_index = last_match[0];
-      if (last_start_index == last_end_index) last_end_index++;
+      if (last_start_index == last_end_index) {
+        // Zero-length match. Advance by one code point.
+        last_end_index = AdvanceZeroLength(last_end_index);
+      }
       if (last_end_index > subject_->length()) {
         num_matches_ = 0;  // Signal failed match.
         return NULL;

diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc
index 34d20fe..80f48ca 100644
--- a/src/regexp/jsregexp.cc
+++ b/src/regexp/jsregexp.cc

@@ -25,6 +25,11 @@
 #include "src/string-search.h"
 #include "src/unicode-decoder.h"
 
+#ifdef V8_I18N_SUPPORT
+#include "unicode/uset.h"
+#include "unicode/utypes.h"
+#endif  // V8_I18N_SUPPORT
+
 #ifndef V8_INTERPRETED_REGEXP
 #if V8_TARGET_ARCH_IA32
 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
@@ -72,7 +77,7 @@
                             int ranges_length,
                             Interval new_range) {
   DCHECK((ranges_length & 1) == 1);
-  DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
+  DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
   if (containment == kLatticeUnknown) return containment;
   bool inside = false;
   int last = 0;
@@ -145,9 +150,8 @@
   PostponeInterruptsScope postpone(isolate);
   RegExpCompileData parse_result;
   FlatStringReader reader(isolate, pattern);
-  if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader,
-                                 flags & JSRegExp::kMultiline,
-                                 flags & JSRegExp::kUnicode, &parse_result)) {
+  if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
+                                 &parse_result)) {
     // Throw an exception if we fail to parse the pattern.
     return ThrowRegExpException(re, pattern, parse_result.error);
   }
@@ -371,18 +375,16 @@
   pattern = String::Flatten(pattern);
   RegExpCompileData compile_data;
   FlatStringReader reader(isolate, pattern);
-  if (!RegExpParser::ParseRegExp(isolate, &zone, &reader,
-                                 flags & JSRegExp::kMultiline,
-                                 flags & JSRegExp::kUnicode, &compile_data)) {
+  if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
+                                 &compile_data)) {
     // Throw an exception if we fail to parse the pattern.
     // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
     USE(ThrowRegExpException(re, pattern, compile_data.error));
     return false;
   }
-  RegExpEngine::CompilationResult result = RegExpEngine::Compile(
-      isolate, &zone, &compile_data, flags & JSRegExp::kIgnoreCase,
-      flags & JSRegExp::kGlobal, flags & JSRegExp::kMultiline,
-      flags & JSRegExp::kSticky, pattern, sample_subject, is_one_byte);
+  RegExpEngine::CompilationResult result =
+      RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
+                            sample_subject, is_one_byte);
   if (result.error_message != NULL) {
     // Unable to compile regexp.
     Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
@@ -636,7 +638,6 @@
 
 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
                                      Handle<String> subject,
-                                     bool is_global,
                                      Isolate* isolate)
   : register_array_(NULL),
     register_array_size_(0),
@@ -661,7 +662,8 @@
     }
   }
 
-  if (is_global && !interpreted) {
+  DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
+  if (!interpreted) {
     register_array_size_ =
         Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
     max_matches_ = register_array_size_ / registers_per_match_;
@@ -690,6 +692,16 @@
   last_match[1] = 0;
 }
 
+int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
+  if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
+      last_index + 1 < subject_->length() &&
+      unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
+      unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
+    // Advance over the surrogate pair.
+    return last_index + 2;
+  }
+  return last_index + 1;
+}
 
 // -------------------------------------------------------------------
 // Implementation of the Irregexp regular expression engine.
@@ -945,7 +957,7 @@
 class RegExpCompiler {
  public:
   RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
-                 bool ignore_case, bool is_one_byte);
+                 JSRegExp::Flags flags, bool is_one_byte);
 
   int AllocateRegister() {
     if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
@@ -955,6 +967,22 @@
     return next_register_++;
   }
 
+  // Lookarounds to match lone surrogates for unicode character class matches
+  // are never nested. We can therefore reuse registers.
+  int UnicodeLookaroundStackRegister() {
+    if (unicode_lookaround_stack_register_ == kNoRegister) {
+      unicode_lookaround_stack_register_ = AllocateRegister();
+    }
+    return unicode_lookaround_stack_register_;
+  }
+
+  int UnicodeLookaroundPositionRegister() {
+    if (unicode_lookaround_position_register_ == kNoRegister) {
+      unicode_lookaround_position_register_ = AllocateRegister();
+    }
+    return unicode_lookaround_position_register_;
+  }
+
   RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
                                            RegExpNode* start,
                                            int capture_count,
@@ -981,7 +1009,8 @@
 
   void SetRegExpTooBig() { reg_exp_too_big_ = true; }
 
-  inline bool ignore_case() { return ignore_case_; }
+  inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
+  inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
   inline bool one_byte() { return one_byte_; }
   inline bool optimize() { return optimize_; }
   inline void set_optimize(bool value) { optimize_ = value; }
@@ -1006,10 +1035,12 @@
  private:
   EndNode* accept_;
   int next_register_;
+  int unicode_lookaround_stack_register_;
+  int unicode_lookaround_position_register_;
   List<RegExpNode*>* work_list_;
   int recursion_depth_;
   RegExpMacroAssembler* macro_assembler_;
-  bool ignore_case_;
+  JSRegExp::Flags flags_;
   bool one_byte_;
   bool reg_exp_too_big_;
   bool limiting_recursion_;
@@ -1041,11 +1072,13 @@
 // Attempts to compile the regexp using an Irregexp code generator.  Returns
 // a fixed array or a null handle depending on whether it succeeded.
 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
-                               bool ignore_case, bool one_byte)
+                               JSRegExp::Flags flags, bool one_byte)
     : next_register_(2 * (capture_count + 1)),
+      unicode_lookaround_stack_register_(kNoRegister),
+      unicode_lookaround_position_register_(kNoRegister),
       work_list_(NULL),
       recursion_depth_(0),
-      ignore_case_(ignore_case),
+      flags_(flags),
       one_byte_(one_byte),
       reg_exp_too_big_(false),
       limiting_recursion_(false),
@@ -1941,15 +1974,13 @@
 // know that the character is in the range of min_char to max_char inclusive.
 // Either label can be NULL indicating backtracking.  Either label can also be
 // equal to the fall_through label.
-static void GenerateBranches(RegExpMacroAssembler* masm,
-                             ZoneList<int>* ranges,
-                             int start_index,
-                             int end_index,
-                             uc16 min_char,
-                             uc16 max_char,
-                             Label* fall_through,
-                             Label* even_label,
-                             Label* odd_label) {
+static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
+                             int start_index, int end_index, uc32 min_char,
+                             uc32 max_char, Label* fall_through,
+                             Label* even_label, Label* odd_label) {
+  DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
+  DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
+
   int first = ranges->at(start_index);
   int last = ranges->at(end_index) - 1;
 
@@ -2098,9 +2129,7 @@
                           Label* on_failure, int cp_offset, bool check_offset,
                           bool preloaded, Zone* zone) {
   ZoneList<CharacterRange>* ranges = cc->ranges(zone);
-  if (!CharacterRange::IsCanonical(ranges)) {
-    CharacterRange::Canonicalize(ranges);
-  }
+  CharacterRange::Canonicalize(ranges);
 
   int max_char;
   if (one_byte) {
@@ -2142,23 +2171,14 @@
     }
     return;
   }
-  if (last_valid_range == 0 &&
-      !cc->is_negated() &&
-      ranges->at(0).IsEverything(max_char)) {
-    // This is a common case hit by non-anchored expressions.
-    if (check_offset) {
-      macro_assembler->CheckPosition(cp_offset, on_failure);
-    }
-    return;
-  }
 
   if (!preloaded) {
     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
   }
 
   if (cc->is_standard(zone) &&
-        macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
-                                                    on_failure)) {
+      macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
+                                                  on_failure)) {
       return;
   }
 
@@ -2470,12 +2490,14 @@
   } else {
     // For 2-character preloads in one-byte mode or 1-character preloads in
     // two-byte mode we also use a 16 bit load with zero extend.
+    static const uint32_t kTwoByteMask = 0xffff;
+    static const uint32_t kFourByteMask = 0xffffffff;
     if (details->characters() == 2 && compiler->one_byte()) {
-      if ((mask & 0xffff) == 0xffff) need_mask = false;
+      if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
     } else if (details->characters() == 1 && !compiler->one_byte()) {
-      if ((mask & 0xffff) == 0xffff) need_mask = false;
+      if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
     } else {
-      if (mask == 0xffffffff) need_mask = false;
+      if (mask == kFourByteMask) need_mask = false;
     }
   }
 
@@ -2798,9 +2820,7 @@
       DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
       RegExpCharacterClass* cc = elm.char_class();
       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
-      if (!CharacterRange::IsCanonical(ranges)) {
-        CharacterRange::Canonicalize(ranges);
-      }
+      CharacterRange::Canonicalize(ranges);
       // Now they are in order so we only need to look at the first.
       int range_count = ranges->length();
       if (cc->is_negated()) {
@@ -3289,6 +3309,36 @@
 }
 
 
+TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
+                                             ZoneList<CharacterRange>* ranges,
+                                             bool read_backward,
+                                             RegExpNode* on_success) {
+  DCHECK_NOT_NULL(ranges);
+  ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
+  elms->Add(
+      TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
+      zone);
+  return new (zone) TextNode(elms, read_backward, on_success);
+}
+
+
+TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
+                                           CharacterRange trail,
+                                           bool read_backward,
+                                           RegExpNode* on_success) {
+  ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
+  ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
+  ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
+  elms->Add(TextElement::CharClass(
+                new (zone) RegExpCharacterClass(lead_ranges, false)),
+            zone);
+  elms->Add(TextElement::CharClass(
+                new (zone) RegExpCharacterClass(trail_ranges, false)),
+            zone);
+  return new (zone) TextNode(elms, read_backward, on_success);
+}
+
+
 // This generates the code to match a text node.  A text node can contain
 // straight character sequences (possibly to be matched in a case-independent
 // way) and character classes.  For efficiency we do not do this in a single
@@ -3385,10 +3435,7 @@
       // independent case and it slows us down if we don't know that.
       if (cc->is_standard(zone())) continue;
       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
-      int range_count = ranges->length();
-      for (int j = 0; j < range_count; j++) {
-        ranges->at(j).AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
-      }
+      CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
     }
   }
 }
@@ -3405,9 +3452,7 @@
   if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
   RegExpCharacterClass* node = elm.char_class();
   ZoneList<CharacterRange>* ranges = node->ranges(zone());
-  if (!CharacterRange::IsCanonical(ranges)) {
-    CharacterRange::Canonicalize(ranges);
-  }
+  CharacterRange::Canonicalize(ranges);
   if (node->is_negated()) {
     return ranges->length() == 0 ? on_success() : NULL;
   }
@@ -3554,27 +3599,29 @@
 };
 
 
+static const uc32 kRangeEndMarker = 0x110000;
+
 // The '2' variant is has inclusive from and exclusive to.
 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
-static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
-    0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
-    0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
-    0xFEFF, 0xFF00, 0x10000 };
+static const int kSpaceRanges[] = {
+    '\t',   '\r' + 1, ' ',    ' ' + 1, 0x00A0, 0x00A1, 0x1680,         0x1681,
+    0x180E, 0x180F,   0x2000, 0x200B,  0x2028, 0x202A, 0x202F,         0x2030,
+    0x205F, 0x2060,   0x3000, 0x3001,  0xFEFF, 0xFF00, kRangeEndMarker};
 static const int kSpaceRangeCount = arraysize(kSpaceRanges);
 
 static const int kWordRanges[] = {
-    '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
+    '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
 static const int kWordRangeCount = arraysize(kWordRanges);
-static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
+static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
 static const int kDigitRangeCount = arraysize(kDigitRanges);
-static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
+static const int kSurrogateRanges[] = {
+    kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
-static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
-    0x2028, 0x202A, 0x10000 };
+static const int kLineTerminatorRanges[] = {
+    0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
 
-
 void BoyerMoorePositionInfo::Set(int character) {
   SetInterval(Interval(character, character));
 }
@@ -3916,6 +3963,11 @@
 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   int choice_count = alternatives_->length();
 
+  if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
+    alternatives_->at(0).node()->Emit(compiler, trace);
+    return;
+  }
+
   AssertGuardsMentionRegisters(trace);
 
   LimitResult limit_result = LimitVersions(compiler, trace);
@@ -4349,14 +4401,19 @@
 
   DCHECK_EQ(start_reg_ + 1, end_reg_);
   if (compiler->ignore_case()) {
-    assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
-                                               trace->backtrack());
+    assembler->CheckNotBackReferenceIgnoreCase(
+        start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
   } else {
     assembler->CheckNotBackReference(start_reg_, read_backward(),
                                      trace->backtrack());
   }
   // We are going to advance backward, so we may end up at the start.
   if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
+
+  // Check that the back reference does not end inside a surrogate pair.
+  if (compiler->unicode() && !compiler->one_byte()) {
+    assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
+  }
   on_success()->Emit(compiler, trace);
 }
 
@@ -4732,8 +4789,8 @@
 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
                                  const int* special_class,
                                  int length) {
-  length--;  // Remove final 0x10000.
-  DCHECK(special_class[length] == 0x10000);
+  length--;  // Remove final marker.
+  DCHECK(special_class[length] == kRangeEndMarker);
   DCHECK(ranges->length() != 0);
   DCHECK(length != 0);
   DCHECK(special_class[0] != 0);
@@ -4753,7 +4810,7 @@
       return false;
     }
   }
-  if (range.to() != 0xffff) {
+  if (range.to() != String::kMaxCodePoint) {
     return false;
   }
   return true;
@@ -4763,8 +4820,8 @@
 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
                           const int* special_class,
                           int length) {
-  length--;  // Remove final 0x10000.
-  DCHECK(special_class[length] == 0x10000);
+  length--;  // Remove final marker.
+  DCHECK(special_class[length] == kRangeEndMarker);
   if (ranges->length() * 2 != length) {
     return false;
   }
@@ -4820,10 +4877,303 @@
 }
 
 
+UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
+                                           ZoneList<CharacterRange>* base)
+    : zone_(zone),
+      table_(zone),
+      bmp_(nullptr),
+      lead_surrogates_(nullptr),
+      trail_surrogates_(nullptr),
+      non_bmp_(nullptr) {
+  // The unicode range splitter categorizes given character ranges into:
+  // - Code points from the BMP representable by one code unit.
+  // - Code points outside the BMP that need to be split into surrogate pairs.
+  // - Lone lead surrogates.
+  // - Lone trail surrogates.
+  // Lone surrogates are valid code points, even though no actual characters.
+  // They require special matching to make sure we do not split surrogate pairs.
+  // We use the dispatch table to accomplish this. The base range is split up
+  // by the table by the overlay ranges, and the Call callback is used to
+  // filter and collect ranges for each category.
+  for (int i = 0; i < base->length(); i++) {
+    table_.AddRange(base->at(i), kBase, zone_);
+  }
+  // Add overlay ranges.
+  table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
+                  kBmpCodePoints, zone_);
+  table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
+                  kLeadSurrogates, zone_);
+  table_.AddRange(
+      CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
+      kTrailSurrogates, zone_);
+  table_.AddRange(
+      CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
+      kBmpCodePoints, zone_);
+  table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
+                  kNonBmpCodePoints, zone_);
+  table_.ForEach(this);
+}
+
+
+void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
+  OutSet* outset = entry.out_set();
+  if (!outset->Get(kBase)) return;
+  ZoneList<CharacterRange>** target = NULL;
+  if (outset->Get(kBmpCodePoints)) {
+    target = &bmp_;
+  } else if (outset->Get(kLeadSurrogates)) {
+    target = &lead_surrogates_;
+  } else if (outset->Get(kTrailSurrogates)) {
+    target = &trail_surrogates_;
+  } else {
+    DCHECK(outset->Get(kNonBmpCodePoints));
+    target = &non_bmp_;
+  }
+  if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
+  (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
+}
+
+
+void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
+                      RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
+  ZoneList<CharacterRange>* bmp = splitter->bmp();
+  if (bmp == nullptr) return;
+  result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
+      compiler->zone(), bmp, compiler->read_backward(), on_success)));
+}
+
+
+void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
+                             RegExpNode* on_success,
+                             UnicodeRangeSplitter* splitter) {
+  ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
+  if (non_bmp == nullptr) return;
+  DCHECK(compiler->unicode());
+  DCHECK(!compiler->one_byte());
+  Zone* zone = compiler->zone();
+  CharacterRange::Canonicalize(non_bmp);
+  for (int i = 0; i < non_bmp->length(); i++) {
+    // Match surrogate pair.
+    // E.g. [\u10005-\u11005] becomes
+    //      \ud800[\udc05-\udfff]|
+    //      [\ud801-\ud803][\udc00-\udfff]|
+    //      \ud804[\udc00-\udc05]
+    uc32 from = non_bmp->at(i).from();
+    uc32 to = non_bmp->at(i).to();
+    uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
+    uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
+    uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
+    uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
+    if (from_l == to_l) {
+      // The lead surrogate is the same.
+      result->AddAlternative(
+          GuardedAlternative(TextNode::CreateForSurrogatePair(
+              zone, CharacterRange::Singleton(from_l),
+              CharacterRange::Range(from_t, to_t), compiler->read_backward(),
+              on_success)));
+    } else {
+      if (from_t != kTrailSurrogateStart) {
+        // Add [from_l][from_t-\udfff]
+        result->AddAlternative(
+            GuardedAlternative(TextNode::CreateForSurrogatePair(
+                zone, CharacterRange::Singleton(from_l),
+                CharacterRange::Range(from_t, kTrailSurrogateEnd),
+                compiler->read_backward(), on_success)));
+        from_l++;
+      }
+      if (to_t != kTrailSurrogateEnd) {
+        // Add [to_l][\udc00-to_t]
+        result->AddAlternative(
+            GuardedAlternative(TextNode::CreateForSurrogatePair(
+                zone, CharacterRange::Singleton(to_l),
+                CharacterRange::Range(kTrailSurrogateStart, to_t),
+                compiler->read_backward(), on_success)));
+        to_l--;
+      }
+      if (from_l <= to_l) {
+        // Add [from_l-to_l][\udc00-\udfff]
+        result->AddAlternative(
+            GuardedAlternative(TextNode::CreateForSurrogatePair(
+                zone, CharacterRange::Range(from_l, to_l),
+                CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
+                compiler->read_backward(), on_success)));
+      }
+    }
+  }
+}
+
+
+RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
+    RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
+    ZoneList<CharacterRange>* match, RegExpNode* on_success,
+    bool read_backward) {
+  Zone* zone = compiler->zone();
+  RegExpNode* match_node = TextNode::CreateForCharacterRanges(
+      zone, match, read_backward, on_success);
+  int stack_register = compiler->UnicodeLookaroundStackRegister();
+  int position_register = compiler->UnicodeLookaroundPositionRegister();
+  RegExpLookaround::Builder lookaround(false, match_node, stack_register,
+                                       position_register);
+  RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
+      zone, lookbehind, !read_backward, lookaround.on_match_success());
+  return lookaround.ForMatch(negative_match);
+}
+
+
+RegExpNode* MatchAndNegativeLookaroundInReadDirection(
+    RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
+    ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
+    bool read_backward) {
+  Zone* zone = compiler->zone();
+  int stack_register = compiler->UnicodeLookaroundStackRegister();
+  int position_register = compiler->UnicodeLookaroundPositionRegister();
+  RegExpLookaround::Builder lookaround(false, on_success, stack_register,
+                                       position_register);
+  RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
+      zone, lookahead, read_backward, lookaround.on_match_success());
+  return TextNode::CreateForCharacterRanges(
+      zone, match, read_backward, lookaround.ForMatch(negative_match));
+}
+
+
+void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
+                           RegExpNode* on_success,
+                           UnicodeRangeSplitter* splitter) {
+  ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
+  if (lead_surrogates == nullptr) return;
+  Zone* zone = compiler->zone();
+  // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
+  ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
+      zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
+
+  RegExpNode* match;
+  if (compiler->read_backward()) {
+    // Reading backward. Assert that reading forward, there is no trail
+    // surrogate, and then backward match the lead surrogate.
+    match = NegativeLookaroundAgainstReadDirectionAndMatch(
+        compiler, trail_surrogates, lead_surrogates, on_success, true);
+  } else {
+    // Reading forward. Forward match the lead surrogate and assert that
+    // no trail surrogate follows.
+    match = MatchAndNegativeLookaroundInReadDirection(
+        compiler, lead_surrogates, trail_surrogates, on_success, false);
+  }
+  result->AddAlternative(GuardedAlternative(match));
+}
+
+
+void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
+                            RegExpNode* on_success,
+                            UnicodeRangeSplitter* splitter) {
+  ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
+  if (trail_surrogates == nullptr) return;
+  Zone* zone = compiler->zone();
+  // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
+  ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
+      zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
+
+  RegExpNode* match;
+  if (compiler->read_backward()) {
+    // Reading backward. Backward match the trail surrogate and assert that no
+    // lead surrogate precedes it.
+    match = MatchAndNegativeLookaroundInReadDirection(
+        compiler, trail_surrogates, lead_surrogates, on_success, true);
+  } else {
+    // Reading forward. Assert that reading backward, there is no lead
+    // surrogate, and then forward match the trail surrogate.
+    match = NegativeLookaroundAgainstReadDirectionAndMatch(
+        compiler, lead_surrogates, trail_surrogates, on_success, false);
+  }
+  result->AddAlternative(GuardedAlternative(match));
+}
+
+RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
+                              RegExpNode* on_success) {
+  // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
+  DCHECK(!compiler->read_backward());
+  Zone* zone = compiler->zone();
+  // Advance any character. If the character happens to be a lead surrogate and
+  // we advanced into the middle of a surrogate pair, it will work out, as
+  // nothing will match from there. We will have to advance again, consuming
+  // the associated trail surrogate.
+  ZoneList<CharacterRange>* range = CharacterRange::List(
+      zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
+  return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
+}
+
+
+void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
+                               ZoneList<CharacterRange>* ranges) {
+#ifdef V8_I18N_SUPPORT
+  // Use ICU to compute the case fold closure over the ranges.
+  DCHECK(compiler->unicode());
+  DCHECK(compiler->ignore_case());
+  USet* set = uset_openEmpty();
+  for (int i = 0; i < ranges->length(); i++) {
+    uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
+  }
+  ranges->Clear();
+  uset_closeOver(set, USET_CASE_INSENSITIVE);
+  // Full case mapping map single characters to multiple characters.
+  // Those are represented as strings in the set. Remove them so that
+  // we end up with only simple and common case mappings.
+  uset_removeAllStrings(set);
+  int item_count = uset_getItemCount(set);
+  int item_result = 0;
+  UErrorCode ec = U_ZERO_ERROR;
+  Zone* zone = compiler->zone();
+  for (int i = 0; i < item_count; i++) {
+    uc32 start = 0;
+    uc32 end = 0;
+    item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
+    ranges->Add(CharacterRange::Range(start, end), zone);
+  }
+  // No errors and everything we collected have been ranges.
+  DCHECK_EQ(U_ZERO_ERROR, ec);
+  DCHECK_EQ(0, item_result);
+  uset_close(set);
+#else
+  // Fallback if ICU is not included.
+  CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
+                                     ranges, compiler->one_byte());
+#endif  // V8_I18N_SUPPORT
+  CharacterRange::Canonicalize(ranges);
+}
+
+
 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
                                          RegExpNode* on_success) {
-  return new (compiler->zone())
-      TextNode(this, compiler->read_backward(), on_success);
+  set_.Canonicalize();
+  Zone* zone = compiler->zone();
+  ZoneList<CharacterRange>* ranges = this->ranges(zone);
+  if (compiler->unicode() && compiler->ignore_case()) {
+    AddUnicodeCaseEquivalents(compiler, ranges);
+  }
+  if (compiler->unicode() && !compiler->one_byte()) {
+    if (is_negated()) {
+      ZoneList<CharacterRange>* negated =
+          new (zone) ZoneList<CharacterRange>(2, zone);
+      CharacterRange::Negate(ranges, negated, zone);
+      ranges = negated;
+    }
+    if (ranges->length() == 0) {
+      // No matches possible.
+      return new (zone) EndNode(EndNode::BACKTRACK, zone);
+    }
+    if (standard_type() == '*') {
+      return UnanchoredAdvance(compiler, on_success);
+    } else {
+      ChoiceNode* result = new (zone) ChoiceNode(2, zone);
+      UnicodeRangeSplitter splitter(zone, ranges);
+      AddBmpCharacters(compiler, result, on_success, &splitter);
+      AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
+      AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
+      AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
+      return result;
+    }
+  } else {
+    return new (zone) TextNode(this, compiler->read_backward(), on_success);
+  }
 }
 
 
@@ -5338,6 +5688,47 @@
 }
 
 
+RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
+                                   int stack_pointer_register,
+                                   int position_register,
+                                   int capture_register_count,
+                                   int capture_register_start)
+    : is_positive_(is_positive),
+      on_success_(on_success),
+      stack_pointer_register_(stack_pointer_register),
+      position_register_(position_register) {
+  if (is_positive_) {
+    on_match_success_ = ActionNode::PositiveSubmatchSuccess(
+        stack_pointer_register, position_register, capture_register_count,
+        capture_register_start, on_success_);
+  } else {
+    Zone* zone = on_success_->zone();
+    on_match_success_ = new (zone) NegativeSubmatchSuccess(
+        stack_pointer_register, position_register, capture_register_count,
+        capture_register_start, zone);
+  }
+}
+
+
+RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
+  if (is_positive_) {
+    return ActionNode::BeginSubmatch(stack_pointer_register_,
+                                     position_register_, match);
+  } else {
+    Zone* zone = on_success_->zone();
+    // We use a ChoiceNode to represent the negative lookaround. The first
+    // alternative is the negative match. On success, the end node backtracks.
+    // On failure, the second alternative is tried and leads to success.
+    // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
+    // first exit when calculating quick checks.
+    ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
+        GuardedAlternative(match), GuardedAlternative(on_success_), zone);
+    return ActionNode::BeginSubmatch(stack_pointer_register_,
+                                     position_register_, choice_node);
+  }
+}
+
+
 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
                                      RegExpNode* on_success) {
   int stack_pointer_register = compiler->AllocateRegister();
@@ -5352,35 +5743,10 @@
   RegExpNode* result;
   bool was_reading_backward = compiler->read_backward();
   compiler->set_read_backward(type() == LOOKBEHIND);
-  if (is_positive()) {
-    result = ActionNode::BeginSubmatch(
-        stack_pointer_register, position_register,
-        body()->ToNode(compiler,
-                       ActionNode::PositiveSubmatchSuccess(
-                           stack_pointer_register, position_register,
-                           register_count, register_start, on_success)));
-  } else {
-    // We use a ChoiceNode for a negative lookahead because it has most of
-    // the characteristics we need.  It has the body of the lookahead as its
-    // first alternative and the expression after the lookahead of the second
-    // alternative.  If the first alternative succeeds then the
-    // NegativeSubmatchSuccess will unwind the stack including everything the
-    // choice node set up and backtrack.  If the first alternative fails then
-    // the second alternative is tried, which is exactly the desired result
-    // for a negative lookahead.  The NegativeLookaheadChoiceNode is a special
-    // ChoiceNode that knows to ignore the first exit when calculating quick
-    // checks.
-    Zone* zone = compiler->zone();
-
-    GuardedAlternative body_alt(
-        body()->ToNode(compiler, new (zone) NegativeSubmatchSuccess(
-                                     stack_pointer_register, position_register,
-                                     register_count, register_start, zone)));
-    ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
-        body_alt, GuardedAlternative(on_success), zone);
-    result = ActionNode::BeginSubmatch(stack_pointer_register,
-                                       position_register, choice_node);
-  }
+  Builder builder(is_positive(), on_success, stack_pointer_register,
+                  position_register, register_count, register_start);
+  RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
+  result = builder.ForMatch(match);
   compiler->set_read_backward(was_reading_backward);
   return result;
 }
@@ -5428,10 +5794,10 @@
                      ZoneList<CharacterRange>* ranges,
                      Zone* zone) {
   elmc--;
-  DCHECK(elmv[elmc] == 0x10000);
+  DCHECK(elmv[elmc] == kRangeEndMarker);
   for (int i = 0; i < elmc; i += 2) {
     DCHECK(elmv[i] < elmv[i + 1]);
-    ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);
+    ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
   }
 }
 
@@ -5441,17 +5807,17 @@
                             ZoneList<CharacterRange>* ranges,
                             Zone* zone) {
   elmc--;
-  DCHECK(elmv[elmc] == 0x10000);
+  DCHECK(elmv[elmc] == kRangeEndMarker);
   DCHECK(elmv[0] != 0x0000);
-  DCHECK(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
+  DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
   uc16 last = 0x0000;
   for (int i = 0; i < elmc; i += 2) {
     DCHECK(last <= elmv[i] - 1);
     DCHECK(elmv[i] < elmv[i + 1]);
-    ranges->Add(CharacterRange(last, elmv[i] - 1), zone);
+    ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
     last = elmv[i + 1];
   }
-  ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);
+  ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
 }
 
 
@@ -5508,115 +5874,73 @@
 }
 
 
-class CharacterRangeSplitter {
- public:
-  CharacterRangeSplitter(ZoneList<CharacterRange>** included,
-                         ZoneList<CharacterRange>** excluded,
-                         Zone* zone)
-      : included_(included),
-        excluded_(excluded),
-        zone_(zone) { }
-  void Call(uc16 from, DispatchTable::Entry entry);
-
-  static const int kInBase = 0;
-  static const int kInOverlay = 1;
-
- private:
-  ZoneList<CharacterRange>** included_;
-  ZoneList<CharacterRange>** excluded_;
-  Zone* zone_;
-};
-
-
-void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
-  if (!entry.out_set()->Get(kInBase)) return;
-  ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
-    ? included_
-    : excluded_;
-  if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_);
-  (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);
-}
-
-
-void CharacterRange::Split(ZoneList<CharacterRange>* base,
-                           Vector<const int> overlay,
-                           ZoneList<CharacterRange>** included,
-                           ZoneList<CharacterRange>** excluded,
-                           Zone* zone) {
-  DCHECK_NULL(*included);
-  DCHECK_NULL(*excluded);
-  DispatchTable table(zone);
-  for (int i = 0; i < base->length(); i++)
-    table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);
-  for (int i = 0; i < overlay.length(); i += 2) {
-    table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
-                   CharacterRangeSplitter::kInOverlay, zone);
-  }
-  CharacterRangeSplitter callback(included, excluded, zone);
-  table.ForEach(&callback);
-}
-
-
 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
                                         ZoneList<CharacterRange>* ranges,
                                         bool is_one_byte) {
-  uc16 bottom = from();
-  uc16 top = to();
-  if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {
-    if (bottom > String::kMaxOneByteCharCode) return;
-    if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
-  }
-  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
-  if (top == bottom) {
-    // If this is a singleton we just expand the one character.
-    int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
-    for (int i = 0; i < length; i++) {
-      uc32 chr = chars[i];
-      if (chr != bottom) {
-        ranges->Add(CharacterRange::Singleton(chars[i]), zone);
-      }
+  int range_count = ranges->length();
+  for (int i = 0; i < range_count; i++) {
+    CharacterRange range = ranges->at(i);
+    uc32 bottom = range.from();
+    if (bottom > String::kMaxUtf16CodeUnit) return;
+    uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
+    // Nothing to be done for surrogates.
+    if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
+    if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
+      if (bottom > String::kMaxOneByteCharCode) return;
+      if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
     }
-  } else {
-    // If this is a range we expand the characters block by block,
-    // expanding contiguous subranges (blocks) one at a time.
-    // The approach is as follows.  For a given start character we
-    // look up the remainder of the block that contains it (represented
-    // by the end point), for instance we find 'z' if the character
-    // is 'c'.  A block is characterized by the property
-    // that all characters uncanonicalize in the same way, except that
-    // each entry in the result is incremented by the distance from the first
-    // element.  So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
-    // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
-    // Once we've found the end point we look up its uncanonicalization
-    // and produce a range for each element.  For instance for [c-f]
-    // we look up ['z', 'Z'] and produce [c-f] and [C-F].  We then only
-    // add a range if it is not already contained in the input, so [c-f]
-    // will be skipped but [C-F] will be added.  If this range is not
-    // completely contained in a block we do this for all the blocks
-    // covered by the range (handling characters that is not in a block
-    // as a "singleton block").
-    unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
-    int pos = bottom;
-    while (pos <= top) {
-      int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
-      uc16 block_end;
-      if (length == 0) {
-        block_end = pos;
-      } else {
-        DCHECK_EQ(1, length);
-        block_end = range[0];
-      }
-      int end = (block_end > top) ? top : block_end;
-      length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
+    unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+    if (top == bottom) {
+      // If this is a singleton we just expand the one character.
+      int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
       for (int i = 0; i < length; i++) {
-        uc32 c = range[i];
-        uc16 range_from = c - (block_end - pos);
-        uc16 range_to = c - (block_end - end);
-        if (!(bottom <= range_from && range_to <= top)) {
-          ranges->Add(CharacterRange(range_from, range_to), zone);
+        uc32 chr = chars[i];
+        if (chr != bottom) {
+          ranges->Add(CharacterRange::Singleton(chars[i]), zone);
         }
       }
-      pos = end + 1;
+    } else {
+      // If this is a range we expand the characters block by block, expanding
+      // contiguous subranges (blocks) one at a time.  The approach is as
+      // follows.  For a given start character we look up the remainder of the
+      // block that contains it (represented by the end point), for instance we
+      // find 'z' if the character is 'c'.  A block is characterized by the
+      // property that all characters uncanonicalize in the same way, except
+      // that each entry in the result is incremented by the distance from the
+      // first element.  So a-z is a block because 'a' uncanonicalizes to ['a',
+      // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].  Once
+      // we've found the end point we look up its uncanonicalization and
+      // produce a range for each element.  For instance for [c-f] we look up
+      // ['z', 'Z'] and produce [c-f] and [C-F].  We then only add a range if
+      // it is not already contained in the input, so [c-f] will be skipped but
+      // [C-F] will be added.  If this range is not completely contained in a
+      // block we do this for all the blocks covered by the range (handling
+      // characters that is not in a block as a "singleton block").
+      unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+      int pos = bottom;
+      while (pos <= top) {
+        int length =
+            isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
+        uc32 block_end;
+        if (length == 0) {
+          block_end = pos;
+        } else {
+          DCHECK_EQ(1, length);
+          block_end = equivalents[0];
+        }
+        int end = (block_end > top) ? top : block_end;
+        length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
+                                                         equivalents);
+        for (int i = 0; i < length; i++) {
+          uc32 c = equivalents[i];
+          uc32 range_from = c - (block_end - pos);
+          uc32 range_to = c - (block_end - end);
+          if (!(bottom <= range_from && range_to <= top)) {
+            ranges->Add(CharacterRange::Range(range_from, range_to), zone);
+          }
+        }
+        pos = end + 1;
+      }
     }
   }
 }
@@ -5672,8 +5996,8 @@
   // list[0..count] for the result. Returns the number of resulting
   // canonicalized ranges. Inserting a range may collapse existing ranges into
   // fewer ranges, so the return value can be anything in the range 1..count+1.
-  uc16 from = insert.from();
-  uc16 to = insert.to();
+  uc32 from = insert.from();
+  uc32 to = insert.to();
   int start_pos = 0;
   int end_pos = count;
   for (int i = count - 1; i >= 0; i--) {
@@ -5706,7 +6030,7 @@
     CharacterRange to_replace = list->at(start_pos);
     int new_from = Min(to_replace.from(), from);
     int new_to = Max(to_replace.to(), to);
-    list->at(start_pos) = CharacterRange(new_from, new_to);
+    list->at(start_pos) = CharacterRange::Range(new_from, new_to);
     return count;
   }
   // Replace a number of existing ranges from start_pos to end_pos - 1.
@@ -5717,7 +6041,7 @@
   if (end_pos < count) {
     MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
   }
-  list->at(start_pos) = CharacterRange(new_from, new_to);
+  list->at(start_pos) = CharacterRange::Range(new_from, new_to);
   return count - (end_pos - start_pos) + 1;
 }
 
@@ -5773,20 +6097,20 @@
   DCHECK(CharacterRange::IsCanonical(ranges));
   DCHECK_EQ(0, negated_ranges->length());
   int range_count = ranges->length();
-  uc16 from = 0;
+  uc32 from = 0;
   int i = 0;
   if (range_count > 0 && ranges->at(0).from() == 0) {
-    from = ranges->at(0).to();
+    from = ranges->at(0).to() + 1;
     i = 1;
   }
   while (i < range_count) {
     CharacterRange range = ranges->at(i);
-    negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);
-    from = range.to();
+    negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
+    from = range.to() + 1;
     i++;
   }
-  if (from < String::kMaxUtf16CodeUnit) {
-    negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit),
+  if (from < String::kMaxCodePoint) {
+    negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
                         zone);
   }
 }
@@ -5838,7 +6162,7 @@
 }
 
 
-const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
+const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
 
 
 void DispatchTable::AddRange(CharacterRange full_range, int value,
@@ -5866,8 +6190,9 @@
     if (entry->from() < current.from() && entry->to() >= current.from()) {
       // Snap the overlapping range in half around the start point of
       // the range we're adding.
-      CharacterRange left(entry->from(), current.from() - 1);
-      CharacterRange right(current.from(), entry->to());
+      CharacterRange left =
+          CharacterRange::Range(entry->from(), current.from() - 1);
+      CharacterRange right = CharacterRange::Range(current.from(), entry->to());
       // The left part of the overlapping range doesn't overlap.
       // Truncate the whole entry to be just the left part.
       entry->set_to(left.to());
@@ -5919,10 +6244,6 @@
       // we're adding so we can just update it and move the start point
       // of the range we're adding just past it.
       entry->AddValue(value, zone);
-      // Bail out if the last interval ended at 0xFFFF since otherwise
-      // adding 1 will wrap around to 0.
-      if (entry->to() == String::kMaxUtf16CodeUnit)
-        break;
       DCHECK(entry->to() + 1 > current.from());
       current.set_from(entry->to() + 1);
     } else {
@@ -5940,7 +6261,7 @@
 }
 
 
-OutSet* DispatchTable::Get(uc16 value) {
+OutSet* DispatchTable::Get(uc32 value) {
   ZoneSplayTree<Config>::Locator loc;
   if (!tree()->FindGreatestLessThan(value, &loc))
     return empty();
@@ -5990,7 +6311,7 @@
 
 
 void Analysis::VisitText(TextNode* that) {
-  if (ignore_case_) {
+  if (ignore_case()) {
     that->MakeCaseIndependent(isolate(), is_one_byte_);
   }
   EnsureAnalyzed(that->on_success());
@@ -6173,8 +6494,7 @@
 
 
 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
-  CharacterRange range(from, entry.to());
-  constructor_->AddRange(range);
+  constructor_->AddRange(CharacterRange::Range(from, entry.to()));
 }
 
 
@@ -6212,16 +6532,16 @@
   for (int i = 0; i < ranges->length(); i++) {
     CharacterRange range = ranges->at(i);
     if (last < range.from())
-      AddRange(CharacterRange(last, range.from() - 1));
+      AddRange(CharacterRange::Range(last, range.from() - 1));
     if (range.to() >= last) {
-      if (range.to() == String::kMaxUtf16CodeUnit) {
+      if (range.to() == String::kMaxCodePoint) {
         return;
       } else {
         last = range.to() + 1;
       }
     }
   }
-  AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
+  AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
 }
 
 
@@ -6230,7 +6550,7 @@
   switch (elm.text_type()) {
     case TextElement::ATOM: {
       uc16 c = elm.atom()->data()[0];
-      AddRange(CharacterRange(c, c));
+      AddRange(CharacterRange::Range(c, c));
       break;
     }
     case TextElement::CHAR_CLASS: {
@@ -6257,14 +6577,48 @@
 }
 
 
+RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
+                                              RegExpNode* on_success) {
+  // If the regexp matching starts within a surrogate pair, step back
+  // to the lead surrogate and start matching from there.
+  DCHECK(!compiler->read_backward());
+  Zone* zone = compiler->zone();
+  ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
+      zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
+  ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
+      zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
+
+  ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
+
+  int stack_register = compiler->UnicodeLookaroundStackRegister();
+  int position_register = compiler->UnicodeLookaroundPositionRegister();
+  RegExpNode* step_back = TextNode::CreateForCharacterRanges(
+      zone, lead_surrogates, true, on_success);
+  RegExpLookaround::Builder builder(true, step_back, stack_register,
+                                    position_register);
+  RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
+      zone, trail_surrogates, false, builder.on_match_success());
+
+  optional_step_back->AddAlternative(
+      GuardedAlternative(builder.ForMatch(match_trail)));
+  optional_step_back->AddAlternative(GuardedAlternative(on_success));
+
+  return optional_step_back;
+}
+
+
 RegExpEngine::CompilationResult RegExpEngine::Compile(
-    Isolate* isolate, Zone* zone, RegExpCompileData* data, bool ignore_case,
-    bool is_global, bool is_multiline, bool is_sticky, Handle<String> pattern,
+    Isolate* isolate, Zone* zone, RegExpCompileData* data,
+    JSRegExp::Flags flags, Handle<String> pattern,
     Handle<String> sample_subject, bool is_one_byte) {
   if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
     return IrregexpRegExpTooBig(isolate);
   }
-  RegExpCompiler compiler(isolate, zone, data->capture_count, ignore_case,
+  bool ignore_case = flags & JSRegExp::kIgnoreCase;
+  bool is_sticky = flags & JSRegExp::kSticky;
+  bool is_global = flags & JSRegExp::kGlobal;
+  bool is_unicode = flags & JSRegExp::kUnicode;
+  RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
                           is_one_byte);
 
   if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
@@ -6316,11 +6670,13 @@
     if (node != NULL) {
       node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
     }
+  } else if (compiler.unicode() && (is_global || is_sticky)) {
+    node = OptionallyStepBackToLeadSurrogate(&compiler, node);
   }
 
   if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
   data->node = node;
-  Analysis analysis(isolate, ignore_case, is_one_byte);
+  Analysis analysis(isolate, flags, is_one_byte);
   analysis.EnsureAnalyzed(node);
   if (analysis.has_failed()) {
     const char* error_message = analysis.error_message();
@@ -6381,10 +6737,13 @@
   }
 
   if (is_global) {
-    macro_assembler.set_global_mode(
-        (data->tree->min_match() > 0)
-            ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
-            : RegExpMacroAssembler::GLOBAL);
+    RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
+    if (data->tree->min_match() > 0) {
+      mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
+    } else if (is_unicode) {
+      mode = RegExpMacroAssembler::GLOBAL_UNICODE;
+    }
+    macro_assembler.set_global_mode(mode);
   }
 
   return compiler.Assemble(&macro_assembler,

diff --git a/src/regexp/jsregexp.h b/src/regexp/jsregexp.h
index 0ad4b79..e55d650 100644
--- a/src/regexp/jsregexp.h
+++ b/src/regexp/jsregexp.h

@@ -8,6 +8,7 @@
 #include "src/allocation.h"
 #include "src/assembler.h"
 #include "src/regexp/regexp-ast.h"
+#include "src/regexp/regexp-macro-assembler.h"
 
 namespace v8 {
 namespace internal {
@@ -121,7 +122,6 @@
    public:
     GlobalCache(Handle<JSRegExp> regexp,
                 Handle<String> subject,
-                bool is_global,
                 Isolate* isolate);
 
     INLINE(~GlobalCache());
@@ -137,6 +137,8 @@
     INLINE(bool HasException()) { return num_matches_ < 0; }
 
    private:
+    int AdvanceZeroLength(int last_index);
+
     int num_matches_;
     int max_matches_;
     int current_match_index_;
@@ -265,28 +267,30 @@
   class Entry {
    public:
     Entry() : from_(0), to_(0), out_set_(NULL) { }
-    Entry(uc16 from, uc16 to, OutSet* out_set)
-        : from_(from), to_(to), out_set_(out_set) { }
-    uc16 from() { return from_; }
-    uc16 to() { return to_; }
-    void set_to(uc16 value) { to_ = value; }
+    Entry(uc32 from, uc32 to, OutSet* out_set)
+        : from_(from), to_(to), out_set_(out_set) {
+      DCHECK(from <= to);
+    }
+    uc32 from() { return from_; }
+    uc32 to() { return to_; }
+    void set_to(uc32 value) { to_ = value; }
     void AddValue(int value, Zone* zone) {
       out_set_ = out_set_->Extend(value, zone);
     }
     OutSet* out_set() { return out_set_; }
    private:
-    uc16 from_;
-    uc16 to_;
+    uc32 from_;
+    uc32 to_;
     OutSet* out_set_;
   };
 
   class Config {
    public:
-    typedef uc16 Key;
+    typedef uc32 Key;
     typedef Entry Value;
-    static const uc16 kNoKey;
+    static const uc32 kNoKey;
     static const Entry NoValue() { return Value(); }
-    static inline int Compare(uc16 a, uc16 b) {
+    static inline int Compare(uc32 a, uc32 b) {
       if (a == b)
         return 0;
       else if (a < b)
@@ -297,7 +301,7 @@
   };
 
   void AddRange(CharacterRange range, int value, Zone* zone);
-  OutSet* Get(uc16 value);
+  OutSet* Get(uc32 value);
   void Dump();
 
   template <typename Callback>
@@ -315,6 +319,34 @@
 };
 
 
+// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
+class UnicodeRangeSplitter {
+ public:
+  UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
+  void Call(uc32 from, DispatchTable::Entry entry);
+
+  ZoneList<CharacterRange>* bmp() { return bmp_; }
+  ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
+  ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
+  ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
+
+ private:
+  static const int kBase = 0;
+  // Separate ranges into
+  static const int kBmpCodePoints = 1;
+  static const int kLeadSurrogates = 2;
+  static const int kTrailSurrogates = 3;
+  static const int kNonBmpCodePoints = 4;
+
+  Zone* zone_;
+  DispatchTable table_;
+  ZoneList<CharacterRange>* bmp_;
+  ZoneList<CharacterRange>* lead_surrogates_;
+  ZoneList<CharacterRange>* trail_surrogates_;
+  ZoneList<CharacterRange>* non_bmp_;
+};
+
+
 #define FOR_EACH_NODE_TYPE(VISIT)                                    \
   VISIT(End)                                                         \
   VISIT(Action)                                                      \
@@ -690,6 +722,17 @@
         read_backward_(read_backward) {
     elms_->Add(TextElement::CharClass(that), zone());
   }
+  // Create TextNode for a single character class for the given ranges.
+  static TextNode* CreateForCharacterRanges(Zone* zone,
+                                            ZoneList<CharacterRange>* ranges,
+                                            bool read_backward,
+                                            RegExpNode* on_success);
+  // Create TextNode for a surrogate pair with a range given for the
+  // lead and the trail surrogate each.
+  static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
+                                          CharacterRange trail,
+                                          bool read_backward,
+                                          RegExpNode* on_success);
   virtual void Accept(NodeVisitor* visitor);
   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
@@ -813,8 +856,7 @@
 class EndNode: public RegExpNode {
  public:
   enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
-  explicit EndNode(Action action, Zone* zone)
-      : RegExpNode(zone), action_(action) { }
+  EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
   virtual void Accept(NodeVisitor* visitor);
   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
   virtual int EatsAtLeast(int still_to_find,
@@ -1440,9 +1482,9 @@
 //   +-------+        +------------+
 class Analysis: public NodeVisitor {
  public:
-  Analysis(Isolate* isolate, bool ignore_case, bool is_one_byte)
+  Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte)
       : isolate_(isolate),
-        ignore_case_(ignore_case),
+        flags_(flags),
         is_one_byte_(is_one_byte),
         error_message_(NULL) {}
   void EnsureAnalyzed(RegExpNode* node);
@@ -1464,9 +1506,12 @@
 
   Isolate* isolate() const { return isolate_; }
 
+  bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
+  bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
+
  private:
   Isolate* isolate_;
-  bool ignore_case_;
+  JSRegExp::Flags flags_;
   bool is_one_byte_;
   const char* error_message_;
 
@@ -1505,8 +1550,8 @@
   };
 
   static CompilationResult Compile(Isolate* isolate, Zone* zone,
-                                   RegExpCompileData* input, bool ignore_case,
-                                   bool global, bool multiline, bool sticky,
+                                   RegExpCompileData* input,
+                                   JSRegExp::Flags flags,
                                    Handle<String> pattern,
                                    Handle<String> sample_subject,
                                    bool is_one_byte);

diff --git a/src/regexp/mips/regexp-macro-assembler-mips.cc b/src/regexp/mips/regexp-macro-assembler-mips.cc
index 9c59328..6197f45 100644
--- a/src/regexp/mips/regexp-macro-assembler-mips.cc
+++ b/src/regexp/mips/regexp-macro-assembler-mips.cc

@@ -215,7 +215,7 @@
 
 
 void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
-    int start_reg, bool read_backward, Label* on_no_match) {
+    int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
   Label fallthrough;
   __ lw(a0, register_location(start_reg));  // Index of start of capture.
   __ lw(a1, register_location(start_reg + 1));  // Index of end of capture.
@@ -310,7 +310,7 @@
     //   a0: Address byte_offset1 - Address captured substring's start.
     //   a1: Address byte_offset2 - Address of current character position.
     //   a2: size_t byte_length - length of capture in bytes(!).
-    //   a3: Isolate* isolate.
+    //   a3: Isolate* isolate or 0 if unicode flag.
 
     // Address of start of capture.
     __ Addu(a0, a0, Operand(end_of_input_address()));
@@ -324,7 +324,14 @@
       __ Subu(a1, a1, Operand(s3));
     }
     // Isolate.
-    __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
+#ifdef V8_I18N_SUPPORT
+    if (unicode) {
+      __ mov(a3, zero_reg);
+    } else  // NOLINT
+#endif      // V8_I18N_SUPPORT
+    {
+      __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
+    }
 
     {
       AllowExternalCallThatCantCauseGC scope(masm_);
@@ -801,9 +808,12 @@
           __ Branch(&exit_label_, eq, current_input_offset(),
                     Operand(zero_reg));
           // Advance current position after a zero-length match.
+          Label advance;
+          __ bind(&advance);
           __ Addu(current_input_offset(),
                   current_input_offset(),
                   Operand((mode_ == UC16) ? 2 : 1));
+          if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
         }
 
         __ Branch(&load_char_start_regexp);

diff --git a/src/regexp/mips/regexp-macro-assembler-mips.h b/src/regexp/mips/regexp-macro-assembler-mips.h
index 902e220..6dedb1e 100644
--- a/src/regexp/mips/regexp-macro-assembler-mips.h
+++ b/src/regexp/mips/regexp-macro-assembler-mips.h

@@ -37,7 +37,7 @@
   virtual void CheckNotBackReference(int start_reg, bool read_backward,
                                      Label* on_no_match);
   virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
-                                               bool read_backward,
+                                               bool read_backward, bool unicode,
                                                Label* on_no_match);
   virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
   virtual void CheckNotCharacterAfterAnd(uint32_t c,

diff --git a/src/regexp/mips64/regexp-macro-assembler-mips64.cc b/src/regexp/mips64/regexp-macro-assembler-mips64.cc
index 5153bd0..bf95a9c 100644
--- a/src/regexp/mips64/regexp-macro-assembler-mips64.cc
+++ b/src/regexp/mips64/regexp-macro-assembler-mips64.cc

@@ -251,7 +251,7 @@
 
 
 void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
-    int start_reg, bool read_backward, Label* on_no_match) {
+    int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
   Label fallthrough;
   __ ld(a0, register_location(start_reg));  // Index of start of capture.
   __ ld(a1, register_location(start_reg + 1));  // Index of end of capture.
@@ -346,7 +346,7 @@
     //   a0: Address byte_offset1 - Address captured substring's start.
     //   a1: Address byte_offset2 - Address of current character position.
     //   a2: size_t byte_length - length of capture in bytes(!).
-    //   a3: Isolate* isolate.
+    //   a3: Isolate* isolate or 0 if unicode flag.
 
     // Address of start of capture.
     __ Daddu(a0, a0, Operand(end_of_input_address()));
@@ -360,7 +360,14 @@
       __ Dsubu(a1, a1, Operand(s3));
     }
     // Isolate.
-    __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
+#ifdef V8_I18N_SUPPORT
+    if (unicode) {
+      __ mov(a3, zero_reg);
+    } else  // NOLINT
+#endif      // V8_I18N_SUPPORT
+    {
+      __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
+    }
 
     {
       AllowExternalCallThatCantCauseGC scope(masm_);
@@ -664,10 +671,7 @@
         s3.bit() | s4.bit() | s5.bit() | s6.bit() | s7.bit() | fp.bit();
     RegList argument_registers = a0.bit() | a1.bit() | a2.bit() | a3.bit();
 
-    if (kMipsAbi == kN64) {
-      // TODO(plind): Should probably alias a4-a7, for clarity.
-      argument_registers |= a4.bit() | a5.bit() | a6.bit() | a7.bit();
-    }
+    argument_registers |= a4.bit() | a5.bit() | a6.bit() | a7.bit();
 
     __ MultiPush(argument_registers | registers_to_retain | ra.bit());
     // Set frame pointer in space for it if this is not a direct call
@@ -841,9 +845,12 @@
           __ Branch(&exit_label_, eq, current_input_offset(),
                     Operand(zero_reg));
           // Advance current position after a zero-length match.
+          Label advance;
+          __ bind(&advance);
           __ Daddu(current_input_offset(),
                   current_input_offset(),
                   Operand((mode_ == UC16) ? 2 : 1));
+          if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
         }
 
         __ Branch(&load_char_start_regexp);

diff --git a/src/regexp/mips64/regexp-macro-assembler-mips64.h b/src/regexp/mips64/regexp-macro-assembler-mips64.h
index 9a8ca17..df2c6c5 100644
--- a/src/regexp/mips64/regexp-macro-assembler-mips64.h
+++ b/src/regexp/mips64/regexp-macro-assembler-mips64.h

@@ -37,7 +37,7 @@
   virtual void CheckNotBackReference(int start_reg, bool read_backward,
                                      Label* on_no_match);
   virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
-                                               bool read_backward,
+                                               bool read_backward, bool unicode,
                                                Label* on_no_match);
   virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
   virtual void CheckNotCharacterAfterAnd(uint32_t c,
@@ -96,7 +96,6 @@
   void print_regexp_frame_constants();
 
  private:
-#if defined(MIPS_ABI_N64)
   // Offsets from frame_pointer() of function parameters and stored registers.
   static const int kFramePointer = 0;
 
@@ -105,7 +104,7 @@
   static const int kStoredRegisters = kFramePointer;
   // Return address (stored from link register, read into pc on return).
 
-// TODO(plind): This 9 - is 8 s-regs (s0..s7) plus fp.
+  // TODO(plind): This 9 - is 8 s-regs (s0..s7) plus fp.
 
   static const int kReturnAddress = kStoredRegisters + 9 * kPointerSize;
   static const int kSecondaryReturnAddress = kReturnAddress + kPointerSize;
@@ -131,43 +130,6 @@
   // First register address. Following registers are below it on the stack.
   static const int kRegisterZero = kStringStartMinusOne - kPointerSize;
 
-#elif defined(MIPS_ABI_O32)
-  // Offsets from frame_pointer() of function parameters and stored registers.
-  static const int kFramePointer = 0;
-
-  // Above the frame pointer - Stored registers and stack passed parameters.
-  // Registers s0 to s7, fp, and ra.
-  static const int kStoredRegisters = kFramePointer;
-  // Return address (stored from link register, read into pc on return).
-  static const int kReturnAddress = kStoredRegisters + 9 * kPointerSize;
-  static const int kSecondaryReturnAddress = kReturnAddress + kPointerSize;
-  // Stack frame header.
-  static const int kStackFrameHeader = kReturnAddress + kPointerSize;
-  // Stack parameters placed by caller.
-  static const int kRegisterOutput =
-      kStackFrameHeader + 4 * kPointerSize + kPointerSize;
-  static const int kNumOutputRegisters = kRegisterOutput + kPointerSize;
-  static const int kStackHighEnd = kNumOutputRegisters + kPointerSize;
-  static const int kDirectCall = kStackHighEnd + kPointerSize;
-  static const int kIsolate = kDirectCall + kPointerSize;
-
-  // Below the frame pointer.
-  // Register parameters stored by setup code.
-  static const int kInputEnd = kFramePointer - kPointerSize;
-  static const int kInputStart = kInputEnd - kPointerSize;
-  static const int kStartIndex = kInputStart - kPointerSize;
-  static const int kInputString = kStartIndex - kPointerSize;
-  // When adding local variables remember to push space for them in
-  // the frame in GetCode.
-  static const int kSuccessfulCaptures = kInputString - kPointerSize;
-  static const int kStringStartMinusOne = kSuccessfulCaptures - kPointerSize;
-  // First register address. Following registers are below it on the stack.
-  static const int kRegisterZero = kStringStartMinusOne - kPointerSize;
-
-#else
-# error "undefined MIPS ABI"
-#endif
-
   // Initial size of code buffer.
   static const size_t kRegExpCodeSize = 1024;
 

diff --git a/src/regexp/ppc/regexp-macro-assembler-ppc.cc b/src/regexp/ppc/regexp-macro-assembler-ppc.cc
index f3ddf7b..c05c580 100644
--- a/src/regexp/ppc/regexp-macro-assembler-ppc.cc
+++ b/src/regexp/ppc/regexp-macro-assembler-ppc.cc

@@ -225,9 +225,8 @@
   BranchOrBacktrack(eq, on_equal);
 }
 
-
 void RegExpMacroAssemblerPPC::CheckNotBackReferenceIgnoreCase(
-    int start_reg, bool read_backward, Label* on_no_match) {
+    int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
   Label fallthrough;
   __ LoadP(r3, register_location(start_reg), r0);  // Index of start of capture
   __ LoadP(r4, register_location(start_reg + 1), r0);  // Index of end
@@ -322,7 +321,7 @@
     //   r3: Address byte_offset1 - Address captured substring's start.
     //   r4: Address byte_offset2 - Address of current character position.
     //   r5: size_t byte_length - length of capture in bytes(!)
-    //   r6: Isolate* isolate
+    //   r6: Isolate* isolate or 0 if unicode flag.
 
     // Address of start of capture.
     __ add(r3, r3, end_of_input_address());
@@ -336,7 +335,14 @@
       __ sub(r4, r4, r25);
     }
     // Isolate.
-    __ mov(r6, Operand(ExternalReference::isolate_address(isolate())));
+#ifdef V8_I18N_SUPPORT
+    if (unicode) {
+      __ li(r6, Operand::Zero());
+    } else  // NOLINT
+#endif      // V8_I18N_SUPPORT
+    {
+      __ mov(r6, Operand(ExternalReference::isolate_address(isolate())));
+    }
 
     {
       AllowExternalCallThatCantCauseGC scope(masm_);
@@ -845,8 +851,11 @@
           __ cmpi(current_input_offset(), Operand::Zero());
           __ beq(&exit_label_);
           // Advance current position after a zero-length match.
+          Label advance;
+          __ bind(&advance);
           __ addi(current_input_offset(), current_input_offset(),
                   Operand((mode_ == UC16) ? 2 : 1));
+          if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
         }
 
         __ b(&load_char_start_regexp);

diff --git a/src/regexp/ppc/regexp-macro-assembler-ppc.h b/src/regexp/ppc/regexp-macro-assembler-ppc.h
index 4d1836f..d281387 100644
--- a/src/regexp/ppc/regexp-macro-assembler-ppc.h
+++ b/src/regexp/ppc/regexp-macro-assembler-ppc.h

@@ -38,7 +38,7 @@
   virtual void CheckNotBackReference(int start_reg, bool read_backward,
                                      Label* on_no_match);
   virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
-                                               bool read_backward,
+                                               bool read_backward, bool unicode,
                                                Label* on_no_match);
   virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
   virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned mask,

diff --git a/src/regexp/regexp-ast.cc b/src/regexp/regexp-ast.cc
index 31c93b1..b5c2bb6 100644
--- a/src/regexp/regexp-ast.cc
+++ b/src/regexp/regexp-ast.cc

@@ -172,9 +172,9 @@
 
 
 void RegExpUnparser::VisitCharacterRange(CharacterRange that) {
-  os_ << AsUC16(that.from());
+  os_ << AsUC32(that.from());
   if (!that.IsSingleton()) {
-    os_ << "-" << AsUC16(that.to());
+    os_ << "-" << AsUC32(that.to());
   }
 }
 

diff --git a/src/regexp/regexp-ast.h b/src/regexp/regexp-ast.h
index f877785..0e718d3 100644
--- a/src/regexp/regexp-ast.h
+++ b/src/regexp/regexp-ast.h

@@ -5,6 +5,7 @@
 #ifndef V8_REGEXP_REGEXP_AST_H_
 #define V8_REGEXP_REGEXP_AST_H_
 
+#include "src/objects.h"
 #include "src/utils.h"
 #include "src/zone.h"
 
@@ -77,33 +78,38 @@
   CharacterRange() : from_(0), to_(0) {}
   // For compatibility with the CHECK_OK macro
   CharacterRange(void* null) { DCHECK_NULL(null); }  // NOLINT
-  CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) {}
   static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
                              Zone* zone);
   static Vector<const int> GetWordBounds();
-  static inline CharacterRange Singleton(uc16 value) {
+  static inline CharacterRange Singleton(uc32 value) {
     return CharacterRange(value, value);
   }
-  static inline CharacterRange Range(uc16 from, uc16 to) {
-    DCHECK(from <= to);
+  static inline CharacterRange Range(uc32 from, uc32 to) {
+    DCHECK(0 <= from && to <= String::kMaxCodePoint);
+    DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
     return CharacterRange(from, to);
   }
   static inline CharacterRange Everything() {
-    return CharacterRange(0, 0xFFFF);
+    return CharacterRange(0, String::kMaxCodePoint);
   }
-  bool Contains(uc16 i) { return from_ <= i && i <= to_; }
-  uc16 from() const { return from_; }
-  void set_from(uc16 value) { from_ = value; }
-  uc16 to() const { return to_; }
-  void set_to(uc16 value) { to_ = value; }
+  static inline ZoneList<CharacterRange>* List(Zone* zone,
+                                               CharacterRange range) {
+    ZoneList<CharacterRange>* list =
+        new (zone) ZoneList<CharacterRange>(1, zone);
+    list->Add(range, zone);
+    return list;
+  }
+  bool Contains(uc32 i) { return from_ <= i && i <= to_; }
+  uc32 from() const { return from_; }
+  void set_from(uc32 value) { from_ = value; }
+  uc32 to() const { return to_; }
+  void set_to(uc32 value) { to_ = value; }
   bool is_valid() { return from_ <= to_; }
   bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
   bool IsSingleton() { return (from_ == to_); }
-  void AddCaseEquivalents(Isolate* isolate, Zone* zone,
-                          ZoneList<CharacterRange>* ranges, bool is_one_byte);
-  static void Split(ZoneList<CharacterRange>* base, Vector<const int> overlay,
-                    ZoneList<CharacterRange>** included,
-                    ZoneList<CharacterRange>** excluded, Zone* zone);
+  static void AddCaseEquivalents(Isolate* isolate, Zone* zone,
+                                 ZoneList<CharacterRange>* ranges,
+                                 bool is_one_byte);
   // Whether a range list is in canonical form: Ranges ordered by from value,
   // and ranges non-overlapping and non-adjacent.
   static bool IsCanonical(ZoneList<CharacterRange>* ranges);
@@ -119,8 +125,10 @@
   static const int kPayloadMask = (1 << 24) - 1;
 
  private:
-  uc16 from_;
-  uc16 to_;
+  CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
+
+  uc32 from_;
+  uc32 to_;
 };
 
 
@@ -303,8 +311,8 @@
   // W : non-ASCII word character
   // d : ASCII digit
   // D : non-ASCII digit
-  // . : non-unicode non-newline
-  // * : All characters
+  // . : non-newline
+  // * : All characters, for advancing unanchored regexp
   uc16 standard_type() { return set_.standard_set_type(); }
   ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
   bool is_negated() { return is_negated_; }
@@ -451,6 +459,22 @@
   int capture_from() { return capture_from_; }
   Type type() { return type_; }
 
+  class Builder {
+   public:
+    Builder(bool is_positive, RegExpNode* on_success,
+            int stack_pointer_register, int position_register,
+            int capture_register_count = 0, int capture_register_start = 0);
+    RegExpNode* on_match_success() { return on_match_success_; }
+    RegExpNode* ForMatch(RegExpNode* match);
+
+   private:
+    bool is_positive_;
+    RegExpNode* on_match_success_;
+    RegExpNode* on_success_;
+    int stack_pointer_register_;
+    int position_register_;
+  };
+
  private:
   RegExpTree* body_;
   bool is_positive_;

diff --git a/src/regexp/regexp-macro-assembler-irregexp-inl.h b/src/regexp/regexp-macro-assembler-irregexp-inl.h
index 4d0b1bc..a602129 100644
--- a/src/regexp/regexp-macro-assembler-irregexp-inl.h
+++ b/src/regexp/regexp-macro-assembler-irregexp-inl.h

@@ -5,14 +5,14 @@
 #ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_IRREGEXP_INL_H_
 #define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_IRREGEXP_INL_H_
 
+#ifdef V8_INTERPRETED_REGEXP
+
 #include "src/ast/ast.h"
 #include "src/regexp/bytecodes-irregexp.h"
 
 namespace v8 {
 namespace internal {
 
-#ifdef V8_INTERPRETED_REGEXP
-
 void RegExpMacroAssemblerIrregexp::Emit(uint32_t byte,
                                         uint32_t twenty_four_bits) {
   uint32_t word = ((twenty_four_bits << BYTECODE_SHIFT) | byte);
@@ -54,9 +54,9 @@
   pc_ += 4;
 }
 
-#endif  // V8_INTERPRETED_REGEXP
-
 }  // namespace internal
 }  // namespace v8
 
+#endif  // V8_INTERPRETED_REGEXP
+
 #endif  // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_IRREGEXP_INL_H_

diff --git a/src/regexp/regexp-macro-assembler-irregexp.cc b/src/regexp/regexp-macro-assembler-irregexp.cc
index 751ee44..a0bb5e7 100644
--- a/src/regexp/regexp-macro-assembler-irregexp.cc
+++ b/src/regexp/regexp-macro-assembler-irregexp.cc

@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
+#ifdef V8_INTERPRETED_REGEXP
+
 #include "src/regexp/regexp-macro-assembler-irregexp.h"
 
 #include "src/ast/ast.h"
@@ -9,12 +11,9 @@
 #include "src/regexp/regexp-macro-assembler.h"
 #include "src/regexp/regexp-macro-assembler-irregexp-inl.h"
 
-
 namespace v8 {
 namespace internal {
 
-#ifdef V8_INTERPRETED_REGEXP
-
 RegExpMacroAssemblerIrregexp::RegExpMacroAssemblerIrregexp(Isolate* isolate,
                                                            Vector<byte> buffer,
                                                            Zone* zone)
@@ -382,11 +381,13 @@
 
 
 void RegExpMacroAssemblerIrregexp::CheckNotBackReferenceIgnoreCase(
-    int start_reg, bool read_backward, Label* on_not_equal) {
+    int start_reg, bool read_backward, bool unicode, Label* on_not_equal) {
   DCHECK(start_reg >= 0);
   DCHECK(start_reg <= kMaxRegister);
-  Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD
-                     : BC_CHECK_NOT_BACK_REF_NO_CASE,
+  Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD
+                                : BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD)
+                     : (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE
+                                : BC_CHECK_NOT_BACK_REF_NO_CASE),
        start_reg);
   EmitOrLink(on_not_equal);
 }
@@ -454,7 +455,7 @@
   }
 }
 
-#endif  // V8_INTERPRETED_REGEXP
-
 }  // namespace internal
 }  // namespace v8
+
+#endif  // V8_INTERPRETED_REGEXP

diff --git a/src/regexp/regexp-macro-assembler-irregexp.h b/src/regexp/regexp-macro-assembler-irregexp.h
index f1ace63..dad2e9a 100644
--- a/src/regexp/regexp-macro-assembler-irregexp.h
+++ b/src/regexp/regexp-macro-assembler-irregexp.h

@@ -5,13 +5,13 @@
 #ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_IRREGEXP_H_
 #define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_IRREGEXP_H_
 
+#ifdef V8_INTERPRETED_REGEXP
+
 #include "src/regexp/regexp-macro-assembler.h"
 
 namespace v8 {
 namespace internal {
 
-#ifdef V8_INTERPRETED_REGEXP
-
 // A light-weight assembler for the Irregexp byte code.
 class RegExpMacroAssemblerIrregexp: public RegExpMacroAssembler {
  public:
@@ -85,7 +85,7 @@
   virtual void CheckNotBackReference(int start_reg, bool read_backward,
                                      Label* on_no_match);
   virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
-                                               bool read_backward,
+                                               bool read_backward, bool unicode,
                                                Label* on_no_match);
   virtual void IfRegisterLT(int register_index, int comparand, Label* if_lt);
   virtual void IfRegisterGE(int register_index, int comparand, Label* if_ge);
@@ -125,9 +125,9 @@
   DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpMacroAssemblerIrregexp);
 };
 
-#endif  // V8_INTERPRETED_REGEXP
-
 }  // namespace internal
 }  // namespace v8
 
+#endif  // V8_INTERPRETED_REGEXP
+
 #endif  // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_IRREGEXP_H_

diff --git a/src/regexp/regexp-macro-assembler-tracer.cc b/src/regexp/regexp-macro-assembler-tracer.cc
index 5301ead..ec86526 100644
--- a/src/regexp/regexp-macro-assembler-tracer.cc
+++ b/src/regexp/regexp-macro-assembler-tracer.cc

@@ -360,11 +360,11 @@
 
 
 void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase(
-    int start_reg, bool read_backward, Label* on_no_match) {
-  PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s, label[%08x]);\n",
+    int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
+  PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s %s, label[%08x]);\n",
          start_reg, read_backward ? "backward" : "forward",
-         LabelToInt(on_no_match));
-  assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward,
+         unicode ? "unicode" : "non-unicode", LabelToInt(on_no_match));
+  assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, unicode,
                                               on_no_match);
 }
 

diff --git a/src/regexp/regexp-macro-assembler-tracer.h b/src/regexp/regexp-macro-assembler-tracer.h
index 77377aa..8a9ebe3 100644
--- a/src/regexp/regexp-macro-assembler-tracer.h
+++ b/src/regexp/regexp-macro-assembler-tracer.h

@@ -34,7 +34,7 @@
   virtual void CheckNotBackReference(int start_reg, bool read_backward,
                                      Label* on_no_match);
   virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
-                                               bool read_backward,
+                                               bool read_backward, bool unicode,
                                                Label* on_no_match);
   virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
   virtual void CheckNotCharacterAfterAnd(unsigned c,

diff --git a/src/regexp/regexp-macro-assembler.cc b/src/regexp/regexp-macro-assembler.cc
index caf8b51..9bb5073 100644
--- a/src/regexp/regexp-macro-assembler.cc
+++ b/src/regexp/regexp-macro-assembler.cc

@@ -9,6 +9,10 @@
 #include "src/regexp/regexp-stack.h"
 #include "src/simulator.h"
 
+#ifdef V8_I18N_SUPPORT
+#include "unicode/uchar.h"
+#endif  // V8_I18N_SUPPORT
+
 namespace v8 {
 namespace internal {
 
@@ -23,6 +27,80 @@
 }
 
 
+int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
+                                                     Address byte_offset2,
+                                                     size_t byte_length,
+                                                     Isolate* isolate) {
+  unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
+      isolate->regexp_macro_assembler_canonicalize();
+  // This function is not allowed to cause a garbage collection.
+  // A GC might move the calling generated code and invalidate the
+  // return address on the stack.
+  DCHECK(byte_length % 2 == 0);
+  uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
+  uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
+  size_t length = byte_length >> 1;
+
+#ifdef V8_I18N_SUPPORT
+  if (isolate == nullptr) {
+    for (size_t i = 0; i < length; i++) {
+      uc32 c1 = substring1[i];
+      uc32 c2 = substring2[i];
+      if (unibrow::Utf16::IsLeadSurrogate(c1)) {
+        // Non-BMP characters do not have case-equivalents in the BMP.
+        // Both have to be non-BMP for them to be able to match.
+        if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
+        if (i + 1 < length) {
+          uc16 c1t = substring1[i + 1];
+          uc16 c2t = substring2[i + 1];
+          if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
+              unibrow::Utf16::IsTrailSurrogate(c2t)) {
+            c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
+            c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
+            i++;
+          }
+        }
+      }
+      c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
+      c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
+      if (c1 != c2) return 0;
+    }
+    return 1;
+  }
+#endif  // V8_I18N_SUPPORT
+  DCHECK_NOT_NULL(isolate);
+  for (size_t i = 0; i < length; i++) {
+    unibrow::uchar c1 = substring1[i];
+    unibrow::uchar c2 = substring2[i];
+    if (c1 != c2) {
+      unibrow::uchar s1[1] = {c1};
+      canonicalize->get(c1, '\0', s1);
+      if (s1[0] != c2) {
+        unibrow::uchar s2[1] = {c2};
+        canonicalize->get(c2, '\0', s2);
+        if (s1[0] != s2[0]) {
+          return 0;
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+
+void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
+                                                   Label* on_failure) {
+  Label ok;
+  // Check that current character is not a trail surrogate.
+  LoadCurrentCharacter(cp_offset, &ok);
+  CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
+  // Check that previous character is not a lead surrogate.
+  LoadCurrentCharacter(cp_offset - 1, &ok);
+  CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
+  Bind(&ok);
+}
+
+
 #ifndef V8_INTERPRETED_REGEXP  // Avoid unused code, e.g., on ARM.
 
 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
@@ -245,40 +323,6 @@
 };
 
 
-int NativeRegExpMacroAssembler::CaseInsensitiveCompareUC16(
-    Address byte_offset1,
-    Address byte_offset2,
-    size_t byte_length,
-    Isolate* isolate) {
-  unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
-      isolate->regexp_macro_assembler_canonicalize();
-  // This function is not allowed to cause a garbage collection.
-  // A GC might move the calling generated code and invalidate the
-  // return address on the stack.
-  DCHECK(byte_length % 2 == 0);
-  uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
-  uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
-  size_t length = byte_length >> 1;
-
-  for (size_t i = 0; i < length; i++) {
-    unibrow::uchar c1 = substring1[i];
-    unibrow::uchar c2 = substring2[i];
-    if (c1 != c2) {
-      unibrow::uchar s1[1] = { c1 };
-      canonicalize->get(c1, '\0', s1);
-      if (s1[0] != c2) {
-        unibrow::uchar s2[1] = { c2 };
-        canonicalize->get(c2, '\0', s2);
-        if (s1[0] != s2[0]) {
-          return 0;
-        }
-      }
-    }
-  }
-  return 1;
-}
-
-
 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
                                               Address* stack_base,
                                               Isolate* isolate) {

diff --git a/src/regexp/regexp-macro-assembler.h b/src/regexp/regexp-macro-assembler.h
index 2059933..6f79a16 100644
--- a/src/regexp/regexp-macro-assembler.h
+++ b/src/regexp/regexp-macro-assembler.h

@@ -11,6 +11,13 @@
 namespace v8 {
 namespace internal {
 
+static const uc32 kLeadSurrogateStart = 0xd800;
+static const uc32 kLeadSurrogateEnd = 0xdbff;
+static const uc32 kTrailSurrogateStart = 0xdc00;
+static const uc32 kTrailSurrogateEnd = 0xdfff;
+static const uc32 kNonBmpStart = 0x10000;
+static const uc32 kNonBmpEnd = 0x10ffff;
+
 struct DisjunctDecisionRow {
   RegExpCharacterClass cc;
   Label* on_match;
@@ -76,7 +83,7 @@
   virtual void CheckNotBackReference(int start_reg, bool read_backward,
                                      Label* on_no_match) = 0;
   virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
-                                               bool read_backward,
+                                               bool read_backward, bool unicode,
                                                Label* on_no_match) = 0;
   // Check the current character for a match with a literal character.  If we
   // fail to match then goto the on_failure label.  End of input always
@@ -146,25 +153,40 @@
   virtual void ClearRegisters(int reg_from, int reg_to) = 0;
   virtual void WriteStackPointerToRegister(int reg) = 0;
 
+  // Compares two-byte strings case insensitively.
+  // Called from generated RegExp code.
+  static int CaseInsensitiveCompareUC16(Address byte_offset1,
+                                        Address byte_offset2,
+                                        size_t byte_length, Isolate* isolate);
+
+  // Check that we are not in the middle of a surrogate pair.
+  void CheckNotInSurrogatePair(int cp_offset, Label* on_failure);
+
   // Controls the generation of large inlined constants in the code.
   void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; }
   bool slow_safe() { return slow_safe_compiler_; }
 
-  enum GlobalMode { NOT_GLOBAL, GLOBAL, GLOBAL_NO_ZERO_LENGTH_CHECK };
+  enum GlobalMode {
+    NOT_GLOBAL,
+    GLOBAL_NO_ZERO_LENGTH_CHECK,
+    GLOBAL,
+    GLOBAL_UNICODE
+  };
   // Set whether the regular expression has the global flag.  Exiting due to
   // a failure in a global regexp may still mean success overall.
   inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; }
   inline bool global() { return global_mode_ != NOT_GLOBAL; }
   inline bool global_with_zero_length_check() {
-    return global_mode_ == GLOBAL;
+    return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE;
   }
+  inline bool global_unicode() { return global_mode_ == GLOBAL_UNICODE; }
 
   Isolate* isolate() const { return isolate_; }
   Zone* zone() const { return zone_; }
 
  private:
   bool slow_safe_compiler_;
-  bool global_mode_;
+  GlobalMode global_mode_;
   Isolate* isolate_;
   Zone* zone_;
 };
@@ -199,13 +221,6 @@
                       int previous_index,
                       Isolate* isolate);
 
-  // Compares two-byte strings case insensitively.
-  // Called from generated RegExp code.
-  static int CaseInsensitiveCompareUC16(Address byte_offset1,
-                                        Address byte_offset2,
-                                        size_t byte_length,
-                                        Isolate* isolate);
-
   // Called from RegExp if the backtrack stack limit is hit.
   // Tries to expand the stack. Returns the new stack-pointer if
   // successful, and updates the stack_top address, or returns 0 if unable

diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
index fa89003..46c593c 100644
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc

@@ -8,27 +8,32 @@
 #include "src/factory.h"
 #include "src/isolate.h"
 #include "src/objects-inl.h"
+#include "src/ostreams.h"
 #include "src/regexp/jsregexp.h"
 #include "src/utils.h"
 
+#ifdef V8_I18N_SUPPORT
+#include "unicode/uset.h"
+#endif  // V8_I18N_SUPPORT
+
 namespace v8 {
 namespace internal {
 
 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
-                           bool multiline, bool unicode, Isolate* isolate,
-                           Zone* zone)
+                           JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
     : isolate_(isolate),
       zone_(zone),
       error_(error),
       captures_(NULL),
       in_(in),
       current_(kEndMarker),
+      ignore_case_(flags & JSRegExp::kIgnoreCase),
+      multiline_(flags & JSRegExp::kMultiline),
+      unicode_(flags & JSRegExp::kUnicode),
       next_pos_(0),
       captures_started_(0),
       capture_count_(0),
       has_more_(true),
-      multiline_(multiline),
-      unicode_(unicode),
       simple_(false),
       contains_anchor_(false),
       is_scanned_for_captures_(false),
@@ -36,10 +41,28 @@
   Advance();
 }
 
+template <bool update_position>
+inline uc32 RegExpParser::ReadNext() {
+  int position = next_pos_;
+  uc32 c0 = in()->Get(position);
+  position++;
+  // Read the whole surrogate pair in case of unicode flag, if possible.
+  if (unicode() && position < in()->length() &&
+      unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
+    uc16 c1 = in()->Get(position);
+    if (unibrow::Utf16::IsTrailSurrogate(c1)) {
+      c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
+      position++;
+    }
+  }
+  if (update_position) next_pos_ = position;
+  return c0;
+}
+
 
 uc32 RegExpParser::Next() {
   if (has_next()) {
-    return in()->Get(next_pos_);
+    return ReadNext<false>();
   } else {
     return kEndMarker;
   }
@@ -47,25 +70,14 @@
 
 
 void RegExpParser::Advance() {
-  if (next_pos_ < in()->length()) {
+  if (has_next()) {
     StackLimitCheck check(isolate());
     if (check.HasOverflowed()) {
       ReportError(CStrVector(Isolate::kStackOverflowMessage));
     } else if (zone()->excess_allocation()) {
       ReportError(CStrVector("Regular expression too large"));
     } else {
-      current_ = in()->Get(next_pos_);
-      next_pos_++;
-      // Read the whole surrogate pair in case of unicode flag, if possible.
-      if (unicode_ && next_pos_ < in()->length() &&
-          unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {
-        uc16 trail = in()->Get(next_pos_);
-        if (unibrow::Utf16::IsTrailSurrogate(trail)) {
-          current_ = unibrow::Utf16::CombineSurrogatePair(
-              static_cast<uc16>(current_), trail);
-          next_pos_++;
-        }
-      }
+      current_ = ReadNext<true>();
     }
   } else {
     current_ = kEndMarker;
@@ -92,11 +104,28 @@
 
 bool RegExpParser::simple() { return simple_; }
 
-
-bool RegExpParser::IsSyntaxCharacter(uc32 c) {
-  return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
-         c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
-         c == '{' || c == '}' || c == '|';
+bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
+  switch (c) {
+    case '^':
+    case '$':
+    case '\\':
+    case '.':
+    case '*':
+    case '+':
+    case '?':
+    case '(':
+    case ')':
+    case '[':
+    case ']':
+    case '{':
+    case '}':
+    case '|':
+    case '/':
+      return true;
+    default:
+      break;
+  }
+  return false;
 }
 
 
@@ -142,7 +171,7 @@
 RegExpTree* RegExpParser::ParseDisjunction() {
   // Used to store current state while parsing subexpressions.
   RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
-                                  zone());
+                                  ignore_case(), unicode(), zone());
   RegExpParserState* state = &initial_state;
   // Cache the builder in a local variable for quick access.
   RegExpBuilder* builder = initial_state.builder();
@@ -151,14 +180,14 @@
       case kEndMarker:
         if (state->IsSubexpression()) {
           // Inside a parenthesized group when hitting end of input.
-          ReportError(CStrVector("Unterminated group") CHECK_FAILED);
+          return ReportError(CStrVector("Unterminated group"));
         }
         DCHECK_EQ(INITIAL, state->group_type());
         // Parsing completed successfully.
         return builder->ToRegExp();
       case ')': {
         if (!state->IsSubexpression()) {
-          ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
+          return ReportError(CStrVector("Unmatched ')'"));
         }
         DCHECK_NE(INITIAL, state->group_type());
 
@@ -206,7 +235,7 @@
         return ReportError(CStrVector("Nothing to repeat"));
       case '^': {
         Advance();
-        if (multiline_) {
+        if (multiline()) {
           builder->AddAssertion(
               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
         } else {
@@ -219,8 +248,8 @@
       case '$': {
         Advance();
         RegExpAssertion::AssertionType assertion_type =
-            multiline_ ? RegExpAssertion::END_OF_LINE
-                       : RegExpAssertion::END_OF_INPUT;
+            multiline() ? RegExpAssertion::END_OF_LINE
+                        : RegExpAssertion::END_OF_INPUT;
         builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
         continue;
       }
@@ -230,8 +259,9 @@
         ZoneList<CharacterRange>* ranges =
             new (zone()) ZoneList<CharacterRange>(2, zone());
         CharacterRange::AddClassEscape('.', ranges, zone());
-        RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
-        builder->AddAtom(atom);
+        RegExpCharacterClass* cc =
+            new (zone()) RegExpCharacterClass(ranges, false);
+        builder->AddCharacterClass(cc);
         break;
       }
       case '(': {
@@ -265,25 +295,25 @@
               }
             // Fall through.
             default:
-              ReportError(CStrVector("Invalid group") CHECK_FAILED);
-              break;
+              return ReportError(CStrVector("Invalid group"));
           }
           Advance(2);
         } else {
           if (captures_started_ >= kMaxCaptures) {
-            ReportError(CStrVector("Too many captures") CHECK_FAILED);
+            return ReportError(CStrVector("Too many captures"));
           }
           captures_started_++;
         }
         // Store current state and begin new disjunction parsing.
         state = new (zone()) RegExpParserState(
-            state, subexpr_type, lookaround_type, captures_started_, zone());
+            state, subexpr_type, lookaround_type, captures_started_,
+            ignore_case(), unicode(), zone());
         builder = state->builder();
         continue;
       }
       case '[': {
-        RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
-        builder->AddAtom(atom);
+        RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
+        builder->AddCharacterClass(cc->AsCharacterClass());
         break;
       }
       // Atom ::
@@ -318,8 +348,30 @@
             ZoneList<CharacterRange>* ranges =
                 new (zone()) ZoneList<CharacterRange>(2, zone());
             CharacterRange::AddClassEscape(c, ranges, zone());
-            RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
-            builder->AddAtom(atom);
+            RegExpCharacterClass* cc =
+                new (zone()) RegExpCharacterClass(ranges, false);
+            builder->AddCharacterClass(cc);
+            break;
+          }
+          case 'p':
+          case 'P': {
+            uc32 p = Next();
+            Advance(2);
+            if (unicode()) {
+              if (FLAG_harmony_regexp_property) {
+                ZoneList<CharacterRange>* ranges = ParsePropertyClass();
+                if (ranges == nullptr) {
+                  return ReportError(CStrVector("Invalid property name"));
+                }
+                RegExpCharacterClass* cc =
+                    new (zone()) RegExpCharacterClass(ranges, p == 'P');
+                builder->AddCharacterClass(cc);
+              } else {
+                return ReportError(CStrVector("Invalid escape"));
+              }
+            } else {
+              builder->AddCharacter(p);
+            }
             break;
           }
           case '1':
@@ -332,7 +384,8 @@
           case '8':
           case '9': {
             int index = 0;
-            if (ParseBackReferenceIndex(&index)) {
+            bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
+            if (is_backref) {
               if (state->IsInsideCaptureGroup(index)) {
                 // The back reference is inside the capture group it refers to.
                 // Nothing can possibly have been captured yet, so we use empty
@@ -347,24 +400,25 @@
               }
               break;
             }
+            // With /u, no identity escapes except for syntax characters
+            // are allowed. Otherwise, all identity escapes are allowed.
+            if (unicode()) {
+              return ReportError(CStrVector("Invalid escape"));
+            }
             uc32 first_digit = Next();
             if (first_digit == '8' || first_digit == '9') {
-              // If the 'u' flag is present, only syntax characters can be
-              // escaped,
-              // no other identity escapes are allowed. If the 'u' flag is not
-              // present, all identity escapes are allowed.
-              if (!unicode_) {
-                builder->AddCharacter(first_digit);
-                Advance(2);
-              } else {
-                return ReportError(CStrVector("Invalid escape"));
-              }
+              builder->AddCharacter(first_digit);
+              Advance(2);
               break;
             }
           }
           // FALLTHROUGH
           case '0': {
             Advance();
+            if (unicode() && Next() >= '0' && Next() <= '9') {
+              // With /u, decimal escape with leading 0 are not parsed as octal.
+              return ReportError(CStrVector("Invalid decimal escape"));
+            }
             uc32 octal = ParseOctalLiteral();
             builder->AddCharacter(octal);
             break;
@@ -402,6 +456,10 @@
               // This is outside the specification. We match JSC in
               // reading the backslash as a literal character instead
               // of as starting an escape.
+              if (unicode()) {
+                // With /u, invalid escapes are not treated as identity escapes.
+                return ReportError(CStrVector("Invalid unicode escape"));
+              }
               builder->AddCharacter('\\');
             } else {
               Advance(2);
@@ -414,11 +472,10 @@
             uc32 value;
             if (ParseHexEscape(2, &value)) {
               builder->AddCharacter(value);
-            } else if (!unicode_) {
+            } else if (!unicode()) {
               builder->AddCharacter('x');
             } else {
-              // If the 'u' flag is present, invalid escapes are not treated as
-              // identity escapes.
+              // With /u, invalid escapes are not treated as identity escapes.
               return ReportError(CStrVector("Invalid escape"));
             }
             break;
@@ -427,24 +484,20 @@
             Advance(2);
             uc32 value;
             if (ParseUnicodeEscape(&value)) {
-              builder->AddUnicodeCharacter(value);
-            } else if (!unicode_) {
+              builder->AddEscapedUnicodeCharacter(value);
+            } else if (!unicode()) {
               builder->AddCharacter('u');
             } else {
-              // If the 'u' flag is present, invalid escapes are not treated as
-              // identity escapes.
+              // With /u, invalid escapes are not treated as identity escapes.
               return ReportError(CStrVector("Invalid unicode escape"));
             }
             break;
           }
           default:
             Advance();
-            // If the 'u' flag is present, only syntax characters can be
-            // escaped, no
-            // other identity escapes are allowed. If the 'u' flag is not
-            // present,
-            // all identity escapes are allowed.
-            if (!unicode_ || IsSyntaxCharacter(current())) {
+            // With /u, no identity escapes except for syntax characters
+            // are allowed. Otherwise, all identity escapes are allowed.
+            if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
               builder->AddCharacter(current());
               Advance();
             } else {
@@ -456,10 +509,16 @@
       case '{': {
         int dummy;
         if (ParseIntervalQuantifier(&dummy, &dummy)) {
-          ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
+          return ReportError(CStrVector("Nothing to repeat"));
         }
         // fallthrough
       }
+      case '}':
+      case ']':
+        if (unicode()) {
+          return ReportError(CStrVector("Lone quantifier brackets"));
+        }
+      // fallthrough
       default:
         builder->AddUnicodeCharacter(current());
         Advance();
@@ -492,13 +551,15 @@
       case '{':
         if (ParseIntervalQuantifier(&min, &max)) {
           if (max < min) {
-            ReportError(CStrVector("numbers out of order in {} quantifier.")
-                            CHECK_FAILED);
+            return ReportError(
+                CStrVector("numbers out of order in {} quantifier"));
           }
           break;
-        } else {
-          continue;
+        } else if (unicode()) {
+          // With /u, incomplete quantifiers are not allowed.
+          return ReportError(CStrVector("Incomplete quantifier"));
         }
+        continue;
       default:
         continue;
     }
@@ -511,7 +572,9 @@
       quantifier_type = RegExpQuantifier::POSSESSIVE;
       Advance();
     }
-    builder->AddQuantifierToAtom(min, max, quantifier_type);
+    if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
+      return ReportError(CStrVector("Invalid quantifier"));
+    }
   }
 }
 
@@ -740,12 +803,12 @@
   return true;
 }
 
-
+// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
   // allowed). In the latter case, the number of hex digits between { } is
   // arbitrary. \ and u have already been read.
-  if (current() == '{' && unicode_) {
+  if (current() == '{' && unicode()) {
     int start = position();
     Advance();
     if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
@@ -758,9 +821,75 @@
     return false;
   }
   // \u but no {, or \u{...} escapes not allowed.
-  return ParseHexEscape(4, value);
+  bool result = ParseHexEscape(4, value);
+  if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
+      current() == '\\') {
+    // Attempt to read trail surrogate.
+    int start = position();
+    if (Next() == 'u') {
+      Advance(2);
+      uc32 trail;
+      if (ParseHexEscape(4, &trail) &&
+          unibrow::Utf16::IsTrailSurrogate(trail)) {
+        *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
+                                                      static_cast<uc16>(trail));
+        return true;
+      }
+    }
+    Reset(start);
+  }
+  return result;
 }
 
+ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {
+#ifdef V8_I18N_SUPPORT
+  char property_name[3];
+  memset(property_name, 0, sizeof(property_name));
+  if (current() == '{') {
+    Advance();
+    if (current() < 'A' || current() > 'Z') return nullptr;
+    property_name[0] = static_cast<char>(current());
+    Advance();
+    if (current() >= 'a' && current() <= 'z') {
+      property_name[1] = static_cast<char>(current());
+      Advance();
+    }
+    if (current() != '}') return nullptr;
+  } else if (current() >= 'A' && current() <= 'Z') {
+    property_name[0] = static_cast<char>(current());
+  } else {
+    return nullptr;
+  }
+  Advance();
+
+  int32_t category =
+      u_getPropertyValueEnum(UCHAR_GENERAL_CATEGORY_MASK, property_name);
+  if (category == UCHAR_INVALID_CODE) return nullptr;
+
+  USet* set = uset_openEmpty();
+  UErrorCode ec = U_ZERO_ERROR;
+  uset_applyIntPropertyValue(set, UCHAR_GENERAL_CATEGORY_MASK, category, &ec);
+  ZoneList<CharacterRange>* ranges = nullptr;
+  if (ec == U_ZERO_ERROR && !uset_isEmpty(set)) {
+    uset_removeAllStrings(set);
+    int item_count = uset_getItemCount(set);
+    ranges = new (zone()) ZoneList<CharacterRange>(item_count, zone());
+    int item_result = 0;
+    for (int i = 0; i < item_count; i++) {
+      uc32 start = 0;
+      uc32 end = 0;
+      item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
+      ranges->Add(CharacterRange::Range(start, end), zone());
+    }
+    DCHECK_EQ(U_ZERO_ERROR, ec);
+    DCHECK_EQ(0, item_result);
+  }
+  uset_close(set);
+  return ranges;
+#else   // V8_I18N_SUPPORT
+  return nullptr;
+#endif  // V8_I18N_SUPPORT
+}
 
 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
   uc32 x = 0;
@@ -809,20 +938,35 @@
     case 'c': {
       uc32 controlLetter = Next();
       uc32 letter = controlLetter & ~('A' ^ 'a');
-      // For compatibility with JSC, inside a character class
-      // we also accept digits and underscore as control characters.
-      if ((controlLetter >= '0' && controlLetter <= '9') ||
-          controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
+      // For compatibility with JSC, inside a character class. We also accept
+      // digits and underscore as control characters, unless with /u.
+      if (letter >= 'A' && letter <= 'Z') {
         Advance(2);
         // Control letters mapped to ASCII control characters in the range
         // 0x00-0x1f.
         return controlLetter & 0x1f;
       }
+      if (unicode()) {
+        // With /u, invalid escapes are not treated as identity escapes.
+        ReportError(CStrVector("Invalid class escape"));
+        return 0;
+      }
+      if ((controlLetter >= '0' && controlLetter <= '9') ||
+          controlLetter == '_') {
+        Advance(2);
+        return controlLetter & 0x1f;
+      }
       // We match JSC in reading the backslash as a literal
       // character instead of as starting an escape.
       return '\\';
     }
     case '0':
+      // With /u, \0 is interpreted as NUL if not followed by another digit.
+      if (unicode() && !(Next() >= '0' && Next() <= '9')) {
+        Advance();
+        return 0;
+      }
+    // Fall through.
     case '1':
     case '2':
     case '3':
@@ -833,43 +977,43 @@
       // For compatibility, we interpret a decimal escape that isn't
       // a back reference (and therefore either \0 or not valid according
       // to the specification) as a 1..3 digit octal character code.
+      if (unicode()) {
+        // With /u, decimal escape is not interpreted as octal character code.
+        ReportError(CStrVector("Invalid class escape"));
+        return 0;
+      }
       return ParseOctalLiteral();
     case 'x': {
       Advance();
       uc32 value;
-      if (ParseHexEscape(2, &value)) {
-        return value;
+      if (ParseHexEscape(2, &value)) return value;
+      if (unicode()) {
+        // With /u, invalid escapes are not treated as identity escapes.
+        ReportError(CStrVector("Invalid escape"));
+        return 0;
       }
-      if (!unicode_) {
-        // If \x is not followed by a two-digit hexadecimal, treat it
-        // as an identity escape.
-        return 'x';
-      }
-      // If the 'u' flag is present, invalid escapes are not treated as
-      // identity escapes.
-      ReportError(CStrVector("Invalid escape"));
-      return 0;
+      // If \x is not followed by a two-digit hexadecimal, treat it
+      // as an identity escape.
+      return 'x';
     }
     case 'u': {
       Advance();
       uc32 value;
-      if (ParseUnicodeEscape(&value)) {
-        return value;
+      if (ParseUnicodeEscape(&value)) return value;
+      if (unicode()) {
+        // With /u, invalid escapes are not treated as identity escapes.
+        ReportError(CStrVector("Invalid unicode escape"));
+        return 0;
       }
-      if (!unicode_) {
-        return 'u';
-      }
-      // If the 'u' flag is present, invalid escapes are not treated as
-      // identity escapes.
-      ReportError(CStrVector("Invalid unicode escape"));
-      return 0;
+      // If \u is not followed by a two-digit hexadecimal, treat it
+      // as an identity escape.
+      return 'u';
     }
     default: {
       uc32 result = current();
-      // If the 'u' flag is present, only syntax characters can be escaped, no
-      // other identity escapes are allowed. If the 'u' flag is not present, all
-      // identity escapes are allowed.
-      if (!unicode_ || IsSyntaxCharacter(result)) {
+      // With /u, no identity escapes except for syntax characters and '-' are
+      // allowed. Otherwise, all identity escapes are allowed.
+      if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
         Advance();
         return result;
       }
@@ -899,13 +1043,13 @@
       case kEndMarker:
         return ReportError(CStrVector("\\ at end of pattern"));
       default:
-        uc32 c = ParseClassCharacterEscape(CHECK_FAILED);
-        return CharacterRange::Singleton(c);
+        first = ParseClassCharacterEscape(CHECK_FAILED);
     }
   } else {
     Advance();
-    return CharacterRange::Singleton(first);
   }
+
+  return CharacterRange::Singleton(first);
 }
 
 
@@ -927,6 +1071,7 @@
 
 RegExpTree* RegExpParser::ParseCharacterClass() {
   static const char* kUnterminated = "Unterminated character class";
+  static const char* kRangeInvalid = "Invalid character class";
   static const char* kRangeOutOfOrder = "Range out of order in character class";
 
   DCHECK_EQ(current(), '[');
@@ -956,13 +1101,18 @@
       CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
       if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
         // Either end is an escaped character class. Treat the '-' verbatim.
+        if (unicode()) {
+          // ES2015 21.2.2.15.1 step 1.
+          return ReportError(CStrVector(kRangeInvalid));
+        }
         AddRangeOrEscape(ranges, char_class, first, zone());
         ranges->Add(CharacterRange::Singleton('-'), zone());
         AddRangeOrEscape(ranges, char_class_2, next, zone());
         continue;
       }
+      // ES2015 21.2.2.15.1 step 6.
       if (first.from() > next.to()) {
-        return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
+        return ReportError(CStrVector(kRangeOutOfOrder));
       }
       ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
     } else {
@@ -970,7 +1120,7 @@
     }
   }
   if (!has_more()) {
-    return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
+    return ReportError(CStrVector(kUnterminated));
   }
   Advance();
   if (ranges->length() == 0) {
@@ -985,10 +1135,10 @@
 
 
 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
-                               FlatStringReader* input, bool multiline,
-                               bool unicode, RegExpCompileData* result) {
+                               FlatStringReader* input, JSRegExp::Flags flags,
+                               RegExpCompileData* result) {
   DCHECK(result != NULL);
-  RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);
+  RegExpParser parser(input, &result->error, flags, isolate, zone);
   RegExpTree* tree = parser.ParsePattern();
   if (parser.failed()) {
     DCHECK(tree == NULL);
@@ -1010,11 +1160,13 @@
   return !parser.failed();
 }
 
-
-RegExpBuilder::RegExpBuilder(Zone* zone)
+RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
     : zone_(zone),
       pending_empty_(false),
+      ignore_case_(ignore_case),
+      unicode_(unicode),
       characters_(NULL),
+      pending_surrogate_(kNoPendingSurrogate),
       terms_(),
       alternatives_()
 #ifdef DEBUG
@@ -1025,7 +1177,51 @@
 }
 
 
+void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
+  DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+  FlushPendingSurrogate();
+  // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
+  pending_surrogate_ = lead_surrogate;
+}
+
+
+void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
+  DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
+  if (pending_surrogate_ != kNoPendingSurrogate) {
+    uc16 lead_surrogate = pending_surrogate_;
+    pending_surrogate_ = kNoPendingSurrogate;
+    DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+    uc32 combined =
+        unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
+    if (NeedsDesugaringForIgnoreCase(combined)) {
+      AddCharacterClassForDesugaring(combined);
+    } else {
+      ZoneList<uc16> surrogate_pair(2, zone());
+      surrogate_pair.Add(lead_surrogate, zone());
+      surrogate_pair.Add(trail_surrogate, zone());
+      RegExpAtom* atom =
+          new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
+      AddAtom(atom);
+    }
+  } else {
+    pending_surrogate_ = trail_surrogate;
+    FlushPendingSurrogate();
+  }
+}
+
+
+void RegExpBuilder::FlushPendingSurrogate() {
+  if (pending_surrogate_ != kNoPendingSurrogate) {
+    DCHECK(unicode());
+    uc32 c = pending_surrogate_;
+    pending_surrogate_ = kNoPendingSurrogate;
+    AddCharacterClassForDesugaring(c);
+  }
+}
+
+
 void RegExpBuilder::FlushCharacters() {
+  FlushPendingSurrogate();
   pending_empty_ = false;
   if (characters_ != NULL) {
     RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
@@ -1053,31 +1249,61 @@
 
 
 void RegExpBuilder::AddCharacter(uc16 c) {
+  FlushPendingSurrogate();
   pending_empty_ = false;
-  if (characters_ == NULL) {
-    characters_ = new (zone()) ZoneList<uc16>(4, zone());
+  if (NeedsDesugaringForIgnoreCase(c)) {
+    AddCharacterClassForDesugaring(c);
+  } else {
+    if (characters_ == NULL) {
+      characters_ = new (zone()) ZoneList<uc16>(4, zone());
+    }
+    characters_->Add(c, zone());
+    LAST(ADD_CHAR);
   }
-  characters_->Add(c, zone());
-  LAST(ADD_CHAR);
 }
 
 
 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
   if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
-    ZoneList<uc16> surrogate_pair(2, zone());
-    surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());
-    surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());
-    RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
-    AddAtom(atom);
+    DCHECK(unicode());
+    AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
+    AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
+  } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
+    AddLeadSurrogate(c);
+  } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
+    AddTrailSurrogate(c);
   } else {
     AddCharacter(static_cast<uc16>(c));
   }
 }
 
+void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
+  // A lead or trail surrogate parsed via escape sequence will not
+  // pair up with any preceding lead or following trail surrogate.
+  FlushPendingSurrogate();
+  AddUnicodeCharacter(character);
+  FlushPendingSurrogate();
+}
 
 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
 
 
+void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
+  if (NeedsDesugaringForUnicode(cc)) {
+    // With /u, character class needs to be desugared, so it
+    // must be a standalone term instead of being part of a RegExpText.
+    AddTerm(cc);
+  } else {
+    AddAtom(cc);
+  }
+}
+
+void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
+  AddTerm(new (zone()) RegExpCharacterClass(
+      CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
+}
+
+
 void RegExpBuilder::AddAtom(RegExpTree* term) {
   if (term->IsEmpty()) {
     AddEmpty();
@@ -1094,6 +1320,13 @@
 }
 
 
+void RegExpBuilder::AddTerm(RegExpTree* term) {
+  FlushText();
+  terms_.Add(term, zone());
+  LAST(ADD_ATOM);
+}
+
+
 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
   FlushText();
   terms_.Add(assert, zone());
@@ -1121,6 +1354,47 @@
 }
 
 
+bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
+  if (!unicode()) return false;
+  switch (cc->standard_type()) {
+    case 's':        // white space
+    case 'w':        // ASCII word character
+    case 'd':        // ASCII digit
+      return false;  // These characters do not need desugaring.
+    default:
+      break;
+  }
+  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
+  CharacterRange::Canonicalize(ranges);
+  for (int i = ranges->length() - 1; i >= 0; i--) {
+    uc32 from = ranges->at(i).from();
+    uc32 to = ranges->at(i).to();
+    // Check for non-BMP characters.
+    if (to >= kNonBmpStart) return true;
+    // Check for lone surrogates.
+    if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
+  }
+  return false;
+}
+
+
+bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
+#ifdef V8_I18N_SUPPORT
+  if (unicode() && ignore_case()) {
+    USet* set = uset_open(c, c);
+    uset_closeOver(set, USET_CASE_INSENSITIVE);
+    uset_removeAllStrings(set);
+    bool result = uset_size(set) > 1;
+    uset_close(set);
+    return result;
+  }
+  // In the case where ICU is not included, we act as if the unicode flag is
+  // not set, and do not desugar.
+#endif  // V8_I18N_SUPPORT
+  return false;
+}
+
+
 RegExpTree* RegExpBuilder::ToRegExp() {
   FlushTerms();
   int num_alternatives = alternatives_.length();
@@ -1129,12 +1403,12 @@
   return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
 }
 
-
-void RegExpBuilder::AddQuantifierToAtom(
+bool RegExpBuilder::AddQuantifierToAtom(
     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
+  FlushPendingSurrogate();
   if (pending_empty_) {
     pending_empty_ = false;
-    return;
+    return true;
   }
   RegExpTree* atom;
   if (characters_ != NULL) {
@@ -1157,23 +1431,26 @@
   } else if (terms_.length() > 0) {
     DCHECK(last_added_ == ADD_ATOM);
     atom = terms_.RemoveLast();
+    // With /u, lookarounds are not quantifiable.
+    if (unicode() && atom->IsLookaround()) return false;
     if (atom->max_match() == 0) {
       // Guaranteed to only match an empty string.
       LAST(ADD_TERM);
       if (min == 0) {
-        return;
+        return true;
       }
       terms_.Add(atom, zone());
-      return;
+      return true;
     }
   } else {
     // Only call immediately after adding an atom or character!
     UNREACHABLE();
-    return;
+    return false;
   }
   terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
              zone());
   LAST(ADD_TERM);
+  return true;
 }
 
 }  // namespace internal

diff --git a/src/regexp/regexp-parser.h b/src/regexp/regexp-parser.h
index af9b765..acf783c 100644
--- a/src/regexp/regexp-parser.h
+++ b/src/regexp/regexp-parser.h

@@ -99,28 +99,43 @@
 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
 class RegExpBuilder : public ZoneObject {
  public:
-  explicit RegExpBuilder(Zone* zone);
+  RegExpBuilder(Zone* zone, bool ignore_case, bool unicode);
   void AddCharacter(uc16 character);
   void AddUnicodeCharacter(uc32 character);
+  void AddEscapedUnicodeCharacter(uc32 character);
   // "Adds" an empty expression. Does nothing except consume a
   // following quantifier
   void AddEmpty();
+  void AddCharacterClass(RegExpCharacterClass* cc);
+  void AddCharacterClassForDesugaring(uc32 c);
   void AddAtom(RegExpTree* tree);
+  void AddTerm(RegExpTree* tree);
   void AddAssertion(RegExpTree* tree);
   void NewAlternative();  // '|'
-  void AddQuantifierToAtom(int min, int max,
+  bool AddQuantifierToAtom(int min, int max,
                            RegExpQuantifier::QuantifierType type);
   RegExpTree* ToRegExp();
 
  private:
+  static const uc16 kNoPendingSurrogate = 0;
+  void AddLeadSurrogate(uc16 lead_surrogate);
+  void AddTrailSurrogate(uc16 trail_surrogate);
+  void FlushPendingSurrogate();
   void FlushCharacters();
   void FlushText();
   void FlushTerms();
+  bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
+  bool NeedsDesugaringForIgnoreCase(uc32 c);
   Zone* zone() const { return zone_; }
+  bool ignore_case() const { return ignore_case_; }
+  bool unicode() const { return unicode_; }
 
   Zone* zone_;
   bool pending_empty_;
+  bool ignore_case_;
+  bool unicode_;
   ZoneList<uc16>* characters_;
+  uc16 pending_surrogate_;
   BufferedZoneList<RegExpTree, 2> terms_;
   BufferedZoneList<RegExpTree, 2> text_;
   BufferedZoneList<RegExpTree, 2> alternatives_;
@@ -135,12 +150,11 @@
 
 class RegExpParser BASE_EMBEDDED {
  public:
-  RegExpParser(FlatStringReader* in, Handle<String>* error, bool multiline_mode,
-               bool unicode, Isolate* isolate, Zone* zone);
+  RegExpParser(FlatStringReader* in, Handle<String>* error,
+               JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
 
   static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
-                          bool multiline, bool unicode,
-                          RegExpCompileData* result);
+                          JSRegExp::Flags flags, RegExpCompileData* result);
 
   RegExpTree* ParsePattern();
   RegExpTree* ParseDisjunction();
@@ -160,6 +174,7 @@
   bool ParseHexEscape(int length, uc32* value);
   bool ParseUnicodeEscape(uc32* value);
   bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
+  ZoneList<CharacterRange>* ParsePropertyClass();
 
   uc32 ParseOctalLiteral();
 
@@ -183,8 +198,11 @@
   int captures_started() { return captures_started_; }
   int position() { return next_pos_ - 1; }
   bool failed() { return failed_; }
+  bool ignore_case() const { return ignore_case_; }
+  bool multiline() const { return multiline_; }
+  bool unicode() const { return unicode_; }
 
-  static bool IsSyntaxCharacter(uc32 c);
+  static bool IsSyntaxCharacterOrSlash(uc32 c);
 
   static const int kMaxCaptures = 1 << 16;
   static const uc32 kEndMarker = (1 << 21);
@@ -203,9 +221,10 @@
     RegExpParserState(RegExpParserState* previous_state,
                       SubexpressionType group_type,
                       RegExpLookaround::Type lookaround_type,
-                      int disjunction_capture_index, Zone* zone)
+                      int disjunction_capture_index, bool ignore_case,
+                      bool unicode, Zone* zone)
         : previous_state_(previous_state),
-          builder_(new (zone) RegExpBuilder(zone)),
+          builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)),
           group_type_(group_type),
           lookaround_type_(lookaround_type),
           disjunction_capture_index_(disjunction_capture_index) {}
@@ -249,6 +268,8 @@
   bool has_more() { return has_more_; }
   bool has_next() { return next_pos_ < in()->length(); }
   uc32 Next();
+  template <bool update_position>
+  uc32 ReadNext();
   FlatStringReader* in() { return in_; }
   void ScanForCaptures();
 
@@ -258,13 +279,14 @@
   ZoneList<RegExpCapture*>* captures_;
   FlatStringReader* in_;
   uc32 current_;
+  bool ignore_case_;
+  bool multiline_;
+  bool unicode_;
   int next_pos_;
   int captures_started_;
   // The capture count is only valid after we have scanned for captures.
   int capture_count_;
   bool has_more_;
-  bool multiline_;
-  bool unicode_;
   bool simple_;
   bool contains_anchor_;
   bool is_scanned_for_captures_;

diff --git a/src/regexp/x64/regexp-macro-assembler-x64.cc b/src/regexp/x64/regexp-macro-assembler-x64.cc
index 286f159..952034f 100644
--- a/src/regexp/x64/regexp-macro-assembler-x64.cc
+++ b/src/regexp/x64/regexp-macro-assembler-x64.cc

@@ -203,7 +203,7 @@
 
 
 void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase(
-    int start_reg, bool read_backward, Label* on_no_match) {
+    int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
   Label fallthrough;
   ReadPositionFromRegister(rdx, start_reg);  // Offset of start of capture
   ReadPositionFromRegister(rbx, start_reg + 1);  // Offset of end of capture
@@ -308,8 +308,10 @@
     //   Address byte_offset1 - Address captured substring's start.
     //   Address byte_offset2 - Address of current character position.
     //   size_t byte_length - length of capture in bytes(!)
-    //   Isolate* isolate
+//   Isolate* isolate or 0 if unicode flag.
 #ifdef _WIN64
+    DCHECK(rcx.is(arg_reg_1));
+    DCHECK(rdx.is(arg_reg_2));
     // Compute and set byte_offset1 (start of capture).
     __ leap(rcx, Operand(rsi, rdx, times_1, 0));
     // Set byte_offset2.
@@ -317,11 +319,9 @@
     if (read_backward) {
       __ subq(rdx, rbx);
     }
-    // Set byte_length.
-    __ movp(r8, rbx);
-    // Isolate.
-    __ LoadAddress(r9, ExternalReference::isolate_address(isolate()));
 #else  // AMD64 calling convention
+    DCHECK(rdi.is(arg_reg_1));
+    DCHECK(rsi.is(arg_reg_2));
     // Compute byte_offset2 (current position = rsi+rdi).
     __ leap(rax, Operand(rsi, rdi, times_1, 0));
     // Compute and set byte_offset1 (start of capture).
@@ -331,11 +331,19 @@
     if (read_backward) {
       __ subq(rsi, rbx);
     }
+#endif  // _WIN64
+
     // Set byte_length.
-    __ movp(rdx, rbx);
+    __ movp(arg_reg_3, rbx);
     // Isolate.
-    __ LoadAddress(rcx, ExternalReference::isolate_address(isolate()));
-#endif
+#ifdef V8_I18N_SUPPORT
+    if (unicode) {
+      __ movp(arg_reg_4, Immediate(0));
+    } else  // NOLINT
+#endif      // V8_I18N_SUPPORT
+    {
+      __ LoadAddress(arg_reg_4, ExternalReference::isolate_address(isolate()));
+    }
 
     { // NOLINT: Can't find a way to open this scope without confusing the
       // linter.
@@ -869,11 +877,14 @@
         __ testp(rdi, rdi);
         __ j(zero, &exit_label_, Label::kNear);
         // Advance current position after a zero-length match.
+        Label advance;
+        __ bind(&advance);
         if (mode_ == UC16) {
           __ addq(rdi, Immediate(2));
         } else {
           __ incq(rdi);
         }
+        if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
       }
 
       __ jmp(&load_char_start_regexp);

diff --git a/src/regexp/x64/regexp-macro-assembler-x64.h b/src/regexp/x64/regexp-macro-assembler-x64.h
index 2578047..4c37771 100644
--- a/src/regexp/x64/regexp-macro-assembler-x64.h
+++ b/src/regexp/x64/regexp-macro-assembler-x64.h

@@ -38,7 +38,7 @@
   virtual void CheckNotBackReference(int start_reg, bool read_backward,
                                      Label* on_no_match);
   virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
-                                               bool read_backward,
+                                               bool read_backward, bool unicode,
                                                Label* on_no_match);
   virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
   virtual void CheckNotCharacterAfterAnd(uint32_t c,

diff --git a/src/regexp/x87/regexp-macro-assembler-x87.cc b/src/regexp/x87/regexp-macro-assembler-x87.cc
index 01d0b24..6e62092 100644
--- a/src/regexp/x87/regexp-macro-assembler-x87.cc
+++ b/src/regexp/x87/regexp-macro-assembler-x87.cc

@@ -187,9 +187,8 @@
   __ bind(&fallthrough);
 }
 
-
 void RegExpMacroAssemblerX87::CheckNotBackReferenceIgnoreCase(
-    int start_reg, bool read_backward, Label* on_no_match) {
+    int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
   Label fallthrough;
   __ mov(edx, register_location(start_reg));  // Index of start of capture
   __ mov(ebx, register_location(start_reg + 1));  // Index of end of capture
@@ -296,11 +295,18 @@
     //   Address byte_offset1 - Address captured substring's start.
     //   Address byte_offset2 - Address of current character position.
     //   size_t byte_length - length of capture in bytes(!)
-    //   Isolate* isolate
+//   Isolate* isolate or 0 if unicode flag.
 
     // Set isolate.
-    __ mov(Operand(esp, 3 * kPointerSize),
-           Immediate(ExternalReference::isolate_address(isolate())));
+#ifdef V8_I18N_SUPPORT
+    if (unicode) {
+      __ mov(Operand(esp, 3 * kPointerSize), Immediate(0));
+    } else  // NOLINT
+#endif      // V8_I18N_SUPPORT
+    {
+      __ mov(Operand(esp, 3 * kPointerSize),
+             Immediate(ExternalReference::isolate_address(isolate())));
+    }
     // Set byte_length.
     __ mov(Operand(esp, 2 * kPointerSize), ebx);
     // Set byte_offset2.
@@ -822,13 +828,15 @@
         __ test(edi, edi);
         __ j(zero, &exit_label_, Label::kNear);
         // Advance current position after a zero-length match.
+        Label advance;
+        __ bind(&advance);
         if (mode_ == UC16) {
           __ add(edi, Immediate(2));
         } else {
           __ inc(edi);
         }
+        if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
       }
-
       __ jmp(&load_char_start_regexp);
     } else {
       __ mov(eax, Immediate(SUCCESS));

diff --git a/src/regexp/x87/regexp-macro-assembler-x87.h b/src/regexp/x87/regexp-macro-assembler-x87.h
index c955412..2f68961 100644
--- a/src/regexp/x87/regexp-macro-assembler-x87.h
+++ b/src/regexp/x87/regexp-macro-assembler-x87.h

@@ -37,7 +37,7 @@
   virtual void CheckNotBackReference(int start_reg, bool read_backward,
                                      Label* on_no_match);
   virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
-                                               bool read_backward,
+                                               bool read_backward, bool unicode,
                                                Label* on_no_match);
   virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
   virtual void CheckNotCharacterAfterAnd(uint32_t c,
commit	097c5b25f8b57b92e6c94f25383a883ff242344f	[log] [tgz]
author	Ben Murdoch <benm@google.com>	Wed May 18 11:27:45 2016 +0100
committer	Dirk Vogt <dirk@fairphone.com>	Fri Mar 17 16:05:59 2017 +0100
tree	79bf5b2d1246e60291de69b04c80eef46ff43ee1
parent	3942d2ab9eee35cb86018242521effc970f4e902 [diff]