Merge r3236, r3243, r3249 and r3258 from bleeding_edge to trunk.

This fixes issue 486 (incorrect handling of cyrillic characters).

Review URL: http://codereview.chromium.org/389001

git-svn-id: http://v8.googlecode.com/svn/trunk@3268 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
diff --git a/test/mjsunit/cyrillic.js b/test/mjsunit/cyrillic.js
new file mode 100644
index 0000000..13775b0
--- /dev/null
+++ b/test/mjsunit/cyrillic.js
@@ -0,0 +1,208 @@
+// Copyright 2009 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Test Unicode character ranges in regexps.
+
+
+// Cyrillic.
+var cyrillic = {
+  FIRST: "\u0410",   // A
+  first: "\u0430",   // a
+  LAST: "\u042f",    // YA
+  last: "\u044f",    // ya
+  MIDDLE: "\u0427",  // CHE
+  middle: "\u0447",   // che
+  // Actually no characters are between the cases in Cyrillic.
+  BetweenCases: false};
+
+var SIGMA = "\u03a3";
+var sigma = "\u03c3";
+var alternative_sigma = "\u03c2";
+
+// Greek.
+var greek = {
+  FIRST: "\u0391",     // ALPHA
+  first: "\u03b1",     // alpha
+  LAST: "\u03a9",      // OMEGA
+  last: "\u03c9",      // omega
+  MIDDLE: SIGMA,       // SIGMA
+  middle: sigma,       // sigma
+  // Epsilon acute is between ALPHA-OMEGA and alpha-omega, ie it
+  // is between OMEGA and alpha.
+  BetweenCases: "\u03ad"};
+
+
+function Range(from, to, flags) {
+  return new RegExp("[" + from + "-" + to + "]", flags);
+}
+
+// Test Cyrillic and Greek separately.
+for (var lang = 0; lang < 2; lang++) {
+  var chars = (lang == 0) ? cyrillic : greek;
+
+  for (var i = 0; i < 2; i++) {
+    var lc = (i == 0);  // Lower case.
+    var first = lc ? chars.first : chars.FIRST;
+    var middle = lc ? chars.middle : chars.MIDDLE;
+    var last = lc ? chars.last : chars.LAST;
+    var first_other_case = lc ? chars.FIRST : chars.first;
+    var middle_other_case = lc ? chars.MIDDLE : chars.middle;
+    var last_other_case = lc ? chars.LAST : chars.last;
+
+    assertTrue(Range(first, last).test(first), 1);
+    assertTrue(Range(first, last).test(middle), 2);
+    assertTrue(Range(first, last).test(last), 3);
+
+    assertFalse(Range(first, last).test(first_other_case), 4);
+    assertFalse(Range(first, last).test(middle_other_case), 5);
+    assertFalse(Range(first, last).test(last_other_case), 6);
+
+    assertTrue(Range(first, last, "i").test(first), 7);
+    assertTrue(Range(first, last, "i").test(middle), 8);
+    assertTrue(Range(first, last, "i").test(last), 9);
+
+    assertTrue(Range(first, last, "i").test(first_other_case), 10);
+    assertTrue(Range(first, last, "i").test(middle_other_case), 11);
+    assertTrue(Range(first, last, "i").test(last_other_case), 12);
+
+    if (chars.BetweenCases) {
+      assertFalse(Range(first, last).test(chars.BetweenCases), 13);
+      assertFalse(Range(first, last, "i").test(chars.BetweenCases), 14);
+    }
+  }
+  if (chars.BetweenCases) {
+    assertTrue(Range(chars.FIRST, chars.last).test(chars.BetweenCases), 15);
+    assertTrue(Range(chars.FIRST, chars.last, "i").test(chars.BetweenCases), 16);
+  }
+}
+
+// Test range that covers both greek and cyrillic characters.
+for (key in greek) {
+  assertTrue(Range(greek.FIRST, cyrillic.last).test(greek[key]), 17 + key);
+  if (cyrillic[key]) {
+    assertTrue(Range(greek.FIRST, cyrillic.last).test(cyrillic[key]), 18 + key);
+  }
+}
+
+for (var i = 0; i < 2; i++) {
+  var ignore_case = (i == 0);
+  var flag = ignore_case ? "i" : "";
+  assertTrue(Range(greek.first, cyrillic.LAST, flag).test(greek.first), 19);
+  assertTrue(Range(greek.first, cyrillic.LAST, flag).test(greek.middle), 20);
+  assertTrue(Range(greek.first, cyrillic.LAST, flag).test(greek.last), 21);
+
+  assertTrue(Range(greek.first, cyrillic.LAST, flag).test(cyrillic.FIRST), 22);
+  assertTrue(Range(greek.first, cyrillic.LAST, flag).test(cyrillic.MIDDLE), 23);
+  assertTrue(Range(greek.first, cyrillic.LAST, flag).test(cyrillic.LAST), 24);
+
+  // A range that covers the lower case greek letters and the upper case cyrillic
+  // letters.
+  assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(greek.FIRST), 25);
+  assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(greek.MIDDLE), 26);
+  assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(greek.LAST), 27);
+
+  assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(cyrillic.first), 28);
+  assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(cyrillic.middle), 29);
+  assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(cyrillic.last), 30);
+}
+
+
+// Sigma is special because there are two lower case versions of the same upper
+// case character.  JS requires that case independece means that you should
+// convert everything to upper case, so the two sigma variants are equal to each
+// other in a case independt comparison.
+for (var i = 0; i < 2; i++) {
+  var simple = (i != 0);
+  var name = simple ? "" : "[]";
+  var regex = simple ? SIGMA : "[" + SIGMA + "]";
+
+  assertFalse(new RegExp(regex).test(sigma), 31 + name);
+  assertFalse(new RegExp(regex).test(alternative_sigma), 32 + name);
+  assertTrue(new RegExp(regex).test(SIGMA), 33 + name);
+
+  assertTrue(new RegExp(regex, "i").test(sigma), 34 + name);
+  // JSC and Tracemonkey fail this one.
+  assertTrue(new RegExp(regex, "i").test(alternative_sigma), 35 + name);
+  assertTrue(new RegExp(regex, "i").test(SIGMA), 36 + name);
+
+  regex = simple ? sigma : "[" + sigma + "]";
+
+  assertTrue(new RegExp(regex).test(sigma), 41 + name);
+  assertFalse(new RegExp(regex).test(alternative_sigma), 42 + name);
+  assertFalse(new RegExp(regex).test(SIGMA), 43 + name);
+
+  assertTrue(new RegExp(regex, "i").test(sigma), 44 + name);
+  // JSC and Tracemonkey fail this one.
+  assertTrue(new RegExp(regex, "i").test(alternative_sigma), 45 + name);
+  assertTrue(new RegExp(regex, "i").test(SIGMA), 46 + name);
+
+  regex = simple ? alternative_sigma : "[" + alternative_sigma + "]";
+
+  assertFalse(new RegExp(regex).test(sigma), 51 + name);
+  assertTrue(new RegExp(regex).test(alternative_sigma), 52 + name);
+  assertFalse(new RegExp(regex).test(SIGMA), 53 + name);
+
+  // JSC and Tracemonkey fail this one.
+  assertTrue(new RegExp(regex, "i").test(sigma), 54 + name);
+  assertTrue(new RegExp(regex, "i").test(alternative_sigma), 55 + name);
+  // JSC and Tracemonkey fail this one.
+  assertTrue(new RegExp(regex, "i").test(SIGMA), 56 + name);
+}
+
+
+// Test all non-ASCII characters individually to ensure that our optimizations
+// didn't break anything.
+for (var i = 0x80; i <= 0xfffe; i++) {
+  var c = String.fromCharCode(i);
+  var c2 = String.fromCharCode(i + 1);
+  var re = new RegExp("[" + c + "-" + c2 + "]", "i");
+  assertTrue(re.test(c), 57);
+}
+
+for (var add_non_ascii_character_to_subject = 0;
+     add_non_ascii_character_to_subject < 2;
+     add_non_ascii_character_to_subject++) {
+  var suffix = add_non_ascii_character_to_subject ? "\ufffe" : "";
+  // A range that covers both ASCII and non-ASCII.
+  for (var i = 0; i < 2; i++) {
+    var full = (i != 0);
+    var mixed = full ? "[a-\uffff]" : "[a-" + cyrillic.LAST + "]";
+    var f = full ? "f" : "c";
+    for (var j = 0; j < 2; j++) {
+      var ignore_case = (j == 0);
+      var flag = ignore_case ? "i" : "";
+      var re = new RegExp(mixed, flag);
+      assertEquals(ignore_case || (full && add_non_ascii_character_to_subject),
+                   re.test("A" + suffix),
+                   58 + flag + f);
+      assertTrue(re.test("a" + suffix), 59 + flag + f);
+      assertTrue(re.test("~" + suffix), 60 + flag + f);
+      assertTrue(re.test(cyrillic.MIDDLE), 61 + flag + f);
+      assertEquals(ignore_case || full, re.test(cyrillic.middle), 62 + flag + f);
+    }
+  }
+}
diff --git a/test/mjsunit/mjsunit.status b/test/mjsunit/mjsunit.status
index 15f62b0..7995a82 100644
--- a/test/mjsunit/mjsunit.status
+++ b/test/mjsunit/mjsunit.status
@@ -39,6 +39,9 @@
 # Issue 488: this test sometimes times out.
 array-constructor: PASS || TIMEOUT
 
+# Issue 499
+cyrillic: PASS, TIMEOUT if ($arch == arm)
+
 [ $arch == arm ]
 
 # Slow tests which times out in debug mode.
diff --git a/test/mjsunit/regress/regress-486.js b/test/mjsunit/regress/regress-486.js
new file mode 100644
index 0000000..c1e29a6
--- /dev/null
+++ b/test/mjsunit/regress/regress-486.js
@@ -0,0 +1,30 @@
+// Copyright 2009 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+var st = "\u0422\u0435\u0441\u0442";  // Test in Cyrillic characters.
+var cyrillicMatch = /^[\u0430-\u044fa-z]+$/i.test(st);  // a-ja a-z.
+assertTrue(cyrillicMatch);