Benjamin Peterson | ee8712c | 2008-05-20 21:35:26 +0000 | [diff] [blame] | 1 | from test.support import run_unittest, open_urlresource |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 2 | import unittest |
| 3 | |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 4 | import sys |
Tim Peters | 1b445d3 | 2002-11-24 18:53:11 +0000 | [diff] [blame] | 5 | import os |
Martin v. Löwis | bb417dc | 2008-09-10 21:15:32 +0000 | [diff] [blame] | 6 | from unicodedata import normalize, unidata_version |
Tim Peters | 1b445d3 | 2002-11-24 18:53:11 +0000 | [diff] [blame] | 7 | |
Skip Montanaro | 7a98be2 | 2007-08-16 14:35:24 +0000 | [diff] [blame] | 8 | TESTDATAFILE = "NormalizationTest.txt" |
Martin v. Löwis | bb417dc | 2008-09-10 21:15:32 +0000 | [diff] [blame] | 9 | TESTDATAURL = "http://www.unicode.org/Public/" + unidata_version + "/ucd/" + TESTDATAFILE |
| 10 | |
| 11 | if os.path.exists(TESTDATAFILE): |
Antoine Pitrou | 66cb9d5 | 2008-12-16 14:25:45 +0000 | [diff] [blame] | 12 | f = open(TESTDATAFILE, encoding='utf-8') |
Martin v. Löwis | bb417dc | 2008-09-10 21:15:32 +0000 | [diff] [blame] | 13 | l = f.readline() |
| 14 | f.close() |
| 15 | if not unidata_version in l: |
| 16 | os.unlink(TESTDATAFILE) |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 17 | |
Neal Norwitz | 1e32b69 | 2006-03-24 08:02:35 +0000 | [diff] [blame] | 18 | class RangeError(Exception): |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 19 | pass |
| 20 | |
| 21 | def NFC(str): |
| 22 | return normalize("NFC", str) |
| 23 | |
| 24 | def NFKC(str): |
| 25 | return normalize("NFKC", str) |
| 26 | |
| 27 | def NFD(str): |
| 28 | return normalize("NFD", str) |
| 29 | |
| 30 | def NFKD(str): |
| 31 | return normalize("NFKD", str) |
| 32 | |
| 33 | def unistr(data): |
| 34 | data = [int(x, 16) for x in data.split(" ")] |
| 35 | for x in data: |
| 36 | if x > sys.maxunicode: |
| 37 | raise RangeError |
Guido van Rossum | 84fc66d | 2007-05-03 17:18:26 +0000 | [diff] [blame] | 38 | return "".join([chr(x) for x in data]) |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 39 | |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 40 | class NormalizationTest(unittest.TestCase): |
| 41 | def test_main(self): |
| 42 | part1_data = {} |
Martin v. Löwis | 234a34a | 2007-08-30 20:58:02 +0000 | [diff] [blame] | 43 | for line in open_urlresource(TESTDATAURL, encoding="utf-8"): |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 44 | if '#' in line: |
| 45 | line = line.split('#')[0] |
| 46 | line = line.strip() |
| 47 | if not line: |
| 48 | continue |
| 49 | if line.startswith("@Part"): |
| 50 | part = line.split()[0] |
| 51 | continue |
| 52 | if part == "@Part3": |
| 53 | # XXX we don't support PRI #29 yet, so skip these tests for now |
| 54 | continue |
| 55 | try: |
| 56 | c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] |
| 57 | except RangeError: |
| 58 | # Skip unsupported characters; |
| 59 | # try atleast adding c1 if we are in part1 |
| 60 | if part == "@Part1": |
| 61 | try: |
| 62 | c1 = unistr(line.split(';')[0]) |
| 63 | except RangeError: |
| 64 | pass |
| 65 | else: |
| 66 | part1_data[c1] = 1 |
| 67 | continue |
| 68 | |
| 69 | # Perform tests |
| 70 | self.failUnless(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) |
| 71 | self.failUnless(c4 == NFC(c4) == NFC(c5), line) |
| 72 | self.failUnless(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) |
| 73 | self.failUnless(c5 == NFD(c4) == NFD(c5), line) |
| 74 | self.failUnless(c4 == NFKC(c1) == NFKC(c2) == \ |
| 75 | NFKC(c3) == NFKC(c4) == NFKC(c5), |
| 76 | line) |
| 77 | self.failUnless(c5 == NFKD(c1) == NFKD(c2) == \ |
| 78 | NFKD(c3) == NFKD(c4) == NFKD(c5), |
| 79 | line) |
| 80 | |
| 81 | # Record part 1 data |
Martin v. Löwis | 4196296 | 2006-03-10 11:59:47 +0000 | [diff] [blame] | 82 | if part == "@Part1": |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 83 | part1_data[c1] = 1 |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 84 | |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 85 | # Perform tests for all other data |
| 86 | for c in range(sys.maxunicode+1): |
Guido van Rossum | 84fc66d | 2007-05-03 17:18:26 +0000 | [diff] [blame] | 87 | X = chr(c) |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 88 | if X in part1_data: |
| 89 | continue |
| 90 | self.failUnless(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c) |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 91 | |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 92 | def test_bug_834676(self): |
| 93 | # Check for bug 834676 |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 94 | normalize('NFC', '\ud55c\uae00') |
Tim Peters | 1b445d3 | 2002-11-24 18:53:11 +0000 | [diff] [blame] | 95 | |
Tim Peters | 1b445d3 | 2002-11-24 18:53:11 +0000 | [diff] [blame] | 96 | |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 97 | def test_main(): |
| 98 | # Hit the exception early |
| 99 | open_urlresource(TESTDATAURL) |
| 100 | run_unittest(NormalizationTest) |
Martin v. Löwis | d2171d2 | 2003-11-06 20:47:57 +0000 | [diff] [blame] | 101 | |
Tim Peters | 1b445d3 | 2002-11-24 18:53:11 +0000 | [diff] [blame] | 102 | if __name__ == "__main__": |
| 103 | test_main() |