blob: 9e85e0a9156815ac70788069d59f880fe24dad08 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test.support import run_unittest, open_urlresource
Guido van Rossumd8faa362007-04-27 19:54:29 +00002import unittest
3
Martin v. Löwis677bde22002-11-23 22:08:15 +00004import sys
Tim Peters1b445d32002-11-24 18:53:11 +00005import os
Martin v. Löwisbb417dc2008-09-10 21:15:32 +00006from unicodedata import normalize, unidata_version
Tim Peters1b445d32002-11-24 18:53:11 +00007
Skip Montanaro7a98be22007-08-16 14:35:24 +00008TESTDATAFILE = "NormalizationTest.txt"
Martin v. Löwisbb417dc2008-09-10 21:15:32 +00009TESTDATAURL = "http://www.unicode.org/Public/" + unidata_version + "/ucd/" + TESTDATAFILE
10
11if os.path.exists(TESTDATAFILE):
Antoine Pitrou66cb9d52008-12-16 14:25:45 +000012 f = open(TESTDATAFILE, encoding='utf-8')
Martin v. Löwisbb417dc2008-09-10 21:15:32 +000013 l = f.readline()
14 f.close()
15 if not unidata_version in l:
16 os.unlink(TESTDATAFILE)
Martin v. Löwis677bde22002-11-23 22:08:15 +000017
Neal Norwitz1e32b692006-03-24 08:02:35 +000018class RangeError(Exception):
Martin v. Löwis677bde22002-11-23 22:08:15 +000019 pass
20
21def NFC(str):
22 return normalize("NFC", str)
23
24def NFKC(str):
25 return normalize("NFKC", str)
26
27def NFD(str):
28 return normalize("NFD", str)
29
30def NFKD(str):
31 return normalize("NFKD", str)
32
33def unistr(data):
34 data = [int(x, 16) for x in data.split(" ")]
35 for x in data:
36 if x > sys.maxunicode:
37 raise RangeError
Guido van Rossum84fc66d2007-05-03 17:18:26 +000038 return "".join([chr(x) for x in data])
Martin v. Löwis677bde22002-11-23 22:08:15 +000039
Guido van Rossumd8faa362007-04-27 19:54:29 +000040class NormalizationTest(unittest.TestCase):
41 def test_main(self):
42 part1_data = {}
Martin v. Löwis234a34a2007-08-30 20:58:02 +000043 for line in open_urlresource(TESTDATAURL, encoding="utf-8"):
Guido van Rossumd8faa362007-04-27 19:54:29 +000044 if '#' in line:
45 line = line.split('#')[0]
46 line = line.strip()
47 if not line:
48 continue
49 if line.startswith("@Part"):
50 part = line.split()[0]
51 continue
52 if part == "@Part3":
53 # XXX we don't support PRI #29 yet, so skip these tests for now
54 continue
55 try:
56 c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
57 except RangeError:
58 # Skip unsupported characters;
59 # try atleast adding c1 if we are in part1
60 if part == "@Part1":
61 try:
62 c1 = unistr(line.split(';')[0])
63 except RangeError:
64 pass
65 else:
66 part1_data[c1] = 1
67 continue
68
69 # Perform tests
70 self.failUnless(c2 == NFC(c1) == NFC(c2) == NFC(c3), line)
71 self.failUnless(c4 == NFC(c4) == NFC(c5), line)
72 self.failUnless(c3 == NFD(c1) == NFD(c2) == NFD(c3), line)
73 self.failUnless(c5 == NFD(c4) == NFD(c5), line)
74 self.failUnless(c4 == NFKC(c1) == NFKC(c2) == \
75 NFKC(c3) == NFKC(c4) == NFKC(c5),
76 line)
77 self.failUnless(c5 == NFKD(c1) == NFKD(c2) == \
78 NFKD(c3) == NFKD(c4) == NFKD(c5),
79 line)
80
81 # Record part 1 data
Martin v. Löwis41962962006-03-10 11:59:47 +000082 if part == "@Part1":
Guido van Rossumd8faa362007-04-27 19:54:29 +000083 part1_data[c1] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +000084
Guido van Rossumd8faa362007-04-27 19:54:29 +000085 # Perform tests for all other data
86 for c in range(sys.maxunicode+1):
Guido van Rossum84fc66d2007-05-03 17:18:26 +000087 X = chr(c)
Guido van Rossumd8faa362007-04-27 19:54:29 +000088 if X in part1_data:
89 continue
90 self.failUnless(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
Martin v. Löwis677bde22002-11-23 22:08:15 +000091
Guido van Rossumd8faa362007-04-27 19:54:29 +000092 def test_bug_834676(self):
93 # Check for bug 834676
Guido van Rossumef87d6e2007-05-02 19:09:54 +000094 normalize('NFC', '\ud55c\uae00')
Tim Peters1b445d32002-11-24 18:53:11 +000095
Tim Peters1b445d32002-11-24 18:53:11 +000096
Guido van Rossumd8faa362007-04-27 19:54:29 +000097def test_main():
98 # Hit the exception early
99 open_urlresource(TESTDATAURL)
100 run_unittest(NormalizationTest)
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000101
Tim Peters1b445d32002-11-24 18:53:11 +0000102if __name__ == "__main__":
103 test_main()