Benjamin Peterson | ee8712c | 2008-05-20 21:35:26 +0000 | [diff] [blame] | 1 | from test.support import run_unittest, open_urlresource |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 2 | import unittest |
| 3 | |
Antoine Pitrou | 1a305fd | 2009-11-26 12:38:23 +0000 | [diff] [blame] | 4 | from http.client import HTTPException |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 5 | import sys |
Tim Peters | 1b445d3 | 2002-11-24 18:53:11 +0000 | [diff] [blame] | 6 | import os |
Martin v. Löwis | bb417dc | 2008-09-10 21:15:32 +0000 | [diff] [blame] | 7 | from unicodedata import normalize, unidata_version |
Tim Peters | 1b445d3 | 2002-11-24 18:53:11 +0000 | [diff] [blame] | 8 | |
Skip Montanaro | 7a98be2 | 2007-08-16 14:35:24 +0000 | [diff] [blame] | 9 | TESTDATAFILE = "NormalizationTest.txt" |
Georg Brandl | 5a15508 | 2014-11-06 14:37:49 +0100 | [diff] [blame] | 10 | TESTDATAURL = "http://www.pythontest.net/unicode/" + unidata_version + "/" + TESTDATAFILE |
Martin v. Löwis | bb417dc | 2008-09-10 21:15:32 +0000 | [diff] [blame] | 11 | |
Florent Xicluna | f089fd6 | 2010-03-19 14:25:03 +0000 | [diff] [blame] | 12 | def check_version(testfile): |
| 13 | hdr = testfile.readline() |
| 14 | return unidata_version in hdr |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 15 | |
Neal Norwitz | 1e32b69 | 2006-03-24 08:02:35 +0000 | [diff] [blame] | 16 | class RangeError(Exception): |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 17 | pass |
| 18 | |
| 19 | def NFC(str): |
| 20 | return normalize("NFC", str) |
| 21 | |
| 22 | def NFKC(str): |
| 23 | return normalize("NFKC", str) |
| 24 | |
| 25 | def NFD(str): |
| 26 | return normalize("NFD", str) |
| 27 | |
| 28 | def NFKD(str): |
| 29 | return normalize("NFKD", str) |
| 30 | |
| 31 | def unistr(data): |
| 32 | data = [int(x, 16) for x in data.split(" ")] |
| 33 | for x in data: |
| 34 | if x > sys.maxunicode: |
| 35 | raise RangeError |
Guido van Rossum | 84fc66d | 2007-05-03 17:18:26 +0000 | [diff] [blame] | 36 | return "".join([chr(x) for x in data]) |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 37 | |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 38 | class NormalizationTest(unittest.TestCase): |
| 39 | def test_main(self): |
Florent Xicluna | f089fd6 | 2010-03-19 14:25:03 +0000 | [diff] [blame] | 40 | part = None |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 41 | part1_data = {} |
Antoine Pitrou | 7e05e7d | 2009-11-01 21:43:20 +0000 | [diff] [blame] | 42 | # Hit the exception early |
| 43 | try: |
Florent Xicluna | f089fd6 | 2010-03-19 14:25:03 +0000 | [diff] [blame] | 44 | testdata = open_urlresource(TESTDATAURL, encoding="utf-8", |
| 45 | check=check_version) |
Andrew Svetlov | f7a17b4 | 2012-12-25 16:47:37 +0200 | [diff] [blame] | 46 | except (OSError, HTTPException): |
Antoine Pitrou | 7e05e7d | 2009-11-01 21:43:20 +0000 | [diff] [blame] | 47 | self.skipTest("Could not retrieve " + TESTDATAURL) |
Antoine Pitrou | 70df8f8 | 2010-12-22 22:19:15 +0000 | [diff] [blame] | 48 | self.addCleanup(testdata.close) |
Florent Xicluna | f089fd6 | 2010-03-19 14:25:03 +0000 | [diff] [blame] | 49 | for line in testdata: |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 50 | if '#' in line: |
| 51 | line = line.split('#')[0] |
| 52 | line = line.strip() |
| 53 | if not line: |
| 54 | continue |
| 55 | if line.startswith("@Part"): |
| 56 | part = line.split()[0] |
| 57 | continue |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 58 | try: |
| 59 | c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] |
| 60 | except RangeError: |
| 61 | # Skip unsupported characters; |
Ezio Melotti | 85a8629 | 2013-08-17 16:57:41 +0300 | [diff] [blame] | 62 | # try at least adding c1 if we are in part1 |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 63 | if part == "@Part1": |
| 64 | try: |
| 65 | c1 = unistr(line.split(';')[0]) |
| 66 | except RangeError: |
| 67 | pass |
| 68 | else: |
| 69 | part1_data[c1] = 1 |
| 70 | continue |
| 71 | |
| 72 | # Perform tests |
Benjamin Peterson | c9c0f20 | 2009-06-30 23:06:06 +0000 | [diff] [blame] | 73 | self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) |
| 74 | self.assertTrue(c4 == NFC(c4) == NFC(c5), line) |
| 75 | self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) |
| 76 | self.assertTrue(c5 == NFD(c4) == NFD(c5), line) |
| 77 | self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \ |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 78 | NFKC(c3) == NFKC(c4) == NFKC(c5), |
| 79 | line) |
Benjamin Peterson | c9c0f20 | 2009-06-30 23:06:06 +0000 | [diff] [blame] | 80 | self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \ |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 81 | NFKD(c3) == NFKD(c4) == NFKD(c5), |
| 82 | line) |
| 83 | |
| 84 | # Record part 1 data |
Martin v. Löwis | 4196296 | 2006-03-10 11:59:47 +0000 | [diff] [blame] | 85 | if part == "@Part1": |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 86 | part1_data[c1] = 1 |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 87 | |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 88 | # Perform tests for all other data |
| 89 | for c in range(sys.maxunicode+1): |
Guido van Rossum | 84fc66d | 2007-05-03 17:18:26 +0000 | [diff] [blame] | 90 | X = chr(c) |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 91 | if X in part1_data: |
| 92 | continue |
Benjamin Peterson | c9c0f20 | 2009-06-30 23:06:06 +0000 | [diff] [blame] | 93 | self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c) |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 94 | |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 95 | def test_bug_834676(self): |
| 96 | # Check for bug 834676 |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 97 | normalize('NFC', '\ud55c\uae00') |
Tim Peters | 1b445d3 | 2002-11-24 18:53:11 +0000 | [diff] [blame] | 98 | |
Tim Peters | 1b445d3 | 2002-11-24 18:53:11 +0000 | [diff] [blame] | 99 | |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 100 | def test_main(): |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 101 | run_unittest(NormalizationTest) |
Martin v. Löwis | d2171d2 | 2003-11-06 20:47:57 +0000 | [diff] [blame] | 102 | |
Tim Peters | 1b445d3 | 2002-11-24 18:53:11 +0000 | [diff] [blame] | 103 | if __name__ == "__main__": |
| 104 | test_main() |