Zachary Ware | 38c707e | 2015-04-13 15:00:43 -0500 | [diff] [blame] | 1 | from test.support import open_urlresource |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 2 | import unittest |
| 3 | |
Antoine Pitrou | 1a305fd | 2009-11-26 12:38:23 +0000 | [diff] [blame] | 4 | from http.client import HTTPException |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 5 | import sys |
Max Bélanger | 2810dd7 | 2018-11-04 15:58:24 -0800 | [diff] [blame] | 6 | from unicodedata import normalize, is_normalized, unidata_version |
Tim Peters | 1b445d3 | 2002-11-24 18:53:11 +0000 | [diff] [blame] | 7 | |
Skip Montanaro | 7a98be2 | 2007-08-16 14:35:24 +0000 | [diff] [blame] | 8 | TESTDATAFILE = "NormalizationTest.txt" |
Georg Brandl | 5a15508 | 2014-11-06 14:37:49 +0100 | [diff] [blame] | 9 | TESTDATAURL = "http://www.pythontest.net/unicode/" + unidata_version + "/" + TESTDATAFILE |
Martin v. Löwis | bb417dc | 2008-09-10 21:15:32 +0000 | [diff] [blame] | 10 | |
Florent Xicluna | f089fd6 | 2010-03-19 14:25:03 +0000 | [diff] [blame] | 11 | def check_version(testfile): |
| 12 | hdr = testfile.readline() |
| 13 | return unidata_version in hdr |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 14 | |
Neal Norwitz | 1e32b69 | 2006-03-24 08:02:35 +0000 | [diff] [blame] | 15 | class RangeError(Exception): |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 16 | pass |
| 17 | |
| 18 | def NFC(str): |
| 19 | return normalize("NFC", str) |
| 20 | |
| 21 | def NFKC(str): |
| 22 | return normalize("NFKC", str) |
| 23 | |
| 24 | def NFD(str): |
| 25 | return normalize("NFD", str) |
| 26 | |
| 27 | def NFKD(str): |
| 28 | return normalize("NFKD", str) |
| 29 | |
| 30 | def unistr(data): |
| 31 | data = [int(x, 16) for x in data.split(" ")] |
| 32 | for x in data: |
| 33 | if x > sys.maxunicode: |
| 34 | raise RangeError |
Guido van Rossum | 84fc66d | 2007-05-03 17:18:26 +0000 | [diff] [blame] | 35 | return "".join([chr(x) for x in data]) |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 36 | |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 37 | class NormalizationTest(unittest.TestCase): |
| 38 | def test_main(self): |
Antoine Pitrou | 7e05e7d | 2009-11-01 21:43:20 +0000 | [diff] [blame] | 39 | # Hit the exception early |
| 40 | try: |
Florent Xicluna | f089fd6 | 2010-03-19 14:25:03 +0000 | [diff] [blame] | 41 | testdata = open_urlresource(TESTDATAURL, encoding="utf-8", |
| 42 | check=check_version) |
Victor Stinner | d13d547 | 2017-04-20 02:39:59 +0200 | [diff] [blame] | 43 | except PermissionError: |
| 44 | self.skipTest(f"Permission error when downloading {TESTDATAURL} " |
| 45 | f"into the test data directory") |
Andrew Svetlov | f7a17b4 | 2012-12-25 16:47:37 +0200 | [diff] [blame] | 46 | except (OSError, HTTPException): |
Victor Stinner | 722a3af | 2017-03-30 17:06:53 +0200 | [diff] [blame] | 47 | self.fail(f"Could not retrieve {TESTDATAURL}") |
| 48 | |
| 49 | with testdata: |
| 50 | self.run_normalization_tests(testdata) |
| 51 | |
| 52 | def run_normalization_tests(self, testdata): |
| 53 | part = None |
| 54 | part1_data = {} |
| 55 | |
Florent Xicluna | f089fd6 | 2010-03-19 14:25:03 +0000 | [diff] [blame] | 56 | for line in testdata: |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 57 | if '#' in line: |
| 58 | line = line.split('#')[0] |
| 59 | line = line.strip() |
| 60 | if not line: |
| 61 | continue |
| 62 | if line.startswith("@Part"): |
| 63 | part = line.split()[0] |
| 64 | continue |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 65 | try: |
| 66 | c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] |
| 67 | except RangeError: |
| 68 | # Skip unsupported characters; |
Ezio Melotti | 85a8629 | 2013-08-17 16:57:41 +0300 | [diff] [blame] | 69 | # try at least adding c1 if we are in part1 |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 70 | if part == "@Part1": |
| 71 | try: |
| 72 | c1 = unistr(line.split(';')[0]) |
| 73 | except RangeError: |
| 74 | pass |
| 75 | else: |
| 76 | part1_data[c1] = 1 |
| 77 | continue |
| 78 | |
| 79 | # Perform tests |
Benjamin Peterson | c9c0f20 | 2009-06-30 23:06:06 +0000 | [diff] [blame] | 80 | self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) |
| 81 | self.assertTrue(c4 == NFC(c4) == NFC(c5), line) |
| 82 | self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) |
| 83 | self.assertTrue(c5 == NFD(c4) == NFD(c5), line) |
| 84 | self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \ |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 85 | NFKC(c3) == NFKC(c4) == NFKC(c5), |
| 86 | line) |
Benjamin Peterson | c9c0f20 | 2009-06-30 23:06:06 +0000 | [diff] [blame] | 87 | self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \ |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 88 | NFKD(c3) == NFKD(c4) == NFKD(c5), |
| 89 | line) |
| 90 | |
Max Bélanger | 2810dd7 | 2018-11-04 15:58:24 -0800 | [diff] [blame] | 91 | self.assertTrue(is_normalized("NFC", c2)) |
| 92 | self.assertTrue(is_normalized("NFC", c4)) |
| 93 | |
| 94 | self.assertTrue(is_normalized("NFD", c3)) |
| 95 | self.assertTrue(is_normalized("NFD", c5)) |
| 96 | |
| 97 | self.assertTrue(is_normalized("NFKC", c4)) |
| 98 | self.assertTrue(is_normalized("NFKD", c5)) |
| 99 | |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 100 | # Record part 1 data |
Martin v. Löwis | 4196296 | 2006-03-10 11:59:47 +0000 | [diff] [blame] | 101 | if part == "@Part1": |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 102 | part1_data[c1] = 1 |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 103 | |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 104 | # Perform tests for all other data |
| 105 | for c in range(sys.maxunicode+1): |
Guido van Rossum | 84fc66d | 2007-05-03 17:18:26 +0000 | [diff] [blame] | 106 | X = chr(c) |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 107 | if X in part1_data: |
| 108 | continue |
Benjamin Peterson | c9c0f20 | 2009-06-30 23:06:06 +0000 | [diff] [blame] | 109 | self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c) |
Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 110 | |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 111 | def test_bug_834676(self): |
| 112 | # Check for bug 834676 |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 113 | normalize('NFC', '\ud55c\uae00') |
Tim Peters | 1b445d3 | 2002-11-24 18:53:11 +0000 | [diff] [blame] | 114 | |
Tim Peters | 1b445d3 | 2002-11-24 18:53:11 +0000 | [diff] [blame] | 115 | |
Tim Peters | 1b445d3 | 2002-11-24 18:53:11 +0000 | [diff] [blame] | 116 | if __name__ == "__main__": |
Zachary Ware | 38c707e | 2015-04-13 15:00:43 -0500 | [diff] [blame] | 117 | unittest.main() |