blob: 68a3219560e11d3d6e4cc1897ded45bca37f916a [file] [log] [blame]
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00001""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
Fredrik Lundh06d12682001-01-24 07:59:11 +00004Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00005
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
Marc-André Lemburg36619082001-01-17 19:11:13 +00009
Walter Dörwald37c47282003-02-26 14:49:41 +000010import unittest
Ezio Melotti931b8aa2011-10-21 21:57:36 +030011import unicodedata
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000012
Benjamin Petersonee8712c2008-05-20 21:35:26 +000013from test import support
Ezio Melotti931b8aa2011-10-21 21:57:36 +030014from http.client import HTTPException
15from test.test_normalization import check_version
Fredrik Lundhee865c62001-01-19 11:00:42 +000016
Walter Dörwald37c47282003-02-26 14:49:41 +000017class UnicodeNamesTest(unittest.TestCase):
Fredrik Lundhee865c62001-01-19 11:00:42 +000018
Walter Dörwald37c47282003-02-26 14:49:41 +000019 def checkletter(self, name, code):
20 # Helper that put all \N escapes inside eval'd raw strings,
Tim Peters669454e2003-03-07 17:30:48 +000021 # to make sure this script runs even if the compiler
Walter Dörwald37c47282003-02-26 14:49:41 +000022 # chokes on \N escapes
Guido van Rossumbf4806b2007-07-21 00:15:34 +000023 res = eval(r'"\N{%s}"' % name)
Walter Dörwald37c47282003-02-26 14:49:41 +000024 self.assertEqual(res, code)
25 return res
Fredrik Lundhee865c62001-01-19 11:00:42 +000026
Walter Dörwald37c47282003-02-26 14:49:41 +000027 def test_general(self):
28 # General and case insensitivity test:
29 chars = [
30 "LATIN CAPITAL LETTER T",
31 "LATIN SMALL LETTER H",
32 "LATIN SMALL LETTER E",
33 "SPACE",
34 "LATIN SMALL LETTER R",
35 "LATIN CAPITAL LETTER E",
36 "LATIN SMALL LETTER D",
37 "SPACE",
38 "LATIN SMALL LETTER f",
39 "LATIN CAPITAL LeTtEr o",
40 "LATIN SMaLl LETTER x",
41 "SPACE",
42 "LATIN SMALL LETTER A",
43 "LATIN SMALL LETTER T",
44 "LATIN SMALL LETTER E",
45 "SPACE",
46 "LATIN SMALL LETTER T",
47 "LATIN SMALL LETTER H",
48 "LATIN SMALL LETTER E",
49 "SpAcE",
50 "LATIN SMALL LETTER S",
51 "LATIN SMALL LETTER H",
52 "LATIN small LETTER e",
53 "LATIN small LETTER e",
54 "LATIN SMALL LETTER P",
55 "FULL STOP"
56 ]
Guido van Rossumef87d6e2007-05-02 19:09:54 +000057 string = "The rEd fOx ate the sheep."
Martin v. Löwis8579efc2002-11-23 17:11:42 +000058
Walter Dörwald37c47282003-02-26 14:49:41 +000059 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +000060 "".join([self.checkletter(*args) for args in zip(chars, string)]),
Walter Dörwald37c47282003-02-26 14:49:41 +000061 string
62 )
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +000063
Walter Dörwald37c47282003-02-26 14:49:41 +000064 def test_ascii_letters(self):
Guido van Rossum805365e2007-05-07 22:24:25 +000065 for char in "".join(map(chr, range(ord("a"), ord("z")))):
Walter Dörwald37c47282003-02-26 14:49:41 +000066 name = "LATIN SMALL LETTER %s" % char.upper()
67 code = unicodedata.lookup(name)
68 self.assertEqual(unicodedata.name(code), name)
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000069
Walter Dörwald37c47282003-02-26 14:49:41 +000070 def test_hangul_syllables(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000071 self.checkletter("HANGUL SYLLABLE GA", "\uac00")
72 self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
73 self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
74 self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
75 self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
76 self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
77 self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
78 self.checkletter("HANGUL SYLLABLE YI", "\uc758")
79 self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
80 self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
81 self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
82 self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
83 self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000084
Guido van Rossumef87d6e2007-05-02 19:09:54 +000085 self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000086
Walter Dörwald37c47282003-02-26 14:49:41 +000087 def test_cjk_unified_ideographs(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000088 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
89 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
90 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000091 self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000092 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
93 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000094 self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
95 self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
96 self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
97 self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000098
Walter Dörwald37c47282003-02-26 14:49:41 +000099 def test_bmp_characters(self):
Guido van Rossum805365e2007-05-07 22:24:25 +0000100 for code in range(0x10000):
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000101 char = chr(code)
Walter Dörwald37c47282003-02-26 14:49:41 +0000102 name = unicodedata.name(char, None)
103 if name is not None:
104 self.assertEqual(unicodedata.lookup(name), char)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000105
Walter Dörwald37c47282003-02-26 14:49:41 +0000106 def test_misc_symbols(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000107 self.checkletter("PILCROW SIGN", "\u00b6")
108 self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
109 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
110 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
Walter Dörwald37c47282003-02-26 14:49:41 +0000111
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300112 def test_aliases(self):
113 # Check that the aliases defined in the NameAliases.txt file work.
114 # This should be updated when new aliases are added or the file
115 # should be downloaded and parsed instead. See #12753.
116 aliases = [
117 ('LATIN CAPITAL LETTER GHA', 0x01A2),
118 ('LATIN SMALL LETTER GHA', 0x01A3),
119 ('KANNADA LETTER LLLA', 0x0CDE),
120 ('LAO LETTER FO FON', 0x0E9D),
121 ('LAO LETTER FO FAY', 0x0E9F),
122 ('LAO LETTER RO', 0x0EA3),
123 ('LAO LETTER LO', 0x0EA5),
124 ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
125 ('YI SYLLABLE ITERATION MARK', 0xA015),
126 ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
127 ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
128 ]
129 for alias, codepoint in aliases:
130 self.checkletter(alias, chr(codepoint))
131 name = unicodedata.name(chr(codepoint))
132 self.assertNotEqual(name, alias)
133 self.assertEqual(unicodedata.lookup(alias),
134 unicodedata.lookup(name))
135 with self.assertRaises(KeyError):
136 unicodedata.ucd_3_2_0.lookup(alias)
137
138 def test_aliases_names_in_pua_range(self):
139 # We are storing aliases in the PUA 15, but their names shouldn't leak
140 for cp in range(0xf0000, 0xf0100):
141 with self.assertRaises(ValueError) as cm:
142 unicodedata.name(chr(cp))
143 self.assertEqual(str(cm.exception), 'no such name')
144
145 def test_named_sequences_names_in_pua_range(self):
146 # We are storing named seq in the PUA 15, but their names shouldn't leak
147 for cp in range(0xf0100, 0xf0fff):
148 with self.assertRaises(ValueError) as cm:
149 unicodedata.name(chr(cp))
150 self.assertEqual(str(cm.exception), 'no such name')
151
152 def test_named_sequences_sample(self):
153 # Check a few named sequences. See #12753.
154 sequences = [
155 ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
156 ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
157 ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
158 ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
159 ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
160 ]
161 for seqname, codepoints in sequences:
162 self.assertEqual(unicodedata.lookup(seqname), codepoints)
163 with self.assertRaises(SyntaxError):
164 self.checkletter(seqname, None)
165 with self.assertRaises(KeyError):
166 unicodedata.ucd_3_2_0.lookup(seqname)
167
168 def test_named_sequences_full(self):
169 # Check all the named sequences
170 url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" %
171 unicodedata.unidata_version)
172 try:
173 testdata = support.open_urlresource(url, encoding="utf-8",
174 check=check_version)
175 except (IOError, HTTPException):
176 self.skipTest("Could not retrieve " + url)
177 self.addCleanup(testdata.close)
178 for line in testdata:
179 line = line.strip()
180 if not line or line.startswith('#'):
181 continue
182 seqname, codepoints = line.split(';')
183 codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
184 self.assertEqual(unicodedata.lookup(seqname), codepoints)
185 with self.assertRaises(SyntaxError):
186 self.checkletter(seqname, None)
187 with self.assertRaises(KeyError):
188 unicodedata.ucd_3_2_0.lookup(seqname)
189
Walter Dörwald37c47282003-02-26 14:49:41 +0000190 def test_errors(self):
Walter Dörwald37c47282003-02-26 14:49:41 +0000191 self.assertRaises(TypeError, unicodedata.name)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000192 self.assertRaises(TypeError, unicodedata.name, 'xx')
Walter Dörwald37c47282003-02-26 14:49:41 +0000193 self.assertRaises(TypeError, unicodedata.lookup)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000194 self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
Walter Dörwald37c47282003-02-26 14:49:41 +0000195
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000196 def test_strict_error_handling(self):
Walter Dörwald37c47282003-02-26 14:49:41 +0000197 # bogus character name
198 self.assertRaises(
199 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000200 str, b"\\N{blah}", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000201 )
202 # long bogus character name
203 self.assertRaises(
204 UnicodeError,
Guido van Rossum9c627722007-08-27 18:31:48 +0000205 str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000206 )
207 # missing closing brace
208 self.assertRaises(
209 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000210 str, b"\\N{SPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000211 )
212 # missing opening brace
213 self.assertRaises(
214 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000215 str, b"\\NSPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000216 )
217
218def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000219 support.run_unittest(UnicodeNamesTest)
Walter Dörwald37c47282003-02-26 14:49:41 +0000220
221if __name__ == "__main__":
222 test_main()