blob: 2e6374561facdd69739e4f90226c0a081e1bfd0c [file] [log] [blame]
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00001""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
Fredrik Lundh06d12682001-01-24 07:59:11 +00004Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00005
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
Marc-André Lemburg36619082001-01-17 19:11:13 +00009
Walter Dörwald37c47282003-02-26 14:49:41 +000010import unittest
Ezio Melotti931b8aa2011-10-21 21:57:36 +030011import unicodedata
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +020012import _testcapi
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000013
Benjamin Petersonee8712c2008-05-20 21:35:26 +000014from test import support
Ezio Melotti931b8aa2011-10-21 21:57:36 +030015from http.client import HTTPException
16from test.test_normalization import check_version
Fredrik Lundhee865c62001-01-19 11:00:42 +000017
Walter Dörwald37c47282003-02-26 14:49:41 +000018class UnicodeNamesTest(unittest.TestCase):
Fredrik Lundhee865c62001-01-19 11:00:42 +000019
Walter Dörwald37c47282003-02-26 14:49:41 +000020 def checkletter(self, name, code):
21 # Helper that put all \N escapes inside eval'd raw strings,
Tim Peters669454e2003-03-07 17:30:48 +000022 # to make sure this script runs even if the compiler
Walter Dörwald37c47282003-02-26 14:49:41 +000023 # chokes on \N escapes
Guido van Rossumbf4806b2007-07-21 00:15:34 +000024 res = eval(r'"\N{%s}"' % name)
Walter Dörwald37c47282003-02-26 14:49:41 +000025 self.assertEqual(res, code)
26 return res
Fredrik Lundhee865c62001-01-19 11:00:42 +000027
Walter Dörwald37c47282003-02-26 14:49:41 +000028 def test_general(self):
29 # General and case insensitivity test:
30 chars = [
31 "LATIN CAPITAL LETTER T",
32 "LATIN SMALL LETTER H",
33 "LATIN SMALL LETTER E",
34 "SPACE",
35 "LATIN SMALL LETTER R",
36 "LATIN CAPITAL LETTER E",
37 "LATIN SMALL LETTER D",
38 "SPACE",
39 "LATIN SMALL LETTER f",
40 "LATIN CAPITAL LeTtEr o",
41 "LATIN SMaLl LETTER x",
42 "SPACE",
43 "LATIN SMALL LETTER A",
44 "LATIN SMALL LETTER T",
45 "LATIN SMALL LETTER E",
46 "SPACE",
47 "LATIN SMALL LETTER T",
48 "LATIN SMALL LETTER H",
49 "LATIN SMALL LETTER E",
50 "SpAcE",
51 "LATIN SMALL LETTER S",
52 "LATIN SMALL LETTER H",
53 "LATIN small LETTER e",
54 "LATIN small LETTER e",
55 "LATIN SMALL LETTER P",
56 "FULL STOP"
57 ]
Guido van Rossumef87d6e2007-05-02 19:09:54 +000058 string = "The rEd fOx ate the sheep."
Martin v. Löwis8579efc2002-11-23 17:11:42 +000059
Walter Dörwald37c47282003-02-26 14:49:41 +000060 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +000061 "".join([self.checkletter(*args) for args in zip(chars, string)]),
Walter Dörwald37c47282003-02-26 14:49:41 +000062 string
63 )
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +000064
Walter Dörwald37c47282003-02-26 14:49:41 +000065 def test_ascii_letters(self):
Guido van Rossum805365e2007-05-07 22:24:25 +000066 for char in "".join(map(chr, range(ord("a"), ord("z")))):
Walter Dörwald37c47282003-02-26 14:49:41 +000067 name = "LATIN SMALL LETTER %s" % char.upper()
68 code = unicodedata.lookup(name)
69 self.assertEqual(unicodedata.name(code), name)
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000070
Walter Dörwald37c47282003-02-26 14:49:41 +000071 def test_hangul_syllables(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000072 self.checkletter("HANGUL SYLLABLE GA", "\uac00")
73 self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
74 self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
75 self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
76 self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
77 self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
78 self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
79 self.checkletter("HANGUL SYLLABLE YI", "\uc758")
80 self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
81 self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
82 self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
83 self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
84 self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000085
Guido van Rossumef87d6e2007-05-02 19:09:54 +000086 self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000087
Walter Dörwald37c47282003-02-26 14:49:41 +000088 def test_cjk_unified_ideographs(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
90 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
91 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000092 self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000093 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
94 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000095 self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
96 self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
97 self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
98 self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000099
Walter Dörwald37c47282003-02-26 14:49:41 +0000100 def test_bmp_characters(self):
Guido van Rossum805365e2007-05-07 22:24:25 +0000101 for code in range(0x10000):
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000102 char = chr(code)
Walter Dörwald37c47282003-02-26 14:49:41 +0000103 name = unicodedata.name(char, None)
104 if name is not None:
105 self.assertEqual(unicodedata.lookup(name), char)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000106
Walter Dörwald37c47282003-02-26 14:49:41 +0000107 def test_misc_symbols(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000108 self.checkletter("PILCROW SIGN", "\u00b6")
109 self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
110 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
111 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
Walter Dörwald37c47282003-02-26 14:49:41 +0000112
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300113 def test_aliases(self):
114 # Check that the aliases defined in the NameAliases.txt file work.
115 # This should be updated when new aliases are added or the file
116 # should be downloaded and parsed instead. See #12753.
117 aliases = [
118 ('LATIN CAPITAL LETTER GHA', 0x01A2),
119 ('LATIN SMALL LETTER GHA', 0x01A3),
120 ('KANNADA LETTER LLLA', 0x0CDE),
121 ('LAO LETTER FO FON', 0x0E9D),
122 ('LAO LETTER FO FAY', 0x0E9F),
123 ('LAO LETTER RO', 0x0EA3),
124 ('LAO LETTER LO', 0x0EA5),
125 ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
126 ('YI SYLLABLE ITERATION MARK', 0xA015),
127 ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
128 ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
129 ]
130 for alias, codepoint in aliases:
131 self.checkletter(alias, chr(codepoint))
132 name = unicodedata.name(chr(codepoint))
133 self.assertNotEqual(name, alias)
134 self.assertEqual(unicodedata.lookup(alias),
135 unicodedata.lookup(name))
136 with self.assertRaises(KeyError):
137 unicodedata.ucd_3_2_0.lookup(alias)
138
139 def test_aliases_names_in_pua_range(self):
140 # We are storing aliases in the PUA 15, but their names shouldn't leak
141 for cp in range(0xf0000, 0xf0100):
142 with self.assertRaises(ValueError) as cm:
143 unicodedata.name(chr(cp))
144 self.assertEqual(str(cm.exception), 'no such name')
145
146 def test_named_sequences_names_in_pua_range(self):
147 # We are storing named seq in the PUA 15, but their names shouldn't leak
148 for cp in range(0xf0100, 0xf0fff):
149 with self.assertRaises(ValueError) as cm:
150 unicodedata.name(chr(cp))
151 self.assertEqual(str(cm.exception), 'no such name')
152
153 def test_named_sequences_sample(self):
154 # Check a few named sequences. See #12753.
155 sequences = [
156 ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
157 ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
158 ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
159 ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
160 ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
161 ]
162 for seqname, codepoints in sequences:
163 self.assertEqual(unicodedata.lookup(seqname), codepoints)
164 with self.assertRaises(SyntaxError):
165 self.checkletter(seqname, None)
166 with self.assertRaises(KeyError):
167 unicodedata.ucd_3_2_0.lookup(seqname)
168
169 def test_named_sequences_full(self):
170 # Check all the named sequences
171 url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" %
172 unicodedata.unidata_version)
173 try:
174 testdata = support.open_urlresource(url, encoding="utf-8",
175 check=check_version)
176 except (IOError, HTTPException):
177 self.skipTest("Could not retrieve " + url)
178 self.addCleanup(testdata.close)
179 for line in testdata:
180 line = line.strip()
181 if not line or line.startswith('#'):
182 continue
183 seqname, codepoints = line.split(';')
184 codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
185 self.assertEqual(unicodedata.lookup(seqname), codepoints)
186 with self.assertRaises(SyntaxError):
187 self.checkletter(seqname, None)
188 with self.assertRaises(KeyError):
189 unicodedata.ucd_3_2_0.lookup(seqname)
190
Walter Dörwald37c47282003-02-26 14:49:41 +0000191 def test_errors(self):
Walter Dörwald37c47282003-02-26 14:49:41 +0000192 self.assertRaises(TypeError, unicodedata.name)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000193 self.assertRaises(TypeError, unicodedata.name, 'xx')
Walter Dörwald37c47282003-02-26 14:49:41 +0000194 self.assertRaises(TypeError, unicodedata.lookup)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000195 self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
Walter Dörwald37c47282003-02-26 14:49:41 +0000196
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000197 def test_strict_error_handling(self):
Walter Dörwald37c47282003-02-26 14:49:41 +0000198 # bogus character name
199 self.assertRaises(
200 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000201 str, b"\\N{blah}", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000202 )
203 # long bogus character name
204 self.assertRaises(
205 UnicodeError,
Guido van Rossum9c627722007-08-27 18:31:48 +0000206 str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000207 )
208 # missing closing brace
209 self.assertRaises(
210 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000211 str, b"\\N{SPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000212 )
213 # missing opening brace
214 self.assertRaises(
215 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000216 str, b"\\NSPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000217 )
218
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +0200219 @unittest.skipUnless(_testcapi.INT_MAX < _testcapi.PY_SSIZE_T_MAX,
220 "needs UINT_MAX < SIZE_MAX")
Serhiy Storchakae45dac42013-01-21 20:23:58 +0200221 @support.bigmemtest(size=_testcapi.UINT_MAX + 1,
Serhiy Storchaka5070c272013-01-25 10:13:37 +0200222 memuse=2 + 1, dry_run=False)
Serhiy Storchakae45dac42013-01-21 20:23:58 +0200223 def test_issue16335(self, size):
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +0200224 # very very long bogus character name
Serhiy Storchakae45dac42013-01-21 20:23:58 +0200225 x = b'\\N{SPACE' + b'x' * (_testcapi.UINT_MAX + 1) + b'}'
226 self.assertEqual(len(x), len(b'\\N{SPACE}') +
227 (_testcapi.UINT_MAX + 1))
228 self.assertRaisesRegex(UnicodeError,
229 'unknown Unicode character name',
230 x.decode, 'unicode-escape'
231 )
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +0200232
233
Walter Dörwald37c47282003-02-26 14:49:41 +0000234def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000235 support.run_unittest(UnicodeNamesTest)
Walter Dörwald37c47282003-02-26 14:49:41 +0000236
237if __name__ == "__main__":
238 test_main()