blob: 8febf0af8627b41f6f752e4e791291da1f08d107 [file] [log] [blame]
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00001""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
Fredrik Lundh06d12682001-01-24 07:59:11 +00004Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00005
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
Marc-André Lemburg36619082001-01-17 19:11:13 +00009
Walter Dörwald37c47282003-02-26 14:49:41 +000010import unittest
Ezio Melotti931b8aa2011-10-21 21:57:36 +030011import unicodedata
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000012
Benjamin Petersonee8712c2008-05-20 21:35:26 +000013from test import support
Ezio Melotti931b8aa2011-10-21 21:57:36 +030014from http.client import HTTPException
15from test.test_normalization import check_version
Fredrik Lundhee865c62001-01-19 11:00:42 +000016
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +020017try:
18 from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
19except ImportError:
20 INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1
21
Walter Dörwald37c47282003-02-26 14:49:41 +000022class UnicodeNamesTest(unittest.TestCase):
Fredrik Lundhee865c62001-01-19 11:00:42 +000023
Walter Dörwald37c47282003-02-26 14:49:41 +000024 def checkletter(self, name, code):
25 # Helper that put all \N escapes inside eval'd raw strings,
Tim Peters669454e2003-03-07 17:30:48 +000026 # to make sure this script runs even if the compiler
Walter Dörwald37c47282003-02-26 14:49:41 +000027 # chokes on \N escapes
Guido van Rossumbf4806b2007-07-21 00:15:34 +000028 res = eval(r'"\N{%s}"' % name)
Walter Dörwald37c47282003-02-26 14:49:41 +000029 self.assertEqual(res, code)
30 return res
Fredrik Lundhee865c62001-01-19 11:00:42 +000031
Walter Dörwald37c47282003-02-26 14:49:41 +000032 def test_general(self):
33 # General and case insensitivity test:
34 chars = [
35 "LATIN CAPITAL LETTER T",
36 "LATIN SMALL LETTER H",
37 "LATIN SMALL LETTER E",
38 "SPACE",
39 "LATIN SMALL LETTER R",
40 "LATIN CAPITAL LETTER E",
41 "LATIN SMALL LETTER D",
42 "SPACE",
43 "LATIN SMALL LETTER f",
44 "LATIN CAPITAL LeTtEr o",
45 "LATIN SMaLl LETTER x",
46 "SPACE",
47 "LATIN SMALL LETTER A",
48 "LATIN SMALL LETTER T",
49 "LATIN SMALL LETTER E",
50 "SPACE",
51 "LATIN SMALL LETTER T",
52 "LATIN SMALL LETTER H",
53 "LATIN SMALL LETTER E",
54 "SpAcE",
55 "LATIN SMALL LETTER S",
56 "LATIN SMALL LETTER H",
57 "LATIN small LETTER e",
58 "LATIN small LETTER e",
59 "LATIN SMALL LETTER P",
60 "FULL STOP"
61 ]
Guido van Rossumef87d6e2007-05-02 19:09:54 +000062 string = "The rEd fOx ate the sheep."
Martin v. Löwis8579efc2002-11-23 17:11:42 +000063
Walter Dörwald37c47282003-02-26 14:49:41 +000064 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +000065 "".join([self.checkletter(*args) for args in zip(chars, string)]),
Walter Dörwald37c47282003-02-26 14:49:41 +000066 string
67 )
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +000068
Walter Dörwald37c47282003-02-26 14:49:41 +000069 def test_ascii_letters(self):
Guido van Rossum805365e2007-05-07 22:24:25 +000070 for char in "".join(map(chr, range(ord("a"), ord("z")))):
Walter Dörwald37c47282003-02-26 14:49:41 +000071 name = "LATIN SMALL LETTER %s" % char.upper()
72 code = unicodedata.lookup(name)
73 self.assertEqual(unicodedata.name(code), name)
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000074
Walter Dörwald37c47282003-02-26 14:49:41 +000075 def test_hangul_syllables(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000076 self.checkletter("HANGUL SYLLABLE GA", "\uac00")
77 self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
78 self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
79 self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
80 self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
81 self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
82 self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
83 self.checkletter("HANGUL SYLLABLE YI", "\uc758")
84 self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
85 self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
86 self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
87 self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
88 self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000089
Guido van Rossumef87d6e2007-05-02 19:09:54 +000090 self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000091
Walter Dörwald37c47282003-02-26 14:49:41 +000092 def test_cjk_unified_ideographs(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000093 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
94 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
95 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000096 self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
98 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000099 self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
100 self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
101 self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
102 self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000103
Walter Dörwald37c47282003-02-26 14:49:41 +0000104 def test_bmp_characters(self):
Guido van Rossum805365e2007-05-07 22:24:25 +0000105 for code in range(0x10000):
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000106 char = chr(code)
Walter Dörwald37c47282003-02-26 14:49:41 +0000107 name = unicodedata.name(char, None)
108 if name is not None:
109 self.assertEqual(unicodedata.lookup(name), char)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000110
Walter Dörwald37c47282003-02-26 14:49:41 +0000111 def test_misc_symbols(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 self.checkletter("PILCROW SIGN", "\u00b6")
113 self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
114 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
115 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
Walter Dörwald37c47282003-02-26 14:49:41 +0000116
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300117 def test_aliases(self):
118 # Check that the aliases defined in the NameAliases.txt file work.
119 # This should be updated when new aliases are added or the file
120 # should be downloaded and parsed instead. See #12753.
121 aliases = [
122 ('LATIN CAPITAL LETTER GHA', 0x01A2),
123 ('LATIN SMALL LETTER GHA', 0x01A3),
124 ('KANNADA LETTER LLLA', 0x0CDE),
125 ('LAO LETTER FO FON', 0x0E9D),
126 ('LAO LETTER FO FAY', 0x0E9F),
127 ('LAO LETTER RO', 0x0EA3),
128 ('LAO LETTER LO', 0x0EA5),
129 ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
130 ('YI SYLLABLE ITERATION MARK', 0xA015),
131 ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
132 ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
133 ]
134 for alias, codepoint in aliases:
135 self.checkletter(alias, chr(codepoint))
136 name = unicodedata.name(chr(codepoint))
137 self.assertNotEqual(name, alias)
138 self.assertEqual(unicodedata.lookup(alias),
139 unicodedata.lookup(name))
140 with self.assertRaises(KeyError):
141 unicodedata.ucd_3_2_0.lookup(alias)
142
143 def test_aliases_names_in_pua_range(self):
144 # We are storing aliases in the PUA 15, but their names shouldn't leak
145 for cp in range(0xf0000, 0xf0100):
146 with self.assertRaises(ValueError) as cm:
147 unicodedata.name(chr(cp))
148 self.assertEqual(str(cm.exception), 'no such name')
149
150 def test_named_sequences_names_in_pua_range(self):
151 # We are storing named seq in the PUA 15, but their names shouldn't leak
152 for cp in range(0xf0100, 0xf0fff):
153 with self.assertRaises(ValueError) as cm:
154 unicodedata.name(chr(cp))
155 self.assertEqual(str(cm.exception), 'no such name')
156
157 def test_named_sequences_sample(self):
158 # Check a few named sequences. See #12753.
159 sequences = [
160 ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
161 ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
162 ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
163 ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
164 ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
165 ]
166 for seqname, codepoints in sequences:
167 self.assertEqual(unicodedata.lookup(seqname), codepoints)
168 with self.assertRaises(SyntaxError):
169 self.checkletter(seqname, None)
170 with self.assertRaises(KeyError):
171 unicodedata.ucd_3_2_0.lookup(seqname)
172
173 def test_named_sequences_full(self):
174 # Check all the named sequences
Georg Brandl5a155082014-11-06 14:37:49 +0100175 url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300176 unicodedata.unidata_version)
177 try:
178 testdata = support.open_urlresource(url, encoding="utf-8",
179 check=check_version)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200180 except (OSError, HTTPException):
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300181 self.skipTest("Could not retrieve " + url)
182 self.addCleanup(testdata.close)
183 for line in testdata:
184 line = line.strip()
185 if not line or line.startswith('#'):
186 continue
187 seqname, codepoints = line.split(';')
188 codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
189 self.assertEqual(unicodedata.lookup(seqname), codepoints)
190 with self.assertRaises(SyntaxError):
191 self.checkletter(seqname, None)
192 with self.assertRaises(KeyError):
193 unicodedata.ucd_3_2_0.lookup(seqname)
194
Walter Dörwald37c47282003-02-26 14:49:41 +0000195 def test_errors(self):
Walter Dörwald37c47282003-02-26 14:49:41 +0000196 self.assertRaises(TypeError, unicodedata.name)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000197 self.assertRaises(TypeError, unicodedata.name, 'xx')
Walter Dörwald37c47282003-02-26 14:49:41 +0000198 self.assertRaises(TypeError, unicodedata.lookup)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000199 self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
Walter Dörwald37c47282003-02-26 14:49:41 +0000200
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000201 def test_strict_error_handling(self):
Walter Dörwald37c47282003-02-26 14:49:41 +0000202 # bogus character name
203 self.assertRaises(
204 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000205 str, b"\\N{blah}", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000206 )
207 # long bogus character name
208 self.assertRaises(
209 UnicodeError,
Guido van Rossum9c627722007-08-27 18:31:48 +0000210 str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000211 )
212 # missing closing brace
213 self.assertRaises(
214 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000215 str, b"\\N{SPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000216 )
217 # missing opening brace
218 self.assertRaises(
219 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000220 str, b"\\NSPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000221 )
222
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +0200223 @support.cpython_only
224 @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
225 @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False)
Serhiy Storchakae45dac42013-01-21 20:23:58 +0200226 def test_issue16335(self, size):
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +0200227 # very very long bogus character name
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +0200228 x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}'
229 self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
Serhiy Storchakae45dac42013-01-21 20:23:58 +0200230 self.assertRaisesRegex(UnicodeError,
231 'unknown Unicode character name',
232 x.decode, 'unicode-escape'
233 )
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +0200234
235
Walter Dörwald37c47282003-02-26 14:49:41 +0000236if __name__ == "__main__":
Zachary Ware38c707e2015-04-13 15:00:43 -0500237 unittest.main()