blob: 503df6a51d5c62b94f2412c75f920bfb613176d1 [file] [log] [blame]
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00001""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
Fredrik Lundh06d12682001-01-24 07:59:11 +00004Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00005
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
Marc-André Lemburg36619082001-01-17 19:11:13 +00009
Walter Dörwald37c47282003-02-26 14:49:41 +000010import unittest
Ezio Melotti931b8aa2011-10-21 21:57:36 +030011import unicodedata
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000012
Benjamin Petersonee8712c2008-05-20 21:35:26 +000013from test import support
Ezio Melotti931b8aa2011-10-21 21:57:36 +030014from http.client import HTTPException
Fredrik Lundhee865c62001-01-19 11:00:42 +000015
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +020016try:
17 from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
18except ImportError:
19 INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1
20
Walter Dörwald37c47282003-02-26 14:49:41 +000021class UnicodeNamesTest(unittest.TestCase):
Fredrik Lundhee865c62001-01-19 11:00:42 +000022
Walter Dörwald37c47282003-02-26 14:49:41 +000023 def checkletter(self, name, code):
24 # Helper that put all \N escapes inside eval'd raw strings,
Tim Peters669454e2003-03-07 17:30:48 +000025 # to make sure this script runs even if the compiler
Walter Dörwald37c47282003-02-26 14:49:41 +000026 # chokes on \N escapes
Guido van Rossumbf4806b2007-07-21 00:15:34 +000027 res = eval(r'"\N{%s}"' % name)
Walter Dörwald37c47282003-02-26 14:49:41 +000028 self.assertEqual(res, code)
29 return res
Fredrik Lundhee865c62001-01-19 11:00:42 +000030
Walter Dörwald37c47282003-02-26 14:49:41 +000031 def test_general(self):
32 # General and case insensitivity test:
33 chars = [
34 "LATIN CAPITAL LETTER T",
35 "LATIN SMALL LETTER H",
36 "LATIN SMALL LETTER E",
37 "SPACE",
38 "LATIN SMALL LETTER R",
39 "LATIN CAPITAL LETTER E",
40 "LATIN SMALL LETTER D",
41 "SPACE",
42 "LATIN SMALL LETTER f",
43 "LATIN CAPITAL LeTtEr o",
44 "LATIN SMaLl LETTER x",
45 "SPACE",
46 "LATIN SMALL LETTER A",
47 "LATIN SMALL LETTER T",
48 "LATIN SMALL LETTER E",
49 "SPACE",
50 "LATIN SMALL LETTER T",
51 "LATIN SMALL LETTER H",
52 "LATIN SMALL LETTER E",
53 "SpAcE",
54 "LATIN SMALL LETTER S",
55 "LATIN SMALL LETTER H",
56 "LATIN small LETTER e",
57 "LATIN small LETTER e",
58 "LATIN SMALL LETTER P",
59 "FULL STOP"
60 ]
Guido van Rossumef87d6e2007-05-02 19:09:54 +000061 string = "The rEd fOx ate the sheep."
Martin v. Löwis8579efc2002-11-23 17:11:42 +000062
Walter Dörwald37c47282003-02-26 14:49:41 +000063 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +000064 "".join([self.checkletter(*args) for args in zip(chars, string)]),
Walter Dörwald37c47282003-02-26 14:49:41 +000065 string
66 )
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +000067
Walter Dörwald37c47282003-02-26 14:49:41 +000068 def test_ascii_letters(self):
Guido van Rossum805365e2007-05-07 22:24:25 +000069 for char in "".join(map(chr, range(ord("a"), ord("z")))):
Walter Dörwald37c47282003-02-26 14:49:41 +000070 name = "LATIN SMALL LETTER %s" % char.upper()
71 code = unicodedata.lookup(name)
72 self.assertEqual(unicodedata.name(code), name)
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000073
Walter Dörwald37c47282003-02-26 14:49:41 +000074 def test_hangul_syllables(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000075 self.checkletter("HANGUL SYLLABLE GA", "\uac00")
76 self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
77 self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
78 self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
79 self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
80 self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
81 self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
82 self.checkletter("HANGUL SYLLABLE YI", "\uc758")
83 self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
84 self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
85 self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
86 self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
87 self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000088
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000090
Walter Dörwald37c47282003-02-26 14:49:41 +000091 def test_cjk_unified_ideographs(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000092 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
93 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
94 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000095 self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
97 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000098 self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
99 self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
100 self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
101 self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000102
Walter Dörwald37c47282003-02-26 14:49:41 +0000103 def test_bmp_characters(self):
Guido van Rossum805365e2007-05-07 22:24:25 +0000104 for code in range(0x10000):
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000105 char = chr(code)
Walter Dörwald37c47282003-02-26 14:49:41 +0000106 name = unicodedata.name(char, None)
107 if name is not None:
108 self.assertEqual(unicodedata.lookup(name), char)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000109
Walter Dörwald37c47282003-02-26 14:49:41 +0000110 def test_misc_symbols(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000111 self.checkletter("PILCROW SIGN", "\u00b6")
112 self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
113 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
114 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
Walter Dörwald37c47282003-02-26 14:49:41 +0000115
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300116 def test_aliases(self):
117 # Check that the aliases defined in the NameAliases.txt file work.
118 # This should be updated when new aliases are added or the file
119 # should be downloaded and parsed instead. See #12753.
120 aliases = [
121 ('LATIN CAPITAL LETTER GHA', 0x01A2),
122 ('LATIN SMALL LETTER GHA', 0x01A3),
123 ('KANNADA LETTER LLLA', 0x0CDE),
124 ('LAO LETTER FO FON', 0x0E9D),
125 ('LAO LETTER FO FAY', 0x0E9F),
126 ('LAO LETTER RO', 0x0EA3),
127 ('LAO LETTER LO', 0x0EA5),
128 ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
129 ('YI SYLLABLE ITERATION MARK', 0xA015),
130 ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
131 ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
132 ]
133 for alias, codepoint in aliases:
134 self.checkletter(alias, chr(codepoint))
135 name = unicodedata.name(chr(codepoint))
136 self.assertNotEqual(name, alias)
137 self.assertEqual(unicodedata.lookup(alias),
138 unicodedata.lookup(name))
139 with self.assertRaises(KeyError):
140 unicodedata.ucd_3_2_0.lookup(alias)
141
142 def test_aliases_names_in_pua_range(self):
143 # We are storing aliases in the PUA 15, but their names shouldn't leak
144 for cp in range(0xf0000, 0xf0100):
145 with self.assertRaises(ValueError) as cm:
146 unicodedata.name(chr(cp))
147 self.assertEqual(str(cm.exception), 'no such name')
148
149 def test_named_sequences_names_in_pua_range(self):
150 # We are storing named seq in the PUA 15, but their names shouldn't leak
151 for cp in range(0xf0100, 0xf0fff):
152 with self.assertRaises(ValueError) as cm:
153 unicodedata.name(chr(cp))
154 self.assertEqual(str(cm.exception), 'no such name')
155
156 def test_named_sequences_sample(self):
157 # Check a few named sequences. See #12753.
158 sequences = [
159 ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
160 ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
161 ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
162 ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
163 ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
164 ]
165 for seqname, codepoints in sequences:
166 self.assertEqual(unicodedata.lookup(seqname), codepoints)
167 with self.assertRaises(SyntaxError):
168 self.checkletter(seqname, None)
169 with self.assertRaises(KeyError):
170 unicodedata.ucd_3_2_0.lookup(seqname)
171
172 def test_named_sequences_full(self):
173 # Check all the named sequences
Greg Price1ad0c772019-09-10 02:29:26 -0700174 def check_version(testfile):
175 hdr = testfile.readline()
176 return unicodedata.unidata_version in hdr
Georg Brandl5a155082014-11-06 14:37:49 +0100177 url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300178 unicodedata.unidata_version)
179 try:
180 testdata = support.open_urlresource(url, encoding="utf-8",
181 check=check_version)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200182 except (OSError, HTTPException):
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300183 self.skipTest("Could not retrieve " + url)
184 self.addCleanup(testdata.close)
185 for line in testdata:
186 line = line.strip()
187 if not line or line.startswith('#'):
188 continue
189 seqname, codepoints = line.split(';')
190 codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
191 self.assertEqual(unicodedata.lookup(seqname), codepoints)
192 with self.assertRaises(SyntaxError):
193 self.checkletter(seqname, None)
194 with self.assertRaises(KeyError):
195 unicodedata.ucd_3_2_0.lookup(seqname)
196
Walter Dörwald37c47282003-02-26 14:49:41 +0000197 def test_errors(self):
Walter Dörwald37c47282003-02-26 14:49:41 +0000198 self.assertRaises(TypeError, unicodedata.name)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000199 self.assertRaises(TypeError, unicodedata.name, 'xx')
Walter Dörwald37c47282003-02-26 14:49:41 +0000200 self.assertRaises(TypeError, unicodedata.lookup)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000201 self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
Walter Dörwald37c47282003-02-26 14:49:41 +0000202
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000203 def test_strict_error_handling(self):
Walter Dörwald37c47282003-02-26 14:49:41 +0000204 # bogus character name
205 self.assertRaises(
206 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000207 str, b"\\N{blah}", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000208 )
209 # long bogus character name
210 self.assertRaises(
211 UnicodeError,
Guido van Rossum9c627722007-08-27 18:31:48 +0000212 str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000213 )
214 # missing closing brace
215 self.assertRaises(
216 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000217 str, b"\\N{SPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000218 )
219 # missing opening brace
220 self.assertRaises(
221 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000222 str, b"\\NSPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000223 )
224
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +0200225 @support.cpython_only
226 @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
227 @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False)
Serhiy Storchakae45dac42013-01-21 20:23:58 +0200228 def test_issue16335(self, size):
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +0200229 # very very long bogus character name
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +0200230 x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}'
231 self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
Serhiy Storchakae45dac42013-01-21 20:23:58 +0200232 self.assertRaisesRegex(UnicodeError,
233 'unknown Unicode character name',
234 x.decode, 'unicode-escape'
235 )
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +0200236
237
Walter Dörwald37c47282003-02-26 14:49:41 +0000238if __name__ == "__main__":
Zachary Ware38c707e2015-04-13 15:00:43 -0500239 unittest.main()