blob: cbfd5af2bb751c7f1886fde498e21cbed9823ec2 [file] [log] [blame]
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00001""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
Fredrik Lundh06d12682001-01-24 07:59:11 +00004Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00005
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
Marc-André Lemburg36619082001-01-17 19:11:13 +00009
Florian Bruhina8bf44d2020-10-06 16:21:56 +020010import ast
Walter Dörwald37c47282003-02-26 14:49:41 +000011import unittest
Ezio Melotti931b8aa2011-10-21 21:57:36 +030012import unicodedata
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000013
Benjamin Petersonee8712c2008-05-20 21:35:26 +000014from test import support
Ezio Melotti931b8aa2011-10-21 21:57:36 +030015from http.client import HTTPException
Fredrik Lundhee865c62001-01-19 11:00:42 +000016
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +020017try:
18 from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
19except ImportError:
20 INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1
21
Walter Dörwald37c47282003-02-26 14:49:41 +000022class UnicodeNamesTest(unittest.TestCase):
Fredrik Lundhee865c62001-01-19 11:00:42 +000023
Walter Dörwald37c47282003-02-26 14:49:41 +000024 def checkletter(self, name, code):
25 # Helper that put all \N escapes inside eval'd raw strings,
Tim Peters669454e2003-03-07 17:30:48 +000026 # to make sure this script runs even if the compiler
Walter Dörwald37c47282003-02-26 14:49:41 +000027 # chokes on \N escapes
Florian Bruhina8bf44d2020-10-06 16:21:56 +020028 res = ast.literal_eval(r'"\N{%s}"' % name)
Walter Dörwald37c47282003-02-26 14:49:41 +000029 self.assertEqual(res, code)
30 return res
Fredrik Lundhee865c62001-01-19 11:00:42 +000031
Walter Dörwald37c47282003-02-26 14:49:41 +000032 def test_general(self):
33 # General and case insensitivity test:
34 chars = [
35 "LATIN CAPITAL LETTER T",
36 "LATIN SMALL LETTER H",
37 "LATIN SMALL LETTER E",
38 "SPACE",
39 "LATIN SMALL LETTER R",
40 "LATIN CAPITAL LETTER E",
41 "LATIN SMALL LETTER D",
42 "SPACE",
43 "LATIN SMALL LETTER f",
44 "LATIN CAPITAL LeTtEr o",
45 "LATIN SMaLl LETTER x",
46 "SPACE",
47 "LATIN SMALL LETTER A",
48 "LATIN SMALL LETTER T",
49 "LATIN SMALL LETTER E",
50 "SPACE",
51 "LATIN SMALL LETTER T",
52 "LATIN SMALL LETTER H",
53 "LATIN SMALL LETTER E",
54 "SpAcE",
55 "LATIN SMALL LETTER S",
56 "LATIN SMALL LETTER H",
57 "LATIN small LETTER e",
58 "LATIN small LETTER e",
59 "LATIN SMALL LETTER P",
60 "FULL STOP"
61 ]
Guido van Rossumef87d6e2007-05-02 19:09:54 +000062 string = "The rEd fOx ate the sheep."
Martin v. Löwis8579efc2002-11-23 17:11:42 +000063
Walter Dörwald37c47282003-02-26 14:49:41 +000064 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +000065 "".join([self.checkletter(*args) for args in zip(chars, string)]),
Walter Dörwald37c47282003-02-26 14:49:41 +000066 string
67 )
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +000068
Walter Dörwald37c47282003-02-26 14:49:41 +000069 def test_ascii_letters(self):
Guido van Rossum805365e2007-05-07 22:24:25 +000070 for char in "".join(map(chr, range(ord("a"), ord("z")))):
Walter Dörwald37c47282003-02-26 14:49:41 +000071 name = "LATIN SMALL LETTER %s" % char.upper()
72 code = unicodedata.lookup(name)
73 self.assertEqual(unicodedata.name(code), name)
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000074
Walter Dörwald37c47282003-02-26 14:49:41 +000075 def test_hangul_syllables(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000076 self.checkletter("HANGUL SYLLABLE GA", "\uac00")
77 self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
78 self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
79 self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
80 self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
81 self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
82 self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
83 self.checkletter("HANGUL SYLLABLE YI", "\uc758")
84 self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
85 self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
86 self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
87 self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
88 self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000089
Guido van Rossumef87d6e2007-05-02 19:09:54 +000090 self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000091
Walter Dörwald37c47282003-02-26 14:49:41 +000092 def test_cjk_unified_ideographs(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000093 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
94 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
95 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000096 self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
98 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000099 self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
100 self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
101 self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
102 self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
Benjamin Peterson051b9d02020-03-10 20:41:34 -0700103 self.checkletter("CJK UNIFIED IDEOGRAPH-3134A", "\U0003134A")
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000104
Walter Dörwald37c47282003-02-26 14:49:41 +0000105 def test_bmp_characters(self):
Guido van Rossum805365e2007-05-07 22:24:25 +0000106 for code in range(0x10000):
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000107 char = chr(code)
Walter Dörwald37c47282003-02-26 14:49:41 +0000108 name = unicodedata.name(char, None)
109 if name is not None:
110 self.assertEqual(unicodedata.lookup(name), char)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000111
Walter Dörwald37c47282003-02-26 14:49:41 +0000112 def test_misc_symbols(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000113 self.checkletter("PILCROW SIGN", "\u00b6")
114 self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
115 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
116 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
Walter Dörwald37c47282003-02-26 14:49:41 +0000117
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300118 def test_aliases(self):
119 # Check that the aliases defined in the NameAliases.txt file work.
120 # This should be updated when new aliases are added or the file
121 # should be downloaded and parsed instead. See #12753.
122 aliases = [
123 ('LATIN CAPITAL LETTER GHA', 0x01A2),
124 ('LATIN SMALL LETTER GHA', 0x01A3),
125 ('KANNADA LETTER LLLA', 0x0CDE),
126 ('LAO LETTER FO FON', 0x0E9D),
127 ('LAO LETTER FO FAY', 0x0E9F),
128 ('LAO LETTER RO', 0x0EA3),
129 ('LAO LETTER LO', 0x0EA5),
130 ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
131 ('YI SYLLABLE ITERATION MARK', 0xA015),
132 ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
133 ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
134 ]
135 for alias, codepoint in aliases:
136 self.checkletter(alias, chr(codepoint))
137 name = unicodedata.name(chr(codepoint))
138 self.assertNotEqual(name, alias)
139 self.assertEqual(unicodedata.lookup(alias),
140 unicodedata.lookup(name))
141 with self.assertRaises(KeyError):
142 unicodedata.ucd_3_2_0.lookup(alias)
143
144 def test_aliases_names_in_pua_range(self):
145 # We are storing aliases in the PUA 15, but their names shouldn't leak
146 for cp in range(0xf0000, 0xf0100):
147 with self.assertRaises(ValueError) as cm:
148 unicodedata.name(chr(cp))
149 self.assertEqual(str(cm.exception), 'no such name')
150
151 def test_named_sequences_names_in_pua_range(self):
152 # We are storing named seq in the PUA 15, but their names shouldn't leak
153 for cp in range(0xf0100, 0xf0fff):
154 with self.assertRaises(ValueError) as cm:
155 unicodedata.name(chr(cp))
156 self.assertEqual(str(cm.exception), 'no such name')
157
158 def test_named_sequences_sample(self):
159 # Check a few named sequences. See #12753.
160 sequences = [
161 ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
162 ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
163 ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
164 ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
165 ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
166 ]
167 for seqname, codepoints in sequences:
168 self.assertEqual(unicodedata.lookup(seqname), codepoints)
169 with self.assertRaises(SyntaxError):
170 self.checkletter(seqname, None)
171 with self.assertRaises(KeyError):
172 unicodedata.ucd_3_2_0.lookup(seqname)
173
174 def test_named_sequences_full(self):
175 # Check all the named sequences
Greg Price1ad0c772019-09-10 02:29:26 -0700176 def check_version(testfile):
177 hdr = testfile.readline()
178 return unicodedata.unidata_version in hdr
Georg Brandl5a155082014-11-06 14:37:49 +0100179 url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300180 unicodedata.unidata_version)
181 try:
182 testdata = support.open_urlresource(url, encoding="utf-8",
183 check=check_version)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200184 except (OSError, HTTPException):
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300185 self.skipTest("Could not retrieve " + url)
186 self.addCleanup(testdata.close)
187 for line in testdata:
188 line = line.strip()
189 if not line or line.startswith('#'):
190 continue
191 seqname, codepoints = line.split(';')
192 codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
193 self.assertEqual(unicodedata.lookup(seqname), codepoints)
194 with self.assertRaises(SyntaxError):
195 self.checkletter(seqname, None)
196 with self.assertRaises(KeyError):
197 unicodedata.ucd_3_2_0.lookup(seqname)
198
Walter Dörwald37c47282003-02-26 14:49:41 +0000199 def test_errors(self):
Walter Dörwald37c47282003-02-26 14:49:41 +0000200 self.assertRaises(TypeError, unicodedata.name)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000201 self.assertRaises(TypeError, unicodedata.name, 'xx')
Walter Dörwald37c47282003-02-26 14:49:41 +0000202 self.assertRaises(TypeError, unicodedata.lookup)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000203 self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
Walter Dörwald37c47282003-02-26 14:49:41 +0000204
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000205 def test_strict_error_handling(self):
Walter Dörwald37c47282003-02-26 14:49:41 +0000206 # bogus character name
207 self.assertRaises(
208 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000209 str, b"\\N{blah}", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000210 )
211 # long bogus character name
212 self.assertRaises(
213 UnicodeError,
Guido van Rossum9c627722007-08-27 18:31:48 +0000214 str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000215 )
216 # missing closing brace
217 self.assertRaises(
218 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000219 str, b"\\N{SPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000220 )
221 # missing opening brace
222 self.assertRaises(
223 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000224 str, b"\\NSPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000225 )
226
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +0200227 @support.cpython_only
228 @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
229 @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False)
Serhiy Storchakae45dac42013-01-21 20:23:58 +0200230 def test_issue16335(self, size):
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +0200231 # very very long bogus character name
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +0200232 x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}'
233 self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
Serhiy Storchakae45dac42013-01-21 20:23:58 +0200234 self.assertRaisesRegex(UnicodeError,
235 'unknown Unicode character name',
236 x.decode, 'unicode-escape'
237 )
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +0200238
239
Walter Dörwald37c47282003-02-26 14:49:41 +0000240if __name__ == "__main__":
Zachary Ware38c707e2015-04-13 15:00:43 -0500241 unittest.main()