blob: e95f911d8eedddd2d7be8bc5d7030375f1bf1f9e [file] [log] [blame]
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00001""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
Fredrik Lundh06d12682001-01-24 07:59:11 +00004Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00005
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
Marc-André Lemburg36619082001-01-17 19:11:13 +00009
Walter Dörwald37c47282003-02-26 14:49:41 +000010import unittest
Ezio Melotti931b8aa2011-10-21 21:57:36 +030011import unicodedata
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000012
Benjamin Petersonee8712c2008-05-20 21:35:26 +000013from test import support
Ezio Melotti931b8aa2011-10-21 21:57:36 +030014from http.client import HTTPException
Fredrik Lundhee865c62001-01-19 11:00:42 +000015
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +020016try:
17 from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
18except ImportError:
19 INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1
20
Walter Dörwald37c47282003-02-26 14:49:41 +000021class UnicodeNamesTest(unittest.TestCase):
Fredrik Lundhee865c62001-01-19 11:00:42 +000022
Walter Dörwald37c47282003-02-26 14:49:41 +000023 def checkletter(self, name, code):
24 # Helper that put all \N escapes inside eval'd raw strings,
Tim Peters669454e2003-03-07 17:30:48 +000025 # to make sure this script runs even if the compiler
Walter Dörwald37c47282003-02-26 14:49:41 +000026 # chokes on \N escapes
Guido van Rossumbf4806b2007-07-21 00:15:34 +000027 res = eval(r'"\N{%s}"' % name)
Walter Dörwald37c47282003-02-26 14:49:41 +000028 self.assertEqual(res, code)
29 return res
Fredrik Lundhee865c62001-01-19 11:00:42 +000030
Walter Dörwald37c47282003-02-26 14:49:41 +000031 def test_general(self):
32 # General and case insensitivity test:
33 chars = [
34 "LATIN CAPITAL LETTER T",
35 "LATIN SMALL LETTER H",
36 "LATIN SMALL LETTER E",
37 "SPACE",
38 "LATIN SMALL LETTER R",
39 "LATIN CAPITAL LETTER E",
40 "LATIN SMALL LETTER D",
41 "SPACE",
42 "LATIN SMALL LETTER f",
43 "LATIN CAPITAL LeTtEr o",
44 "LATIN SMaLl LETTER x",
45 "SPACE",
46 "LATIN SMALL LETTER A",
47 "LATIN SMALL LETTER T",
48 "LATIN SMALL LETTER E",
49 "SPACE",
50 "LATIN SMALL LETTER T",
51 "LATIN SMALL LETTER H",
52 "LATIN SMALL LETTER E",
53 "SpAcE",
54 "LATIN SMALL LETTER S",
55 "LATIN SMALL LETTER H",
56 "LATIN small LETTER e",
57 "LATIN small LETTER e",
58 "LATIN SMALL LETTER P",
59 "FULL STOP"
60 ]
Guido van Rossumef87d6e2007-05-02 19:09:54 +000061 string = "The rEd fOx ate the sheep."
Martin v. Löwis8579efc2002-11-23 17:11:42 +000062
Walter Dörwald37c47282003-02-26 14:49:41 +000063 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +000064 "".join([self.checkletter(*args) for args in zip(chars, string)]),
Walter Dörwald37c47282003-02-26 14:49:41 +000065 string
66 )
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +000067
Walter Dörwald37c47282003-02-26 14:49:41 +000068 def test_ascii_letters(self):
Guido van Rossum805365e2007-05-07 22:24:25 +000069 for char in "".join(map(chr, range(ord("a"), ord("z")))):
Walter Dörwald37c47282003-02-26 14:49:41 +000070 name = "LATIN SMALL LETTER %s" % char.upper()
71 code = unicodedata.lookup(name)
72 self.assertEqual(unicodedata.name(code), name)
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000073
Walter Dörwald37c47282003-02-26 14:49:41 +000074 def test_hangul_syllables(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000075 self.checkletter("HANGUL SYLLABLE GA", "\uac00")
76 self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
77 self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
78 self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
79 self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
80 self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
81 self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
82 self.checkletter("HANGUL SYLLABLE YI", "\uc758")
83 self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
84 self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
85 self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
86 self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
87 self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000088
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000090
Walter Dörwald37c47282003-02-26 14:49:41 +000091 def test_cjk_unified_ideographs(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000092 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
93 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
94 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000095 self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
97 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000098 self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
99 self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
100 self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
101 self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
Benjamin Peterson051b9d02020-03-10 20:41:34 -0700102 self.checkletter("CJK UNIFIED IDEOGRAPH-3134A", "\U0003134A")
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000103
Walter Dörwald37c47282003-02-26 14:49:41 +0000104 def test_bmp_characters(self):
Guido van Rossum805365e2007-05-07 22:24:25 +0000105 for code in range(0x10000):
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000106 char = chr(code)
Walter Dörwald37c47282003-02-26 14:49:41 +0000107 name = unicodedata.name(char, None)
108 if name is not None:
109 self.assertEqual(unicodedata.lookup(name), char)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000110
Walter Dörwald37c47282003-02-26 14:49:41 +0000111 def test_misc_symbols(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 self.checkletter("PILCROW SIGN", "\u00b6")
113 self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
114 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
115 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
Walter Dörwald37c47282003-02-26 14:49:41 +0000116
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300117 def test_aliases(self):
118 # Check that the aliases defined in the NameAliases.txt file work.
119 # This should be updated when new aliases are added or the file
120 # should be downloaded and parsed instead. See #12753.
121 aliases = [
122 ('LATIN CAPITAL LETTER GHA', 0x01A2),
123 ('LATIN SMALL LETTER GHA', 0x01A3),
124 ('KANNADA LETTER LLLA', 0x0CDE),
125 ('LAO LETTER FO FON', 0x0E9D),
126 ('LAO LETTER FO FAY', 0x0E9F),
127 ('LAO LETTER RO', 0x0EA3),
128 ('LAO LETTER LO', 0x0EA5),
129 ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
130 ('YI SYLLABLE ITERATION MARK', 0xA015),
131 ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
132 ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
133 ]
134 for alias, codepoint in aliases:
135 self.checkletter(alias, chr(codepoint))
136 name = unicodedata.name(chr(codepoint))
137 self.assertNotEqual(name, alias)
138 self.assertEqual(unicodedata.lookup(alias),
139 unicodedata.lookup(name))
140 with self.assertRaises(KeyError):
141 unicodedata.ucd_3_2_0.lookup(alias)
142
143 def test_aliases_names_in_pua_range(self):
144 # We are storing aliases in the PUA 15, but their names shouldn't leak
145 for cp in range(0xf0000, 0xf0100):
146 with self.assertRaises(ValueError) as cm:
147 unicodedata.name(chr(cp))
148 self.assertEqual(str(cm.exception), 'no such name')
149
150 def test_named_sequences_names_in_pua_range(self):
151 # We are storing named seq in the PUA 15, but their names shouldn't leak
152 for cp in range(0xf0100, 0xf0fff):
153 with self.assertRaises(ValueError) as cm:
154 unicodedata.name(chr(cp))
155 self.assertEqual(str(cm.exception), 'no such name')
156
157 def test_named_sequences_sample(self):
158 # Check a few named sequences. See #12753.
159 sequences = [
160 ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
161 ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
162 ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
163 ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
164 ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
165 ]
166 for seqname, codepoints in sequences:
167 self.assertEqual(unicodedata.lookup(seqname), codepoints)
168 with self.assertRaises(SyntaxError):
169 self.checkletter(seqname, None)
170 with self.assertRaises(KeyError):
171 unicodedata.ucd_3_2_0.lookup(seqname)
172
173 def test_named_sequences_full(self):
174 # Check all the named sequences
Greg Price1ad0c772019-09-10 02:29:26 -0700175 def check_version(testfile):
176 hdr = testfile.readline()
177 return unicodedata.unidata_version in hdr
Georg Brandl5a155082014-11-06 14:37:49 +0100178 url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300179 unicodedata.unidata_version)
180 try:
181 testdata = support.open_urlresource(url, encoding="utf-8",
182 check=check_version)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200183 except (OSError, HTTPException):
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300184 self.skipTest("Could not retrieve " + url)
185 self.addCleanup(testdata.close)
186 for line in testdata:
187 line = line.strip()
188 if not line or line.startswith('#'):
189 continue
190 seqname, codepoints = line.split(';')
191 codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
192 self.assertEqual(unicodedata.lookup(seqname), codepoints)
193 with self.assertRaises(SyntaxError):
194 self.checkletter(seqname, None)
195 with self.assertRaises(KeyError):
196 unicodedata.ucd_3_2_0.lookup(seqname)
197
Walter Dörwald37c47282003-02-26 14:49:41 +0000198 def test_errors(self):
Walter Dörwald37c47282003-02-26 14:49:41 +0000199 self.assertRaises(TypeError, unicodedata.name)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000200 self.assertRaises(TypeError, unicodedata.name, 'xx')
Walter Dörwald37c47282003-02-26 14:49:41 +0000201 self.assertRaises(TypeError, unicodedata.lookup)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000202 self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
Walter Dörwald37c47282003-02-26 14:49:41 +0000203
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000204 def test_strict_error_handling(self):
Walter Dörwald37c47282003-02-26 14:49:41 +0000205 # bogus character name
206 self.assertRaises(
207 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000208 str, b"\\N{blah}", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000209 )
210 # long bogus character name
211 self.assertRaises(
212 UnicodeError,
Guido van Rossum9c627722007-08-27 18:31:48 +0000213 str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000214 )
215 # missing closing brace
216 self.assertRaises(
217 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000218 str, b"\\N{SPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000219 )
220 # missing opening brace
221 self.assertRaises(
222 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000223 str, b"\\NSPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000224 )
225
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +0200226 @support.cpython_only
227 @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
228 @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False)
Serhiy Storchakae45dac42013-01-21 20:23:58 +0200229 def test_issue16335(self, size):
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +0200230 # very very long bogus character name
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +0200231 x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}'
232 self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
Serhiy Storchakae45dac42013-01-21 20:23:58 +0200233 self.assertRaisesRegex(UnicodeError,
234 'unknown Unicode character name',
235 x.decode, 'unicode-escape'
236 )
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +0200237
238
Walter Dörwald37c47282003-02-26 14:49:41 +0000239if __name__ == "__main__":
Zachary Ware38c707e2015-04-13 15:00:43 -0500240 unittest.main()