blob: 1d303dc97d73156fc806db3e9f0aa3d5c3f0064b [file] [log] [blame]
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00001""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
Fredrik Lundh06d12682001-01-24 07:59:11 +00004Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00005
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
Marc-André Lemburg36619082001-01-17 19:11:13 +00009
Walter Dörwald37c47282003-02-26 14:49:41 +000010import unittest
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +020011import _testcapi
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000012
Walter Dörwald37c47282003-02-26 14:49:41 +000013from test import test_support
Fredrik Lundhee865c62001-01-19 11:00:42 +000014
Walter Dörwald37c47282003-02-26 14:49:41 +000015class UnicodeNamesTest(unittest.TestCase):
Fredrik Lundhee865c62001-01-19 11:00:42 +000016
Walter Dörwald37c47282003-02-26 14:49:41 +000017 def checkletter(self, name, code):
18 # Helper that put all \N escapes inside eval'd raw strings,
Tim Peters669454e2003-03-07 17:30:48 +000019 # to make sure this script runs even if the compiler
Walter Dörwald37c47282003-02-26 14:49:41 +000020 # chokes on \N escapes
21 res = eval(ur'u"\N{%s}"' % name)
22 self.assertEqual(res, code)
23 return res
Fredrik Lundhee865c62001-01-19 11:00:42 +000024
Walter Dörwald37c47282003-02-26 14:49:41 +000025 def test_general(self):
26 # General and case insensitivity test:
27 chars = [
28 "LATIN CAPITAL LETTER T",
29 "LATIN SMALL LETTER H",
30 "LATIN SMALL LETTER E",
31 "SPACE",
32 "LATIN SMALL LETTER R",
33 "LATIN CAPITAL LETTER E",
34 "LATIN SMALL LETTER D",
35 "SPACE",
36 "LATIN SMALL LETTER f",
37 "LATIN CAPITAL LeTtEr o",
38 "LATIN SMaLl LETTER x",
39 "SPACE",
40 "LATIN SMALL LETTER A",
41 "LATIN SMALL LETTER T",
42 "LATIN SMALL LETTER E",
43 "SPACE",
44 "LATIN SMALL LETTER T",
45 "LATIN SMALL LETTER H",
46 "LATIN SMALL LETTER E",
47 "SpAcE",
48 "LATIN SMALL LETTER S",
49 "LATIN SMALL LETTER H",
50 "LATIN small LETTER e",
51 "LATIN small LETTER e",
52 "LATIN SMALL LETTER P",
53 "FULL STOP"
54 ]
55 string = u"The rEd fOx ate the sheep."
Martin v. Löwis8579efc2002-11-23 17:11:42 +000056
Walter Dörwald37c47282003-02-26 14:49:41 +000057 self.assertEqual(
58 u"".join([self.checkletter(*args) for args in zip(chars, string)]),
59 string
60 )
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +000061
Walter Dörwald37c47282003-02-26 14:49:41 +000062 def test_ascii_letters(self):
63 import unicodedata
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000064
Walter Dörwald37c47282003-02-26 14:49:41 +000065 for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
66 name = "LATIN SMALL LETTER %s" % char.upper()
67 code = unicodedata.lookup(name)
68 self.assertEqual(unicodedata.name(code), name)
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000069
Walter Dörwald37c47282003-02-26 14:49:41 +000070 def test_hangul_syllables(self):
71 self.checkletter("HANGUL SYLLABLE GA", u"\uac00")
72 self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8")
73 self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0")
74 self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8")
75 self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0")
76 self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88")
77 self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370")
78 self.checkletter("HANGUL SYLLABLE YI", u"\uc758")
79 self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40")
80 self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28")
81 self.checkletter("HANGUL SYLLABLE PAN", u"\ud310")
82 self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8")
83 self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000084
Walter Dörwald37c47282003-02-26 14:49:41 +000085 import unicodedata
86 self.assertRaises(ValueError, unicodedata.name, u"\ud7a4")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000087
Walter Dörwald37c47282003-02-26 14:49:41 +000088 def test_cjk_unified_ideographs(self):
89 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", u"\u3400")
90 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", u"\u4db5")
91 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", u"\u4e00")
92 self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", u"\u9fa5")
93 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", u"\U00020000")
94 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", u"\U0002a6d6")
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000095
Walter Dörwald37c47282003-02-26 14:49:41 +000096 def test_bmp_characters(self):
97 import unicodedata
98 count = 0
99 for code in xrange(0x10000):
100 char = unichr(code)
101 name = unicodedata.name(char, None)
102 if name is not None:
103 self.assertEqual(unicodedata.lookup(name), char)
104 count += 1
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000105
Walter Dörwald37c47282003-02-26 14:49:41 +0000106 def test_misc_symbols(self):
107 self.checkletter("PILCROW SIGN", u"\u00b6")
108 self.checkletter("REPLACEMENT CHARACTER", u"\uFFFD")
109 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", u"\uFF9F")
110 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", u"\uFF41")
111
112 def test_errors(self):
113 import unicodedata
114 self.assertRaises(TypeError, unicodedata.name)
115 self.assertRaises(TypeError, unicodedata.name, u'xx')
116 self.assertRaises(TypeError, unicodedata.lookup)
117 self.assertRaises(KeyError, unicodedata.lookup, u'unknown')
118
119 def test_strict_eror_handling(self):
120 # bogus character name
121 self.assertRaises(
122 UnicodeError,
123 unicode, "\\N{blah}", 'unicode-escape', 'strict'
124 )
125 # long bogus character name
126 self.assertRaises(
127 UnicodeError,
128 unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'
129 )
130 # missing closing brace
131 self.assertRaises(
132 UnicodeError,
133 unicode, "\\N{SPACE", 'unicode-escape', 'strict'
134 )
135 # missing opening brace
136 self.assertRaises(
137 UnicodeError,
138 unicode, "\\NSPACE", 'unicode-escape', 'strict'
139 )
140
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +0200141 @unittest.skipUnless(_testcapi.INT_MAX < _testcapi.PY_SSIZE_T_MAX,
142 "needs UINT_MAX < SIZE_MAX")
143 def test_issue16335(self):
144 # very very long bogus character name
145 try:
146 x = b'\\N{SPACE' + b'x' * int(_testcapi.UINT_MAX + 1) + b'}'
147 except MemoryError:
148 raise unittest.SkipTest("not enough memory")
149 self.assertEqual(len(x), len(b'\\N{SPACE}') + (_testcapi.UINT_MAX + 1))
150 self.assertRaisesRegex(UnicodeError,
151 'unknown Unicode character name',
152 x.decode, 'unicode-escape'
153 )
154
155
Walter Dörwald37c47282003-02-26 14:49:41 +0000156def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000157 test_support.run_unittest(UnicodeNamesTest)
Walter Dörwald37c47282003-02-26 14:49:41 +0000158
159if __name__ == "__main__":
160 test_main()