blob: cba4f078befe0a98f208906e3cab536586a86e33 [file] [log] [blame]
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00001""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
Fredrik Lundh06d12682001-01-24 07:59:11 +00004Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00005
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
Marc-André Lemburg36619082001-01-17 19:11:13 +00009
Walter Dörwald37c47282003-02-26 14:49:41 +000010import unittest
Serhiy Storchakae3adb432013-01-21 20:23:01 +020011import sys
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +020012import _testcapi
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000013
Walter Dörwald37c47282003-02-26 14:49:41 +000014from test import test_support
Fredrik Lundhee865c62001-01-19 11:00:42 +000015
Walter Dörwald37c47282003-02-26 14:49:41 +000016class UnicodeNamesTest(unittest.TestCase):
Fredrik Lundhee865c62001-01-19 11:00:42 +000017
Walter Dörwald37c47282003-02-26 14:49:41 +000018 def checkletter(self, name, code):
19 # Helper that put all \N escapes inside eval'd raw strings,
Tim Peters669454e2003-03-07 17:30:48 +000020 # to make sure this script runs even if the compiler
Walter Dörwald37c47282003-02-26 14:49:41 +000021 # chokes on \N escapes
22 res = eval(ur'u"\N{%s}"' % name)
23 self.assertEqual(res, code)
24 return res
Fredrik Lundhee865c62001-01-19 11:00:42 +000025
Walter Dörwald37c47282003-02-26 14:49:41 +000026 def test_general(self):
27 # General and case insensitivity test:
28 chars = [
29 "LATIN CAPITAL LETTER T",
30 "LATIN SMALL LETTER H",
31 "LATIN SMALL LETTER E",
32 "SPACE",
33 "LATIN SMALL LETTER R",
34 "LATIN CAPITAL LETTER E",
35 "LATIN SMALL LETTER D",
36 "SPACE",
37 "LATIN SMALL LETTER f",
38 "LATIN CAPITAL LeTtEr o",
39 "LATIN SMaLl LETTER x",
40 "SPACE",
41 "LATIN SMALL LETTER A",
42 "LATIN SMALL LETTER T",
43 "LATIN SMALL LETTER E",
44 "SPACE",
45 "LATIN SMALL LETTER T",
46 "LATIN SMALL LETTER H",
47 "LATIN SMALL LETTER E",
48 "SpAcE",
49 "LATIN SMALL LETTER S",
50 "LATIN SMALL LETTER H",
51 "LATIN small LETTER e",
52 "LATIN small LETTER e",
53 "LATIN SMALL LETTER P",
54 "FULL STOP"
55 ]
56 string = u"The rEd fOx ate the sheep."
Martin v. Löwis8579efc2002-11-23 17:11:42 +000057
Walter Dörwald37c47282003-02-26 14:49:41 +000058 self.assertEqual(
59 u"".join([self.checkletter(*args) for args in zip(chars, string)]),
60 string
61 )
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +000062
Walter Dörwald37c47282003-02-26 14:49:41 +000063 def test_ascii_letters(self):
64 import unicodedata
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000065
Walter Dörwald37c47282003-02-26 14:49:41 +000066 for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
67 name = "LATIN SMALL LETTER %s" % char.upper()
68 code = unicodedata.lookup(name)
69 self.assertEqual(unicodedata.name(code), name)
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000070
Walter Dörwald37c47282003-02-26 14:49:41 +000071 def test_hangul_syllables(self):
72 self.checkletter("HANGUL SYLLABLE GA", u"\uac00")
73 self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8")
74 self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0")
75 self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8")
76 self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0")
77 self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88")
78 self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370")
79 self.checkletter("HANGUL SYLLABLE YI", u"\uc758")
80 self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40")
81 self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28")
82 self.checkletter("HANGUL SYLLABLE PAN", u"\ud310")
83 self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8")
84 self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000085
Walter Dörwald37c47282003-02-26 14:49:41 +000086 import unicodedata
87 self.assertRaises(ValueError, unicodedata.name, u"\ud7a4")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000088
Walter Dörwald37c47282003-02-26 14:49:41 +000089 def test_cjk_unified_ideographs(self):
90 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", u"\u3400")
91 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", u"\u4db5")
92 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", u"\u4e00")
93 self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", u"\u9fa5")
94 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", u"\U00020000")
95 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", u"\U0002a6d6")
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000096
Walter Dörwald37c47282003-02-26 14:49:41 +000097 def test_bmp_characters(self):
98 import unicodedata
99 count = 0
100 for code in xrange(0x10000):
101 char = unichr(code)
102 name = unicodedata.name(char, None)
103 if name is not None:
104 self.assertEqual(unicodedata.lookup(name), char)
105 count += 1
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000106
Walter Dörwald37c47282003-02-26 14:49:41 +0000107 def test_misc_symbols(self):
108 self.checkletter("PILCROW SIGN", u"\u00b6")
109 self.checkletter("REPLACEMENT CHARACTER", u"\uFFFD")
110 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", u"\uFF9F")
111 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", u"\uFF41")
112
113 def test_errors(self):
114 import unicodedata
115 self.assertRaises(TypeError, unicodedata.name)
116 self.assertRaises(TypeError, unicodedata.name, u'xx')
117 self.assertRaises(TypeError, unicodedata.lookup)
118 self.assertRaises(KeyError, unicodedata.lookup, u'unknown')
119
120 def test_strict_eror_handling(self):
121 # bogus character name
122 self.assertRaises(
123 UnicodeError,
124 unicode, "\\N{blah}", 'unicode-escape', 'strict'
125 )
126 # long bogus character name
127 self.assertRaises(
128 UnicodeError,
129 unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'
130 )
131 # missing closing brace
132 self.assertRaises(
133 UnicodeError,
134 unicode, "\\N{SPACE", 'unicode-escape', 'strict'
135 )
136 # missing opening brace
137 self.assertRaises(
138 UnicodeError,
139 unicode, "\\NSPACE", 'unicode-escape', 'strict'
140 )
141
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +0200142 @unittest.skipUnless(_testcapi.INT_MAX < _testcapi.PY_SSIZE_T_MAX,
143 "needs UINT_MAX < SIZE_MAX")
Serhiy Storchakae3adb432013-01-21 20:23:01 +0200144 @unittest.skipUnless(_testcapi.UINT_MAX < sys.maxint,
145 "needs UINT_MAX < sys.maxint")
146 @test_support.bigmemtest(minsize=_testcapi.UINT_MAX + 1,
Serhiy Storchaka7ee79a22013-01-25 10:03:12 +0200147 memuse=2 + 4 // len(u'\U00010000'))
Serhiy Storchakae3adb432013-01-21 20:23:01 +0200148 def test_issue16335(self, size):
Serhiy Storchaka7ee79a22013-01-25 10:03:12 +0200149 func = self.test_issue16335
150 if size < func.minsize:
151 raise unittest.SkipTest("not enough memory: %.1fG minimum needed" %
152 (func.minsize * func.memuse / float(1024**3),))
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +0200153 # very very long bogus character name
Serhiy Storchakae3adb432013-01-21 20:23:01 +0200154 x = b'\\N{SPACE' + b'x' * int(_testcapi.UINT_MAX + 1) + b'}'
155 self.assertEqual(len(x), len(b'\\N{SPACE}') +
156 (_testcapi.UINT_MAX + 1))
157 self.assertRaisesRegexp(UnicodeError,
158 'unknown Unicode character name',
159 x.decode, 'unicode-escape'
160 )
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +0200161
162
Walter Dörwald37c47282003-02-26 14:49:41 +0000163def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000164 test_support.run_unittest(UnicodeNamesTest)
Walter Dörwald37c47282003-02-26 14:49:41 +0000165
166if __name__ == "__main__":
167 test_main()