blob: 4409deb2d177bc45a9aae8b2380ae6f8212ca59c [file] [log] [blame]
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00001""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
Fredrik Lundh06d12682001-01-24 07:59:11 +00004Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00005
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
Marc-André Lemburg36619082001-01-17 19:11:13 +00009
Walter Dörwald37c47282003-02-26 14:49:41 +000010import unittest
Serhiy Storchakae3adb432013-01-21 20:23:01 +020011import sys
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000012
Walter Dörwald37c47282003-02-26 14:49:41 +000013from test import test_support
Fredrik Lundhee865c62001-01-19 11:00:42 +000014
Serhiy Storchaka76249ea2014-02-07 10:06:05 +020015try:
16 from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
17except ImportError:
18 INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1
19
Walter Dörwald37c47282003-02-26 14:49:41 +000020class UnicodeNamesTest(unittest.TestCase):
Fredrik Lundhee865c62001-01-19 11:00:42 +000021
Walter Dörwald37c47282003-02-26 14:49:41 +000022 def checkletter(self, name, code):
23 # Helper that put all \N escapes inside eval'd raw strings,
Tim Peters669454e2003-03-07 17:30:48 +000024 # to make sure this script runs even if the compiler
Walter Dörwald37c47282003-02-26 14:49:41 +000025 # chokes on \N escapes
26 res = eval(ur'u"\N{%s}"' % name)
27 self.assertEqual(res, code)
28 return res
Fredrik Lundhee865c62001-01-19 11:00:42 +000029
Walter Dörwald37c47282003-02-26 14:49:41 +000030 def test_general(self):
31 # General and case insensitivity test:
32 chars = [
33 "LATIN CAPITAL LETTER T",
34 "LATIN SMALL LETTER H",
35 "LATIN SMALL LETTER E",
36 "SPACE",
37 "LATIN SMALL LETTER R",
38 "LATIN CAPITAL LETTER E",
39 "LATIN SMALL LETTER D",
40 "SPACE",
41 "LATIN SMALL LETTER f",
42 "LATIN CAPITAL LeTtEr o",
43 "LATIN SMaLl LETTER x",
44 "SPACE",
45 "LATIN SMALL LETTER A",
46 "LATIN SMALL LETTER T",
47 "LATIN SMALL LETTER E",
48 "SPACE",
49 "LATIN SMALL LETTER T",
50 "LATIN SMALL LETTER H",
51 "LATIN SMALL LETTER E",
52 "SpAcE",
53 "LATIN SMALL LETTER S",
54 "LATIN SMALL LETTER H",
55 "LATIN small LETTER e",
56 "LATIN small LETTER e",
57 "LATIN SMALL LETTER P",
58 "FULL STOP"
59 ]
60 string = u"The rEd fOx ate the sheep."
Martin v. Löwis8579efc2002-11-23 17:11:42 +000061
Walter Dörwald37c47282003-02-26 14:49:41 +000062 self.assertEqual(
63 u"".join([self.checkletter(*args) for args in zip(chars, string)]),
64 string
65 )
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +000066
Walter Dörwald37c47282003-02-26 14:49:41 +000067 def test_ascii_letters(self):
68 import unicodedata
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000069
Walter Dörwald37c47282003-02-26 14:49:41 +000070 for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
71 name = "LATIN SMALL LETTER %s" % char.upper()
72 code = unicodedata.lookup(name)
73 self.assertEqual(unicodedata.name(code), name)
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000074
Walter Dörwald37c47282003-02-26 14:49:41 +000075 def test_hangul_syllables(self):
76 self.checkletter("HANGUL SYLLABLE GA", u"\uac00")
77 self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8")
78 self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0")
79 self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8")
80 self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0")
81 self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88")
82 self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370")
83 self.checkletter("HANGUL SYLLABLE YI", u"\uc758")
84 self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40")
85 self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28")
86 self.checkletter("HANGUL SYLLABLE PAN", u"\ud310")
87 self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8")
88 self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000089
Walter Dörwald37c47282003-02-26 14:49:41 +000090 import unicodedata
91 self.assertRaises(ValueError, unicodedata.name, u"\ud7a4")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000092
Walter Dörwald37c47282003-02-26 14:49:41 +000093 def test_cjk_unified_ideographs(self):
94 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", u"\u3400")
95 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", u"\u4db5")
96 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", u"\u4e00")
97 self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", u"\u9fa5")
98 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", u"\U00020000")
99 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", u"\U0002a6d6")
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000100
Walter Dörwald37c47282003-02-26 14:49:41 +0000101 def test_bmp_characters(self):
102 import unicodedata
103 count = 0
104 for code in xrange(0x10000):
105 char = unichr(code)
106 name = unicodedata.name(char, None)
107 if name is not None:
108 self.assertEqual(unicodedata.lookup(name), char)
109 count += 1
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000110
Walter Dörwald37c47282003-02-26 14:49:41 +0000111 def test_misc_symbols(self):
112 self.checkletter("PILCROW SIGN", u"\u00b6")
113 self.checkletter("REPLACEMENT CHARACTER", u"\uFFFD")
114 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", u"\uFF9F")
115 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", u"\uFF41")
116
117 def test_errors(self):
118 import unicodedata
119 self.assertRaises(TypeError, unicodedata.name)
120 self.assertRaises(TypeError, unicodedata.name, u'xx')
121 self.assertRaises(TypeError, unicodedata.lookup)
122 self.assertRaises(KeyError, unicodedata.lookup, u'unknown')
123
124 def test_strict_eror_handling(self):
125 # bogus character name
126 self.assertRaises(
127 UnicodeError,
128 unicode, "\\N{blah}", 'unicode-escape', 'strict'
129 )
130 # long bogus character name
131 self.assertRaises(
132 UnicodeError,
133 unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'
134 )
135 # missing closing brace
136 self.assertRaises(
137 UnicodeError,
138 unicode, "\\N{SPACE", 'unicode-escape', 'strict'
139 )
140 # missing opening brace
141 self.assertRaises(
142 UnicodeError,
143 unicode, "\\NSPACE", 'unicode-escape', 'strict'
144 )
145
Serhiy Storchaka76249ea2014-02-07 10:06:05 +0200146 @test_support.cpython_only
147 @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
148 @unittest.skipUnless(UINT_MAX < sys.maxint, "needs UINT_MAX < sys.maxint")
149 @test_support.bigmemtest(minsize=UINT_MAX + 1,
Serhiy Storchaka7ee79a22013-01-25 10:03:12 +0200150 memuse=2 + 4 // len(u'\U00010000'))
Serhiy Storchakae3adb432013-01-21 20:23:01 +0200151 def test_issue16335(self, size):
Serhiy Storchaka7ee79a22013-01-25 10:03:12 +0200152 func = self.test_issue16335
153 if size < func.minsize:
154 raise unittest.SkipTest("not enough memory: %.1fG minimum needed" %
155 (func.minsize * func.memuse / float(1024**3),))
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +0200156 # very very long bogus character name
Serhiy Storchaka76249ea2014-02-07 10:06:05 +0200157 x = b'\\N{SPACE' + b'x' * int(UINT_MAX + 1) + b'}'
158 self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
Serhiy Storchakae3adb432013-01-21 20:23:01 +0200159 self.assertRaisesRegexp(UnicodeError,
160 'unknown Unicode character name',
161 x.decode, 'unicode-escape'
162 )
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +0200163
164
Walter Dörwald37c47282003-02-26 14:49:41 +0000165def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000166 test_support.run_unittest(UnicodeNamesTest)
Walter Dörwald37c47282003-02-26 14:49:41 +0000167
168if __name__ == "__main__":
169 test_main()