blob: fbbe78b3166e970762b1d7595e42f85085e94bfa [file] [log] [blame]
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00001""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
Fredrik Lundh06d12682001-01-24 07:59:11 +00004Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00005
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
Marc-André Lemburg36619082001-01-17 19:11:13 +00009
Walter Dörwald37c47282003-02-26 14:49:41 +000010import unittest
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +020011import _testcapi
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000012
Benjamin Petersonee8712c2008-05-20 21:35:26 +000013from test import support
Fredrik Lundhee865c62001-01-19 11:00:42 +000014
Walter Dörwald37c47282003-02-26 14:49:41 +000015class UnicodeNamesTest(unittest.TestCase):
Fredrik Lundhee865c62001-01-19 11:00:42 +000016
Walter Dörwald37c47282003-02-26 14:49:41 +000017 def checkletter(self, name, code):
18 # Helper that put all \N escapes inside eval'd raw strings,
Tim Peters669454e2003-03-07 17:30:48 +000019 # to make sure this script runs even if the compiler
Walter Dörwald37c47282003-02-26 14:49:41 +000020 # chokes on \N escapes
Guido van Rossumbf4806b2007-07-21 00:15:34 +000021 res = eval(r'"\N{%s}"' % name)
Walter Dörwald37c47282003-02-26 14:49:41 +000022 self.assertEqual(res, code)
23 return res
Fredrik Lundhee865c62001-01-19 11:00:42 +000024
Walter Dörwald37c47282003-02-26 14:49:41 +000025 def test_general(self):
26 # General and case insensitivity test:
27 chars = [
28 "LATIN CAPITAL LETTER T",
29 "LATIN SMALL LETTER H",
30 "LATIN SMALL LETTER E",
31 "SPACE",
32 "LATIN SMALL LETTER R",
33 "LATIN CAPITAL LETTER E",
34 "LATIN SMALL LETTER D",
35 "SPACE",
36 "LATIN SMALL LETTER f",
37 "LATIN CAPITAL LeTtEr o",
38 "LATIN SMaLl LETTER x",
39 "SPACE",
40 "LATIN SMALL LETTER A",
41 "LATIN SMALL LETTER T",
42 "LATIN SMALL LETTER E",
43 "SPACE",
44 "LATIN SMALL LETTER T",
45 "LATIN SMALL LETTER H",
46 "LATIN SMALL LETTER E",
47 "SpAcE",
48 "LATIN SMALL LETTER S",
49 "LATIN SMALL LETTER H",
50 "LATIN small LETTER e",
51 "LATIN small LETTER e",
52 "LATIN SMALL LETTER P",
53 "FULL STOP"
54 ]
Guido van Rossumef87d6e2007-05-02 19:09:54 +000055 string = "The rEd fOx ate the sheep."
Martin v. Löwis8579efc2002-11-23 17:11:42 +000056
Walter Dörwald37c47282003-02-26 14:49:41 +000057 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +000058 "".join([self.checkletter(*args) for args in zip(chars, string)]),
Walter Dörwald37c47282003-02-26 14:49:41 +000059 string
60 )
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +000061
Walter Dörwald37c47282003-02-26 14:49:41 +000062 def test_ascii_letters(self):
63 import unicodedata
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000064
Guido van Rossum805365e2007-05-07 22:24:25 +000065 for char in "".join(map(chr, range(ord("a"), ord("z")))):
Walter Dörwald37c47282003-02-26 14:49:41 +000066 name = "LATIN SMALL LETTER %s" % char.upper()
67 code = unicodedata.lookup(name)
68 self.assertEqual(unicodedata.name(code), name)
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000069
Walter Dörwald37c47282003-02-26 14:49:41 +000070 def test_hangul_syllables(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000071 self.checkletter("HANGUL SYLLABLE GA", "\uac00")
72 self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
73 self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
74 self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
75 self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
76 self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
77 self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
78 self.checkletter("HANGUL SYLLABLE YI", "\uc758")
79 self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
80 self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
81 self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
82 self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
83 self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000084
Walter Dörwald37c47282003-02-26 14:49:41 +000085 import unicodedata
Guido van Rossumef87d6e2007-05-02 19:09:54 +000086 self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000087
Walter Dörwald37c47282003-02-26 14:49:41 +000088 def test_cjk_unified_ideographs(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
90 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
91 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000092 self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000093 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
94 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000095 self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
96 self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
97 self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
98 self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000099
Walter Dörwald37c47282003-02-26 14:49:41 +0000100 def test_bmp_characters(self):
101 import unicodedata
102 count = 0
Guido van Rossum805365e2007-05-07 22:24:25 +0000103 for code in range(0x10000):
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000104 char = chr(code)
Walter Dörwald37c47282003-02-26 14:49:41 +0000105 name = unicodedata.name(char, None)
106 if name is not None:
107 self.assertEqual(unicodedata.lookup(name), char)
108 count += 1
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000109
Walter Dörwald37c47282003-02-26 14:49:41 +0000110 def test_misc_symbols(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000111 self.checkletter("PILCROW SIGN", "\u00b6")
112 self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
113 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
114 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
Walter Dörwald37c47282003-02-26 14:49:41 +0000115
116 def test_errors(self):
117 import unicodedata
118 self.assertRaises(TypeError, unicodedata.name)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000119 self.assertRaises(TypeError, unicodedata.name, 'xx')
Walter Dörwald37c47282003-02-26 14:49:41 +0000120 self.assertRaises(TypeError, unicodedata.lookup)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000121 self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
Walter Dörwald37c47282003-02-26 14:49:41 +0000122
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000123 def test_strict_error_handling(self):
Walter Dörwald37c47282003-02-26 14:49:41 +0000124 # bogus character name
125 self.assertRaises(
126 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000127 str, b"\\N{blah}", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000128 )
129 # long bogus character name
130 self.assertRaises(
131 UnicodeError,
Guido van Rossum9c627722007-08-27 18:31:48 +0000132 str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000133 )
134 # missing closing brace
135 self.assertRaises(
136 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000137 str, b"\\N{SPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000138 )
139 # missing opening brace
140 self.assertRaises(
141 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000142 str, b"\\NSPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000143 )
144
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +0200145 @unittest.skipUnless(_testcapi.INT_MAX < _testcapi.PY_SSIZE_T_MAX,
146 "needs UINT_MAX < SIZE_MAX")
Serhiy Storchakae45dac42013-01-21 20:23:58 +0200147 @support.bigmemtest(size=_testcapi.UINT_MAX + 1,
Serhiy Storchaka2f2dd992013-01-25 10:12:30 +0200148 memuse=2 + 4 // len('\U00010000'), dry_run=False)
Serhiy Storchakae45dac42013-01-21 20:23:58 +0200149 def test_issue16335(self, size):
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +0200150 # very very long bogus character name
Serhiy Storchakae45dac42013-01-21 20:23:58 +0200151 x = b'\\N{SPACE' + b'x' * (_testcapi.UINT_MAX + 1) + b'}'
152 self.assertEqual(len(x), len(b'\\N{SPACE}') +
153 (_testcapi.UINT_MAX + 1))
154 self.assertRaisesRegex(UnicodeError,
155 'unknown Unicode character name',
156 x.decode, 'unicode-escape'
157 )
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +0200158
159
Walter Dörwald37c47282003-02-26 14:49:41 +0000160def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000161 support.run_unittest(UnicodeNamesTest)
Walter Dörwald37c47282003-02-26 14:49:41 +0000162
163if __name__ == "__main__":
164 test_main()