blob: 485e124947ca9e24c3827354372d5890cf6bd35e [file] [log] [blame]
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00001""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
Fredrik Lundh06d12682001-01-24 07:59:11 +00004Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00005
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
Marc-André Lemburg36619082001-01-17 19:11:13 +00009
Walter Dörwald37c47282003-02-26 14:49:41 +000010import unittest
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000011
Walter Dörwald37c47282003-02-26 14:49:41 +000012from test import test_support
Fredrik Lundhee865c62001-01-19 11:00:42 +000013
Walter Dörwald37c47282003-02-26 14:49:41 +000014class UnicodeNamesTest(unittest.TestCase):
Fredrik Lundhee865c62001-01-19 11:00:42 +000015
Walter Dörwald37c47282003-02-26 14:49:41 +000016 def checkletter(self, name, code):
17 # Helper that put all \N escapes inside eval'd raw strings,
Tim Peters669454e2003-03-07 17:30:48 +000018 # to make sure this script runs even if the compiler
Walter Dörwald37c47282003-02-26 14:49:41 +000019 # chokes on \N escapes
Guido van Rossumef87d6e2007-05-02 19:09:54 +000020 res = eval(r'u"\N{%s}"' % name)
Walter Dörwald37c47282003-02-26 14:49:41 +000021 self.assertEqual(res, code)
22 return res
Fredrik Lundhee865c62001-01-19 11:00:42 +000023
Walter Dörwald37c47282003-02-26 14:49:41 +000024 def test_general(self):
25 # General and case insensitivity test:
26 chars = [
27 "LATIN CAPITAL LETTER T",
28 "LATIN SMALL LETTER H",
29 "LATIN SMALL LETTER E",
30 "SPACE",
31 "LATIN SMALL LETTER R",
32 "LATIN CAPITAL LETTER E",
33 "LATIN SMALL LETTER D",
34 "SPACE",
35 "LATIN SMALL LETTER f",
36 "LATIN CAPITAL LeTtEr o",
37 "LATIN SMaLl LETTER x",
38 "SPACE",
39 "LATIN SMALL LETTER A",
40 "LATIN SMALL LETTER T",
41 "LATIN SMALL LETTER E",
42 "SPACE",
43 "LATIN SMALL LETTER T",
44 "LATIN SMALL LETTER H",
45 "LATIN SMALL LETTER E",
46 "SpAcE",
47 "LATIN SMALL LETTER S",
48 "LATIN SMALL LETTER H",
49 "LATIN small LETTER e",
50 "LATIN small LETTER e",
51 "LATIN SMALL LETTER P",
52 "FULL STOP"
53 ]
Guido van Rossumef87d6e2007-05-02 19:09:54 +000054 string = "The rEd fOx ate the sheep."
Martin v. Löwis8579efc2002-11-23 17:11:42 +000055
Walter Dörwald37c47282003-02-26 14:49:41 +000056 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +000057 "".join([self.checkletter(*args) for args in zip(chars, string)]),
Walter Dörwald37c47282003-02-26 14:49:41 +000058 string
59 )
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +000060
Walter Dörwald37c47282003-02-26 14:49:41 +000061 def test_ascii_letters(self):
62 import unicodedata
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000063
Walter Dörwald37c47282003-02-26 14:49:41 +000064 for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
65 name = "LATIN SMALL LETTER %s" % char.upper()
66 code = unicodedata.lookup(name)
67 self.assertEqual(unicodedata.name(code), name)
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000068
Walter Dörwald37c47282003-02-26 14:49:41 +000069 def test_hangul_syllables(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000070 self.checkletter("HANGUL SYLLABLE GA", "\uac00")
71 self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
72 self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
73 self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
74 self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
75 self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
76 self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
77 self.checkletter("HANGUL SYLLABLE YI", "\uc758")
78 self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
79 self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
80 self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
81 self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
82 self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000083
Walter Dörwald37c47282003-02-26 14:49:41 +000084 import unicodedata
Guido van Rossumef87d6e2007-05-02 19:09:54 +000085 self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000086
Walter Dörwald37c47282003-02-26 14:49:41 +000087 def test_cjk_unified_ideographs(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000088 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
89 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
90 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
91 self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", "\u9fa5")
92 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
93 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000094
Walter Dörwald37c47282003-02-26 14:49:41 +000095 def test_bmp_characters(self):
96 import unicodedata
97 count = 0
98 for code in xrange(0x10000):
Guido van Rossum84fc66d2007-05-03 17:18:26 +000099 char = chr(code)
Walter Dörwald37c47282003-02-26 14:49:41 +0000100 name = unicodedata.name(char, None)
101 if name is not None:
102 self.assertEqual(unicodedata.lookup(name), char)
103 count += 1
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000104
Walter Dörwald37c47282003-02-26 14:49:41 +0000105 def test_misc_symbols(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000106 self.checkletter("PILCROW SIGN", "\u00b6")
107 self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
108 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
109 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
Walter Dörwald37c47282003-02-26 14:49:41 +0000110
111 def test_errors(self):
112 import unicodedata
113 self.assertRaises(TypeError, unicodedata.name)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000114 self.assertRaises(TypeError, unicodedata.name, 'xx')
Walter Dörwald37c47282003-02-26 14:49:41 +0000115 self.assertRaises(TypeError, unicodedata.lookup)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000116 self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
Walter Dörwald37c47282003-02-26 14:49:41 +0000117
118 def test_strict_eror_handling(self):
119 # bogus character name
120 self.assertRaises(
121 UnicodeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000122 str, "\\N{blah}", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000123 )
124 # long bogus character name
125 self.assertRaises(
126 UnicodeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000127 str, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000128 )
129 # missing closing brace
130 self.assertRaises(
131 UnicodeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000132 str, "\\N{SPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000133 )
134 # missing opening brace
135 self.assertRaises(
136 UnicodeError,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000137 str, "\\NSPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000138 )
139
140def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000141 test_support.run_unittest(UnicodeNamesTest)
Walter Dörwald37c47282003-02-26 14:49:41 +0000142
143if __name__ == "__main__":
144 test_main()