blob: fd620f01e4754eedb657c7cc6edf6dd72f81eb56 [file] [log] [blame]
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00001""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
Fredrik Lundh06d12682001-01-24 07:59:11 +00004Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00005
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
Marc-André Lemburg36619082001-01-17 19:11:13 +00009
Walter Dörwald37c47282003-02-26 14:49:41 +000010import unittest
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000011
Benjamin Petersonee8712c2008-05-20 21:35:26 +000012from test import support
Fredrik Lundhee865c62001-01-19 11:00:42 +000013
Walter Dörwald37c47282003-02-26 14:49:41 +000014class UnicodeNamesTest(unittest.TestCase):
Fredrik Lundhee865c62001-01-19 11:00:42 +000015
Walter Dörwald37c47282003-02-26 14:49:41 +000016 def checkletter(self, name, code):
17 # Helper that put all \N escapes inside eval'd raw strings,
Tim Peters669454e2003-03-07 17:30:48 +000018 # to make sure this script runs even if the compiler
Walter Dörwald37c47282003-02-26 14:49:41 +000019 # chokes on \N escapes
Guido van Rossumbf4806b2007-07-21 00:15:34 +000020 res = eval(r'"\N{%s}"' % name)
Walter Dörwald37c47282003-02-26 14:49:41 +000021 self.assertEqual(res, code)
22 return res
Fredrik Lundhee865c62001-01-19 11:00:42 +000023
Walter Dörwald37c47282003-02-26 14:49:41 +000024 def test_general(self):
25 # General and case insensitivity test:
26 chars = [
27 "LATIN CAPITAL LETTER T",
28 "LATIN SMALL LETTER H",
29 "LATIN SMALL LETTER E",
30 "SPACE",
31 "LATIN SMALL LETTER R",
32 "LATIN CAPITAL LETTER E",
33 "LATIN SMALL LETTER D",
34 "SPACE",
35 "LATIN SMALL LETTER f",
36 "LATIN CAPITAL LeTtEr o",
37 "LATIN SMaLl LETTER x",
38 "SPACE",
39 "LATIN SMALL LETTER A",
40 "LATIN SMALL LETTER T",
41 "LATIN SMALL LETTER E",
42 "SPACE",
43 "LATIN SMALL LETTER T",
44 "LATIN SMALL LETTER H",
45 "LATIN SMALL LETTER E",
46 "SpAcE",
47 "LATIN SMALL LETTER S",
48 "LATIN SMALL LETTER H",
49 "LATIN small LETTER e",
50 "LATIN small LETTER e",
51 "LATIN SMALL LETTER P",
52 "FULL STOP"
53 ]
Guido van Rossumef87d6e2007-05-02 19:09:54 +000054 string = "The rEd fOx ate the sheep."
Martin v. Löwis8579efc2002-11-23 17:11:42 +000055
Walter Dörwald37c47282003-02-26 14:49:41 +000056 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +000057 "".join([self.checkletter(*args) for args in zip(chars, string)]),
Walter Dörwald37c47282003-02-26 14:49:41 +000058 string
59 )
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +000060
Walter Dörwald37c47282003-02-26 14:49:41 +000061 def test_ascii_letters(self):
62 import unicodedata
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000063
Guido van Rossum805365e2007-05-07 22:24:25 +000064 for char in "".join(map(chr, range(ord("a"), ord("z")))):
Walter Dörwald37c47282003-02-26 14:49:41 +000065 name = "LATIN SMALL LETTER %s" % char.upper()
66 code = unicodedata.lookup(name)
67 self.assertEqual(unicodedata.name(code), name)
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000068
Walter Dörwald37c47282003-02-26 14:49:41 +000069 def test_hangul_syllables(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000070 self.checkletter("HANGUL SYLLABLE GA", "\uac00")
71 self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
72 self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
73 self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
74 self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
75 self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
76 self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
77 self.checkletter("HANGUL SYLLABLE YI", "\uc758")
78 self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
79 self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
80 self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
81 self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
82 self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000083
Walter Dörwald37c47282003-02-26 14:49:41 +000084 import unicodedata
Guido van Rossumef87d6e2007-05-02 19:09:54 +000085 self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000086
Walter Dörwald37c47282003-02-26 14:49:41 +000087 def test_cjk_unified_ideographs(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +000088 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
89 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
90 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000091 self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000092 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
93 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000094 self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
95 self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
96 self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
97 self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000098
Walter Dörwald37c47282003-02-26 14:49:41 +000099 def test_bmp_characters(self):
100 import unicodedata
101 count = 0
Guido van Rossum805365e2007-05-07 22:24:25 +0000102 for code in range(0x10000):
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000103 char = chr(code)
Walter Dörwald37c47282003-02-26 14:49:41 +0000104 name = unicodedata.name(char, None)
105 if name is not None:
106 self.assertEqual(unicodedata.lookup(name), char)
107 count += 1
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000108
Walter Dörwald37c47282003-02-26 14:49:41 +0000109 def test_misc_symbols(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000110 self.checkletter("PILCROW SIGN", "\u00b6")
111 self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
112 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
113 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
Walter Dörwald37c47282003-02-26 14:49:41 +0000114
115 def test_errors(self):
116 import unicodedata
117 self.assertRaises(TypeError, unicodedata.name)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000118 self.assertRaises(TypeError, unicodedata.name, 'xx')
Walter Dörwald37c47282003-02-26 14:49:41 +0000119 self.assertRaises(TypeError, unicodedata.lookup)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000120 self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
Walter Dörwald37c47282003-02-26 14:49:41 +0000121
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000122 def test_strict_error_handling(self):
Walter Dörwald37c47282003-02-26 14:49:41 +0000123 # bogus character name
124 self.assertRaises(
125 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000126 str, b"\\N{blah}", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000127 )
128 # long bogus character name
129 self.assertRaises(
130 UnicodeError,
Guido van Rossum9c627722007-08-27 18:31:48 +0000131 str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000132 )
133 # missing closing brace
134 self.assertRaises(
135 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000136 str, b"\\N{SPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000137 )
138 # missing opening brace
139 self.assertRaises(
140 UnicodeError,
Guido van Rossumbf4806b2007-07-21 00:15:34 +0000141 str, b"\\NSPACE", 'unicode-escape', 'strict'
Walter Dörwald37c47282003-02-26 14:49:41 +0000142 )
143
144def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000145 support.run_unittest(UnicodeNamesTest)
Walter Dörwald37c47282003-02-26 14:49:41 +0000146
147if __name__ == "__main__":
148 test_main()