blob: e7b8bbdea3597285ad9b6e6f163e848bfacfe49f [file] [log] [blame]
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00001""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
Fredrik Lundh06d12682001-01-24 07:59:11 +00004Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +00005
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
Barry Warsaw04f357c2002-07-23 19:04:11 +00009from test.test_support import verify, verbose
Marc-André Lemburg36619082001-01-17 19:11:13 +000010
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000011print 'Testing General Unicode Character Name, and case insensitivity...',
12
13# General and case insensitivity test:
Fredrik Lundhf6056062001-01-20 11:15:25 +000014try:
15 # put all \N escapes inside exec'd raw strings, to make sure this
16 # script runs even if the compiler chokes on \N escapes
17 exec r"""
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +000018s = u"\N{LATIN CAPITAL LETTER T}" \
19 u"\N{LATIN SMALL LETTER H}" \
20 u"\N{LATIN SMALL LETTER E}" \
21 u"\N{SPACE}" \
22 u"\N{LATIN SMALL LETTER R}" \
23 u"\N{LATIN CAPITAL LETTER E}" \
24 u"\N{LATIN SMALL LETTER D}" \
25 u"\N{SPACE}" \
26 u"\N{LATIN SMALL LETTER f}" \
27 u"\N{LATIN CAPITAL LeTtEr o}" \
28 u"\N{LATIN SMaLl LETTER x}" \
29 u"\N{SPACE}" \
30 u"\N{LATIN SMALL LETTER A}" \
31 u"\N{LATIN SMALL LETTER T}" \
32 u"\N{LATIN SMALL LETTER E}" \
33 u"\N{SPACE}" \
34 u"\N{LATIN SMALL LETTER T}" \
35 u"\N{LATIN SMALL LETTER H}" \
36 u"\N{LATIN SMALL LETTER E}" \
37 u"\N{SpAcE}" \
38 u"\N{LATIN SMALL LETTER S}" \
39 u"\N{LATIN SMALL LETTER H}" \
40 u"\N{LATIN SMALL LETTER E}" \
41 u"\N{LATIN SMALL LETTER E}" \
42 u"\N{LATIN SMALL LETTER P}" \
43 u"\N{FULL STOP}"
Marc-André Lemburg36619082001-01-17 19:11:13 +000044verify(s == u"The rEd fOx ate the sheep.", s)
Fredrik Lundhf6056062001-01-20 11:15:25 +000045"""
46except UnicodeError, v:
47 print v
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000048print "done."
Fredrik Lundhee865c62001-01-19 11:00:42 +000049
Fredrik Lundh06d12682001-01-24 07:59:11 +000050import unicodedata
Fredrik Lundhee865c62001-01-19 11:00:42 +000051
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000052print "Testing name to code mapping....",
Fredrik Lundhee865c62001-01-19 11:00:42 +000053for char in "SPAM":
54 name = "LATIN SMALL LETTER %s" % char
Fredrik Lundh06d12682001-01-24 07:59:11 +000055 code = unicodedata.lookup(name)
56 verify(unicodedata.name(code) == name)
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000057print "done."
Fredrik Lundhee865c62001-01-19 11:00:42 +000058
Martin v. Löwis8579efc2002-11-23 17:11:42 +000059print "Testing hangul syllable names....",
60exec r"""
61verify(u"\N{HANGUL SYLLABLE GA}" == u"\uac00")
62verify(u"\N{HANGUL SYLLABLE GGWEOSS}" == u"\uafe8")
63verify(u"\N{HANGUL SYLLABLE DOLS}" == u"\ub3d0")
64verify(u"\N{HANGUL SYLLABLE RYAN}" == u"\ub7b8")
65verify(u"\N{HANGUL SYLLABLE MWIK}" == u"\ubba0")
66verify(u"\N{HANGUL SYLLABLE BBWAEM}" == u"\ubf88")
67verify(u"\N{HANGUL SYLLABLE SSEOL}" == u"\uc370")
68verify(u"\N{HANGUL SYLLABLE YI}" == u"\uc758")
69verify(u"\N{HANGUL SYLLABLE JJYOSS}" == u"\ucb40")
70verify(u"\N{HANGUL SYLLABLE KYEOLS}" == u"\ucf28")
71verify(u"\N{HANGUL SYLLABLE PAN}" == u"\ud310")
72verify(u"\N{HANGUL SYLLABLE HWEOK}" == u"\ud6f8")
73verify(u"\N{HANGUL SYLLABLE HIH}" == u"\ud7a3")
74"""
75try:
76 unicodedata.name(u"\ud7a4")
77except ValueError:
78 pass
79else:
80 raise AssertionError, "Found name for U+D7A4"
81print "done."
82
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +000083print "Testing names of CJK unified ideographs....",
84exec r"""
85verify(u"\N{CJK UNIFIED IDEOGRAPH-3400}" == u"\u3400")
86verify(u"\N{CJK UNIFIED IDEOGRAPH-4DB5}" == u"\u4db5")
87verify(u"\N{CJK UNIFIED IDEOGRAPH-4E00}" == u"\u4e00")
88verify(u"\N{CJK UNIFIED IDEOGRAPH-9FA5}" == u"\u9fa5")
89verify(u"\N{CJK UNIFIED IDEOGRAPH-20000}" == u"\U00020000")
90verify(u"\N{CJK UNIFIED IDEOGRAPH-2A6D6}" == u"\U0002a6d6")
91"""
92print "done."
93
94print "Testing code to name mapping for all BMP characters....",
Fredrik Lundh2acb54a2001-01-19 11:13:46 +000095count = 0
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +000096for code in range(0x10000):
Fredrik Lundhee865c62001-01-19 11:00:42 +000097 try:
Fredrik Lundh06d12682001-01-24 07:59:11 +000098 char = unichr(code)
99 name = unicodedata.name(char)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000100 except (KeyError, ValueError):
Fredrik Lundhee865c62001-01-19 11:00:42 +0000101 pass
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000102 else:
103 verify(unicodedata.lookup(name) == char)
104 count += 1
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000105print "done."
106
Fredrik Lundh2acb54a2001-01-19 11:13:46 +0000107print "Found", count, "characters in the unicode name database"
108
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000109# misc. symbol testing
110print "Testing misc. symbols for unicode character name expansion....",
Fredrik Lundhf6056062001-01-20 11:15:25 +0000111exec r"""
Marc-André Lemburg36619082001-01-17 19:11:13 +0000112verify(u"\N{PILCROW SIGN}" == u"\u00b6")
113verify(u"\N{REPLACEMENT CHARACTER}" == u"\uFFFD")
114verify(u"\N{HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK}" == u"\uFF9F")
115verify(u"\N{FULLWIDTH LATIN SMALL LETTER A}" == u"\uFF41")
Fredrik Lundhf6056062001-01-20 11:15:25 +0000116"""
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000117print "done."
118
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000119# strict error testing:
120print "Testing unicode character name expansion strict error handling....",
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000121try:
Fred Drake004d5e62000-10-23 17:22:08 +0000122 unicode("\N{blah}", 'unicode-escape', 'strict')
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000123except UnicodeError:
Fred Drake004d5e62000-10-23 17:22:08 +0000124 pass
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000125else:
Fred Drake004d5e62000-10-23 17:22:08 +0000126 raise AssertionError, "failed to raise an exception when given a bogus character name"
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000127
128try:
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000129 unicode("\N{" + "x" * 100000 + "}", 'unicode-escape', 'strict')
130except UnicodeError:
131 pass
132else:
133 raise AssertionError, "failed to raise an exception when given a very " \
134 "long bogus character name"
135
136try:
Fred Drake004d5e62000-10-23 17:22:08 +0000137 unicode("\N{SPACE", 'unicode-escape', 'strict')
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000138except UnicodeError:
Fred Drake004d5e62000-10-23 17:22:08 +0000139 pass
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000140else:
Fred Drake004d5e62000-10-23 17:22:08 +0000141 raise AssertionError, "failed to raise an exception for a missing closing brace."
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000142
143try:
Fred Drake004d5e62000-10-23 17:22:08 +0000144 unicode("\NSPACE", 'unicode-escape', 'strict')
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000145except UnicodeError:
Fred Drake004d5e62000-10-23 17:22:08 +0000146 pass
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000147else:
Fred Drake004d5e62000-10-23 17:22:08 +0000148 raise AssertionError, "failed to raise an exception for a missing opening brace."
Marc-André Lemburg6cdec2e2000-06-30 09:45:20 +0000149print "done."