blob: 509628bb2f9cbaec5733f642479e393d99c7eb00 [file] [log] [blame]
Guido van Rossum24bdb042000-03-28 20:29:59 +00001""" Test script for the unicodedata module.
2
Marc-André Lemburg6a20ee72000-09-26 16:18:58 +00003 Written by Marc-Andre Lemburg (mal@lemburg.com).
Guido van Rossum24bdb042000-03-28 20:29:59 +00004
Marc-André Lemburg6a20ee72000-09-26 16:18:58 +00005 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
Guido van Rossum24bdb042000-03-28 20:29:59 +00006
Benjamin Peterson71ce9e72008-11-21 22:52:21 +00007"""
8
9import sys
10import unittest
Georg Brandlbffb0bc2006-04-30 08:57:35 +000011import hashlib
Benjamin Peterson71ce9e72008-11-21 22:52:21 +000012import subprocess
13import test.test_support
Guido van Rossum24bdb042000-03-28 20:29:59 +000014
Marc-André Lemburg67ceca72000-09-27 12:24:34 +000015encoding = 'utf-8'
16
Marc-André Lemburg6a20ee72000-09-26 16:18:58 +000017
18### Run tests
19
Walter Dörwald37c47282003-02-26 14:49:41 +000020class UnicodeMethodsTest(unittest.TestCase):
Marc-André Lemburg6a20ee72000-09-26 16:18:58 +000021
Walter Dörwald37c47282003-02-26 14:49:41 +000022 # update this, if the database changes
Martin v. Löwis2a574ae2009-04-26 01:01:58 +000023 expectedchecksum = '6ec65b65835614ec00634c674bba0e50cd32c189'
Guido van Rossum24bdb042000-03-28 20:29:59 +000024
Walter Dörwald37c47282003-02-26 14:49:41 +000025 def test_method_checksum(self):
Georg Brandlbffb0bc2006-04-30 08:57:35 +000026 h = hashlib.sha1()
Walter Dörwald37c47282003-02-26 14:49:41 +000027 for i in range(65536):
28 char = unichr(i)
29 data = [
30 # Predicates (single char)
31 u"01"[char.isalnum()],
32 u"01"[char.isalpha()],
33 u"01"[char.isdecimal()],
34 u"01"[char.isdigit()],
35 u"01"[char.islower()],
36 u"01"[char.isnumeric()],
37 u"01"[char.isspace()],
38 u"01"[char.istitle()],
39 u"01"[char.isupper()],
Guido van Rossum24bdb042000-03-28 20:29:59 +000040
Walter Dörwald37c47282003-02-26 14:49:41 +000041 # Predicates (multiple chars)
42 u"01"[(char + u'abc').isalnum()],
43 u"01"[(char + u'abc').isalpha()],
44 u"01"[(char + u'123').isdecimal()],
45 u"01"[(char + u'123').isdigit()],
46 u"01"[(char + u'abc').islower()],
47 u"01"[(char + u'123').isnumeric()],
48 u"01"[(char + u' \t').isspace()],
49 u"01"[(char + u'abc').istitle()],
50 u"01"[(char + u'ABC').isupper()],
Guido van Rossum24bdb042000-03-28 20:29:59 +000051
Walter Dörwald37c47282003-02-26 14:49:41 +000052 # Mappings (single char)
53 char.lower(),
54 char.upper(),
55 char.title(),
Guido van Rossum24bdb042000-03-28 20:29:59 +000056
Walter Dörwald37c47282003-02-26 14:49:41 +000057 # Mappings (multiple chars)
58 (char + u'abc').lower(),
59 (char + u'ABC').upper(),
60 (char + u'abc').title(),
61 (char + u'ABC').title(),
Guido van Rossum24bdb042000-03-28 20:29:59 +000062
Walter Dörwald37c47282003-02-26 14:49:41 +000063 ]
64 h.update(u''.join(data).encode(encoding))
65 result = h.hexdigest()
66 self.assertEqual(result, self.expectedchecksum)
Guido van Rossum24bdb042000-03-28 20:29:59 +000067
Walter Dörwald37c47282003-02-26 14:49:41 +000068class UnicodeDatabaseTest(unittest.TestCase):
Guido van Rossum24bdb042000-03-28 20:29:59 +000069
Walter Dörwald37c47282003-02-26 14:49:41 +000070 def setUp(self):
71 # In case unicodedata is not available, this will raise an ImportError,
72 # but the other test cases will still be run
73 import unicodedata
74 self.db = unicodedata
Guido van Rossum24bdb042000-03-28 20:29:59 +000075
Walter Dörwald37c47282003-02-26 14:49:41 +000076 def tearDown(self):
77 del self.db
Guido van Rossum24bdb042000-03-28 20:29:59 +000078
Walter Dörwald37c47282003-02-26 14:49:41 +000079class UnicodeFunctionsTest(UnicodeDatabaseTest):
Guido van Rossum24bdb042000-03-28 20:29:59 +000080
Walter Dörwald37c47282003-02-26 14:49:41 +000081 # update this, if the database changes
Martin v. Löwis24329ba2008-09-10 13:38:12 +000082 expectedchecksum = '3136d5afd787dc2bcb1bdcac95e385349fbebbca'
Walter Dörwald37c47282003-02-26 14:49:41 +000083
84 def test_function_checksum(self):
85 data = []
Georg Brandlbffb0bc2006-04-30 08:57:35 +000086 h = hashlib.sha1()
Walter Dörwald37c47282003-02-26 14:49:41 +000087
88 for i in range(0x10000):
89 char = unichr(i)
90 data = [
91 # Properties
92 str(self.db.digit(char, -1)),
93 str(self.db.numeric(char, -1)),
94 str(self.db.decimal(char, -1)),
95 self.db.category(char),
96 self.db.bidirectional(char),
97 self.db.decomposition(char),
98 str(self.db.mirrored(char)),
99 str(self.db.combining(char)),
100 ]
101 h.update(''.join(data))
102 result = h.hexdigest()
103 self.assertEqual(result, self.expectedchecksum)
104
105 def test_digit(self):
106 self.assertEqual(self.db.digit(u'A', None), None)
107 self.assertEqual(self.db.digit(u'9'), 9)
108 self.assertEqual(self.db.digit(u'\u215b', None), None)
109 self.assertEqual(self.db.digit(u'\u2468'), 9)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000110 self.assertEqual(self.db.digit(u'\U00020000', None), None)
Walter Dörwald37c47282003-02-26 14:49:41 +0000111
112 self.assertRaises(TypeError, self.db.digit)
113 self.assertRaises(TypeError, self.db.digit, u'xx')
114 self.assertRaises(ValueError, self.db.digit, u'x')
115
116 def test_numeric(self):
117 self.assertEqual(self.db.numeric(u'A',None), None)
118 self.assertEqual(self.db.numeric(u'9'), 9)
119 self.assertEqual(self.db.numeric(u'\u215b'), 0.125)
120 self.assertEqual(self.db.numeric(u'\u2468'), 9.0)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000121 self.assertEqual(self.db.numeric(u'\U00020000', None), None)
Walter Dörwald37c47282003-02-26 14:49:41 +0000122
123 self.assertRaises(TypeError, self.db.numeric)
124 self.assertRaises(TypeError, self.db.numeric, u'xx')
125 self.assertRaises(ValueError, self.db.numeric, u'x')
126
127 def test_decimal(self):
128 self.assertEqual(self.db.decimal(u'A',None), None)
129 self.assertEqual(self.db.decimal(u'9'), 9)
130 self.assertEqual(self.db.decimal(u'\u215b', None), None)
131 self.assertEqual(self.db.decimal(u'\u2468', None), None)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000132 self.assertEqual(self.db.decimal(u'\U00020000', None), None)
Walter Dörwald37c47282003-02-26 14:49:41 +0000133
134 self.assertRaises(TypeError, self.db.decimal)
135 self.assertRaises(TypeError, self.db.decimal, u'xx')
136 self.assertRaises(ValueError, self.db.decimal, u'x')
137
138 def test_category(self):
139 self.assertEqual(self.db.category(u'\uFFFE'), 'Cn')
140 self.assertEqual(self.db.category(u'a'), 'Ll')
141 self.assertEqual(self.db.category(u'A'), 'Lu')
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000142 self.assertEqual(self.db.category(u'\U00020000'), 'Lo')
Walter Dörwald37c47282003-02-26 14:49:41 +0000143
144 self.assertRaises(TypeError, self.db.category)
145 self.assertRaises(TypeError, self.db.category, u'xx')
146
147 def test_bidirectional(self):
148 self.assertEqual(self.db.bidirectional(u'\uFFFE'), '')
149 self.assertEqual(self.db.bidirectional(u' '), 'WS')
150 self.assertEqual(self.db.bidirectional(u'A'), 'L')
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000151 self.assertEqual(self.db.bidirectional(u'\U00020000'), 'L')
Walter Dörwald37c47282003-02-26 14:49:41 +0000152
153 self.assertRaises(TypeError, self.db.bidirectional)
154 self.assertRaises(TypeError, self.db.bidirectional, u'xx')
155
156 def test_decomposition(self):
157 self.assertEqual(self.db.decomposition(u'\uFFFE'),'')
158 self.assertEqual(self.db.decomposition(u'\u00bc'), '<fraction> 0031 2044 0034')
159
160 self.assertRaises(TypeError, self.db.decomposition)
161 self.assertRaises(TypeError, self.db.decomposition, u'xx')
162
163 def test_mirrored(self):
164 self.assertEqual(self.db.mirrored(u'\uFFFE'), 0)
165 self.assertEqual(self.db.mirrored(u'a'), 0)
166 self.assertEqual(self.db.mirrored(u'\u2201'), 1)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000167 self.assertEqual(self.db.mirrored(u'\U00020000'), 0)
Walter Dörwald37c47282003-02-26 14:49:41 +0000168
169 self.assertRaises(TypeError, self.db.mirrored)
170 self.assertRaises(TypeError, self.db.mirrored, u'xx')
171
172 def test_combining(self):
173 self.assertEqual(self.db.combining(u'\uFFFE'), 0)
174 self.assertEqual(self.db.combining(u'a'), 0)
175 self.assertEqual(self.db.combining(u'\u20e1'), 230)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000176 self.assertEqual(self.db.combining(u'\U00020000'), 0)
Walter Dörwald37c47282003-02-26 14:49:41 +0000177
178 self.assertRaises(TypeError, self.db.combining)
179 self.assertRaises(TypeError, self.db.combining, u'xx')
180
181 def test_normalize(self):
182 self.assertRaises(TypeError, self.db.normalize)
183 self.assertRaises(ValueError, self.db.normalize, 'unknown', u'xx')
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000184 self.assertEqual(self.db.normalize('NFKC', u''), u'')
Walter Dörwald37c47282003-02-26 14:49:41 +0000185 # The rest can be found in test_normalization.py
186 # which requires an external file.
187
Victor Stinnerbb71dc42010-03-21 13:41:15 +0000188 def test_pr29(self):
189 # http://www.unicode.org/review/pr-29.html
Alexander Belopolsky893c3542010-12-28 16:15:08 +0000190 # See issues #1054943 and #10254.
191 composed = (u"\u0b47\u0300\u0b3e", u"\u1100\u0300\u1161",
192 u'Li\u030dt-s\u1e73\u0301',
193 u'\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
194 + u'\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
195 u'\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
196 + 'u\u0938\u094d\u0924\u093e\u0928')
197 for text in composed:
Victor Stinnerbb71dc42010-03-21 13:41:15 +0000198 self.assertEqual(self.db.normalize('NFC', text), text)
199
Alexander Belopolsky893c3542010-12-28 16:15:08 +0000200 def test_issue10254(self):
201 # Crash reported in #10254
202 a = u'C\u0338' * 20 + u'C\u0327'
203 b = u'C\u0338' * 20 + u'\xC7'
204 self.assertEqual(self.db.normalize('NFC', a), b)
205
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000206 def test_east_asian_width(self):
207 eaw = self.db.east_asian_width
208 self.assertRaises(TypeError, eaw, 'a')
209 self.assertRaises(TypeError, eaw, u'')
210 self.assertRaises(TypeError, eaw, u'ra')
211 self.assertEqual(eaw(u'\x1e'), 'N')
212 self.assertEqual(eaw(u'\x20'), 'Na')
213 self.assertEqual(eaw(u'\uC894'), 'W')
214 self.assertEqual(eaw(u'\uFF66'), 'H')
215 self.assertEqual(eaw(u'\uFF1F'), 'F')
216 self.assertEqual(eaw(u'\u2010'), 'A')
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000217 self.assertEqual(eaw(u'\U00020000'), 'W')
Walter Dörwald37c47282003-02-26 14:49:41 +0000218
219class UnicodeMiscTest(UnicodeDatabaseTest):
220
Benjamin Peterson71ce9e72008-11-21 22:52:21 +0000221 def test_failed_import_during_compiling(self):
222 # Issue 4367
223 # Decoding \N escapes requires the unicodedata module. If it can't be
224 # imported, we shouldn't segfault.
225
226 # This program should raise a SyntaxError in the eval.
227 code = "import sys;" \
228 "sys.modules['unicodedata'] = None;" \
229 """eval("u'\N{SOFT HYPHEN}'")"""
230 args = [sys.executable, "-c", code]
231 # We use a subprocess because the unicodedata module may already have
232 # been loaded in this process.
233 popen = subprocess.Popen(args, stderr=subprocess.PIPE)
234 popen.wait()
235 self.assertEqual(popen.returncode, 1)
236 error = "SyntaxError: (unicode error) \N escapes not supported " \
237 "(can't load unicodedata module)"
238 self.assertTrue(error in popen.stderr.read())
239
Walter Dörwald37c47282003-02-26 14:49:41 +0000240 def test_decimal_numeric_consistent(self):
241 # Test that decimal and numeric are consistent,
242 # i.e. if a character has a decimal value,
Georg Brandl7eb4b7d2005-07-22 21:49:32 +0000243 # its numeric value should be the same.
Walter Dörwald37c47282003-02-26 14:49:41 +0000244 count = 0
245 for i in xrange(0x10000):
246 c = unichr(i)
247 dec = self.db.decimal(c, -1)
248 if dec != -1:
249 self.assertEqual(dec, self.db.numeric(c))
250 count += 1
251 self.assert_(count >= 10) # should have tested at least the ASCII digits
252
253 def test_digit_numeric_consistent(self):
254 # Test that digit and numeric are consistent,
Tim Peters669454e2003-03-07 17:30:48 +0000255 # i.e. if a character has a digit value,
Georg Brandl7eb4b7d2005-07-22 21:49:32 +0000256 # its numeric value should be the same.
Walter Dörwald37c47282003-02-26 14:49:41 +0000257 count = 0
258 for i in xrange(0x10000):
259 c = unichr(i)
260 dec = self.db.digit(c, -1)
261 if dec != -1:
262 self.assertEqual(dec, self.db.numeric(c))
263 count += 1
264 self.assert_(count >= 10) # should have tested at least the ASCII digits
265
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +0000266 def test_bug_1704793(self):
267 self.assertEquals(self.db.lookup("GOTHIC LETTER FAIHU"), u'\U00010346')
268
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000269 def test_ucd_510(self):
270 import unicodedata
271 # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
272 self.assert_(unicodedata.mirrored(u"\u0f3a"))
273 self.assert_(not unicodedata.ucd_3_2_0.mirrored(u"\u0f3a"))
274 # Also, we now have two ways of representing
275 # the upper-case mapping: as delta, or as absolute value
276 self.assert_(u"a".upper()==u'A')
277 self.assert_(u"\u1d79".upper()==u'\ua77d')
Walter Dörwald6c863d12009-04-25 14:05:52 +0000278 self.assert_(u".".upper()==u".")
279
280 def test_bug_5828(self):
281 self.assertEqual(u"\u1d79".lower(), u"\u1d79")
282 # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
283 self.assertEqual(
284 [
285 c for c in range(sys.maxunicode+1)
286 if u"\x00" in unichr(c).lower()+unichr(c).upper()+unichr(c).title()
287 ],
288 [0]
289 )
290
Walter Dörwaldcc8cfdb2009-04-26 19:12:55 +0000291 def test_bug_4971(self):
Martin v. Löwis2a574ae2009-04-26 01:01:58 +0000292 # LETTER DZ WITH CARON: DZ, Dz, dz
293 self.assertEqual(u"\u01c4".title(), u"\u01c5")
294 self.assertEqual(u"\u01c5".title(), u"\u01c5")
295 self.assertEqual(u"\u01c6".title(), u"\u01c5")
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000296
Walter Dörwald37c47282003-02-26 14:49:41 +0000297def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000298 test.test_support.run_unittest(
299 UnicodeMiscTest,
300 UnicodeMethodsTest,
301 UnicodeFunctionsTest
302 )
Walter Dörwald37c47282003-02-26 14:49:41 +0000303
304if __name__ == "__main__":
305 test_main()