blob: 11f2cda8209986896114cf06d96ef20be065e49c [file] [log] [blame]
Guido van Rossum24bdb042000-03-28 20:29:59 +00001""" Test script for the unicodedata module.
2
Marc-André Lemburg6a20ee72000-09-26 16:18:58 +00003 Written by Marc-Andre Lemburg (mal@lemburg.com).
Guido van Rossum24bdb042000-03-28 20:29:59 +00004
Marc-André Lemburg6a20ee72000-09-26 16:18:58 +00005 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
Guido van Rossum24bdb042000-03-28 20:29:59 +00006
Benjamin Petersonc078f922008-11-21 22:27:24 +00007"""
8
9import sys
10import unittest
Georg Brandlbffb0bc2006-04-30 08:57:35 +000011import hashlib
Benjamin Petersonc078f922008-11-21 22:27:24 +000012import subprocess
13import test.test_support
Guido van Rossum24bdb042000-03-28 20:29:59 +000014
Marc-André Lemburg67ceca72000-09-27 12:24:34 +000015encoding = 'utf-8'
16
Marc-André Lemburg6a20ee72000-09-26 16:18:58 +000017
18### Run tests
19
Walter Dörwald37c47282003-02-26 14:49:41 +000020class UnicodeMethodsTest(unittest.TestCase):
Marc-André Lemburg6a20ee72000-09-26 16:18:58 +000021
Walter Dörwald37c47282003-02-26 14:49:41 +000022 # update this, if the database changes
Florent Xicluna2e0a53f2010-03-18 21:50:06 +000023 expectedchecksum = '4504dffd035baea02c5b9de82bebc3d65e0e0baf'
Guido van Rossum24bdb042000-03-28 20:29:59 +000024
Walter Dörwald37c47282003-02-26 14:49:41 +000025 def test_method_checksum(self):
Georg Brandlbffb0bc2006-04-30 08:57:35 +000026 h = hashlib.sha1()
Florent Xicluna22b24382010-03-30 08:24:06 +000027 for i in range(0x10000):
Walter Dörwald37c47282003-02-26 14:49:41 +000028 char = unichr(i)
29 data = [
30 # Predicates (single char)
31 u"01"[char.isalnum()],
32 u"01"[char.isalpha()],
33 u"01"[char.isdecimal()],
34 u"01"[char.isdigit()],
35 u"01"[char.islower()],
36 u"01"[char.isnumeric()],
37 u"01"[char.isspace()],
38 u"01"[char.istitle()],
39 u"01"[char.isupper()],
Guido van Rossum24bdb042000-03-28 20:29:59 +000040
Walter Dörwald37c47282003-02-26 14:49:41 +000041 # Predicates (multiple chars)
42 u"01"[(char + u'abc').isalnum()],
43 u"01"[(char + u'abc').isalpha()],
44 u"01"[(char + u'123').isdecimal()],
45 u"01"[(char + u'123').isdigit()],
46 u"01"[(char + u'abc').islower()],
47 u"01"[(char + u'123').isnumeric()],
48 u"01"[(char + u' \t').isspace()],
49 u"01"[(char + u'abc').istitle()],
50 u"01"[(char + u'ABC').isupper()],
Guido van Rossum24bdb042000-03-28 20:29:59 +000051
Walter Dörwald37c47282003-02-26 14:49:41 +000052 # Mappings (single char)
53 char.lower(),
54 char.upper(),
55 char.title(),
Guido van Rossum24bdb042000-03-28 20:29:59 +000056
Walter Dörwald37c47282003-02-26 14:49:41 +000057 # Mappings (multiple chars)
58 (char + u'abc').lower(),
59 (char + u'ABC').upper(),
60 (char + u'abc').title(),
61 (char + u'ABC').title(),
Guido van Rossum24bdb042000-03-28 20:29:59 +000062
Walter Dörwald37c47282003-02-26 14:49:41 +000063 ]
64 h.update(u''.join(data).encode(encoding))
65 result = h.hexdigest()
66 self.assertEqual(result, self.expectedchecksum)
Guido van Rossum24bdb042000-03-28 20:29:59 +000067
Walter Dörwald37c47282003-02-26 14:49:41 +000068class UnicodeDatabaseTest(unittest.TestCase):
Guido van Rossum24bdb042000-03-28 20:29:59 +000069
Walter Dörwald37c47282003-02-26 14:49:41 +000070 def setUp(self):
71 # In case unicodedata is not available, this will raise an ImportError,
72 # but the other test cases will still be run
73 import unicodedata
74 self.db = unicodedata
Guido van Rossum24bdb042000-03-28 20:29:59 +000075
Walter Dörwald37c47282003-02-26 14:49:41 +000076 def tearDown(self):
77 del self.db
Guido van Rossum24bdb042000-03-28 20:29:59 +000078
Walter Dörwald37c47282003-02-26 14:49:41 +000079class UnicodeFunctionsTest(UnicodeDatabaseTest):
Guido van Rossum24bdb042000-03-28 20:29:59 +000080
Walter Dörwald37c47282003-02-26 14:49:41 +000081 # update this, if the database changes
Florent Xicluna2e0a53f2010-03-18 21:50:06 +000082 expectedchecksum = '6ccf1b1a36460d2694f9b0b0f0324942fe70ede6'
Walter Dörwald37c47282003-02-26 14:49:41 +000083
84 def test_function_checksum(self):
85 data = []
Georg Brandlbffb0bc2006-04-30 08:57:35 +000086 h = hashlib.sha1()
Walter Dörwald37c47282003-02-26 14:49:41 +000087
88 for i in range(0x10000):
89 char = unichr(i)
90 data = [
91 # Properties
92 str(self.db.digit(char, -1)),
93 str(self.db.numeric(char, -1)),
94 str(self.db.decimal(char, -1)),
95 self.db.category(char),
96 self.db.bidirectional(char),
97 self.db.decomposition(char),
98 str(self.db.mirrored(char)),
99 str(self.db.combining(char)),
100 ]
101 h.update(''.join(data))
102 result = h.hexdigest()
103 self.assertEqual(result, self.expectedchecksum)
104
105 def test_digit(self):
106 self.assertEqual(self.db.digit(u'A', None), None)
107 self.assertEqual(self.db.digit(u'9'), 9)
108 self.assertEqual(self.db.digit(u'\u215b', None), None)
109 self.assertEqual(self.db.digit(u'\u2468'), 9)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000110 self.assertEqual(self.db.digit(u'\U00020000', None), None)
Walter Dörwald37c47282003-02-26 14:49:41 +0000111
112 self.assertRaises(TypeError, self.db.digit)
113 self.assertRaises(TypeError, self.db.digit, u'xx')
114 self.assertRaises(ValueError, self.db.digit, u'x')
115
116 def test_numeric(self):
117 self.assertEqual(self.db.numeric(u'A',None), None)
118 self.assertEqual(self.db.numeric(u'9'), 9)
119 self.assertEqual(self.db.numeric(u'\u215b'), 0.125)
120 self.assertEqual(self.db.numeric(u'\u2468'), 9.0)
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000121 self.assertEqual(self.db.numeric(u'\ua627'), 7.0)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000122 self.assertEqual(self.db.numeric(u'\U00020000', None), None)
Walter Dörwald37c47282003-02-26 14:49:41 +0000123
124 self.assertRaises(TypeError, self.db.numeric)
125 self.assertRaises(TypeError, self.db.numeric, u'xx')
126 self.assertRaises(ValueError, self.db.numeric, u'x')
127
128 def test_decimal(self):
129 self.assertEqual(self.db.decimal(u'A',None), None)
130 self.assertEqual(self.db.decimal(u'9'), 9)
131 self.assertEqual(self.db.decimal(u'\u215b', None), None)
132 self.assertEqual(self.db.decimal(u'\u2468', None), None)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000133 self.assertEqual(self.db.decimal(u'\U00020000', None), None)
Walter Dörwald37c47282003-02-26 14:49:41 +0000134
135 self.assertRaises(TypeError, self.db.decimal)
136 self.assertRaises(TypeError, self.db.decimal, u'xx')
137 self.assertRaises(ValueError, self.db.decimal, u'x')
138
139 def test_category(self):
140 self.assertEqual(self.db.category(u'\uFFFE'), 'Cn')
141 self.assertEqual(self.db.category(u'a'), 'Ll')
142 self.assertEqual(self.db.category(u'A'), 'Lu')
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000143 self.assertEqual(self.db.category(u'\U00020000'), 'Lo')
Walter Dörwald37c47282003-02-26 14:49:41 +0000144
145 self.assertRaises(TypeError, self.db.category)
146 self.assertRaises(TypeError, self.db.category, u'xx')
147
148 def test_bidirectional(self):
149 self.assertEqual(self.db.bidirectional(u'\uFFFE'), '')
150 self.assertEqual(self.db.bidirectional(u' '), 'WS')
151 self.assertEqual(self.db.bidirectional(u'A'), 'L')
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000152 self.assertEqual(self.db.bidirectional(u'\U00020000'), 'L')
Walter Dörwald37c47282003-02-26 14:49:41 +0000153
154 self.assertRaises(TypeError, self.db.bidirectional)
155 self.assertRaises(TypeError, self.db.bidirectional, u'xx')
156
157 def test_decomposition(self):
158 self.assertEqual(self.db.decomposition(u'\uFFFE'),'')
159 self.assertEqual(self.db.decomposition(u'\u00bc'), '<fraction> 0031 2044 0034')
160
161 self.assertRaises(TypeError, self.db.decomposition)
162 self.assertRaises(TypeError, self.db.decomposition, u'xx')
163
164 def test_mirrored(self):
165 self.assertEqual(self.db.mirrored(u'\uFFFE'), 0)
166 self.assertEqual(self.db.mirrored(u'a'), 0)
167 self.assertEqual(self.db.mirrored(u'\u2201'), 1)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000168 self.assertEqual(self.db.mirrored(u'\U00020000'), 0)
Walter Dörwald37c47282003-02-26 14:49:41 +0000169
170 self.assertRaises(TypeError, self.db.mirrored)
171 self.assertRaises(TypeError, self.db.mirrored, u'xx')
172
173 def test_combining(self):
174 self.assertEqual(self.db.combining(u'\uFFFE'), 0)
175 self.assertEqual(self.db.combining(u'a'), 0)
176 self.assertEqual(self.db.combining(u'\u20e1'), 230)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000177 self.assertEqual(self.db.combining(u'\U00020000'), 0)
Walter Dörwald37c47282003-02-26 14:49:41 +0000178
179 self.assertRaises(TypeError, self.db.combining)
180 self.assertRaises(TypeError, self.db.combining, u'xx')
181
182 def test_normalize(self):
183 self.assertRaises(TypeError, self.db.normalize)
184 self.assertRaises(ValueError, self.db.normalize, 'unknown', u'xx')
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000185 self.assertEqual(self.db.normalize('NFKC', u''), u'')
Walter Dörwald37c47282003-02-26 14:49:41 +0000186 # The rest can be found in test_normalization.py
187 # which requires an external file.
188
Victor Stinner7c924ec2010-03-04 12:09:33 +0000189 def test_pr29(self):
190 # http://www.unicode.org/review/pr-29.html
Alexander Belopolskydce6cf32010-12-28 15:47:56 +0000191 # See issues #1054943 and #10254.
192 composed = (u"\u0b47\u0300\u0b3e", u"\u1100\u0300\u1161",
193 u'Li\u030dt-s\u1e73\u0301',
194 u'\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
195 + u'\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
196 u'\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
197 + 'u\u0938\u094d\u0924\u093e\u0928')
198 for text in composed:
Victor Stinner7c924ec2010-03-04 12:09:33 +0000199 self.assertEqual(self.db.normalize('NFC', text), text)
200
Alexander Belopolskydce6cf32010-12-28 15:47:56 +0000201 def test_issue10254(self):
202 # Crash reported in #10254
Alexander Belopolsky06fdbed2010-12-28 16:04:06 +0000203 a = u'C\u0338' * 20 + u'C\u0327'
204 b = u'C\u0338' * 20 + u'\xC7'
Alexander Belopolskydce6cf32010-12-28 15:47:56 +0000205 self.assertEqual(self.db.normalize('NFC', a), b)
206
Xiang Zhang1889c4c2018-06-15 21:26:55 +0800207 def test_issue29456(self):
208 # Fix #29456
209 u1176_str_a = u'\u1100\u1176\u11a8'
210 u1176_str_b = u'\u1100\u1176\u11a8'
211 u11a7_str_a = u'\u1100\u1175\u11a7'
212 u11a7_str_b = u'\uae30\u11a7'
213 u11c3_str_a = u'\u1100\u1175\u11c3'
214 u11c3_str_b = u'\uae30\u11c3'
215 self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
216 self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
217 self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
218
219
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000220 def test_east_asian_width(self):
221 eaw = self.db.east_asian_width
222 self.assertRaises(TypeError, eaw, 'a')
223 self.assertRaises(TypeError, eaw, u'')
224 self.assertRaises(TypeError, eaw, u'ra')
225 self.assertEqual(eaw(u'\x1e'), 'N')
226 self.assertEqual(eaw(u'\x20'), 'Na')
227 self.assertEqual(eaw(u'\uC894'), 'W')
228 self.assertEqual(eaw(u'\uFF66'), 'H')
229 self.assertEqual(eaw(u'\uFF1F'), 'F')
230 self.assertEqual(eaw(u'\u2010'), 'A')
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000231 self.assertEqual(eaw(u'\U00020000'), 'W')
Walter Dörwald37c47282003-02-26 14:49:41 +0000232
233class UnicodeMiscTest(UnicodeDatabaseTest):
234
Benjamin Petersonc078f922008-11-21 22:27:24 +0000235 def test_failed_import_during_compiling(self):
236 # Issue 4367
237 # Decoding \N escapes requires the unicodedata module. If it can't be
238 # imported, we shouldn't segfault.
239
240 # This program should raise a SyntaxError in the eval.
241 code = "import sys;" \
242 "sys.modules['unicodedata'] = None;" \
243 """eval("u'\N{SOFT HYPHEN}'")"""
244 args = [sys.executable, "-c", code]
245 # We use a subprocess because the unicodedata module may already have
246 # been loaded in this process.
247 popen = subprocess.Popen(args, stderr=subprocess.PIPE)
248 popen.wait()
249 self.assertEqual(popen.returncode, 1)
250 error = "SyntaxError: (unicode error) \N escapes not supported " \
251 "(can't load unicodedata module)"
Ezio Melottiaa980582010-01-23 23:04:36 +0000252 self.assertIn(error, popen.stderr.read())
Benjamin Petersonc078f922008-11-21 22:27:24 +0000253
Walter Dörwald37c47282003-02-26 14:49:41 +0000254 def test_decimal_numeric_consistent(self):
255 # Test that decimal and numeric are consistent,
256 # i.e. if a character has a decimal value,
Georg Brandl7eb4b7d2005-07-22 21:49:32 +0000257 # its numeric value should be the same.
Walter Dörwald37c47282003-02-26 14:49:41 +0000258 count = 0
259 for i in xrange(0x10000):
260 c = unichr(i)
261 dec = self.db.decimal(c, -1)
262 if dec != -1:
263 self.assertEqual(dec, self.db.numeric(c))
264 count += 1
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000265 self.assertTrue(count >= 10) # should have tested at least the ASCII digits
Walter Dörwald37c47282003-02-26 14:49:41 +0000266
267 def test_digit_numeric_consistent(self):
268 # Test that digit and numeric are consistent,
Tim Peters669454e2003-03-07 17:30:48 +0000269 # i.e. if a character has a digit value,
Georg Brandl7eb4b7d2005-07-22 21:49:32 +0000270 # its numeric value should be the same.
Walter Dörwald37c47282003-02-26 14:49:41 +0000271 count = 0
272 for i in xrange(0x10000):
273 c = unichr(i)
274 dec = self.db.digit(c, -1)
275 if dec != -1:
276 self.assertEqual(dec, self.db.numeric(c))
277 count += 1
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000278 self.assertTrue(count >= 10) # should have tested at least the ASCII digits
Walter Dörwald37c47282003-02-26 14:49:41 +0000279
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +0000280 def test_bug_1704793(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000281 self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), u'\U00010346')
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +0000282
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000283 def test_ucd_510(self):
284 import unicodedata
285 # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000286 self.assertTrue(unicodedata.mirrored(u"\u0f3a"))
287 self.assertTrue(not unicodedata.ucd_3_2_0.mirrored(u"\u0f3a"))
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000288 # Also, we now have two ways of representing
289 # the upper-case mapping: as delta, or as absolute value
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000290 self.assertTrue(u"a".upper()==u'A')
291 self.assertTrue(u"\u1d79".upper()==u'\ua77d')
292 self.assertTrue(u".".upper()==u".")
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000293
294 def test_bug_5828(self):
295 self.assertEqual(u"\u1d79".lower(), u"\u1d79")
296 # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
297 self.assertEqual(
298 [
299 c for c in range(sys.maxunicode+1)
300 if u"\x00" in unichr(c).lower()+unichr(c).upper()+unichr(c).title()
301 ],
302 [0]
303 )
304
Walter Dörwald4c69da22009-04-26 19:11:43 +0000305 def test_bug_4971(self):
Martin v. Löwis99f27792009-04-26 00:53:18 +0000306 # LETTER DZ WITH CARON: DZ, Dz, dz
307 self.assertEqual(u"\u01c4".title(), u"\u01c5")
308 self.assertEqual(u"\u01c5".title(), u"\u01c5")
309 self.assertEqual(u"\u01c6".title(), u"\u01c5")
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000310
Florent Xicluna22b24382010-03-30 08:24:06 +0000311 def test_linebreak_7643(self):
312 for i in range(0x10000):
313 lines = (unichr(i) + u'A').splitlines()
314 if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
315 0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
316 self.assertEqual(len(lines), 2,
317 r"\u%.4x should be a linebreak" % i)
318 else:
319 self.assertEqual(len(lines), 1,
320 r"\u%.4x should not be a linebreak" % i)
321
Walter Dörwald37c47282003-02-26 14:49:41 +0000322def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000323 test.test_support.run_unittest(
324 UnicodeMiscTest,
325 UnicodeMethodsTest,
326 UnicodeFunctionsTest
327 )
Walter Dörwald37c47282003-02-26 14:49:41 +0000328
329if __name__ == "__main__":
330 test_main()