blob: 99ed82d63b39d2c2724c546099aa9e26f1c7ebd3 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
4import StringIO
5
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
29 # of input to the reader byte by byte. Read every available from
30 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwalde57d7b12004-12-21 22:24:00 +000044 def test_readline(self):
45 def getreader(input):
46 stream = StringIO.StringIO(input.encode(self.encoding))
47 return codecs.getreader(self.encoding)(stream)
48
49 def readalllines(input, keepends=True):
50 reader = getreader(input)
51 lines = []
52 while True:
53 line = reader.readline(keepends=keepends)
54 if not line:
55 break
56 lines.append(line)
57 return "".join(lines)
58
59 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
60 self.assertEqual(readalllines(s, True), s)
61 self.assertEqual(readalllines(s, False), u"foobarbazspameggs")
62
63 # Test long lines (multiple calls to read() in readline())
64 vw = []
65 vwo = []
66 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
67 vw.append((i*200)*u"\3042" + lineend)
68 vwo.append((i*200)*u"\3042")
69 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
70 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
71
72 # Test lines where the first read might end with \r, so the
73 # reader has to look ahead whether this is a lone \r or a \r\n
74 for size in xrange(80):
75 for lineend in u"\n \r\n \r \u2028".split():
76 s = size*u"a" + lineend + u"xxx\n"
77 self.assertEqual(
78 getreader(s).readline(keepends=True),
79 size*u"a" + lineend,
80 )
81 self.assertEqual(
82 getreader(s).readline(keepends=False),
83 size*u"a",
84 )
85
86 def test_readlinequeue(self):
87 q = Queue()
88 writer = codecs.getwriter(self.encoding)(q)
89 reader = codecs.getreader(self.encoding)(q)
90
91 # No lineends
92 writer.write(u"foo\r")
93 self.assertEqual(reader.readline(keepends=False), u"foo")
94 writer.write(u"\nbar\r")
95 self.assertEqual(reader.readline(keepends=False), u"bar")
96 writer.write(u"baz")
97 self.assertEqual(reader.readline(keepends=False), u"baz")
98 self.assertEqual(reader.readline(keepends=False), u"")
99
100 # Lineends
101 writer.write(u"foo\r")
102 self.assertEqual(reader.readline(keepends=True), u"foo\r")
103 writer.write(u"\nbar\r")
104 self.assertEqual(reader.readline(keepends=True), u"bar\r")
105 writer.write(u"baz")
106 self.assertEqual(reader.readline(keepends=True), u"baz")
107 self.assertEqual(reader.readline(keepends=True), u"")
108 writer.write(u"foo\r\n")
109 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
110
111class UTF16Test(ReadTest):
112 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000113
114 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
115 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
116
117 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000119 # encode some stream
120 s = StringIO.StringIO()
121 f = writer(s)
122 f.write(u"spam")
123 f.write(u"spam")
124 d = s.getvalue()
125 # check whether there is exactly one BOM in it
126 self.assert_(d == self.spamle or d == self.spambe)
127 # try to read it back
128 s = StringIO.StringIO(d)
129 f = reader(s)
130 self.assertEquals(f.read(), u"spamspam")
131
Walter Dörwald69652032004-09-07 20:24:22 +0000132 def test_partial(self):
133 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000134 u"\x00\xff\u0100\uffff",
135 [
136 u"", # first byte of BOM read
137 u"", # second byte of BOM read => byteorder known
138 u"",
139 u"\x00",
140 u"\x00",
141 u"\x00\xff",
142 u"\x00\xff",
143 u"\x00\xff\u0100",
144 u"\x00\xff\u0100",
145 u"\x00\xff\u0100\uffff",
146 ]
147 )
148
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149class UTF16LETest(ReadTest):
150 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000151
152 def test_partial(self):
153 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000154 u"\x00\xff\u0100\uffff",
155 [
156 u"",
157 u"\x00",
158 u"\x00",
159 u"\x00\xff",
160 u"\x00\xff",
161 u"\x00\xff\u0100",
162 u"\x00\xff\u0100",
163 u"\x00\xff\u0100\uffff",
164 ]
165 )
166
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000167class UTF16BETest(ReadTest):
168 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000169
170 def test_partial(self):
171 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000172 u"\x00\xff\u0100\uffff",
173 [
174 u"",
175 u"\x00",
176 u"\x00",
177 u"\x00\xff",
178 u"\x00\xff",
179 u"\x00\xff\u0100",
180 u"\x00\xff\u0100",
181 u"\x00\xff\u0100\uffff",
182 ]
183 )
184
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000185class UTF8Test(ReadTest):
186 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000187
188 def test_partial(self):
189 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000190 u"\x00\xff\u07ff\u0800\uffff",
191 [
192 u"\x00",
193 u"\x00",
194 u"\x00\xff",
195 u"\x00\xff",
196 u"\x00\xff\u07ff",
197 u"\x00\xff\u07ff",
198 u"\x00\xff\u07ff",
199 u"\x00\xff\u07ff\u0800",
200 u"\x00\xff\u07ff\u0800",
201 u"\x00\xff\u07ff\u0800",
202 u"\x00\xff\u07ff\u0800\uffff",
203 ]
204 )
205
Walter Dörwald8709a422002-09-03 13:53:40 +0000206class EscapeDecodeTest(unittest.TestCase):
207 def test_empty_escape_decode(self):
208 self.assertEquals(codecs.escape_decode(""), ("", 0))
209
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000210class RecodingTest(unittest.TestCase):
211 def test_recoding(self):
212 f = StringIO.StringIO()
213 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
214 f2.write(u"a")
215 f2.close()
216 # Python used to crash on this at exit because of a refcount
217 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000218
Martin v. Löwis2548c732003-04-18 10:39:54 +0000219# From RFC 3492
220punycode_testcases = [
221 # A Arabic (Egyptian):
222 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
223 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
224 "egbpdaj6bu4bxfgehfvwxn"),
225 # B Chinese (simplified):
226 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
227 "ihqwcrb4cv8a8dqg056pqjye"),
228 # C Chinese (traditional):
229 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
230 "ihqwctvzc91f659drss3x8bo0yb"),
231 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
232 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
233 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
234 u"\u0065\u0073\u006B\u0079",
235 "Proprostnemluvesky-uyb24dma41a"),
236 # E Hebrew:
237 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
238 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
239 u"\u05D1\u05E8\u05D9\u05EA",
240 "4dbcagdahymbxekheh6e0a7fei0b"),
241 # F Hindi (Devanagari):
242 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
243 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
244 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
245 u"\u0939\u0948\u0902",
246 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
247
248 #(G) Japanese (kanji and hiragana):
249 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
250 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
251 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
252
253 # (H) Korean (Hangul syllables):
254 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
255 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
256 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
257 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
258 "psd879ccm6fea98c"),
259
260 # (I) Russian (Cyrillic):
261 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
262 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
263 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
264 u"\u0438",
265 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
266
267 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
268 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
269 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
270 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
271 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
272 u"\u0061\u00F1\u006F\u006C",
273 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
274
275 # (K) Vietnamese:
276 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
277 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
278 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
279 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
280 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
281 u"\u0056\u0069\u1EC7\u0074",
282 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
283
284
285 #(L) 3<nen>B<gumi><kinpachi><sensei>
286 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
287 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000288
Martin v. Löwis2548c732003-04-18 10:39:54 +0000289 # (M) <amuro><namie>-with-SUPER-MONKEYS
290 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
291 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
292 u"\u004F\u004E\u004B\u0045\u0059\u0053",
293 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
294
295 # (N) Hello-Another-Way-<sorezore><no><basho>
296 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
297 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
298 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
299 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
300
301 # (O) <hitotsu><yane><no><shita>2
302 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
303 "2-u9tlzr9756bt3uc0v"),
304
305 # (P) Maji<de>Koi<suru>5<byou><mae>
306 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
307 u"\u308B\u0035\u79D2\u524D",
308 "MajiKoi5-783gue6qz075azm5e"),
309
310 # (Q) <pafii>de<runba>
311 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
312 "de-jg4avhby1noc0d"),
313
314 # (R) <sono><supiido><de>
315 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
316 "d9juau41awczczp"),
317
318 # (S) -> $1.00 <-
319 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
320 u"\u003C\u002D",
321 "-> $1.00 <--")
322 ]
323
324for i in punycode_testcases:
325 if len(i)!=2:
326 print repr(i)
327
328class PunycodeTest(unittest.TestCase):
329 def test_encode(self):
330 for uni, puny in punycode_testcases:
331 # Need to convert both strings to lower case, since
332 # some of the extended encodings use upper case, but our
333 # code produces only lower case. Converting just puny to
334 # lower is also insufficient, since some of the input characters
335 # are upper case.
336 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
337
338 def test_decode(self):
339 for uni, puny in punycode_testcases:
340 self.assertEquals(uni, puny.decode("punycode"))
341
342# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
343nameprep_tests = [
344 # 3.1 Map to nothing.
345 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
346 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
347 '\xb8\x8f\xef\xbb\xbf',
348 'foobarbaz'),
349 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
350 ('CAFE',
351 'cafe'),
352 # 3.3 Case folding 8bit U+00DF (german sharp s).
353 # The original test case is bogus; it says \xc3\xdf
354 ('\xc3\x9f',
355 'ss'),
356 # 3.4 Case folding U+0130 (turkish capital I with dot).
357 ('\xc4\xb0',
358 'i\xcc\x87'),
359 # 3.5 Case folding multibyte U+0143 U+037A.
360 ('\xc5\x83\xcd\xba',
361 '\xc5\x84 \xce\xb9'),
362 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
363 # XXX: skip this as it fails in UCS-2 mode
364 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
365 # 'telc\xe2\x88\x95kg\xcf\x83'),
366 (None, None),
367 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
368 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
369 '\xc7\xb0 a'),
370 # 3.8 Case folding U+1FB7 and normalization.
371 ('\xe1\xbe\xb7',
372 '\xe1\xbe\xb6\xce\xb9'),
373 # 3.9 Self-reverting case folding U+01F0 and normalization.
374 # The original test case is bogus, it says `\xc7\xf0'
375 ('\xc7\xb0',
376 '\xc7\xb0'),
377 # 3.10 Self-reverting case folding U+0390 and normalization.
378 ('\xce\x90',
379 '\xce\x90'),
380 # 3.11 Self-reverting case folding U+03B0 and normalization.
381 ('\xce\xb0',
382 '\xce\xb0'),
383 # 3.12 Self-reverting case folding U+1E96 and normalization.
384 ('\xe1\xba\x96',
385 '\xe1\xba\x96'),
386 # 3.13 Self-reverting case folding U+1F56 and normalization.
387 ('\xe1\xbd\x96',
388 '\xe1\xbd\x96'),
389 # 3.14 ASCII space character U+0020.
390 (' ',
391 ' '),
392 # 3.15 Non-ASCII 8bit space character U+00A0.
393 ('\xc2\xa0',
394 ' '),
395 # 3.16 Non-ASCII multibyte space character U+1680.
396 ('\xe1\x9a\x80',
397 None),
398 # 3.17 Non-ASCII multibyte space character U+2000.
399 ('\xe2\x80\x80',
400 ' '),
401 # 3.18 Zero Width Space U+200b.
402 ('\xe2\x80\x8b',
403 ''),
404 # 3.19 Non-ASCII multibyte space character U+3000.
405 ('\xe3\x80\x80',
406 ' '),
407 # 3.20 ASCII control characters U+0010 U+007F.
408 ('\x10\x7f',
409 '\x10\x7f'),
410 # 3.21 Non-ASCII 8bit control character U+0085.
411 ('\xc2\x85',
412 None),
413 # 3.22 Non-ASCII multibyte control character U+180E.
414 ('\xe1\xa0\x8e',
415 None),
416 # 3.23 Zero Width No-Break Space U+FEFF.
417 ('\xef\xbb\xbf',
418 ''),
419 # 3.24 Non-ASCII control character U+1D175.
420 ('\xf0\x9d\x85\xb5',
421 None),
422 # 3.25 Plane 0 private use character U+F123.
423 ('\xef\x84\xa3',
424 None),
425 # 3.26 Plane 15 private use character U+F1234.
426 ('\xf3\xb1\x88\xb4',
427 None),
428 # 3.27 Plane 16 private use character U+10F234.
429 ('\xf4\x8f\x88\xb4',
430 None),
431 # 3.28 Non-character code point U+8FFFE.
432 ('\xf2\x8f\xbf\xbe',
433 None),
434 # 3.29 Non-character code point U+10FFFF.
435 ('\xf4\x8f\xbf\xbf',
436 None),
437 # 3.30 Surrogate code U+DF42.
438 ('\xed\xbd\x82',
439 None),
440 # 3.31 Non-plain text character U+FFFD.
441 ('\xef\xbf\xbd',
442 None),
443 # 3.32 Ideographic description character U+2FF5.
444 ('\xe2\xbf\xb5',
445 None),
446 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000447 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000448 '\xcc\x81'),
449 # 3.34 Left-to-right mark U+200E.
450 ('\xe2\x80\x8e',
451 None),
452 # 3.35 Deprecated U+202A.
453 ('\xe2\x80\xaa',
454 None),
455 # 3.36 Language tagging character U+E0001.
456 ('\xf3\xa0\x80\x81',
457 None),
458 # 3.37 Language tagging character U+E0042.
459 ('\xf3\xa0\x81\x82',
460 None),
461 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
462 ('foo\xd6\xbebar',
463 None),
464 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
465 ('foo\xef\xb5\x90bar',
466 None),
467 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
468 ('foo\xef\xb9\xb6bar',
469 'foo \xd9\x8ebar'),
470 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
471 ('\xd8\xa71',
472 None),
473 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
474 ('\xd8\xa71\xd8\xa8',
475 '\xd8\xa71\xd8\xa8'),
476 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000477 # Skip this test as we allow unassigned
478 #('\xf3\xa0\x80\x82',
479 # None),
480 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000481 # 3.44 Larger test (shrinking).
482 # Original test case reads \xc3\xdf
483 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
484 '\xaa\xce\xb0\xe2\x80\x80',
485 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
486 # 3.45 Larger test (expanding).
487 # Original test case reads \xc3\x9f
488 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
489 '\x80',
490 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
491 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
492 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
493 ]
494
495
496class NameprepTest(unittest.TestCase):
497 def test_nameprep(self):
498 from encodings.idna import nameprep
499 for pos, (orig, prepped) in enumerate(nameprep_tests):
500 if orig is None:
501 # Skipped
502 continue
503 # The Unicode strings are given in UTF-8
504 orig = unicode(orig, "utf-8")
505 if prepped is None:
506 # Input contains prohibited characters
507 self.assertRaises(UnicodeError, nameprep, orig)
508 else:
509 prepped = unicode(prepped, "utf-8")
510 try:
511 self.assertEquals(nameprep(orig), prepped)
512 except Exception,e:
513 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
514
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000515class CodecTest(unittest.TestCase):
516 def test_builtin(self):
517 self.assertEquals(unicode("python.org", "idna"), u"python.org")
518
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000519class CodecsModuleTest(unittest.TestCase):
520
521 def test_decode(self):
522 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
523 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000524 self.assertRaises(TypeError, codecs.decode)
525 self.assertEquals(codecs.decode('abc'), u'abc')
526 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
527
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000528 def test_encode(self):
529 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
530 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000531 self.assertRaises(TypeError, codecs.encode)
532 self.assertEquals(codecs.encode(u'abc'), 'abc')
533 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
534
535 def test_register(self):
536 self.assertRaises(TypeError, codecs.register)
537
538 def test_lookup(self):
539 self.assertRaises(TypeError, codecs.lookup)
540 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000541
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000542class StreamReaderTest(unittest.TestCase):
543
544 def setUp(self):
545 self.reader = codecs.getreader('utf-8')
546 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
547
548 def test_readlines(self):
549 f = self.reader(self.stream)
550 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
551
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000552all_unicode_encodings = [
553 "ascii",
554 "base64_codec",
555 "big5",
556 "big5hkscs",
557 "charmap",
558 "cp037",
559 "cp1006",
560 "cp1026",
561 "cp1140",
562 "cp1250",
563 "cp1251",
564 "cp1252",
565 "cp1253",
566 "cp1254",
567 "cp1255",
568 "cp1256",
569 "cp1257",
570 "cp1258",
571 "cp424",
572 "cp437",
573 "cp500",
574 "cp737",
575 "cp775",
576 "cp850",
577 "cp852",
578 "cp855",
579 "cp856",
580 "cp857",
581 "cp860",
582 "cp861",
583 "cp862",
584 "cp863",
585 "cp864",
586 "cp865",
587 "cp866",
588 "cp869",
589 "cp874",
590 "cp875",
591 "cp932",
592 "cp949",
593 "cp950",
594 "euc_jis_2004",
595 "euc_jisx0213",
596 "euc_jp",
597 "euc_kr",
598 "gb18030",
599 "gb2312",
600 "gbk",
601 "hex_codec",
602 "hp_roman8",
603 "hz",
604 "idna",
605 "iso2022_jp",
606 "iso2022_jp_1",
607 "iso2022_jp_2",
608 "iso2022_jp_2004",
609 "iso2022_jp_3",
610 "iso2022_jp_ext",
611 "iso2022_kr",
612 "iso8859_1",
613 "iso8859_10",
614 "iso8859_11",
615 "iso8859_13",
616 "iso8859_14",
617 "iso8859_15",
618 "iso8859_16",
619 "iso8859_2",
620 "iso8859_3",
621 "iso8859_4",
622 "iso8859_5",
623 "iso8859_6",
624 "iso8859_7",
625 "iso8859_8",
626 "iso8859_9",
627 "johab",
628 "koi8_r",
629 "koi8_u",
630 "latin_1",
631 "mac_cyrillic",
632 "mac_greek",
633 "mac_iceland",
634 "mac_latin2",
635 "mac_roman",
636 "mac_turkish",
637 "palmos",
638 "ptcp154",
639 "punycode",
640 "raw_unicode_escape",
641 "rot_13",
642 "shift_jis",
643 "shift_jis_2004",
644 "shift_jisx0213",
645 "tis_620",
646 "unicode_escape",
647 "unicode_internal",
648 "utf_16",
649 "utf_16_be",
650 "utf_16_le",
651 "utf_7",
652 "utf_8",
653]
654
655if hasattr(codecs, "mbcs_encode"):
656 all_unicode_encodings.append("mbcs")
657
658# The following encodings work only with str, not unicode
659all_string_encodings = [
660 "quopri_codec",
661 "string_escape",
662 "uu_codec",
663]
664
665# The following encoding is not tested, because it's not supposed
666# to work:
667# "undefined"
668
669# The following encodings don't work in stateful mode
670broken_unicode_with_streams = [
671 "base64_codec",
672 "hex_codec",
673 "punycode",
674 "unicode_internal"
675]
676
677try:
678 import bz2
679except ImportError:
680 pass
681else:
682 all_unicode_encodings.append("bz2_codec")
683 broken_unicode_with_streams.append("bz2_codec")
684
685try:
686 import zlib
687except ImportError:
688 pass
689else:
690 all_unicode_encodings.append("zlib_codec")
691 broken_unicode_with_streams.append("zlib_codec")
692
693class BasicUnicodeTest(unittest.TestCase):
694 def test_basics(self):
695 s = u"abc123" # all codecs should be able to encode these
696 for encoding in all_unicode_encodings:
697 (bytes, size) = codecs.getencoder(encoding)(s)
698 if encoding != "unicode_internal":
699 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
700 (chars, size) = codecs.getdecoder(encoding)(bytes)
701 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
702
703 if encoding not in broken_unicode_with_streams:
704 # check stream reader/writer
705 q = Queue()
706 writer = codecs.getwriter(encoding)(q)
707 encodedresult = ""
708 for c in s:
709 writer.write(c)
710 encodedresult += q.read()
711 q = Queue()
712 reader = codecs.getreader(encoding)(q)
713 decodedresult = u""
714 for c in encodedresult:
715 q.write(c)
716 decodedresult += reader.read()
717 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
718
719class BasicStrTest(unittest.TestCase):
720 def test_basics(self):
721 s = "abc123"
722 for encoding in all_string_encodings:
723 (bytes, size) = codecs.getencoder(encoding)(s)
724 self.assertEqual(size, len(s))
725 (chars, size) = codecs.getdecoder(encoding)(bytes)
726 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
727
Fred Drake2e2be372001-09-20 21:33:42 +0000728def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000729 test_support.run_unittest(
730 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +0000731 UTF16LETest,
732 UTF16BETest,
733 UTF8Test,
Walter Dörwald21d3a322003-05-01 17:45:56 +0000734 EscapeDecodeTest,
735 RecodingTest,
736 PunycodeTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000737 NameprepTest,
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000738 CodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000739 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000740 StreamReaderTest,
741 BasicUnicodeTest,
742 BasicStrTest
Walter Dörwald21d3a322003-05-01 17:45:56 +0000743 )
Fred Drake2e2be372001-09-20 21:33:42 +0000744
745
746if __name__ == "__main__":
747 test_main()