blob: 01d8955f442fca32748034b5f9692bcb0e3c0dc9 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
4import StringIO
5
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
29 # of input to the reader byte by byte. Read every available from
30 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwalde57d7b12004-12-21 22:24:00 +000044 def test_readline(self):
45 def getreader(input):
46 stream = StringIO.StringIO(input.encode(self.encoding))
47 return codecs.getreader(self.encoding)(stream)
48
49 def readalllines(input, keepends=True):
50 reader = getreader(input)
51 lines = []
52 while True:
53 line = reader.readline(keepends=keepends)
54 if not line:
55 break
56 lines.append(line)
57 return "".join(lines)
58
59 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
60 self.assertEqual(readalllines(s, True), s)
61 self.assertEqual(readalllines(s, False), u"foobarbazspameggs")
62
63 # Test long lines (multiple calls to read() in readline())
64 vw = []
65 vwo = []
66 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
67 vw.append((i*200)*u"\3042" + lineend)
68 vwo.append((i*200)*u"\3042")
69 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
70 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
71
72 # Test lines where the first read might end with \r, so the
73 # reader has to look ahead whether this is a lone \r or a \r\n
74 for size in xrange(80):
75 for lineend in u"\n \r\n \r \u2028".split():
76 s = size*u"a" + lineend + u"xxx\n"
77 self.assertEqual(
78 getreader(s).readline(keepends=True),
79 size*u"a" + lineend,
80 )
81 self.assertEqual(
82 getreader(s).readline(keepends=False),
83 size*u"a",
84 )
85
86 def test_readlinequeue(self):
87 q = Queue()
88 writer = codecs.getwriter(self.encoding)(q)
89 reader = codecs.getreader(self.encoding)(q)
90
91 # No lineends
92 writer.write(u"foo\r")
93 self.assertEqual(reader.readline(keepends=False), u"foo")
94 writer.write(u"\nbar\r")
95 self.assertEqual(reader.readline(keepends=False), u"bar")
96 writer.write(u"baz")
97 self.assertEqual(reader.readline(keepends=False), u"baz")
98 self.assertEqual(reader.readline(keepends=False), u"")
99
100 # Lineends
101 writer.write(u"foo\r")
102 self.assertEqual(reader.readline(keepends=True), u"foo\r")
103 writer.write(u"\nbar\r")
104 self.assertEqual(reader.readline(keepends=True), u"bar\r")
105 writer.write(u"baz")
106 self.assertEqual(reader.readline(keepends=True), u"baz")
107 self.assertEqual(reader.readline(keepends=True), u"")
108 writer.write(u"foo\r\n")
109 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
110
Walter Dörwald9fa09462005-01-10 12:01:39 +0000111 def test_bug1098990_a(self):
112 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
113 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
114 s3 = u"next line.\r\n"
115
116 s = (s1+s2+s3).encode(self.encoding)
117 stream = StringIO.StringIO(s)
118 reader = codecs.getreader(self.encoding)(stream)
119 self.assertEqual(reader.readline(), s1)
120 self.assertEqual(reader.readline(), s2)
121 self.assertEqual(reader.readline(), s3)
122 self.assertEqual(reader.readline(), u"")
123
124 def test_bug1098990_b(self):
125 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
126 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
127 s3 = u"stillokay:bbbbxx\r\n"
128 s4 = u"broken!!!!badbad\r\n"
129 s5 = u"againokay.\r\n"
130
131 s = (s1+s2+s3+s4+s5).encode(self.encoding)
132 stream = StringIO.StringIO(s)
133 reader = codecs.getreader(self.encoding)(stream)
134 self.assertEqual(reader.readline(), s1)
135 self.assertEqual(reader.readline(), s2)
136 self.assertEqual(reader.readline(), s3)
137 self.assertEqual(reader.readline(), s4)
138 self.assertEqual(reader.readline(), s5)
139 self.assertEqual(reader.readline(), u"")
140
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141class UTF16Test(ReadTest):
142 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000143
144 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
145 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
146
147 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000148 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000149 # encode some stream
150 s = StringIO.StringIO()
151 f = writer(s)
152 f.write(u"spam")
153 f.write(u"spam")
154 d = s.getvalue()
155 # check whether there is exactly one BOM in it
156 self.assert_(d == self.spamle or d == self.spambe)
157 # try to read it back
158 s = StringIO.StringIO(d)
159 f = reader(s)
160 self.assertEquals(f.read(), u"spamspam")
161
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000162 def test_badbom(self):
163 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000164 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000165 self.assertRaises(UnicodeError, f.read)
166
167 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000168 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000169 self.assertRaises(UnicodeError, f.read)
170
Walter Dörwald69652032004-09-07 20:24:22 +0000171 def test_partial(self):
172 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000173 u"\x00\xff\u0100\uffff",
174 [
175 u"", # first byte of BOM read
176 u"", # second byte of BOM read => byteorder known
177 u"",
178 u"\x00",
179 u"\x00",
180 u"\x00\xff",
181 u"\x00\xff",
182 u"\x00\xff\u0100",
183 u"\x00\xff\u0100",
184 u"\x00\xff\u0100\uffff",
185 ]
186 )
187
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000188class UTF16LETest(ReadTest):
189 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000190
191 def test_partial(self):
192 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000193 u"\x00\xff\u0100\uffff",
194 [
195 u"",
196 u"\x00",
197 u"\x00",
198 u"\x00\xff",
199 u"\x00\xff",
200 u"\x00\xff\u0100",
201 u"\x00\xff\u0100",
202 u"\x00\xff\u0100\uffff",
203 ]
204 )
205
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000206class UTF16BETest(ReadTest):
207 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000208
209 def test_partial(self):
210 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000211 u"\x00\xff\u0100\uffff",
212 [
213 u"",
214 u"\x00",
215 u"\x00",
216 u"\x00\xff",
217 u"\x00\xff",
218 u"\x00\xff\u0100",
219 u"\x00\xff\u0100",
220 u"\x00\xff\u0100\uffff",
221 ]
222 )
223
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000224class UTF8Test(ReadTest):
225 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000226
227 def test_partial(self):
228 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000229 u"\x00\xff\u07ff\u0800\uffff",
230 [
231 u"\x00",
232 u"\x00",
233 u"\x00\xff",
234 u"\x00\xff",
235 u"\x00\xff\u07ff",
236 u"\x00\xff\u07ff",
237 u"\x00\xff\u07ff",
238 u"\x00\xff\u07ff\u0800",
239 u"\x00\xff\u07ff\u0800",
240 u"\x00\xff\u07ff\u0800",
241 u"\x00\xff\u07ff\u0800\uffff",
242 ]
243 )
244
Walter Dörwald8709a422002-09-03 13:53:40 +0000245class EscapeDecodeTest(unittest.TestCase):
246 def test_empty_escape_decode(self):
247 self.assertEquals(codecs.escape_decode(""), ("", 0))
248
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000249class RecodingTest(unittest.TestCase):
250 def test_recoding(self):
251 f = StringIO.StringIO()
252 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
253 f2.write(u"a")
254 f2.close()
255 # Python used to crash on this at exit because of a refcount
256 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000257
Martin v. Löwis2548c732003-04-18 10:39:54 +0000258# From RFC 3492
259punycode_testcases = [
260 # A Arabic (Egyptian):
261 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
262 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
263 "egbpdaj6bu4bxfgehfvwxn"),
264 # B Chinese (simplified):
265 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
266 "ihqwcrb4cv8a8dqg056pqjye"),
267 # C Chinese (traditional):
268 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
269 "ihqwctvzc91f659drss3x8bo0yb"),
270 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
271 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
272 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
273 u"\u0065\u0073\u006B\u0079",
274 "Proprostnemluvesky-uyb24dma41a"),
275 # E Hebrew:
276 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
277 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
278 u"\u05D1\u05E8\u05D9\u05EA",
279 "4dbcagdahymbxekheh6e0a7fei0b"),
280 # F Hindi (Devanagari):
281 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
282 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
283 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
284 u"\u0939\u0948\u0902",
285 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
286
287 #(G) Japanese (kanji and hiragana):
288 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
289 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
290 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
291
292 # (H) Korean (Hangul syllables):
293 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
294 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
295 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
296 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
297 "psd879ccm6fea98c"),
298
299 # (I) Russian (Cyrillic):
300 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
301 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
302 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
303 u"\u0438",
304 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
305
306 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
307 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
308 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
309 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
310 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
311 u"\u0061\u00F1\u006F\u006C",
312 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
313
314 # (K) Vietnamese:
315 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
316 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
317 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
318 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
319 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
320 u"\u0056\u0069\u1EC7\u0074",
321 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
322
323
324 #(L) 3<nen>B<gumi><kinpachi><sensei>
325 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
326 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000327
Martin v. Löwis2548c732003-04-18 10:39:54 +0000328 # (M) <amuro><namie>-with-SUPER-MONKEYS
329 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
330 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
331 u"\u004F\u004E\u004B\u0045\u0059\u0053",
332 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
333
334 # (N) Hello-Another-Way-<sorezore><no><basho>
335 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
336 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
337 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
338 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
339
340 # (O) <hitotsu><yane><no><shita>2
341 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
342 "2-u9tlzr9756bt3uc0v"),
343
344 # (P) Maji<de>Koi<suru>5<byou><mae>
345 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
346 u"\u308B\u0035\u79D2\u524D",
347 "MajiKoi5-783gue6qz075azm5e"),
348
349 # (Q) <pafii>de<runba>
350 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
351 "de-jg4avhby1noc0d"),
352
353 # (R) <sono><supiido><de>
354 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
355 "d9juau41awczczp"),
356
357 # (S) -> $1.00 <-
358 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
359 u"\u003C\u002D",
360 "-> $1.00 <--")
361 ]
362
363for i in punycode_testcases:
364 if len(i)!=2:
365 print repr(i)
366
367class PunycodeTest(unittest.TestCase):
368 def test_encode(self):
369 for uni, puny in punycode_testcases:
370 # Need to convert both strings to lower case, since
371 # some of the extended encodings use upper case, but our
372 # code produces only lower case. Converting just puny to
373 # lower is also insufficient, since some of the input characters
374 # are upper case.
375 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
376
377 def test_decode(self):
378 for uni, puny in punycode_testcases:
379 self.assertEquals(uni, puny.decode("punycode"))
380
381# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
382nameprep_tests = [
383 # 3.1 Map to nothing.
384 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
385 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
386 '\xb8\x8f\xef\xbb\xbf',
387 'foobarbaz'),
388 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
389 ('CAFE',
390 'cafe'),
391 # 3.3 Case folding 8bit U+00DF (german sharp s).
392 # The original test case is bogus; it says \xc3\xdf
393 ('\xc3\x9f',
394 'ss'),
395 # 3.4 Case folding U+0130 (turkish capital I with dot).
396 ('\xc4\xb0',
397 'i\xcc\x87'),
398 # 3.5 Case folding multibyte U+0143 U+037A.
399 ('\xc5\x83\xcd\xba',
400 '\xc5\x84 \xce\xb9'),
401 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
402 # XXX: skip this as it fails in UCS-2 mode
403 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
404 # 'telc\xe2\x88\x95kg\xcf\x83'),
405 (None, None),
406 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
407 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
408 '\xc7\xb0 a'),
409 # 3.8 Case folding U+1FB7 and normalization.
410 ('\xe1\xbe\xb7',
411 '\xe1\xbe\xb6\xce\xb9'),
412 # 3.9 Self-reverting case folding U+01F0 and normalization.
413 # The original test case is bogus, it says `\xc7\xf0'
414 ('\xc7\xb0',
415 '\xc7\xb0'),
416 # 3.10 Self-reverting case folding U+0390 and normalization.
417 ('\xce\x90',
418 '\xce\x90'),
419 # 3.11 Self-reverting case folding U+03B0 and normalization.
420 ('\xce\xb0',
421 '\xce\xb0'),
422 # 3.12 Self-reverting case folding U+1E96 and normalization.
423 ('\xe1\xba\x96',
424 '\xe1\xba\x96'),
425 # 3.13 Self-reverting case folding U+1F56 and normalization.
426 ('\xe1\xbd\x96',
427 '\xe1\xbd\x96'),
428 # 3.14 ASCII space character U+0020.
429 (' ',
430 ' '),
431 # 3.15 Non-ASCII 8bit space character U+00A0.
432 ('\xc2\xa0',
433 ' '),
434 # 3.16 Non-ASCII multibyte space character U+1680.
435 ('\xe1\x9a\x80',
436 None),
437 # 3.17 Non-ASCII multibyte space character U+2000.
438 ('\xe2\x80\x80',
439 ' '),
440 # 3.18 Zero Width Space U+200b.
441 ('\xe2\x80\x8b',
442 ''),
443 # 3.19 Non-ASCII multibyte space character U+3000.
444 ('\xe3\x80\x80',
445 ' '),
446 # 3.20 ASCII control characters U+0010 U+007F.
447 ('\x10\x7f',
448 '\x10\x7f'),
449 # 3.21 Non-ASCII 8bit control character U+0085.
450 ('\xc2\x85',
451 None),
452 # 3.22 Non-ASCII multibyte control character U+180E.
453 ('\xe1\xa0\x8e',
454 None),
455 # 3.23 Zero Width No-Break Space U+FEFF.
456 ('\xef\xbb\xbf',
457 ''),
458 # 3.24 Non-ASCII control character U+1D175.
459 ('\xf0\x9d\x85\xb5',
460 None),
461 # 3.25 Plane 0 private use character U+F123.
462 ('\xef\x84\xa3',
463 None),
464 # 3.26 Plane 15 private use character U+F1234.
465 ('\xf3\xb1\x88\xb4',
466 None),
467 # 3.27 Plane 16 private use character U+10F234.
468 ('\xf4\x8f\x88\xb4',
469 None),
470 # 3.28 Non-character code point U+8FFFE.
471 ('\xf2\x8f\xbf\xbe',
472 None),
473 # 3.29 Non-character code point U+10FFFF.
474 ('\xf4\x8f\xbf\xbf',
475 None),
476 # 3.30 Surrogate code U+DF42.
477 ('\xed\xbd\x82',
478 None),
479 # 3.31 Non-plain text character U+FFFD.
480 ('\xef\xbf\xbd',
481 None),
482 # 3.32 Ideographic description character U+2FF5.
483 ('\xe2\xbf\xb5',
484 None),
485 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000486 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000487 '\xcc\x81'),
488 # 3.34 Left-to-right mark U+200E.
489 ('\xe2\x80\x8e',
490 None),
491 # 3.35 Deprecated U+202A.
492 ('\xe2\x80\xaa',
493 None),
494 # 3.36 Language tagging character U+E0001.
495 ('\xf3\xa0\x80\x81',
496 None),
497 # 3.37 Language tagging character U+E0042.
498 ('\xf3\xa0\x81\x82',
499 None),
500 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
501 ('foo\xd6\xbebar',
502 None),
503 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
504 ('foo\xef\xb5\x90bar',
505 None),
506 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
507 ('foo\xef\xb9\xb6bar',
508 'foo \xd9\x8ebar'),
509 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
510 ('\xd8\xa71',
511 None),
512 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
513 ('\xd8\xa71\xd8\xa8',
514 '\xd8\xa71\xd8\xa8'),
515 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000516 # Skip this test as we allow unassigned
517 #('\xf3\xa0\x80\x82',
518 # None),
519 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000520 # 3.44 Larger test (shrinking).
521 # Original test case reads \xc3\xdf
522 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
523 '\xaa\xce\xb0\xe2\x80\x80',
524 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
525 # 3.45 Larger test (expanding).
526 # Original test case reads \xc3\x9f
527 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
528 '\x80',
529 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
530 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
531 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
532 ]
533
534
535class NameprepTest(unittest.TestCase):
536 def test_nameprep(self):
537 from encodings.idna import nameprep
538 for pos, (orig, prepped) in enumerate(nameprep_tests):
539 if orig is None:
540 # Skipped
541 continue
542 # The Unicode strings are given in UTF-8
543 orig = unicode(orig, "utf-8")
544 if prepped is None:
545 # Input contains prohibited characters
546 self.assertRaises(UnicodeError, nameprep, orig)
547 else:
548 prepped = unicode(prepped, "utf-8")
549 try:
550 self.assertEquals(nameprep(orig), prepped)
551 except Exception,e:
552 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
553
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000554class CodecTest(unittest.TestCase):
555 def test_builtin(self):
556 self.assertEquals(unicode("python.org", "idna"), u"python.org")
557
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000558class CodecsModuleTest(unittest.TestCase):
559
560 def test_decode(self):
561 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
562 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000563 self.assertRaises(TypeError, codecs.decode)
564 self.assertEquals(codecs.decode('abc'), u'abc')
565 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
566
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000567 def test_encode(self):
568 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
569 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000570 self.assertRaises(TypeError, codecs.encode)
571 self.assertEquals(codecs.encode(u'abc'), 'abc')
572 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
573
574 def test_register(self):
575 self.assertRaises(TypeError, codecs.register)
576
577 def test_lookup(self):
578 self.assertRaises(TypeError, codecs.lookup)
579 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000580
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000581class StreamReaderTest(unittest.TestCase):
582
583 def setUp(self):
584 self.reader = codecs.getreader('utf-8')
585 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
586
587 def test_readlines(self):
588 f = self.reader(self.stream)
589 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
590
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000591all_unicode_encodings = [
592 "ascii",
593 "base64_codec",
594 "big5",
595 "big5hkscs",
596 "charmap",
597 "cp037",
598 "cp1006",
599 "cp1026",
600 "cp1140",
601 "cp1250",
602 "cp1251",
603 "cp1252",
604 "cp1253",
605 "cp1254",
606 "cp1255",
607 "cp1256",
608 "cp1257",
609 "cp1258",
610 "cp424",
611 "cp437",
612 "cp500",
613 "cp737",
614 "cp775",
615 "cp850",
616 "cp852",
617 "cp855",
618 "cp856",
619 "cp857",
620 "cp860",
621 "cp861",
622 "cp862",
623 "cp863",
624 "cp864",
625 "cp865",
626 "cp866",
627 "cp869",
628 "cp874",
629 "cp875",
630 "cp932",
631 "cp949",
632 "cp950",
633 "euc_jis_2004",
634 "euc_jisx0213",
635 "euc_jp",
636 "euc_kr",
637 "gb18030",
638 "gb2312",
639 "gbk",
640 "hex_codec",
641 "hp_roman8",
642 "hz",
643 "idna",
644 "iso2022_jp",
645 "iso2022_jp_1",
646 "iso2022_jp_2",
647 "iso2022_jp_2004",
648 "iso2022_jp_3",
649 "iso2022_jp_ext",
650 "iso2022_kr",
651 "iso8859_1",
652 "iso8859_10",
653 "iso8859_11",
654 "iso8859_13",
655 "iso8859_14",
656 "iso8859_15",
657 "iso8859_16",
658 "iso8859_2",
659 "iso8859_3",
660 "iso8859_4",
661 "iso8859_5",
662 "iso8859_6",
663 "iso8859_7",
664 "iso8859_8",
665 "iso8859_9",
666 "johab",
667 "koi8_r",
668 "koi8_u",
669 "latin_1",
670 "mac_cyrillic",
671 "mac_greek",
672 "mac_iceland",
673 "mac_latin2",
674 "mac_roman",
675 "mac_turkish",
676 "palmos",
677 "ptcp154",
678 "punycode",
679 "raw_unicode_escape",
680 "rot_13",
681 "shift_jis",
682 "shift_jis_2004",
683 "shift_jisx0213",
684 "tis_620",
685 "unicode_escape",
686 "unicode_internal",
687 "utf_16",
688 "utf_16_be",
689 "utf_16_le",
690 "utf_7",
691 "utf_8",
692]
693
694if hasattr(codecs, "mbcs_encode"):
695 all_unicode_encodings.append("mbcs")
696
697# The following encodings work only with str, not unicode
698all_string_encodings = [
699 "quopri_codec",
700 "string_escape",
701 "uu_codec",
702]
703
704# The following encoding is not tested, because it's not supposed
705# to work:
706# "undefined"
707
708# The following encodings don't work in stateful mode
709broken_unicode_with_streams = [
710 "base64_codec",
711 "hex_codec",
712 "punycode",
713 "unicode_internal"
714]
715
716try:
717 import bz2
718except ImportError:
719 pass
720else:
721 all_unicode_encodings.append("bz2_codec")
722 broken_unicode_with_streams.append("bz2_codec")
723
724try:
725 import zlib
726except ImportError:
727 pass
728else:
729 all_unicode_encodings.append("zlib_codec")
730 broken_unicode_with_streams.append("zlib_codec")
731
732class BasicUnicodeTest(unittest.TestCase):
733 def test_basics(self):
734 s = u"abc123" # all codecs should be able to encode these
735 for encoding in all_unicode_encodings:
736 (bytes, size) = codecs.getencoder(encoding)(s)
737 if encoding != "unicode_internal":
738 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
739 (chars, size) = codecs.getdecoder(encoding)(bytes)
740 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
741
742 if encoding not in broken_unicode_with_streams:
743 # check stream reader/writer
744 q = Queue()
745 writer = codecs.getwriter(encoding)(q)
746 encodedresult = ""
747 for c in s:
748 writer.write(c)
749 encodedresult += q.read()
750 q = Queue()
751 reader = codecs.getreader(encoding)(q)
752 decodedresult = u""
753 for c in encodedresult:
754 q.write(c)
755 decodedresult += reader.read()
756 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
757
758class BasicStrTest(unittest.TestCase):
759 def test_basics(self):
760 s = "abc123"
761 for encoding in all_string_encodings:
762 (bytes, size) = codecs.getencoder(encoding)(s)
763 self.assertEqual(size, len(s))
764 (chars, size) = codecs.getdecoder(encoding)(bytes)
765 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
766
Fred Drake2e2be372001-09-20 21:33:42 +0000767def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000768 test_support.run_unittest(
769 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +0000770 UTF16LETest,
771 UTF16BETest,
772 UTF8Test,
Walter Dörwald21d3a322003-05-01 17:45:56 +0000773 EscapeDecodeTest,
774 RecodingTest,
775 PunycodeTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000776 NameprepTest,
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000777 CodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000778 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000779 StreamReaderTest,
780 BasicUnicodeTest,
781 BasicStrTest
Walter Dörwald21d3a322003-05-01 17:45:56 +0000782 )
Fred Drake2e2be372001-09-20 21:33:42 +0000783
784
785if __name__ == "__main__":
786 test_main()