blob: dbe9b88dbd8aa75bc92fc63517a0ddb681633a96 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
4import StringIO
5
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
29 # of input to the reader byte by byte. Read every available from
30 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwalde57d7b12004-12-21 22:24:00 +000044 def test_readline(self):
45 def getreader(input):
46 stream = StringIO.StringIO(input.encode(self.encoding))
47 return codecs.getreader(self.encoding)(stream)
48
49 def readalllines(input, keepends=True):
50 reader = getreader(input)
51 lines = []
52 while True:
53 line = reader.readline(keepends=keepends)
54 if not line:
55 break
56 lines.append(line)
57 return "".join(lines)
58
59 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
60 self.assertEqual(readalllines(s, True), s)
61 self.assertEqual(readalllines(s, False), u"foobarbazspameggs")
62
63 # Test long lines (multiple calls to read() in readline())
64 vw = []
65 vwo = []
66 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
67 vw.append((i*200)*u"\3042" + lineend)
68 vwo.append((i*200)*u"\3042")
69 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
70 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
71
72 # Test lines where the first read might end with \r, so the
73 # reader has to look ahead whether this is a lone \r or a \r\n
74 for size in xrange(80):
75 for lineend in u"\n \r\n \r \u2028".split():
76 s = size*u"a" + lineend + u"xxx\n"
77 self.assertEqual(
78 getreader(s).readline(keepends=True),
79 size*u"a" + lineend,
80 )
81 self.assertEqual(
82 getreader(s).readline(keepends=False),
83 size*u"a",
84 )
85
86 def test_readlinequeue(self):
87 q = Queue()
88 writer = codecs.getwriter(self.encoding)(q)
89 reader = codecs.getreader(self.encoding)(q)
90
91 # No lineends
92 writer.write(u"foo\r")
93 self.assertEqual(reader.readline(keepends=False), u"foo")
94 writer.write(u"\nbar\r")
95 self.assertEqual(reader.readline(keepends=False), u"bar")
96 writer.write(u"baz")
97 self.assertEqual(reader.readline(keepends=False), u"baz")
98 self.assertEqual(reader.readline(keepends=False), u"")
99
100 # Lineends
101 writer.write(u"foo\r")
102 self.assertEqual(reader.readline(keepends=True), u"foo\r")
103 writer.write(u"\nbar\r")
104 self.assertEqual(reader.readline(keepends=True), u"bar\r")
105 writer.write(u"baz")
106 self.assertEqual(reader.readline(keepends=True), u"baz")
107 self.assertEqual(reader.readline(keepends=True), u"")
108 writer.write(u"foo\r\n")
109 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
110
Walter Dörwald9fa09462005-01-10 12:01:39 +0000111 def test_bug1098990_a(self):
112 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
113 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
114 s3 = u"next line.\r\n"
115
116 s = (s1+s2+s3).encode(self.encoding)
117 stream = StringIO.StringIO(s)
118 reader = codecs.getreader(self.encoding)(stream)
119 self.assertEqual(reader.readline(), s1)
120 self.assertEqual(reader.readline(), s2)
121 self.assertEqual(reader.readline(), s3)
122 self.assertEqual(reader.readline(), u"")
123
124 def test_bug1098990_b(self):
125 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
126 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
127 s3 = u"stillokay:bbbbxx\r\n"
128 s4 = u"broken!!!!badbad\r\n"
129 s5 = u"againokay.\r\n"
130
131 s = (s1+s2+s3+s4+s5).encode(self.encoding)
132 stream = StringIO.StringIO(s)
133 reader = codecs.getreader(self.encoding)(stream)
134 self.assertEqual(reader.readline(), s1)
135 self.assertEqual(reader.readline(), s2)
136 self.assertEqual(reader.readline(), s3)
137 self.assertEqual(reader.readline(), s4)
138 self.assertEqual(reader.readline(), s5)
139 self.assertEqual(reader.readline(), u"")
140
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141class UTF16Test(ReadTest):
142 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000143
144 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
145 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
146
147 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000148 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000149 # encode some stream
150 s = StringIO.StringIO()
151 f = writer(s)
152 f.write(u"spam")
153 f.write(u"spam")
154 d = s.getvalue()
155 # check whether there is exactly one BOM in it
156 self.assert_(d == self.spamle or d == self.spambe)
157 # try to read it back
158 s = StringIO.StringIO(d)
159 f = reader(s)
160 self.assertEquals(f.read(), u"spamspam")
161
Walter Dörwald69652032004-09-07 20:24:22 +0000162 def test_partial(self):
163 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000164 u"\x00\xff\u0100\uffff",
165 [
166 u"", # first byte of BOM read
167 u"", # second byte of BOM read => byteorder known
168 u"",
169 u"\x00",
170 u"\x00",
171 u"\x00\xff",
172 u"\x00\xff",
173 u"\x00\xff\u0100",
174 u"\x00\xff\u0100",
175 u"\x00\xff\u0100\uffff",
176 ]
177 )
178
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000179class UTF16LETest(ReadTest):
180 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000181
182 def test_partial(self):
183 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000184 u"\x00\xff\u0100\uffff",
185 [
186 u"",
187 u"\x00",
188 u"\x00",
189 u"\x00\xff",
190 u"\x00\xff",
191 u"\x00\xff\u0100",
192 u"\x00\xff\u0100",
193 u"\x00\xff\u0100\uffff",
194 ]
195 )
196
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000197class UTF16BETest(ReadTest):
198 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000199
200 def test_partial(self):
201 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000202 u"\x00\xff\u0100\uffff",
203 [
204 u"",
205 u"\x00",
206 u"\x00",
207 u"\x00\xff",
208 u"\x00\xff",
209 u"\x00\xff\u0100",
210 u"\x00\xff\u0100",
211 u"\x00\xff\u0100\uffff",
212 ]
213 )
214
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000215class UTF8Test(ReadTest):
216 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000217
218 def test_partial(self):
219 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000220 u"\x00\xff\u07ff\u0800\uffff",
221 [
222 u"\x00",
223 u"\x00",
224 u"\x00\xff",
225 u"\x00\xff",
226 u"\x00\xff\u07ff",
227 u"\x00\xff\u07ff",
228 u"\x00\xff\u07ff",
229 u"\x00\xff\u07ff\u0800",
230 u"\x00\xff\u07ff\u0800",
231 u"\x00\xff\u07ff\u0800",
232 u"\x00\xff\u07ff\u0800\uffff",
233 ]
234 )
235
Walter Dörwald8709a422002-09-03 13:53:40 +0000236class EscapeDecodeTest(unittest.TestCase):
237 def test_empty_escape_decode(self):
238 self.assertEquals(codecs.escape_decode(""), ("", 0))
239
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000240class RecodingTest(unittest.TestCase):
241 def test_recoding(self):
242 f = StringIO.StringIO()
243 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
244 f2.write(u"a")
245 f2.close()
246 # Python used to crash on this at exit because of a refcount
247 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000248
Martin v. Löwis2548c732003-04-18 10:39:54 +0000249# From RFC 3492
250punycode_testcases = [
251 # A Arabic (Egyptian):
252 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
253 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
254 "egbpdaj6bu4bxfgehfvwxn"),
255 # B Chinese (simplified):
256 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
257 "ihqwcrb4cv8a8dqg056pqjye"),
258 # C Chinese (traditional):
259 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
260 "ihqwctvzc91f659drss3x8bo0yb"),
261 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
262 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
263 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
264 u"\u0065\u0073\u006B\u0079",
265 "Proprostnemluvesky-uyb24dma41a"),
266 # E Hebrew:
267 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
268 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
269 u"\u05D1\u05E8\u05D9\u05EA",
270 "4dbcagdahymbxekheh6e0a7fei0b"),
271 # F Hindi (Devanagari):
272 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
273 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
274 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
275 u"\u0939\u0948\u0902",
276 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
277
278 #(G) Japanese (kanji and hiragana):
279 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
280 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
281 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
282
283 # (H) Korean (Hangul syllables):
284 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
285 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
286 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
287 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
288 "psd879ccm6fea98c"),
289
290 # (I) Russian (Cyrillic):
291 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
292 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
293 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
294 u"\u0438",
295 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
296
297 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
298 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
299 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
300 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
301 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
302 u"\u0061\u00F1\u006F\u006C",
303 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
304
305 # (K) Vietnamese:
306 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
307 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
308 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
309 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
310 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
311 u"\u0056\u0069\u1EC7\u0074",
312 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
313
314
315 #(L) 3<nen>B<gumi><kinpachi><sensei>
316 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
317 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000318
Martin v. Löwis2548c732003-04-18 10:39:54 +0000319 # (M) <amuro><namie>-with-SUPER-MONKEYS
320 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
321 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
322 u"\u004F\u004E\u004B\u0045\u0059\u0053",
323 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
324
325 # (N) Hello-Another-Way-<sorezore><no><basho>
326 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
327 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
328 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
329 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
330
331 # (O) <hitotsu><yane><no><shita>2
332 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
333 "2-u9tlzr9756bt3uc0v"),
334
335 # (P) Maji<de>Koi<suru>5<byou><mae>
336 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
337 u"\u308B\u0035\u79D2\u524D",
338 "MajiKoi5-783gue6qz075azm5e"),
339
340 # (Q) <pafii>de<runba>
341 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
342 "de-jg4avhby1noc0d"),
343
344 # (R) <sono><supiido><de>
345 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
346 "d9juau41awczczp"),
347
348 # (S) -> $1.00 <-
349 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
350 u"\u003C\u002D",
351 "-> $1.00 <--")
352 ]
353
354for i in punycode_testcases:
355 if len(i)!=2:
356 print repr(i)
357
358class PunycodeTest(unittest.TestCase):
359 def test_encode(self):
360 for uni, puny in punycode_testcases:
361 # Need to convert both strings to lower case, since
362 # some of the extended encodings use upper case, but our
363 # code produces only lower case. Converting just puny to
364 # lower is also insufficient, since some of the input characters
365 # are upper case.
366 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
367
368 def test_decode(self):
369 for uni, puny in punycode_testcases:
370 self.assertEquals(uni, puny.decode("punycode"))
371
372# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
373nameprep_tests = [
374 # 3.1 Map to nothing.
375 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
376 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
377 '\xb8\x8f\xef\xbb\xbf',
378 'foobarbaz'),
379 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
380 ('CAFE',
381 'cafe'),
382 # 3.3 Case folding 8bit U+00DF (german sharp s).
383 # The original test case is bogus; it says \xc3\xdf
384 ('\xc3\x9f',
385 'ss'),
386 # 3.4 Case folding U+0130 (turkish capital I with dot).
387 ('\xc4\xb0',
388 'i\xcc\x87'),
389 # 3.5 Case folding multibyte U+0143 U+037A.
390 ('\xc5\x83\xcd\xba',
391 '\xc5\x84 \xce\xb9'),
392 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
393 # XXX: skip this as it fails in UCS-2 mode
394 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
395 # 'telc\xe2\x88\x95kg\xcf\x83'),
396 (None, None),
397 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
398 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
399 '\xc7\xb0 a'),
400 # 3.8 Case folding U+1FB7 and normalization.
401 ('\xe1\xbe\xb7',
402 '\xe1\xbe\xb6\xce\xb9'),
403 # 3.9 Self-reverting case folding U+01F0 and normalization.
404 # The original test case is bogus, it says `\xc7\xf0'
405 ('\xc7\xb0',
406 '\xc7\xb0'),
407 # 3.10 Self-reverting case folding U+0390 and normalization.
408 ('\xce\x90',
409 '\xce\x90'),
410 # 3.11 Self-reverting case folding U+03B0 and normalization.
411 ('\xce\xb0',
412 '\xce\xb0'),
413 # 3.12 Self-reverting case folding U+1E96 and normalization.
414 ('\xe1\xba\x96',
415 '\xe1\xba\x96'),
416 # 3.13 Self-reverting case folding U+1F56 and normalization.
417 ('\xe1\xbd\x96',
418 '\xe1\xbd\x96'),
419 # 3.14 ASCII space character U+0020.
420 (' ',
421 ' '),
422 # 3.15 Non-ASCII 8bit space character U+00A0.
423 ('\xc2\xa0',
424 ' '),
425 # 3.16 Non-ASCII multibyte space character U+1680.
426 ('\xe1\x9a\x80',
427 None),
428 # 3.17 Non-ASCII multibyte space character U+2000.
429 ('\xe2\x80\x80',
430 ' '),
431 # 3.18 Zero Width Space U+200b.
432 ('\xe2\x80\x8b',
433 ''),
434 # 3.19 Non-ASCII multibyte space character U+3000.
435 ('\xe3\x80\x80',
436 ' '),
437 # 3.20 ASCII control characters U+0010 U+007F.
438 ('\x10\x7f',
439 '\x10\x7f'),
440 # 3.21 Non-ASCII 8bit control character U+0085.
441 ('\xc2\x85',
442 None),
443 # 3.22 Non-ASCII multibyte control character U+180E.
444 ('\xe1\xa0\x8e',
445 None),
446 # 3.23 Zero Width No-Break Space U+FEFF.
447 ('\xef\xbb\xbf',
448 ''),
449 # 3.24 Non-ASCII control character U+1D175.
450 ('\xf0\x9d\x85\xb5',
451 None),
452 # 3.25 Plane 0 private use character U+F123.
453 ('\xef\x84\xa3',
454 None),
455 # 3.26 Plane 15 private use character U+F1234.
456 ('\xf3\xb1\x88\xb4',
457 None),
458 # 3.27 Plane 16 private use character U+10F234.
459 ('\xf4\x8f\x88\xb4',
460 None),
461 # 3.28 Non-character code point U+8FFFE.
462 ('\xf2\x8f\xbf\xbe',
463 None),
464 # 3.29 Non-character code point U+10FFFF.
465 ('\xf4\x8f\xbf\xbf',
466 None),
467 # 3.30 Surrogate code U+DF42.
468 ('\xed\xbd\x82',
469 None),
470 # 3.31 Non-plain text character U+FFFD.
471 ('\xef\xbf\xbd',
472 None),
473 # 3.32 Ideographic description character U+2FF5.
474 ('\xe2\xbf\xb5',
475 None),
476 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000477 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000478 '\xcc\x81'),
479 # 3.34 Left-to-right mark U+200E.
480 ('\xe2\x80\x8e',
481 None),
482 # 3.35 Deprecated U+202A.
483 ('\xe2\x80\xaa',
484 None),
485 # 3.36 Language tagging character U+E0001.
486 ('\xf3\xa0\x80\x81',
487 None),
488 # 3.37 Language tagging character U+E0042.
489 ('\xf3\xa0\x81\x82',
490 None),
491 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
492 ('foo\xd6\xbebar',
493 None),
494 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
495 ('foo\xef\xb5\x90bar',
496 None),
497 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
498 ('foo\xef\xb9\xb6bar',
499 'foo \xd9\x8ebar'),
500 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
501 ('\xd8\xa71',
502 None),
503 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
504 ('\xd8\xa71\xd8\xa8',
505 '\xd8\xa71\xd8\xa8'),
506 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000507 # Skip this test as we allow unassigned
508 #('\xf3\xa0\x80\x82',
509 # None),
510 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000511 # 3.44 Larger test (shrinking).
512 # Original test case reads \xc3\xdf
513 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
514 '\xaa\xce\xb0\xe2\x80\x80',
515 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
516 # 3.45 Larger test (expanding).
517 # Original test case reads \xc3\x9f
518 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
519 '\x80',
520 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
521 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
522 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
523 ]
524
525
526class NameprepTest(unittest.TestCase):
527 def test_nameprep(self):
528 from encodings.idna import nameprep
529 for pos, (orig, prepped) in enumerate(nameprep_tests):
530 if orig is None:
531 # Skipped
532 continue
533 # The Unicode strings are given in UTF-8
534 orig = unicode(orig, "utf-8")
535 if prepped is None:
536 # Input contains prohibited characters
537 self.assertRaises(UnicodeError, nameprep, orig)
538 else:
539 prepped = unicode(prepped, "utf-8")
540 try:
541 self.assertEquals(nameprep(orig), prepped)
542 except Exception,e:
543 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
544
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000545class CodecTest(unittest.TestCase):
546 def test_builtin(self):
547 self.assertEquals(unicode("python.org", "idna"), u"python.org")
548
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000549class CodecsModuleTest(unittest.TestCase):
550
551 def test_decode(self):
552 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
553 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000554 self.assertRaises(TypeError, codecs.decode)
555 self.assertEquals(codecs.decode('abc'), u'abc')
556 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
557
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000558 def test_encode(self):
559 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
560 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000561 self.assertRaises(TypeError, codecs.encode)
562 self.assertEquals(codecs.encode(u'abc'), 'abc')
563 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
564
565 def test_register(self):
566 self.assertRaises(TypeError, codecs.register)
567
568 def test_lookup(self):
569 self.assertRaises(TypeError, codecs.lookup)
570 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000571
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000572class StreamReaderTest(unittest.TestCase):
573
574 def setUp(self):
575 self.reader = codecs.getreader('utf-8')
576 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
577
578 def test_readlines(self):
579 f = self.reader(self.stream)
580 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
581
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000582all_unicode_encodings = [
583 "ascii",
584 "base64_codec",
585 "big5",
586 "big5hkscs",
587 "charmap",
588 "cp037",
589 "cp1006",
590 "cp1026",
591 "cp1140",
592 "cp1250",
593 "cp1251",
594 "cp1252",
595 "cp1253",
596 "cp1254",
597 "cp1255",
598 "cp1256",
599 "cp1257",
600 "cp1258",
601 "cp424",
602 "cp437",
603 "cp500",
604 "cp737",
605 "cp775",
606 "cp850",
607 "cp852",
608 "cp855",
609 "cp856",
610 "cp857",
611 "cp860",
612 "cp861",
613 "cp862",
614 "cp863",
615 "cp864",
616 "cp865",
617 "cp866",
618 "cp869",
619 "cp874",
620 "cp875",
621 "cp932",
622 "cp949",
623 "cp950",
624 "euc_jis_2004",
625 "euc_jisx0213",
626 "euc_jp",
627 "euc_kr",
628 "gb18030",
629 "gb2312",
630 "gbk",
631 "hex_codec",
632 "hp_roman8",
633 "hz",
634 "idna",
635 "iso2022_jp",
636 "iso2022_jp_1",
637 "iso2022_jp_2",
638 "iso2022_jp_2004",
639 "iso2022_jp_3",
640 "iso2022_jp_ext",
641 "iso2022_kr",
642 "iso8859_1",
643 "iso8859_10",
644 "iso8859_11",
645 "iso8859_13",
646 "iso8859_14",
647 "iso8859_15",
648 "iso8859_16",
649 "iso8859_2",
650 "iso8859_3",
651 "iso8859_4",
652 "iso8859_5",
653 "iso8859_6",
654 "iso8859_7",
655 "iso8859_8",
656 "iso8859_9",
657 "johab",
658 "koi8_r",
659 "koi8_u",
660 "latin_1",
661 "mac_cyrillic",
662 "mac_greek",
663 "mac_iceland",
664 "mac_latin2",
665 "mac_roman",
666 "mac_turkish",
667 "palmos",
668 "ptcp154",
669 "punycode",
670 "raw_unicode_escape",
671 "rot_13",
672 "shift_jis",
673 "shift_jis_2004",
674 "shift_jisx0213",
675 "tis_620",
676 "unicode_escape",
677 "unicode_internal",
678 "utf_16",
679 "utf_16_be",
680 "utf_16_le",
681 "utf_7",
682 "utf_8",
683]
684
685if hasattr(codecs, "mbcs_encode"):
686 all_unicode_encodings.append("mbcs")
687
688# The following encodings work only with str, not unicode
689all_string_encodings = [
690 "quopri_codec",
691 "string_escape",
692 "uu_codec",
693]
694
695# The following encoding is not tested, because it's not supposed
696# to work:
697# "undefined"
698
699# The following encodings don't work in stateful mode
700broken_unicode_with_streams = [
701 "base64_codec",
702 "hex_codec",
703 "punycode",
704 "unicode_internal"
705]
706
707try:
708 import bz2
709except ImportError:
710 pass
711else:
712 all_unicode_encodings.append("bz2_codec")
713 broken_unicode_with_streams.append("bz2_codec")
714
715try:
716 import zlib
717except ImportError:
718 pass
719else:
720 all_unicode_encodings.append("zlib_codec")
721 broken_unicode_with_streams.append("zlib_codec")
722
723class BasicUnicodeTest(unittest.TestCase):
724 def test_basics(self):
725 s = u"abc123" # all codecs should be able to encode these
726 for encoding in all_unicode_encodings:
727 (bytes, size) = codecs.getencoder(encoding)(s)
728 if encoding != "unicode_internal":
729 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
730 (chars, size) = codecs.getdecoder(encoding)(bytes)
731 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
732
733 if encoding not in broken_unicode_with_streams:
734 # check stream reader/writer
735 q = Queue()
736 writer = codecs.getwriter(encoding)(q)
737 encodedresult = ""
738 for c in s:
739 writer.write(c)
740 encodedresult += q.read()
741 q = Queue()
742 reader = codecs.getreader(encoding)(q)
743 decodedresult = u""
744 for c in encodedresult:
745 q.write(c)
746 decodedresult += reader.read()
747 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
748
749class BasicStrTest(unittest.TestCase):
750 def test_basics(self):
751 s = "abc123"
752 for encoding in all_string_encodings:
753 (bytes, size) = codecs.getencoder(encoding)(s)
754 self.assertEqual(size, len(s))
755 (chars, size) = codecs.getdecoder(encoding)(bytes)
756 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
757
Fred Drake2e2be372001-09-20 21:33:42 +0000758def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000759 test_support.run_unittest(
760 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +0000761 UTF16LETest,
762 UTF16BETest,
763 UTF8Test,
Walter Dörwald21d3a322003-05-01 17:45:56 +0000764 EscapeDecodeTest,
765 RecodingTest,
766 PunycodeTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000767 NameprepTest,
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000768 CodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000769 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000770 StreamReaderTest,
771 BasicUnicodeTest,
772 BasicStrTest
Walter Dörwald21d3a322003-05-01 17:45:56 +0000773 )
Fred Drake2e2be372001-09-20 21:33:42 +0000774
775
776if __name__ == "__main__":
777 test_main()