blob: 7cd7141b0670a6c342fb10a6b77f10f0ea97d2a9 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00004import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000029 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000030 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwaldabb02e52006-03-15 11:35:15 +000044 # do the check again, this time using a incremental decoder
45 d = codecs.getincrementaldecoder(self.encoding)()
46 result = u""
47 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
48 result += d.decode(c)
49 self.assertEqual(result, partialresult)
50 # check that there's nothing left in the buffers
51 self.assertEqual(d.decode("", True), u"")
52 self.assertEqual(d.buffer, "")
53
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000054 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000055 d.reset()
56 result = u""
57 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
58 result += d.decode(c)
59 self.assertEqual(result, partialresult)
60 # check that there's nothing left in the buffers
61 self.assertEqual(d.decode("", True), u"")
62 self.assertEqual(d.buffer, "")
63
64 # check iterdecode()
65 encoded = input.encode(self.encoding)
66 self.assertEqual(
67 input,
68 u"".join(codecs.iterdecode(encoded, self.encoding))
69 )
70
Walter Dörwalde57d7b12004-12-21 22:24:00 +000071 def test_readline(self):
72 def getreader(input):
73 stream = StringIO.StringIO(input.encode(self.encoding))
74 return codecs.getreader(self.encoding)(stream)
75
Walter Dörwaldca199432006-03-06 22:39:12 +000076 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 reader = getreader(input)
78 lines = []
79 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000080 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 if not line:
82 break
83 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000084 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085
86 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000087 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
88 sexpectednoends = u"foo|bar|baz|spam|eggs"
89 self.assertEqual(readalllines(s, True), sexpected)
90 self.assertEqual(readalllines(s, False), sexpectednoends)
91 self.assertEqual(readalllines(s, True, 10), sexpected)
92 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093
94 # Test long lines (multiple calls to read() in readline())
95 vw = []
96 vwo = []
97 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
98 vw.append((i*200)*u"\3042" + lineend)
99 vwo.append((i*200)*u"\3042")
100 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
101 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
102
103 # Test lines where the first read might end with \r, so the
104 # reader has to look ahead whether this is a lone \r or a \r\n
105 for size in xrange(80):
106 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000107 s = 10*(size*u"a" + lineend + u"xxx\n")
108 reader = getreader(s)
109 for i in xrange(10):
110 self.assertEqual(
111 reader.readline(keepends=True),
112 size*u"a" + lineend,
113 )
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=False),
118 size*u"a",
119 )
120
121 def test_bug1175396(self):
122 s = [
123 '<%!--===================================================\r\n',
124 ' BLOG index page: show recent articles,\r\n',
125 ' today\'s articles, or articles of a specific date.\r\n',
126 '========================================================--%>\r\n',
127 '<%@inputencoding="ISO-8859-1"%>\r\n',
128 '<%@pagetemplate=TEMPLATE.y%>\r\n',
129 '<%@import=import frog.util, frog%>\r\n',
130 '<%@import=import frog.objects%>\r\n',
131 '<%@import=from frog.storageerrors import StorageError%>\r\n',
132 '<%\r\n',
133 '\r\n',
134 'import logging\r\n',
135 'log=logging.getLogger("Snakelets.logger")\r\n',
136 '\r\n',
137 '\r\n',
138 'user=self.SessionCtx.user\r\n',
139 'storageEngine=self.SessionCtx.storageEngine\r\n',
140 '\r\n',
141 '\r\n',
142 'def readArticlesFromDate(date, count=None):\r\n',
143 ' entryids=storageEngine.listBlogEntries(date)\r\n',
144 ' entryids.reverse() # descending\r\n',
145 ' if count:\r\n',
146 ' entryids=entryids[:count]\r\n',
147 ' try:\r\n',
148 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
149 ' except StorageError,x:\r\n',
150 ' log.error("Error loading articles: "+str(x))\r\n',
151 ' self.abort("cannot load articles")\r\n',
152 '\r\n',
153 'showdate=None\r\n',
154 '\r\n',
155 'arg=self.Request.getArg()\r\n',
156 'if arg=="today":\r\n',
157 ' #-------------------- TODAY\'S ARTICLES\r\n',
158 ' self.write("<h2>Today\'s articles</h2>")\r\n',
159 ' showdate = frog.util.isodatestr() \r\n',
160 ' entries = readArticlesFromDate(showdate)\r\n',
161 'elif arg=="active":\r\n',
162 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
163 ' self.Yredirect("active.y")\r\n',
164 'elif arg=="login":\r\n',
165 ' #-------------------- LOGIN PAGE redirect\r\n',
166 ' self.Yredirect("login.y")\r\n',
167 'elif arg=="date":\r\n',
168 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
169 ' showdate = self.Request.getParameter("date")\r\n',
170 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
171 ' entries = readArticlesFromDate(showdate)\r\n',
172 'else:\r\n',
173 ' #-------------------- RECENT ARTICLES\r\n',
174 ' self.write("<h2>Recent articles</h2>")\r\n',
175 ' dates=storageEngine.listBlogEntryDates()\r\n',
176 ' if dates:\r\n',
177 ' entries=[]\r\n',
178 ' SHOWAMOUNT=10\r\n',
179 ' for showdate in dates:\r\n',
180 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
181 ' if len(entries)>=SHOWAMOUNT:\r\n',
182 ' break\r\n',
183 ' \r\n',
184 ]
185 stream = StringIO.StringIO("".join(s).encode(self.encoding))
186 reader = codecs.getreader(self.encoding)(stream)
187 for (i, line) in enumerate(reader):
188 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000189
190 def test_readlinequeue(self):
191 q = Queue()
192 writer = codecs.getwriter(self.encoding)(q)
193 reader = codecs.getreader(self.encoding)(q)
194
195 # No lineends
196 writer.write(u"foo\r")
197 self.assertEqual(reader.readline(keepends=False), u"foo")
198 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000199 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000200 self.assertEqual(reader.readline(keepends=False), u"bar")
201 writer.write(u"baz")
202 self.assertEqual(reader.readline(keepends=False), u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"")
204
205 # Lineends
206 writer.write(u"foo\r")
207 self.assertEqual(reader.readline(keepends=True), u"foo\r")
208 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000209 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000210 self.assertEqual(reader.readline(keepends=True), u"bar\r")
211 writer.write(u"baz")
212 self.assertEqual(reader.readline(keepends=True), u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"")
214 writer.write(u"foo\r\n")
215 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
216
Walter Dörwald9fa09462005-01-10 12:01:39 +0000217 def test_bug1098990_a(self):
218 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
219 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
220 s3 = u"next line.\r\n"
221
222 s = (s1+s2+s3).encode(self.encoding)
223 stream = StringIO.StringIO(s)
224 reader = codecs.getreader(self.encoding)(stream)
225 self.assertEqual(reader.readline(), s1)
226 self.assertEqual(reader.readline(), s2)
227 self.assertEqual(reader.readline(), s3)
228 self.assertEqual(reader.readline(), u"")
229
230 def test_bug1098990_b(self):
231 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
232 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
233 s3 = u"stillokay:bbbbxx\r\n"
234 s4 = u"broken!!!!badbad\r\n"
235 s5 = u"againokay.\r\n"
236
237 s = (s1+s2+s3+s4+s5).encode(self.encoding)
238 stream = StringIO.StringIO(s)
239 reader = codecs.getreader(self.encoding)(stream)
240 self.assertEqual(reader.readline(), s1)
241 self.assertEqual(reader.readline(), s2)
242 self.assertEqual(reader.readline(), s3)
243 self.assertEqual(reader.readline(), s4)
244 self.assertEqual(reader.readline(), s5)
245 self.assertEqual(reader.readline(), u"")
246
Walter Dörwald6e390802007-08-17 16:41:28 +0000247class UTF32Test(ReadTest):
248 encoding = "utf-32"
249
250 spamle = ('\xff\xfe\x00\x00'
251 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
253 spambe = ('\x00\x00\xfe\xff'
254 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
256
257 def test_only_one_bom(self):
258 _,_,reader,writer = codecs.lookup(self.encoding)
259 # encode some stream
260 s = StringIO.StringIO()
261 f = writer(s)
262 f.write(u"spam")
263 f.write(u"spam")
264 d = s.getvalue()
265 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000266 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000267 # try to read it back
268 s = StringIO.StringIO(d)
269 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000270 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000271
272 def test_badbom(self):
273 s = StringIO.StringIO(4*"\xff")
274 f = codecs.getreader(self.encoding)(s)
275 self.assertRaises(UnicodeError, f.read)
276
277 s = StringIO.StringIO(8*"\xff")
278 f = codecs.getreader(self.encoding)(s)
279 self.assertRaises(UnicodeError, f.read)
280
281 def test_partial(self):
282 self.check_partial(
283 u"\x00\xff\u0100\uffff",
284 [
285 u"", # first byte of BOM read
286 u"", # second byte of BOM read
287 u"", # third byte of BOM read
288 u"", # fourth byte of BOM read => byteorder known
289 u"",
290 u"",
291 u"",
292 u"\x00",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00\xff",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff\u0100",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100\uffff",
305 ]
306 )
307
Georg Brandle9741f32009-09-17 11:28:09 +0000308 def test_handlers(self):
309 self.assertEqual((u'\ufffd', 1),
310 codecs.utf_32_decode('\x01', 'replace', True))
311 self.assertEqual((u'', 1),
312 codecs.utf_32_decode('\x01', 'ignore', True))
313
Walter Dörwald6e390802007-08-17 16:41:28 +0000314 def test_errors(self):
315 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
316 "\xff", "strict", True)
317
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000318 def test_issue8941(self):
319 # Issue #8941: insufficient result allocation when decoding into
320 # surrogate pairs on UCS-2 builds.
321 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
322 self.assertEqual(u'\U00010000' * 1024,
323 codecs.utf_32_decode(encoded_le)[0])
324 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
325 self.assertEqual(u'\U00010000' * 1024,
326 codecs.utf_32_decode(encoded_be)[0])
327
Walter Dörwald6e390802007-08-17 16:41:28 +0000328class UTF32LETest(ReadTest):
329 encoding = "utf-32-le"
330
331 def test_partial(self):
332 self.check_partial(
333 u"\x00\xff\u0100\uffff",
334 [
335 u"",
336 u"",
337 u"",
338 u"\x00",
339 u"\x00",
340 u"\x00",
341 u"\x00",
342 u"\x00\xff",
343 u"\x00\xff",
344 u"\x00\xff",
345 u"\x00\xff",
346 u"\x00\xff\u0100",
347 u"\x00\xff\u0100",
348 u"\x00\xff\u0100",
349 u"\x00\xff\u0100",
350 u"\x00\xff\u0100\uffff",
351 ]
352 )
353
354 def test_simple(self):
355 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
356
357 def test_errors(self):
358 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
359 "\xff", "strict", True)
360
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000361 def test_issue8941(self):
362 # Issue #8941: insufficient result allocation when decoding into
363 # surrogate pairs on UCS-2 builds.
364 encoded = '\x00\x00\x01\x00' * 1024
365 self.assertEqual(u'\U00010000' * 1024,
366 codecs.utf_32_le_decode(encoded)[0])
367
Walter Dörwald6e390802007-08-17 16:41:28 +0000368class UTF32BETest(ReadTest):
369 encoding = "utf-32-be"
370
371 def test_partial(self):
372 self.check_partial(
373 u"\x00\xff\u0100\uffff",
374 [
375 u"",
376 u"",
377 u"",
378 u"\x00",
379 u"\x00",
380 u"\x00",
381 u"\x00",
382 u"\x00\xff",
383 u"\x00\xff",
384 u"\x00\xff",
385 u"\x00\xff",
386 u"\x00\xff\u0100",
387 u"\x00\xff\u0100",
388 u"\x00\xff\u0100",
389 u"\x00\xff\u0100",
390 u"\x00\xff\u0100\uffff",
391 ]
392 )
393
394 def test_simple(self):
395 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
396
397 def test_errors(self):
398 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
399 "\xff", "strict", True)
400
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000401 def test_issue8941(self):
402 # Issue #8941: insufficient result allocation when decoding into
403 # surrogate pairs on UCS-2 builds.
404 encoded = '\x00\x01\x00\x00' * 1024
405 self.assertEqual(u'\U00010000' * 1024,
406 codecs.utf_32_be_decode(encoded)[0])
407
408
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000409class UTF16Test(ReadTest):
410 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000411
412 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
413 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
414
415 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000416 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000417 # encode some stream
418 s = StringIO.StringIO()
419 f = writer(s)
420 f.write(u"spam")
421 f.write(u"spam")
422 d = s.getvalue()
423 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000424 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000425 # try to read it back
426 s = StringIO.StringIO(d)
427 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000428 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000429
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000430 def test_badbom(self):
431 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000432 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000433 self.assertRaises(UnicodeError, f.read)
434
435 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000436 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000437 self.assertRaises(UnicodeError, f.read)
438
Walter Dörwald69652032004-09-07 20:24:22 +0000439 def test_partial(self):
440 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000441 u"\x00\xff\u0100\uffff",
442 [
443 u"", # first byte of BOM read
444 u"", # second byte of BOM read => byteorder known
445 u"",
446 u"\x00",
447 u"\x00",
448 u"\x00\xff",
449 u"\x00\xff",
450 u"\x00\xff\u0100",
451 u"\x00\xff\u0100",
452 u"\x00\xff\u0100\uffff",
453 ]
454 )
455
Georg Brandle9741f32009-09-17 11:28:09 +0000456 def test_handlers(self):
457 self.assertEqual((u'\ufffd', 1),
458 codecs.utf_16_decode('\x01', 'replace', True))
459 self.assertEqual((u'', 1),
460 codecs.utf_16_decode('\x01', 'ignore', True))
461
Walter Dörwalde22d3392005-11-17 08:52:34 +0000462 def test_errors(self):
463 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
464
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000465 def test_bug691291(self):
466 # Files are always opened in binary mode, even if no binary mode was
467 # specified. This means that no automatic conversion of '\n' is done
468 # on reading and writing.
469 s1 = u'Hello\r\nworld\r\n'
470
471 s = s1.encode(self.encoding)
472 try:
473 with open(test_support.TESTFN, 'wb') as fp:
474 fp.write(s)
475 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
476 self.assertEqual(reader.read(), s1)
477 finally:
478 test_support.unlink(test_support.TESTFN)
479
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000480class UTF16LETest(ReadTest):
481 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000482
483 def test_partial(self):
484 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000485 u"\x00\xff\u0100\uffff",
486 [
487 u"",
488 u"\x00",
489 u"\x00",
490 u"\x00\xff",
491 u"\x00\xff",
492 u"\x00\xff\u0100",
493 u"\x00\xff\u0100",
494 u"\x00\xff\u0100\uffff",
495 ]
496 )
497
Walter Dörwalde22d3392005-11-17 08:52:34 +0000498 def test_errors(self):
499 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
500
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000501class UTF16BETest(ReadTest):
502 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000503
504 def test_partial(self):
505 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000506 u"\x00\xff\u0100\uffff",
507 [
508 u"",
509 u"\x00",
510 u"\x00",
511 u"\x00\xff",
512 u"\x00\xff",
513 u"\x00\xff\u0100",
514 u"\x00\xff\u0100",
515 u"\x00\xff\u0100\uffff",
516 ]
517 )
518
Walter Dörwalde22d3392005-11-17 08:52:34 +0000519 def test_errors(self):
520 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
521
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000522class UTF8Test(ReadTest):
523 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000524
525 def test_partial(self):
526 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000527 u"\x00\xff\u07ff\u0800\uffff",
528 [
529 u"\x00",
530 u"\x00",
531 u"\x00\xff",
532 u"\x00\xff",
533 u"\x00\xff\u07ff",
534 u"\x00\xff\u07ff",
535 u"\x00\xff\u07ff",
536 u"\x00\xff\u07ff\u0800",
537 u"\x00\xff\u07ff\u0800",
538 u"\x00\xff\u07ff\u0800",
539 u"\x00\xff\u07ff\u0800\uffff",
540 ]
541 )
542
Walter Dörwalde22d3392005-11-17 08:52:34 +0000543class UTF7Test(ReadTest):
544 encoding = "utf-7"
545
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000546 def test_partial(self):
547 self.check_partial(
548 u"a+-b",
549 [
550 u"a",
551 u"a",
552 u"a+",
553 u"a+-",
554 u"a+-b",
555 ]
556 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000557
558class UTF16ExTest(unittest.TestCase):
559
560 def test_errors(self):
561 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
562
563 def test_bad_args(self):
564 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
565
566class ReadBufferTest(unittest.TestCase):
567
568 def test_array(self):
569 import array
570 self.assertEqual(
571 codecs.readbuffer_encode(array.array("c", "spam")),
572 ("spam", 4)
573 )
574
575 def test_empty(self):
576 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
577
578 def test_bad_args(self):
579 self.assertRaises(TypeError, codecs.readbuffer_encode)
580 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
581
582class CharBufferTest(unittest.TestCase):
583
584 def test_string(self):
585 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
586
587 def test_empty(self):
588 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
589
590 def test_bad_args(self):
591 self.assertRaises(TypeError, codecs.charbuffer_encode)
592 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
593
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000594class UTF8SigTest(ReadTest):
595 encoding = "utf-8-sig"
596
597 def test_partial(self):
598 self.check_partial(
599 u"\ufeff\x00\xff\u07ff\u0800\uffff",
600 [
601 u"",
602 u"",
603 u"", # First BOM has been read and skipped
604 u"",
605 u"",
606 u"\ufeff", # Second BOM has been read and emitted
607 u"\ufeff\x00", # "\x00" read and emitted
608 u"\ufeff\x00", # First byte of encoded u"\xff" read
609 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
610 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
611 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
612 u"\ufeff\x00\xff\u07ff",
613 u"\ufeff\x00\xff\u07ff",
614 u"\ufeff\x00\xff\u07ff\u0800",
615 u"\ufeff\x00\xff\u07ff\u0800",
616 u"\ufeff\x00\xff\u07ff\u0800",
617 u"\ufeff\x00\xff\u07ff\u0800\uffff",
618 ]
619 )
620
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000621 def test_bug1601501(self):
622 # SF bug #1601501: check that the codec works with a buffer
623 unicode("\xef\xbb\xbf", "utf-8-sig")
624
Walter Dörwald42348272007-04-12 10:35:00 +0000625 def test_bom(self):
626 d = codecs.getincrementaldecoder("utf-8-sig")()
627 s = u"spam"
628 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
629
Walter Dörwald183744d2007-11-19 12:41:10 +0000630 def test_stream_bom(self):
631 unistring = u"ABC\u00A1\u2200XYZ"
632 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
633
634 reader = codecs.getreader("utf-8-sig")
635 for sizehint in [None] + range(1, 11) + \
636 [64, 128, 256, 512, 1024]:
637 istream = reader(StringIO.StringIO(bytestring))
638 ostream = StringIO.StringIO()
639 while 1:
640 if sizehint is not None:
641 data = istream.read(sizehint)
642 else:
643 data = istream.read()
644
645 if not data:
646 break
647 ostream.write(data)
648
649 got = ostream.getvalue()
650 self.assertEqual(got, unistring)
651
652 def test_stream_bare(self):
653 unistring = u"ABC\u00A1\u2200XYZ"
654 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
655
656 reader = codecs.getreader("utf-8-sig")
657 for sizehint in [None] + range(1, 11) + \
658 [64, 128, 256, 512, 1024]:
659 istream = reader(StringIO.StringIO(bytestring))
660 ostream = StringIO.StringIO()
661 while 1:
662 if sizehint is not None:
663 data = istream.read(sizehint)
664 else:
665 data = istream.read()
666
667 if not data:
668 break
669 ostream.write(data)
670
671 got = ostream.getvalue()
672 self.assertEqual(got, unistring)
673
Walter Dörwald8709a422002-09-03 13:53:40 +0000674class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000675 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000676 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000677
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000678class RecodingTest(unittest.TestCase):
679 def test_recoding(self):
680 f = StringIO.StringIO()
681 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
682 f2.write(u"a")
683 f2.close()
684 # Python used to crash on this at exit because of a refcount
685 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000686
Martin v. Löwis2548c732003-04-18 10:39:54 +0000687# From RFC 3492
688punycode_testcases = [
689 # A Arabic (Egyptian):
690 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
691 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
692 "egbpdaj6bu4bxfgehfvwxn"),
693 # B Chinese (simplified):
694 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
695 "ihqwcrb4cv8a8dqg056pqjye"),
696 # C Chinese (traditional):
697 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
698 "ihqwctvzc91f659drss3x8bo0yb"),
699 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
700 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
701 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
702 u"\u0065\u0073\u006B\u0079",
703 "Proprostnemluvesky-uyb24dma41a"),
704 # E Hebrew:
705 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
706 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
707 u"\u05D1\u05E8\u05D9\u05EA",
708 "4dbcagdahymbxekheh6e0a7fei0b"),
709 # F Hindi (Devanagari):
710 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
711 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
712 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
713 u"\u0939\u0948\u0902",
714 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
715
716 #(G) Japanese (kanji and hiragana):
717 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
718 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
719 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
720
721 # (H) Korean (Hangul syllables):
722 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
723 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
724 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
725 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
726 "psd879ccm6fea98c"),
727
728 # (I) Russian (Cyrillic):
729 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
730 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
731 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
732 u"\u0438",
733 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
734
735 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
736 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
737 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
738 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
739 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
740 u"\u0061\u00F1\u006F\u006C",
741 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
742
743 # (K) Vietnamese:
744 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
745 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
746 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
747 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
748 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
749 u"\u0056\u0069\u1EC7\u0074",
750 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
751
Martin v. Löwis2548c732003-04-18 10:39:54 +0000752 #(L) 3<nen>B<gumi><kinpachi><sensei>
753 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
754 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000755
Martin v. Löwis2548c732003-04-18 10:39:54 +0000756 # (M) <amuro><namie>-with-SUPER-MONKEYS
757 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
758 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
759 u"\u004F\u004E\u004B\u0045\u0059\u0053",
760 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
761
762 # (N) Hello-Another-Way-<sorezore><no><basho>
763 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
764 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
765 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
766 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
767
768 # (O) <hitotsu><yane><no><shita>2
769 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
770 "2-u9tlzr9756bt3uc0v"),
771
772 # (P) Maji<de>Koi<suru>5<byou><mae>
773 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
774 u"\u308B\u0035\u79D2\u524D",
775 "MajiKoi5-783gue6qz075azm5e"),
776
777 # (Q) <pafii>de<runba>
778 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
779 "de-jg4avhby1noc0d"),
780
781 # (R) <sono><supiido><de>
782 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
783 "d9juau41awczczp"),
784
785 # (S) -> $1.00 <-
786 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
787 u"\u003C\u002D",
788 "-> $1.00 <--")
789 ]
790
791for i in punycode_testcases:
792 if len(i)!=2:
793 print repr(i)
794
795class PunycodeTest(unittest.TestCase):
796 def test_encode(self):
797 for uni, puny in punycode_testcases:
798 # Need to convert both strings to lower case, since
799 # some of the extended encodings use upper case, but our
800 # code produces only lower case. Converting just puny to
801 # lower is also insufficient, since some of the input characters
802 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +0000803 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000804
805 def test_decode(self):
806 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +0000807 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000808
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000809class UnicodeInternalTest(unittest.TestCase):
810 def test_bug1251300(self):
811 # Decoding with unicode_internal used to not correctly handle "code
812 # points" above 0x10ffff on UCS-4 builds.
813 if sys.maxunicode > 0xffff:
814 ok = [
815 ("\x00\x10\xff\xff", u"\U0010ffff"),
816 ("\x00\x00\x01\x01", u"\U00000101"),
817 ("", u""),
818 ]
819 not_ok = [
820 "\x7f\xff\xff\xff",
821 "\x80\x00\x00\x00",
822 "\x81\x00\x00\x00",
823 "\x00",
824 "\x00\x00\x00\x00\x00",
825 ]
826 for internal, uni in ok:
827 if sys.byteorder == "little":
828 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +0000829 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000830 for internal in not_ok:
831 if sys.byteorder == "little":
832 internal = "".join(reversed(internal))
833 self.assertRaises(UnicodeDecodeError, internal.decode,
834 "unicode_internal")
835
836 def test_decode_error_attributes(self):
837 if sys.maxunicode > 0xffff:
838 try:
839 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
840 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +0000841 self.assertEqual("unicode_internal", ex.encoding)
842 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
843 self.assertEqual(4, ex.start)
844 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000845 else:
846 self.fail()
847
848 def test_decode_callback(self):
849 if sys.maxunicode > 0xffff:
850 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
851 decoder = codecs.getdecoder("unicode_internal")
852 ab = u"ab".encode("unicode_internal")
853 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
854 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +0000855 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000856
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000857 def test_encode_length(self):
858 # Issue 3739
859 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +0000860 self.assertEqual(encoder(u"a")[1], 1)
861 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000862
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000863 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +0000864 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000865
Martin v. Löwis2548c732003-04-18 10:39:54 +0000866# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
867nameprep_tests = [
868 # 3.1 Map to nothing.
869 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
870 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
871 '\xb8\x8f\xef\xbb\xbf',
872 'foobarbaz'),
873 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
874 ('CAFE',
875 'cafe'),
876 # 3.3 Case folding 8bit U+00DF (german sharp s).
877 # The original test case is bogus; it says \xc3\xdf
878 ('\xc3\x9f',
879 'ss'),
880 # 3.4 Case folding U+0130 (turkish capital I with dot).
881 ('\xc4\xb0',
882 'i\xcc\x87'),
883 # 3.5 Case folding multibyte U+0143 U+037A.
884 ('\xc5\x83\xcd\xba',
885 '\xc5\x84 \xce\xb9'),
886 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
887 # XXX: skip this as it fails in UCS-2 mode
888 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
889 # 'telc\xe2\x88\x95kg\xcf\x83'),
890 (None, None),
891 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
892 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
893 '\xc7\xb0 a'),
894 # 3.8 Case folding U+1FB7 and normalization.
895 ('\xe1\xbe\xb7',
896 '\xe1\xbe\xb6\xce\xb9'),
897 # 3.9 Self-reverting case folding U+01F0 and normalization.
898 # The original test case is bogus, it says `\xc7\xf0'
899 ('\xc7\xb0',
900 '\xc7\xb0'),
901 # 3.10 Self-reverting case folding U+0390 and normalization.
902 ('\xce\x90',
903 '\xce\x90'),
904 # 3.11 Self-reverting case folding U+03B0 and normalization.
905 ('\xce\xb0',
906 '\xce\xb0'),
907 # 3.12 Self-reverting case folding U+1E96 and normalization.
908 ('\xe1\xba\x96',
909 '\xe1\xba\x96'),
910 # 3.13 Self-reverting case folding U+1F56 and normalization.
911 ('\xe1\xbd\x96',
912 '\xe1\xbd\x96'),
913 # 3.14 ASCII space character U+0020.
914 (' ',
915 ' '),
916 # 3.15 Non-ASCII 8bit space character U+00A0.
917 ('\xc2\xa0',
918 ' '),
919 # 3.16 Non-ASCII multibyte space character U+1680.
920 ('\xe1\x9a\x80',
921 None),
922 # 3.17 Non-ASCII multibyte space character U+2000.
923 ('\xe2\x80\x80',
924 ' '),
925 # 3.18 Zero Width Space U+200b.
926 ('\xe2\x80\x8b',
927 ''),
928 # 3.19 Non-ASCII multibyte space character U+3000.
929 ('\xe3\x80\x80',
930 ' '),
931 # 3.20 ASCII control characters U+0010 U+007F.
932 ('\x10\x7f',
933 '\x10\x7f'),
934 # 3.21 Non-ASCII 8bit control character U+0085.
935 ('\xc2\x85',
936 None),
937 # 3.22 Non-ASCII multibyte control character U+180E.
938 ('\xe1\xa0\x8e',
939 None),
940 # 3.23 Zero Width No-Break Space U+FEFF.
941 ('\xef\xbb\xbf',
942 ''),
943 # 3.24 Non-ASCII control character U+1D175.
944 ('\xf0\x9d\x85\xb5',
945 None),
946 # 3.25 Plane 0 private use character U+F123.
947 ('\xef\x84\xa3',
948 None),
949 # 3.26 Plane 15 private use character U+F1234.
950 ('\xf3\xb1\x88\xb4',
951 None),
952 # 3.27 Plane 16 private use character U+10F234.
953 ('\xf4\x8f\x88\xb4',
954 None),
955 # 3.28 Non-character code point U+8FFFE.
956 ('\xf2\x8f\xbf\xbe',
957 None),
958 # 3.29 Non-character code point U+10FFFF.
959 ('\xf4\x8f\xbf\xbf',
960 None),
961 # 3.30 Surrogate code U+DF42.
962 ('\xed\xbd\x82',
963 None),
964 # 3.31 Non-plain text character U+FFFD.
965 ('\xef\xbf\xbd',
966 None),
967 # 3.32 Ideographic description character U+2FF5.
968 ('\xe2\xbf\xb5',
969 None),
970 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000971 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000972 '\xcc\x81'),
973 # 3.34 Left-to-right mark U+200E.
974 ('\xe2\x80\x8e',
975 None),
976 # 3.35 Deprecated U+202A.
977 ('\xe2\x80\xaa',
978 None),
979 # 3.36 Language tagging character U+E0001.
980 ('\xf3\xa0\x80\x81',
981 None),
982 # 3.37 Language tagging character U+E0042.
983 ('\xf3\xa0\x81\x82',
984 None),
985 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
986 ('foo\xd6\xbebar',
987 None),
988 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
989 ('foo\xef\xb5\x90bar',
990 None),
991 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
992 ('foo\xef\xb9\xb6bar',
993 'foo \xd9\x8ebar'),
994 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
995 ('\xd8\xa71',
996 None),
997 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
998 ('\xd8\xa71\xd8\xa8',
999 '\xd8\xa71\xd8\xa8'),
1000 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001001 # Skip this test as we allow unassigned
1002 #('\xf3\xa0\x80\x82',
1003 # None),
1004 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001005 # 3.44 Larger test (shrinking).
1006 # Original test case reads \xc3\xdf
1007 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1008 '\xaa\xce\xb0\xe2\x80\x80',
1009 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1010 # 3.45 Larger test (expanding).
1011 # Original test case reads \xc3\x9f
1012 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1013 '\x80',
1014 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1015 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1016 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1017 ]
1018
1019
1020class NameprepTest(unittest.TestCase):
1021 def test_nameprep(self):
1022 from encodings.idna import nameprep
1023 for pos, (orig, prepped) in enumerate(nameprep_tests):
1024 if orig is None:
1025 # Skipped
1026 continue
1027 # The Unicode strings are given in UTF-8
1028 orig = unicode(orig, "utf-8")
1029 if prepped is None:
1030 # Input contains prohibited characters
1031 self.assertRaises(UnicodeError, nameprep, orig)
1032 else:
1033 prepped = unicode(prepped, "utf-8")
1034 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001035 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001036 except Exception,e:
1037 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1038
Walter Dörwald78a0be62006-04-14 18:25:39 +00001039class IDNACodecTest(unittest.TestCase):
1040 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001041 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1042 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1043 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1044 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001045
1046 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001047 self.assertEqual(u"python.org".encode("idna"), "python.org")
1048 self.assertEqual("python.org.".encode("idna"), "python.org.")
1049 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1050 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001051
Martin v. Löwis8b595142005-08-25 11:03:38 +00001052 def test_stream(self):
1053 import StringIO
1054 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1055 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001056 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001057
Walter Dörwald78a0be62006-04-14 18:25:39 +00001058 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001059 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001060 "".join(codecs.iterdecode("python.org", "idna")),
1061 u"python.org"
1062 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001063 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001064 "".join(codecs.iterdecode("python.org.", "idna")),
1065 u"python.org."
1066 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001067 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001068 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1069 u"pyth\xf6n.org."
1070 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001071 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001072 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1073 u"pyth\xf6n.org."
1074 )
1075
1076 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001077 self.assertEqual(decoder.decode("xn--xam", ), u"")
1078 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1079 self.assertEqual(decoder.decode(u"rg"), u"")
1080 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001081
1082 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001083 self.assertEqual(decoder.decode("xn--xam", ), u"")
1084 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1085 self.assertEqual(decoder.decode("rg."), u"org.")
1086 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001087
1088 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001089 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001090 "".join(codecs.iterencode(u"python.org", "idna")),
1091 "python.org"
1092 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001093 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001094 "".join(codecs.iterencode(u"python.org.", "idna")),
1095 "python.org."
1096 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001097 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001098 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1099 "xn--pythn-mua.org."
1100 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001101 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001102 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1103 "xn--pythn-mua.org."
1104 )
1105
1106 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001107 self.assertEqual(encoder.encode(u"\xe4x"), "")
1108 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1109 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001110
1111 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001112 self.assertEqual(encoder.encode(u"\xe4x"), "")
1113 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1114 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001115
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001116class CodecsModuleTest(unittest.TestCase):
1117
1118 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001119 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001120 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001121 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001122 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001123 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1124
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001125 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001126 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001127 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001128 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001129 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001130 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001131 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1132
1133 def test_register(self):
1134 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001135 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001136
1137 def test_lookup(self):
1138 self.assertRaises(TypeError, codecs.lookup)
1139 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001140 self.assertRaises(LookupError, codecs.lookup, " ")
1141
1142 def test_getencoder(self):
1143 self.assertRaises(TypeError, codecs.getencoder)
1144 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1145
1146 def test_getdecoder(self):
1147 self.assertRaises(TypeError, codecs.getdecoder)
1148 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1149
1150 def test_getreader(self):
1151 self.assertRaises(TypeError, codecs.getreader)
1152 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1153
1154 def test_getwriter(self):
1155 self.assertRaises(TypeError, codecs.getwriter)
1156 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001157
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001158class StreamReaderTest(unittest.TestCase):
1159
1160 def setUp(self):
1161 self.reader = codecs.getreader('utf-8')
1162 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1163
1164 def test_readlines(self):
1165 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001166 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001167
Georg Brandl8f99f812006-10-29 08:39:22 +00001168class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001169
Georg Brandl8f99f812006-10-29 08:39:22 +00001170 def test_basic(self):
1171 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001172 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001173 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001174
1175 f = StringIO.StringIO()
1176 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1177 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001178 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001179
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001180class Str2StrTest(unittest.TestCase):
1181
1182 def test_read(self):
1183 sin = "\x80".encode("base64_codec")
1184 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1185 sout = reader.read()
1186 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001187 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001188
1189 def test_readline(self):
1190 sin = "\x80".encode("base64_codec")
1191 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1192 sout = reader.readline()
1193 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001194 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001195
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001196all_unicode_encodings = [
1197 "ascii",
1198 "base64_codec",
1199 "big5",
1200 "big5hkscs",
1201 "charmap",
1202 "cp037",
1203 "cp1006",
1204 "cp1026",
1205 "cp1140",
1206 "cp1250",
1207 "cp1251",
1208 "cp1252",
1209 "cp1253",
1210 "cp1254",
1211 "cp1255",
1212 "cp1256",
1213 "cp1257",
1214 "cp1258",
1215 "cp424",
1216 "cp437",
1217 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001218 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001219 "cp737",
1220 "cp775",
1221 "cp850",
1222 "cp852",
1223 "cp855",
1224 "cp856",
1225 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001226 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001227 "cp860",
1228 "cp861",
1229 "cp862",
1230 "cp863",
1231 "cp864",
1232 "cp865",
1233 "cp866",
1234 "cp869",
1235 "cp874",
1236 "cp875",
1237 "cp932",
1238 "cp949",
1239 "cp950",
1240 "euc_jis_2004",
1241 "euc_jisx0213",
1242 "euc_jp",
1243 "euc_kr",
1244 "gb18030",
1245 "gb2312",
1246 "gbk",
1247 "hex_codec",
1248 "hp_roman8",
1249 "hz",
1250 "idna",
1251 "iso2022_jp",
1252 "iso2022_jp_1",
1253 "iso2022_jp_2",
1254 "iso2022_jp_2004",
1255 "iso2022_jp_3",
1256 "iso2022_jp_ext",
1257 "iso2022_kr",
1258 "iso8859_1",
1259 "iso8859_10",
1260 "iso8859_11",
1261 "iso8859_13",
1262 "iso8859_14",
1263 "iso8859_15",
1264 "iso8859_16",
1265 "iso8859_2",
1266 "iso8859_3",
1267 "iso8859_4",
1268 "iso8859_5",
1269 "iso8859_6",
1270 "iso8859_7",
1271 "iso8859_8",
1272 "iso8859_9",
1273 "johab",
1274 "koi8_r",
1275 "koi8_u",
1276 "latin_1",
1277 "mac_cyrillic",
1278 "mac_greek",
1279 "mac_iceland",
1280 "mac_latin2",
1281 "mac_roman",
1282 "mac_turkish",
1283 "palmos",
1284 "ptcp154",
1285 "punycode",
1286 "raw_unicode_escape",
1287 "rot_13",
1288 "shift_jis",
1289 "shift_jis_2004",
1290 "shift_jisx0213",
1291 "tis_620",
1292 "unicode_escape",
1293 "unicode_internal",
1294 "utf_16",
1295 "utf_16_be",
1296 "utf_16_le",
1297 "utf_7",
1298 "utf_8",
1299]
1300
1301if hasattr(codecs, "mbcs_encode"):
1302 all_unicode_encodings.append("mbcs")
1303
1304# The following encodings work only with str, not unicode
1305all_string_encodings = [
1306 "quopri_codec",
1307 "string_escape",
1308 "uu_codec",
1309]
1310
1311# The following encoding is not tested, because it's not supposed
1312# to work:
1313# "undefined"
1314
1315# The following encodings don't work in stateful mode
1316broken_unicode_with_streams = [
1317 "base64_codec",
1318 "hex_codec",
1319 "punycode",
1320 "unicode_internal"
1321]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001322broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001323
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001324# The following encodings only support "strict" mode
1325only_strict_mode = [
1326 "idna",
1327 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001328 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001329]
1330
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001331try:
1332 import bz2
1333except ImportError:
1334 pass
1335else:
1336 all_unicode_encodings.append("bz2_codec")
1337 broken_unicode_with_streams.append("bz2_codec")
1338
1339try:
1340 import zlib
1341except ImportError:
1342 pass
1343else:
1344 all_unicode_encodings.append("zlib_codec")
1345 broken_unicode_with_streams.append("zlib_codec")
1346
1347class BasicUnicodeTest(unittest.TestCase):
1348 def test_basics(self):
1349 s = u"abc123" # all codecs should be able to encode these
1350 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001351 name = codecs.lookup(encoding).name
1352 if encoding.endswith("_codec"):
1353 name += "_codec"
1354 elif encoding == "latin_1":
1355 name = "latin_1"
1356 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001357 (bytes, size) = codecs.getencoder(encoding)(s)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001358 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001359 (chars, size) = codecs.getdecoder(encoding)(bytes)
1360 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1361
1362 if encoding not in broken_unicode_with_streams:
1363 # check stream reader/writer
1364 q = Queue()
1365 writer = codecs.getwriter(encoding)(q)
1366 encodedresult = ""
1367 for c in s:
1368 writer.write(c)
1369 encodedresult += q.read()
1370 q = Queue()
1371 reader = codecs.getreader(encoding)(q)
1372 decodedresult = u""
1373 for c in encodedresult:
1374 q.write(c)
1375 decodedresult += reader.read()
1376 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1377
Georg Brandl2c9838e2006-10-29 14:39:09 +00001378 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001379 # check incremental decoder/encoder (fetched via the Python
1380 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001381 try:
1382 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001383 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001384 except LookupError: # no IncrementalEncoder
1385 pass
1386 else:
1387 # check incremental decoder/encoder
1388 encodedresult = ""
1389 for c in s:
1390 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001391 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001392 decoder = codecs.getincrementaldecoder(encoding)()
1393 decodedresult = u""
1394 for c in encodedresult:
1395 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001396 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001397 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1398
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001399 # check C API
1400 encodedresult = ""
1401 for c in s:
1402 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001403 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001404 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1405 decodedresult = u""
1406 for c in encodedresult:
1407 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001408 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001409 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1410
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001411 # check iterencode()/iterdecode()
1412 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1413 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1414
1415 # check iterencode()/iterdecode() with empty string
1416 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1417 self.assertEqual(result, u"")
1418
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001419 if encoding not in only_strict_mode:
1420 # check incremental decoder/encoder with errors argument
1421 try:
1422 encoder = codecs.getincrementalencoder(encoding)("ignore")
1423 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1424 except LookupError: # no IncrementalEncoder
1425 pass
1426 else:
1427 encodedresult = "".join(encoder.encode(c) for c in s)
1428 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1429 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1430 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001431
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001432 encodedresult = "".join(cencoder.encode(c) for c in s)
1433 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1434 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1435 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1436
Walter Dörwald729c31f2005-03-14 19:06:30 +00001437 def test_seek(self):
1438 # all codecs should be able to encode these
1439 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1440 for encoding in all_unicode_encodings:
1441 if encoding == "idna": # FIXME: See SF bug #1163178
1442 continue
1443 if encoding in broken_unicode_with_streams:
1444 continue
1445 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1446 for t in xrange(5):
1447 # Test that calling seek resets the internal codec state and buffers
1448 reader.seek(0, 0)
1449 line = reader.readline()
1450 self.assertEqual(s[:len(line)], line)
1451
Walter Dörwalde22d3392005-11-17 08:52:34 +00001452 def test_bad_decode_args(self):
1453 for encoding in all_unicode_encodings:
1454 decoder = codecs.getdecoder(encoding)
1455 self.assertRaises(TypeError, decoder)
1456 if encoding not in ("idna", "punycode"):
1457 self.assertRaises(TypeError, decoder, 42)
1458
1459 def test_bad_encode_args(self):
1460 for encoding in all_unicode_encodings:
1461 encoder = codecs.getencoder(encoding)
1462 self.assertRaises(TypeError, encoder)
1463
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001464 def test_encoding_map_type_initialized(self):
1465 from encodings import cp1140
1466 # This used to crash, we are only verifying there's no crash.
1467 table_type = type(cp1140.encoding_table)
1468 self.assertEqual(table_type, table_type)
1469
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001470class BasicStrTest(unittest.TestCase):
1471 def test_basics(self):
1472 s = "abc123"
1473 for encoding in all_string_encodings:
1474 (bytes, size) = codecs.getencoder(encoding)(s)
1475 self.assertEqual(size, len(s))
1476 (chars, size) = codecs.getdecoder(encoding)(bytes)
1477 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1478
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001479class CharmapTest(unittest.TestCase):
1480 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001481 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001482 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1483 (u"abc", 3)
1484 )
1485
Ezio Melotti2623a372010-11-21 13:34:58 +00001486 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001487 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1488 (u"ab\ufffd", 3)
1489 )
1490
Ezio Melotti2623a372010-11-21 13:34:58 +00001491 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001492 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1493 (u"ab\ufffd", 3)
1494 )
1495
Ezio Melotti2623a372010-11-21 13:34:58 +00001496 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001497 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1498 (u"ab", 3)
1499 )
1500
Ezio Melotti2623a372010-11-21 13:34:58 +00001501 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001502 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1503 (u"ab", 3)
1504 )
1505
1506 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001507 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001508 codecs.charmap_decode(allbytes, "ignore", u""),
1509 (u"", len(allbytes))
1510 )
1511
Georg Brandl8f99f812006-10-29 08:39:22 +00001512class WithStmtTest(unittest.TestCase):
1513 def test_encodedfile(self):
1514 f = StringIO.StringIO("\xc3\xbc")
1515 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001516 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001517
1518 def test_streamreaderwriter(self):
1519 f = StringIO.StringIO("\xc3\xbc")
1520 info = codecs.lookup("utf-8")
1521 with codecs.StreamReaderWriter(f, info.streamreader,
1522 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001523 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001524
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001525
Victor Stinner262be5e2010-05-22 02:11:07 +00001526class BomTest(unittest.TestCase):
1527 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00001528 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00001529 tests = ("utf-16",
1530 "utf-16-le",
1531 "utf-16-be",
1532 "utf-32",
1533 "utf-32-le",
1534 "utf-32-be")
1535 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00001536 # Check if the BOM is written only once
1537 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00001538 f.write(data)
1539 f.write(data)
1540 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001541 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001542 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001543 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001544
Victor Stinner7df55da2010-05-22 13:37:56 +00001545 # Check that the BOM is written after a seek(0)
1546 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1547 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001548 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001549 f.seek(0)
1550 f.write(data)
1551 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001552 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001553
1554 # (StreamWriter) Check that the BOM is written after a seek(0)
1555 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1556 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001557 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001558 f.writer.seek(0)
1559 f.writer.write(data)
1560 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001561 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001562
1563 # Check that the BOM is not written after a seek() at a position
1564 # different than the start
1565 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1566 f.write(data)
1567 f.seek(f.tell())
1568 f.write(data)
1569 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001570 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001571
1572 # (StreamWriter) Check that the BOM is not written after a seek()
1573 # at a position different than the start
1574 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1575 f.writer.write(data)
1576 f.writer.seek(f.writer.tell())
1577 f.writer.write(data)
1578 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001579 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001580
Victor Stinner262be5e2010-05-22 02:11:07 +00001581
Fred Drake2e2be372001-09-20 21:33:42 +00001582def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001583 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001584 UTF32Test,
1585 UTF32LETest,
1586 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001587 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001588 UTF16LETest,
1589 UTF16BETest,
1590 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001591 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001592 UTF7Test,
1593 UTF16ExTest,
1594 ReadBufferTest,
1595 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001596 EscapeDecodeTest,
1597 RecodingTest,
1598 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001599 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001600 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001601 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001602 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001603 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001604 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001605 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001606 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001607 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001608 CharmapTest,
1609 WithStmtTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00001610 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001611 )
Fred Drake2e2be372001-09-20 21:33:42 +00001612
1613
1614if __name__ == "__main__":
1615 test_main()