blob: f7f27cc7d9f95c75d79b112559f1be5f9fc2094b [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00004import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000029 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000030 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwaldabb02e52006-03-15 11:35:15 +000044 # do the check again, this time using a incremental decoder
45 d = codecs.getincrementaldecoder(self.encoding)()
46 result = u""
47 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
48 result += d.decode(c)
49 self.assertEqual(result, partialresult)
50 # check that there's nothing left in the buffers
51 self.assertEqual(d.decode("", True), u"")
52 self.assertEqual(d.buffer, "")
53
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000054 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000055 d.reset()
56 result = u""
57 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
58 result += d.decode(c)
59 self.assertEqual(result, partialresult)
60 # check that there's nothing left in the buffers
61 self.assertEqual(d.decode("", True), u"")
62 self.assertEqual(d.buffer, "")
63
64 # check iterdecode()
65 encoded = input.encode(self.encoding)
66 self.assertEqual(
67 input,
68 u"".join(codecs.iterdecode(encoded, self.encoding))
69 )
70
Walter Dörwalde57d7b12004-12-21 22:24:00 +000071 def test_readline(self):
72 def getreader(input):
73 stream = StringIO.StringIO(input.encode(self.encoding))
74 return codecs.getreader(self.encoding)(stream)
75
Walter Dörwaldca199432006-03-06 22:39:12 +000076 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 reader = getreader(input)
78 lines = []
79 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000080 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 if not line:
82 break
83 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000084 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085
86 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000087 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
88 sexpectednoends = u"foo|bar|baz|spam|eggs"
89 self.assertEqual(readalllines(s, True), sexpected)
90 self.assertEqual(readalllines(s, False), sexpectednoends)
91 self.assertEqual(readalllines(s, True, 10), sexpected)
92 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093
94 # Test long lines (multiple calls to read() in readline())
95 vw = []
96 vwo = []
97 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
98 vw.append((i*200)*u"\3042" + lineend)
99 vwo.append((i*200)*u"\3042")
100 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
101 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
102
103 # Test lines where the first read might end with \r, so the
104 # reader has to look ahead whether this is a lone \r or a \r\n
105 for size in xrange(80):
106 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000107 s = 10*(size*u"a" + lineend + u"xxx\n")
108 reader = getreader(s)
109 for i in xrange(10):
110 self.assertEqual(
111 reader.readline(keepends=True),
112 size*u"a" + lineend,
113 )
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=False),
118 size*u"a",
119 )
120
121 def test_bug1175396(self):
122 s = [
123 '<%!--===================================================\r\n',
124 ' BLOG index page: show recent articles,\r\n',
125 ' today\'s articles, or articles of a specific date.\r\n',
126 '========================================================--%>\r\n',
127 '<%@inputencoding="ISO-8859-1"%>\r\n',
128 '<%@pagetemplate=TEMPLATE.y%>\r\n',
129 '<%@import=import frog.util, frog%>\r\n',
130 '<%@import=import frog.objects%>\r\n',
131 '<%@import=from frog.storageerrors import StorageError%>\r\n',
132 '<%\r\n',
133 '\r\n',
134 'import logging\r\n',
135 'log=logging.getLogger("Snakelets.logger")\r\n',
136 '\r\n',
137 '\r\n',
138 'user=self.SessionCtx.user\r\n',
139 'storageEngine=self.SessionCtx.storageEngine\r\n',
140 '\r\n',
141 '\r\n',
142 'def readArticlesFromDate(date, count=None):\r\n',
143 ' entryids=storageEngine.listBlogEntries(date)\r\n',
144 ' entryids.reverse() # descending\r\n',
145 ' if count:\r\n',
146 ' entryids=entryids[:count]\r\n',
147 ' try:\r\n',
148 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
149 ' except StorageError,x:\r\n',
150 ' log.error("Error loading articles: "+str(x))\r\n',
151 ' self.abort("cannot load articles")\r\n',
152 '\r\n',
153 'showdate=None\r\n',
154 '\r\n',
155 'arg=self.Request.getArg()\r\n',
156 'if arg=="today":\r\n',
157 ' #-------------------- TODAY\'S ARTICLES\r\n',
158 ' self.write("<h2>Today\'s articles</h2>")\r\n',
159 ' showdate = frog.util.isodatestr() \r\n',
160 ' entries = readArticlesFromDate(showdate)\r\n',
161 'elif arg=="active":\r\n',
162 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
163 ' self.Yredirect("active.y")\r\n',
164 'elif arg=="login":\r\n',
165 ' #-------------------- LOGIN PAGE redirect\r\n',
166 ' self.Yredirect("login.y")\r\n',
167 'elif arg=="date":\r\n',
168 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
169 ' showdate = self.Request.getParameter("date")\r\n',
170 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
171 ' entries = readArticlesFromDate(showdate)\r\n',
172 'else:\r\n',
173 ' #-------------------- RECENT ARTICLES\r\n',
174 ' self.write("<h2>Recent articles</h2>")\r\n',
175 ' dates=storageEngine.listBlogEntryDates()\r\n',
176 ' if dates:\r\n',
177 ' entries=[]\r\n',
178 ' SHOWAMOUNT=10\r\n',
179 ' for showdate in dates:\r\n',
180 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
181 ' if len(entries)>=SHOWAMOUNT:\r\n',
182 ' break\r\n',
183 ' \r\n',
184 ]
185 stream = StringIO.StringIO("".join(s).encode(self.encoding))
186 reader = codecs.getreader(self.encoding)(stream)
187 for (i, line) in enumerate(reader):
188 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000189
190 def test_readlinequeue(self):
191 q = Queue()
192 writer = codecs.getwriter(self.encoding)(q)
193 reader = codecs.getreader(self.encoding)(q)
194
195 # No lineends
196 writer.write(u"foo\r")
197 self.assertEqual(reader.readline(keepends=False), u"foo")
198 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000199 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000200 self.assertEqual(reader.readline(keepends=False), u"bar")
201 writer.write(u"baz")
202 self.assertEqual(reader.readline(keepends=False), u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"")
204
205 # Lineends
206 writer.write(u"foo\r")
207 self.assertEqual(reader.readline(keepends=True), u"foo\r")
208 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000209 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000210 self.assertEqual(reader.readline(keepends=True), u"bar\r")
211 writer.write(u"baz")
212 self.assertEqual(reader.readline(keepends=True), u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"")
214 writer.write(u"foo\r\n")
215 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
216
Walter Dörwald9fa09462005-01-10 12:01:39 +0000217 def test_bug1098990_a(self):
218 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
219 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
220 s3 = u"next line.\r\n"
221
222 s = (s1+s2+s3).encode(self.encoding)
223 stream = StringIO.StringIO(s)
224 reader = codecs.getreader(self.encoding)(stream)
225 self.assertEqual(reader.readline(), s1)
226 self.assertEqual(reader.readline(), s2)
227 self.assertEqual(reader.readline(), s3)
228 self.assertEqual(reader.readline(), u"")
229
230 def test_bug1098990_b(self):
231 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
232 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
233 s3 = u"stillokay:bbbbxx\r\n"
234 s4 = u"broken!!!!badbad\r\n"
235 s5 = u"againokay.\r\n"
236
237 s = (s1+s2+s3+s4+s5).encode(self.encoding)
238 stream = StringIO.StringIO(s)
239 reader = codecs.getreader(self.encoding)(stream)
240 self.assertEqual(reader.readline(), s1)
241 self.assertEqual(reader.readline(), s2)
242 self.assertEqual(reader.readline(), s3)
243 self.assertEqual(reader.readline(), s4)
244 self.assertEqual(reader.readline(), s5)
245 self.assertEqual(reader.readline(), u"")
246
Walter Dörwald6e390802007-08-17 16:41:28 +0000247class UTF32Test(ReadTest):
248 encoding = "utf-32"
249
250 spamle = ('\xff\xfe\x00\x00'
251 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
253 spambe = ('\x00\x00\xfe\xff'
254 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
256
257 def test_only_one_bom(self):
258 _,_,reader,writer = codecs.lookup(self.encoding)
259 # encode some stream
260 s = StringIO.StringIO()
261 f = writer(s)
262 f.write(u"spam")
263 f.write(u"spam")
264 d = s.getvalue()
265 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000266 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000267 # try to read it back
268 s = StringIO.StringIO(d)
269 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000270 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000271
272 def test_badbom(self):
273 s = StringIO.StringIO(4*"\xff")
274 f = codecs.getreader(self.encoding)(s)
275 self.assertRaises(UnicodeError, f.read)
276
277 s = StringIO.StringIO(8*"\xff")
278 f = codecs.getreader(self.encoding)(s)
279 self.assertRaises(UnicodeError, f.read)
280
281 def test_partial(self):
282 self.check_partial(
283 u"\x00\xff\u0100\uffff",
284 [
285 u"", # first byte of BOM read
286 u"", # second byte of BOM read
287 u"", # third byte of BOM read
288 u"", # fourth byte of BOM read => byteorder known
289 u"",
290 u"",
291 u"",
292 u"\x00",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00\xff",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff\u0100",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100\uffff",
305 ]
306 )
307
Georg Brandle9741f32009-09-17 11:28:09 +0000308 def test_handlers(self):
309 self.assertEqual((u'\ufffd', 1),
310 codecs.utf_32_decode('\x01', 'replace', True))
311 self.assertEqual((u'', 1),
312 codecs.utf_32_decode('\x01', 'ignore', True))
313
Walter Dörwald6e390802007-08-17 16:41:28 +0000314 def test_errors(self):
315 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
316 "\xff", "strict", True)
317
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000318 def test_issue8941(self):
319 # Issue #8941: insufficient result allocation when decoding into
320 # surrogate pairs on UCS-2 builds.
321 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
322 self.assertEqual(u'\U00010000' * 1024,
323 codecs.utf_32_decode(encoded_le)[0])
324 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
325 self.assertEqual(u'\U00010000' * 1024,
326 codecs.utf_32_decode(encoded_be)[0])
327
Walter Dörwald6e390802007-08-17 16:41:28 +0000328class UTF32LETest(ReadTest):
329 encoding = "utf-32-le"
330
331 def test_partial(self):
332 self.check_partial(
333 u"\x00\xff\u0100\uffff",
334 [
335 u"",
336 u"",
337 u"",
338 u"\x00",
339 u"\x00",
340 u"\x00",
341 u"\x00",
342 u"\x00\xff",
343 u"\x00\xff",
344 u"\x00\xff",
345 u"\x00\xff",
346 u"\x00\xff\u0100",
347 u"\x00\xff\u0100",
348 u"\x00\xff\u0100",
349 u"\x00\xff\u0100",
350 u"\x00\xff\u0100\uffff",
351 ]
352 )
353
354 def test_simple(self):
355 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
356
357 def test_errors(self):
358 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
359 "\xff", "strict", True)
360
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000361 def test_issue8941(self):
362 # Issue #8941: insufficient result allocation when decoding into
363 # surrogate pairs on UCS-2 builds.
364 encoded = '\x00\x00\x01\x00' * 1024
365 self.assertEqual(u'\U00010000' * 1024,
366 codecs.utf_32_le_decode(encoded)[0])
367
Walter Dörwald6e390802007-08-17 16:41:28 +0000368class UTF32BETest(ReadTest):
369 encoding = "utf-32-be"
370
371 def test_partial(self):
372 self.check_partial(
373 u"\x00\xff\u0100\uffff",
374 [
375 u"",
376 u"",
377 u"",
378 u"\x00",
379 u"\x00",
380 u"\x00",
381 u"\x00",
382 u"\x00\xff",
383 u"\x00\xff",
384 u"\x00\xff",
385 u"\x00\xff",
386 u"\x00\xff\u0100",
387 u"\x00\xff\u0100",
388 u"\x00\xff\u0100",
389 u"\x00\xff\u0100",
390 u"\x00\xff\u0100\uffff",
391 ]
392 )
393
394 def test_simple(self):
395 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
396
397 def test_errors(self):
398 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
399 "\xff", "strict", True)
400
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000401 def test_issue8941(self):
402 # Issue #8941: insufficient result allocation when decoding into
403 # surrogate pairs on UCS-2 builds.
404 encoded = '\x00\x01\x00\x00' * 1024
405 self.assertEqual(u'\U00010000' * 1024,
406 codecs.utf_32_be_decode(encoded)[0])
407
408
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000409class UTF16Test(ReadTest):
410 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000411
412 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
413 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
414
415 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000416 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000417 # encode some stream
418 s = StringIO.StringIO()
419 f = writer(s)
420 f.write(u"spam")
421 f.write(u"spam")
422 d = s.getvalue()
423 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000424 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000425 # try to read it back
426 s = StringIO.StringIO(d)
427 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000428 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000429
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000430 def test_badbom(self):
431 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000432 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000433 self.assertRaises(UnicodeError, f.read)
434
435 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000436 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000437 self.assertRaises(UnicodeError, f.read)
438
Walter Dörwald69652032004-09-07 20:24:22 +0000439 def test_partial(self):
440 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000441 u"\x00\xff\u0100\uffff",
442 [
443 u"", # first byte of BOM read
444 u"", # second byte of BOM read => byteorder known
445 u"",
446 u"\x00",
447 u"\x00",
448 u"\x00\xff",
449 u"\x00\xff",
450 u"\x00\xff\u0100",
451 u"\x00\xff\u0100",
452 u"\x00\xff\u0100\uffff",
453 ]
454 )
455
Georg Brandle9741f32009-09-17 11:28:09 +0000456 def test_handlers(self):
457 self.assertEqual((u'\ufffd', 1),
458 codecs.utf_16_decode('\x01', 'replace', True))
459 self.assertEqual((u'', 1),
460 codecs.utf_16_decode('\x01', 'ignore', True))
461
Walter Dörwalde22d3392005-11-17 08:52:34 +0000462 def test_errors(self):
463 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
464
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000465 def test_bug691291(self):
466 # Files are always opened in binary mode, even if no binary mode was
467 # specified. This means that no automatic conversion of '\n' is done
468 # on reading and writing.
469 s1 = u'Hello\r\nworld\r\n'
470
471 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200472 self.addCleanup(test_support.unlink, test_support.TESTFN)
473 with open(test_support.TESTFN, 'wb') as fp:
474 fp.write(s)
475 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
476 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000477
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000478class UTF16LETest(ReadTest):
479 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000480
481 def test_partial(self):
482 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000483 u"\x00\xff\u0100\uffff",
484 [
485 u"",
486 u"\x00",
487 u"\x00",
488 u"\x00\xff",
489 u"\x00\xff",
490 u"\x00\xff\u0100",
491 u"\x00\xff\u0100",
492 u"\x00\xff\u0100\uffff",
493 ]
494 )
495
Walter Dörwalde22d3392005-11-17 08:52:34 +0000496 def test_errors(self):
497 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
498
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000499class UTF16BETest(ReadTest):
500 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000501
502 def test_partial(self):
503 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000504 u"\x00\xff\u0100\uffff",
505 [
506 u"",
507 u"\x00",
508 u"\x00",
509 u"\x00\xff",
510 u"\x00\xff",
511 u"\x00\xff\u0100",
512 u"\x00\xff\u0100",
513 u"\x00\xff\u0100\uffff",
514 ]
515 )
516
Walter Dörwalde22d3392005-11-17 08:52:34 +0000517 def test_errors(self):
518 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
519
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000520class UTF8Test(ReadTest):
521 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000522
523 def test_partial(self):
524 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000525 u"\x00\xff\u07ff\u0800\uffff",
526 [
527 u"\x00",
528 u"\x00",
529 u"\x00\xff",
530 u"\x00\xff",
531 u"\x00\xff\u07ff",
532 u"\x00\xff\u07ff",
533 u"\x00\xff\u07ff",
534 u"\x00\xff\u07ff\u0800",
535 u"\x00\xff\u07ff\u0800",
536 u"\x00\xff\u07ff\u0800",
537 u"\x00\xff\u07ff\u0800\uffff",
538 ]
539 )
540
Walter Dörwalde22d3392005-11-17 08:52:34 +0000541class UTF7Test(ReadTest):
542 encoding = "utf-7"
543
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000544 def test_partial(self):
545 self.check_partial(
546 u"a+-b",
547 [
548 u"a",
549 u"a",
550 u"a+",
551 u"a+-",
552 u"a+-b",
553 ]
554 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000555
556class UTF16ExTest(unittest.TestCase):
557
558 def test_errors(self):
559 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
560
561 def test_bad_args(self):
562 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
563
564class ReadBufferTest(unittest.TestCase):
565
566 def test_array(self):
567 import array
568 self.assertEqual(
569 codecs.readbuffer_encode(array.array("c", "spam")),
570 ("spam", 4)
571 )
572
573 def test_empty(self):
574 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
575
576 def test_bad_args(self):
577 self.assertRaises(TypeError, codecs.readbuffer_encode)
578 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
579
580class CharBufferTest(unittest.TestCase):
581
582 def test_string(self):
583 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
584
585 def test_empty(self):
586 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
587
588 def test_bad_args(self):
589 self.assertRaises(TypeError, codecs.charbuffer_encode)
590 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
591
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000592class UTF8SigTest(ReadTest):
593 encoding = "utf-8-sig"
594
595 def test_partial(self):
596 self.check_partial(
597 u"\ufeff\x00\xff\u07ff\u0800\uffff",
598 [
599 u"",
600 u"",
601 u"", # First BOM has been read and skipped
602 u"",
603 u"",
604 u"\ufeff", # Second BOM has been read and emitted
605 u"\ufeff\x00", # "\x00" read and emitted
606 u"\ufeff\x00", # First byte of encoded u"\xff" read
607 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
608 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
609 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
610 u"\ufeff\x00\xff\u07ff",
611 u"\ufeff\x00\xff\u07ff",
612 u"\ufeff\x00\xff\u07ff\u0800",
613 u"\ufeff\x00\xff\u07ff\u0800",
614 u"\ufeff\x00\xff\u07ff\u0800",
615 u"\ufeff\x00\xff\u07ff\u0800\uffff",
616 ]
617 )
618
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000619 def test_bug1601501(self):
620 # SF bug #1601501: check that the codec works with a buffer
621 unicode("\xef\xbb\xbf", "utf-8-sig")
622
Walter Dörwald42348272007-04-12 10:35:00 +0000623 def test_bom(self):
624 d = codecs.getincrementaldecoder("utf-8-sig")()
625 s = u"spam"
626 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
627
Walter Dörwald183744d2007-11-19 12:41:10 +0000628 def test_stream_bom(self):
629 unistring = u"ABC\u00A1\u2200XYZ"
630 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
631
632 reader = codecs.getreader("utf-8-sig")
633 for sizehint in [None] + range(1, 11) + \
634 [64, 128, 256, 512, 1024]:
635 istream = reader(StringIO.StringIO(bytestring))
636 ostream = StringIO.StringIO()
637 while 1:
638 if sizehint is not None:
639 data = istream.read(sizehint)
640 else:
641 data = istream.read()
642
643 if not data:
644 break
645 ostream.write(data)
646
647 got = ostream.getvalue()
648 self.assertEqual(got, unistring)
649
650 def test_stream_bare(self):
651 unistring = u"ABC\u00A1\u2200XYZ"
652 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
653
654 reader = codecs.getreader("utf-8-sig")
655 for sizehint in [None] + range(1, 11) + \
656 [64, 128, 256, 512, 1024]:
657 istream = reader(StringIO.StringIO(bytestring))
658 ostream = StringIO.StringIO()
659 while 1:
660 if sizehint is not None:
661 data = istream.read(sizehint)
662 else:
663 data = istream.read()
664
665 if not data:
666 break
667 ostream.write(data)
668
669 got = ostream.getvalue()
670 self.assertEqual(got, unistring)
671
Walter Dörwald8709a422002-09-03 13:53:40 +0000672class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000673 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000674 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000675
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000676class RecodingTest(unittest.TestCase):
677 def test_recoding(self):
678 f = StringIO.StringIO()
679 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
680 f2.write(u"a")
681 f2.close()
682 # Python used to crash on this at exit because of a refcount
683 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000684
Martin v. Löwis2548c732003-04-18 10:39:54 +0000685# From RFC 3492
686punycode_testcases = [
687 # A Arabic (Egyptian):
688 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
689 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
690 "egbpdaj6bu4bxfgehfvwxn"),
691 # B Chinese (simplified):
692 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
693 "ihqwcrb4cv8a8dqg056pqjye"),
694 # C Chinese (traditional):
695 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
696 "ihqwctvzc91f659drss3x8bo0yb"),
697 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
698 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
699 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
700 u"\u0065\u0073\u006B\u0079",
701 "Proprostnemluvesky-uyb24dma41a"),
702 # E Hebrew:
703 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
704 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
705 u"\u05D1\u05E8\u05D9\u05EA",
706 "4dbcagdahymbxekheh6e0a7fei0b"),
707 # F Hindi (Devanagari):
708 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
709 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
710 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
711 u"\u0939\u0948\u0902",
712 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
713
714 #(G) Japanese (kanji and hiragana):
715 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
716 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
717 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
718
719 # (H) Korean (Hangul syllables):
720 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
721 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
722 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
723 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
724 "psd879ccm6fea98c"),
725
726 # (I) Russian (Cyrillic):
727 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
728 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
729 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
730 u"\u0438",
731 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
732
733 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
734 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
735 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
736 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
737 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
738 u"\u0061\u00F1\u006F\u006C",
739 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
740
741 # (K) Vietnamese:
742 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
743 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
744 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
745 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
746 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
747 u"\u0056\u0069\u1EC7\u0074",
748 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
749
Martin v. Löwis2548c732003-04-18 10:39:54 +0000750 #(L) 3<nen>B<gumi><kinpachi><sensei>
751 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
752 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000753
Martin v. Löwis2548c732003-04-18 10:39:54 +0000754 # (M) <amuro><namie>-with-SUPER-MONKEYS
755 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
756 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
757 u"\u004F\u004E\u004B\u0045\u0059\u0053",
758 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
759
760 # (N) Hello-Another-Way-<sorezore><no><basho>
761 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
762 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
763 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
764 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
765
766 # (O) <hitotsu><yane><no><shita>2
767 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
768 "2-u9tlzr9756bt3uc0v"),
769
770 # (P) Maji<de>Koi<suru>5<byou><mae>
771 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
772 u"\u308B\u0035\u79D2\u524D",
773 "MajiKoi5-783gue6qz075azm5e"),
774
775 # (Q) <pafii>de<runba>
776 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
777 "de-jg4avhby1noc0d"),
778
779 # (R) <sono><supiido><de>
780 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
781 "d9juau41awczczp"),
782
783 # (S) -> $1.00 <-
784 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
785 u"\u003C\u002D",
786 "-> $1.00 <--")
787 ]
788
789for i in punycode_testcases:
790 if len(i)!=2:
791 print repr(i)
792
793class PunycodeTest(unittest.TestCase):
794 def test_encode(self):
795 for uni, puny in punycode_testcases:
796 # Need to convert both strings to lower case, since
797 # some of the extended encodings use upper case, but our
798 # code produces only lower case. Converting just puny to
799 # lower is also insufficient, since some of the input characters
800 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +0000801 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000802
803 def test_decode(self):
804 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +0000805 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000806
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000807class UnicodeInternalTest(unittest.TestCase):
808 def test_bug1251300(self):
809 # Decoding with unicode_internal used to not correctly handle "code
810 # points" above 0x10ffff on UCS-4 builds.
811 if sys.maxunicode > 0xffff:
812 ok = [
813 ("\x00\x10\xff\xff", u"\U0010ffff"),
814 ("\x00\x00\x01\x01", u"\U00000101"),
815 ("", u""),
816 ]
817 not_ok = [
818 "\x7f\xff\xff\xff",
819 "\x80\x00\x00\x00",
820 "\x81\x00\x00\x00",
821 "\x00",
822 "\x00\x00\x00\x00\x00",
823 ]
824 for internal, uni in ok:
825 if sys.byteorder == "little":
826 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +0000827 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000828 for internal in not_ok:
829 if sys.byteorder == "little":
830 internal = "".join(reversed(internal))
831 self.assertRaises(UnicodeDecodeError, internal.decode,
832 "unicode_internal")
833
834 def test_decode_error_attributes(self):
835 if sys.maxunicode > 0xffff:
836 try:
837 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
838 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +0000839 self.assertEqual("unicode_internal", ex.encoding)
840 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
841 self.assertEqual(4, ex.start)
842 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000843 else:
844 self.fail()
845
846 def test_decode_callback(self):
847 if sys.maxunicode > 0xffff:
848 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
849 decoder = codecs.getdecoder("unicode_internal")
850 ab = u"ab".encode("unicode_internal")
851 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
852 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +0000853 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000854
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000855 def test_encode_length(self):
856 # Issue 3739
857 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +0000858 self.assertEqual(encoder(u"a")[1], 1)
859 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000860
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000861 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +0000862 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000863
Martin v. Löwis2548c732003-04-18 10:39:54 +0000864# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
865nameprep_tests = [
866 # 3.1 Map to nothing.
867 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
868 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
869 '\xb8\x8f\xef\xbb\xbf',
870 'foobarbaz'),
871 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
872 ('CAFE',
873 'cafe'),
874 # 3.3 Case folding 8bit U+00DF (german sharp s).
875 # The original test case is bogus; it says \xc3\xdf
876 ('\xc3\x9f',
877 'ss'),
878 # 3.4 Case folding U+0130 (turkish capital I with dot).
879 ('\xc4\xb0',
880 'i\xcc\x87'),
881 # 3.5 Case folding multibyte U+0143 U+037A.
882 ('\xc5\x83\xcd\xba',
883 '\xc5\x84 \xce\xb9'),
884 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
885 # XXX: skip this as it fails in UCS-2 mode
886 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
887 # 'telc\xe2\x88\x95kg\xcf\x83'),
888 (None, None),
889 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
890 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
891 '\xc7\xb0 a'),
892 # 3.8 Case folding U+1FB7 and normalization.
893 ('\xe1\xbe\xb7',
894 '\xe1\xbe\xb6\xce\xb9'),
895 # 3.9 Self-reverting case folding U+01F0 and normalization.
896 # The original test case is bogus, it says `\xc7\xf0'
897 ('\xc7\xb0',
898 '\xc7\xb0'),
899 # 3.10 Self-reverting case folding U+0390 and normalization.
900 ('\xce\x90',
901 '\xce\x90'),
902 # 3.11 Self-reverting case folding U+03B0 and normalization.
903 ('\xce\xb0',
904 '\xce\xb0'),
905 # 3.12 Self-reverting case folding U+1E96 and normalization.
906 ('\xe1\xba\x96',
907 '\xe1\xba\x96'),
908 # 3.13 Self-reverting case folding U+1F56 and normalization.
909 ('\xe1\xbd\x96',
910 '\xe1\xbd\x96'),
911 # 3.14 ASCII space character U+0020.
912 (' ',
913 ' '),
914 # 3.15 Non-ASCII 8bit space character U+00A0.
915 ('\xc2\xa0',
916 ' '),
917 # 3.16 Non-ASCII multibyte space character U+1680.
918 ('\xe1\x9a\x80',
919 None),
920 # 3.17 Non-ASCII multibyte space character U+2000.
921 ('\xe2\x80\x80',
922 ' '),
923 # 3.18 Zero Width Space U+200b.
924 ('\xe2\x80\x8b',
925 ''),
926 # 3.19 Non-ASCII multibyte space character U+3000.
927 ('\xe3\x80\x80',
928 ' '),
929 # 3.20 ASCII control characters U+0010 U+007F.
930 ('\x10\x7f',
931 '\x10\x7f'),
932 # 3.21 Non-ASCII 8bit control character U+0085.
933 ('\xc2\x85',
934 None),
935 # 3.22 Non-ASCII multibyte control character U+180E.
936 ('\xe1\xa0\x8e',
937 None),
938 # 3.23 Zero Width No-Break Space U+FEFF.
939 ('\xef\xbb\xbf',
940 ''),
941 # 3.24 Non-ASCII control character U+1D175.
942 ('\xf0\x9d\x85\xb5',
943 None),
944 # 3.25 Plane 0 private use character U+F123.
945 ('\xef\x84\xa3',
946 None),
947 # 3.26 Plane 15 private use character U+F1234.
948 ('\xf3\xb1\x88\xb4',
949 None),
950 # 3.27 Plane 16 private use character U+10F234.
951 ('\xf4\x8f\x88\xb4',
952 None),
953 # 3.28 Non-character code point U+8FFFE.
954 ('\xf2\x8f\xbf\xbe',
955 None),
956 # 3.29 Non-character code point U+10FFFF.
957 ('\xf4\x8f\xbf\xbf',
958 None),
959 # 3.30 Surrogate code U+DF42.
960 ('\xed\xbd\x82',
961 None),
962 # 3.31 Non-plain text character U+FFFD.
963 ('\xef\xbf\xbd',
964 None),
965 # 3.32 Ideographic description character U+2FF5.
966 ('\xe2\xbf\xb5',
967 None),
968 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000969 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000970 '\xcc\x81'),
971 # 3.34 Left-to-right mark U+200E.
972 ('\xe2\x80\x8e',
973 None),
974 # 3.35 Deprecated U+202A.
975 ('\xe2\x80\xaa',
976 None),
977 # 3.36 Language tagging character U+E0001.
978 ('\xf3\xa0\x80\x81',
979 None),
980 # 3.37 Language tagging character U+E0042.
981 ('\xf3\xa0\x81\x82',
982 None),
983 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
984 ('foo\xd6\xbebar',
985 None),
986 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
987 ('foo\xef\xb5\x90bar',
988 None),
989 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
990 ('foo\xef\xb9\xb6bar',
991 'foo \xd9\x8ebar'),
992 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
993 ('\xd8\xa71',
994 None),
995 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
996 ('\xd8\xa71\xd8\xa8',
997 '\xd8\xa71\xd8\xa8'),
998 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000999 # Skip this test as we allow unassigned
1000 #('\xf3\xa0\x80\x82',
1001 # None),
1002 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001003 # 3.44 Larger test (shrinking).
1004 # Original test case reads \xc3\xdf
1005 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1006 '\xaa\xce\xb0\xe2\x80\x80',
1007 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1008 # 3.45 Larger test (expanding).
1009 # Original test case reads \xc3\x9f
1010 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1011 '\x80',
1012 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1013 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1014 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1015 ]
1016
1017
1018class NameprepTest(unittest.TestCase):
1019 def test_nameprep(self):
1020 from encodings.idna import nameprep
1021 for pos, (orig, prepped) in enumerate(nameprep_tests):
1022 if orig is None:
1023 # Skipped
1024 continue
1025 # The Unicode strings are given in UTF-8
1026 orig = unicode(orig, "utf-8")
1027 if prepped is None:
1028 # Input contains prohibited characters
1029 self.assertRaises(UnicodeError, nameprep, orig)
1030 else:
1031 prepped = unicode(prepped, "utf-8")
1032 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001033 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001034 except Exception,e:
1035 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1036
Walter Dörwald78a0be62006-04-14 18:25:39 +00001037class IDNACodecTest(unittest.TestCase):
1038 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001039 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1040 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1041 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1042 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001043
1044 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001045 self.assertEqual(u"python.org".encode("idna"), "python.org")
1046 self.assertEqual("python.org.".encode("idna"), "python.org.")
1047 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1048 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001049
Martin v. Löwis8b595142005-08-25 11:03:38 +00001050 def test_stream(self):
1051 import StringIO
1052 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1053 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001054 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001055
Walter Dörwald78a0be62006-04-14 18:25:39 +00001056 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001057 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001058 "".join(codecs.iterdecode("python.org", "idna")),
1059 u"python.org"
1060 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001061 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001062 "".join(codecs.iterdecode("python.org.", "idna")),
1063 u"python.org."
1064 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001065 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001066 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1067 u"pyth\xf6n.org."
1068 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001069 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001070 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1071 u"pyth\xf6n.org."
1072 )
1073
1074 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001075 self.assertEqual(decoder.decode("xn--xam", ), u"")
1076 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1077 self.assertEqual(decoder.decode(u"rg"), u"")
1078 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001079
1080 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001081 self.assertEqual(decoder.decode("xn--xam", ), u"")
1082 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1083 self.assertEqual(decoder.decode("rg."), u"org.")
1084 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001085
1086 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001087 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001088 "".join(codecs.iterencode(u"python.org", "idna")),
1089 "python.org"
1090 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001091 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001092 "".join(codecs.iterencode(u"python.org.", "idna")),
1093 "python.org."
1094 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001095 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001096 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1097 "xn--pythn-mua.org."
1098 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001099 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001100 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1101 "xn--pythn-mua.org."
1102 )
1103
1104 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001105 self.assertEqual(encoder.encode(u"\xe4x"), "")
1106 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1107 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001108
1109 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001110 self.assertEqual(encoder.encode(u"\xe4x"), "")
1111 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1112 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001113
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001114class CodecsModuleTest(unittest.TestCase):
1115
1116 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001117 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001118 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001119 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001120 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001121 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1122
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001123 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001124 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001125 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001126 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001127 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001128 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001129 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1130
1131 def test_register(self):
1132 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001133 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001134
1135 def test_lookup(self):
1136 self.assertRaises(TypeError, codecs.lookup)
1137 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001138 self.assertRaises(LookupError, codecs.lookup, " ")
1139
1140 def test_getencoder(self):
1141 self.assertRaises(TypeError, codecs.getencoder)
1142 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1143
1144 def test_getdecoder(self):
1145 self.assertRaises(TypeError, codecs.getdecoder)
1146 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1147
1148 def test_getreader(self):
1149 self.assertRaises(TypeError, codecs.getreader)
1150 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1151
1152 def test_getwriter(self):
1153 self.assertRaises(TypeError, codecs.getwriter)
1154 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001155
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001156class StreamReaderTest(unittest.TestCase):
1157
1158 def setUp(self):
1159 self.reader = codecs.getreader('utf-8')
1160 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1161
1162 def test_readlines(self):
1163 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001164 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001165
Georg Brandl8f99f812006-10-29 08:39:22 +00001166class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001167
Georg Brandl8f99f812006-10-29 08:39:22 +00001168 def test_basic(self):
1169 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001170 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001171 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001172
1173 f = StringIO.StringIO()
1174 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1175 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001176 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001177
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001178class Str2StrTest(unittest.TestCase):
1179
1180 def test_read(self):
1181 sin = "\x80".encode("base64_codec")
1182 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1183 sout = reader.read()
1184 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001185 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001186
1187 def test_readline(self):
1188 sin = "\x80".encode("base64_codec")
1189 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1190 sout = reader.readline()
1191 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001192 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001193
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001194all_unicode_encodings = [
1195 "ascii",
1196 "base64_codec",
1197 "big5",
1198 "big5hkscs",
1199 "charmap",
1200 "cp037",
1201 "cp1006",
1202 "cp1026",
1203 "cp1140",
1204 "cp1250",
1205 "cp1251",
1206 "cp1252",
1207 "cp1253",
1208 "cp1254",
1209 "cp1255",
1210 "cp1256",
1211 "cp1257",
1212 "cp1258",
1213 "cp424",
1214 "cp437",
1215 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001216 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001217 "cp737",
1218 "cp775",
1219 "cp850",
1220 "cp852",
1221 "cp855",
1222 "cp856",
1223 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001224 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001225 "cp860",
1226 "cp861",
1227 "cp862",
1228 "cp863",
1229 "cp864",
1230 "cp865",
1231 "cp866",
1232 "cp869",
1233 "cp874",
1234 "cp875",
1235 "cp932",
1236 "cp949",
1237 "cp950",
1238 "euc_jis_2004",
1239 "euc_jisx0213",
1240 "euc_jp",
1241 "euc_kr",
1242 "gb18030",
1243 "gb2312",
1244 "gbk",
1245 "hex_codec",
1246 "hp_roman8",
1247 "hz",
1248 "idna",
1249 "iso2022_jp",
1250 "iso2022_jp_1",
1251 "iso2022_jp_2",
1252 "iso2022_jp_2004",
1253 "iso2022_jp_3",
1254 "iso2022_jp_ext",
1255 "iso2022_kr",
1256 "iso8859_1",
1257 "iso8859_10",
1258 "iso8859_11",
1259 "iso8859_13",
1260 "iso8859_14",
1261 "iso8859_15",
1262 "iso8859_16",
1263 "iso8859_2",
1264 "iso8859_3",
1265 "iso8859_4",
1266 "iso8859_5",
1267 "iso8859_6",
1268 "iso8859_7",
1269 "iso8859_8",
1270 "iso8859_9",
1271 "johab",
1272 "koi8_r",
1273 "koi8_u",
1274 "latin_1",
1275 "mac_cyrillic",
1276 "mac_greek",
1277 "mac_iceland",
1278 "mac_latin2",
1279 "mac_roman",
1280 "mac_turkish",
1281 "palmos",
1282 "ptcp154",
1283 "punycode",
1284 "raw_unicode_escape",
1285 "rot_13",
1286 "shift_jis",
1287 "shift_jis_2004",
1288 "shift_jisx0213",
1289 "tis_620",
1290 "unicode_escape",
1291 "unicode_internal",
1292 "utf_16",
1293 "utf_16_be",
1294 "utf_16_le",
1295 "utf_7",
1296 "utf_8",
1297]
1298
1299if hasattr(codecs, "mbcs_encode"):
1300 all_unicode_encodings.append("mbcs")
1301
1302# The following encodings work only with str, not unicode
1303all_string_encodings = [
1304 "quopri_codec",
1305 "string_escape",
1306 "uu_codec",
1307]
1308
1309# The following encoding is not tested, because it's not supposed
1310# to work:
1311# "undefined"
1312
1313# The following encodings don't work in stateful mode
1314broken_unicode_with_streams = [
1315 "base64_codec",
1316 "hex_codec",
1317 "punycode",
1318 "unicode_internal"
1319]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001320broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001321
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001322# The following encodings only support "strict" mode
1323only_strict_mode = [
1324 "idna",
1325 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001326 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001327]
1328
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001329try:
1330 import bz2
1331except ImportError:
1332 pass
1333else:
1334 all_unicode_encodings.append("bz2_codec")
1335 broken_unicode_with_streams.append("bz2_codec")
1336
1337try:
1338 import zlib
1339except ImportError:
1340 pass
1341else:
1342 all_unicode_encodings.append("zlib_codec")
1343 broken_unicode_with_streams.append("zlib_codec")
1344
1345class BasicUnicodeTest(unittest.TestCase):
1346 def test_basics(self):
1347 s = u"abc123" # all codecs should be able to encode these
1348 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001349 name = codecs.lookup(encoding).name
1350 if encoding.endswith("_codec"):
1351 name += "_codec"
1352 elif encoding == "latin_1":
1353 name = "latin_1"
1354 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001355 (bytes, size) = codecs.getencoder(encoding)(s)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001356 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001357 (chars, size) = codecs.getdecoder(encoding)(bytes)
1358 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1359
1360 if encoding not in broken_unicode_with_streams:
1361 # check stream reader/writer
1362 q = Queue()
1363 writer = codecs.getwriter(encoding)(q)
1364 encodedresult = ""
1365 for c in s:
1366 writer.write(c)
1367 encodedresult += q.read()
1368 q = Queue()
1369 reader = codecs.getreader(encoding)(q)
1370 decodedresult = u""
1371 for c in encodedresult:
1372 q.write(c)
1373 decodedresult += reader.read()
1374 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1375
Georg Brandl2c9838e2006-10-29 14:39:09 +00001376 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001377 # check incremental decoder/encoder (fetched via the Python
1378 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001379 try:
1380 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001381 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001382 except LookupError: # no IncrementalEncoder
1383 pass
1384 else:
1385 # check incremental decoder/encoder
1386 encodedresult = ""
1387 for c in s:
1388 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001389 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001390 decoder = codecs.getincrementaldecoder(encoding)()
1391 decodedresult = u""
1392 for c in encodedresult:
1393 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001394 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001395 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1396
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001397 # check C API
1398 encodedresult = ""
1399 for c in s:
1400 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001401 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001402 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1403 decodedresult = u""
1404 for c in encodedresult:
1405 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001406 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001407 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1408
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001409 # check iterencode()/iterdecode()
1410 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1411 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1412
1413 # check iterencode()/iterdecode() with empty string
1414 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1415 self.assertEqual(result, u"")
1416
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001417 if encoding not in only_strict_mode:
1418 # check incremental decoder/encoder with errors argument
1419 try:
1420 encoder = codecs.getincrementalencoder(encoding)("ignore")
1421 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1422 except LookupError: # no IncrementalEncoder
1423 pass
1424 else:
1425 encodedresult = "".join(encoder.encode(c) for c in s)
1426 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1427 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1428 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001429
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001430 encodedresult = "".join(cencoder.encode(c) for c in s)
1431 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1432 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1433 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1434
Walter Dörwald729c31f2005-03-14 19:06:30 +00001435 def test_seek(self):
1436 # all codecs should be able to encode these
1437 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1438 for encoding in all_unicode_encodings:
1439 if encoding == "idna": # FIXME: See SF bug #1163178
1440 continue
1441 if encoding in broken_unicode_with_streams:
1442 continue
1443 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1444 for t in xrange(5):
1445 # Test that calling seek resets the internal codec state and buffers
1446 reader.seek(0, 0)
1447 line = reader.readline()
1448 self.assertEqual(s[:len(line)], line)
1449
Walter Dörwalde22d3392005-11-17 08:52:34 +00001450 def test_bad_decode_args(self):
1451 for encoding in all_unicode_encodings:
1452 decoder = codecs.getdecoder(encoding)
1453 self.assertRaises(TypeError, decoder)
1454 if encoding not in ("idna", "punycode"):
1455 self.assertRaises(TypeError, decoder, 42)
1456
1457 def test_bad_encode_args(self):
1458 for encoding in all_unicode_encodings:
1459 encoder = codecs.getencoder(encoding)
1460 self.assertRaises(TypeError, encoder)
1461
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001462 def test_encoding_map_type_initialized(self):
1463 from encodings import cp1140
1464 # This used to crash, we are only verifying there's no crash.
1465 table_type = type(cp1140.encoding_table)
1466 self.assertEqual(table_type, table_type)
1467
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001468class BasicStrTest(unittest.TestCase):
1469 def test_basics(self):
1470 s = "abc123"
1471 for encoding in all_string_encodings:
1472 (bytes, size) = codecs.getencoder(encoding)(s)
1473 self.assertEqual(size, len(s))
1474 (chars, size) = codecs.getdecoder(encoding)(bytes)
1475 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1476
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001477class CharmapTest(unittest.TestCase):
1478 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001479 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001480 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1481 (u"abc", 3)
1482 )
1483
Ezio Melotti2623a372010-11-21 13:34:58 +00001484 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001485 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1486 (u"ab\ufffd", 3)
1487 )
1488
Ezio Melotti2623a372010-11-21 13:34:58 +00001489 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001490 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1491 (u"ab\ufffd", 3)
1492 )
1493
Ezio Melotti2623a372010-11-21 13:34:58 +00001494 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001495 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1496 (u"ab", 3)
1497 )
1498
Ezio Melotti2623a372010-11-21 13:34:58 +00001499 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001500 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1501 (u"ab", 3)
1502 )
1503
1504 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001505 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001506 codecs.charmap_decode(allbytes, "ignore", u""),
1507 (u"", len(allbytes))
1508 )
1509
Georg Brandl8f99f812006-10-29 08:39:22 +00001510class WithStmtTest(unittest.TestCase):
1511 def test_encodedfile(self):
1512 f = StringIO.StringIO("\xc3\xbc")
1513 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001514 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001515
1516 def test_streamreaderwriter(self):
1517 f = StringIO.StringIO("\xc3\xbc")
1518 info = codecs.lookup("utf-8")
1519 with codecs.StreamReaderWriter(f, info.streamreader,
1520 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001521 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001522
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001523
Victor Stinner262be5e2010-05-22 02:11:07 +00001524class BomTest(unittest.TestCase):
1525 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00001526 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00001527 tests = ("utf-16",
1528 "utf-16-le",
1529 "utf-16-be",
1530 "utf-32",
1531 "utf-32-le",
1532 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02001533 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00001534 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00001535 # Check if the BOM is written only once
1536 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00001537 f.write(data)
1538 f.write(data)
1539 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001540 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001541 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001542 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001543
Victor Stinner7df55da2010-05-22 13:37:56 +00001544 # Check that the BOM is written after a seek(0)
1545 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1546 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001547 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001548 f.seek(0)
1549 f.write(data)
1550 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001551 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001552
1553 # (StreamWriter) Check that the BOM is written after a seek(0)
1554 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1555 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001556 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001557 f.writer.seek(0)
1558 f.writer.write(data)
1559 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001560 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001561
1562 # Check that the BOM is not written after a seek() at a position
1563 # different than the start
1564 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1565 f.write(data)
1566 f.seek(f.tell())
1567 f.write(data)
1568 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001569 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001570
1571 # (StreamWriter) Check that the BOM is not written after a seek()
1572 # at a position different than the start
1573 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1574 f.writer.write(data)
1575 f.writer.seek(f.writer.tell())
1576 f.writer.write(data)
1577 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001578 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001579
Victor Stinner262be5e2010-05-22 02:11:07 +00001580
Fred Drake2e2be372001-09-20 21:33:42 +00001581def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001582 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001583 UTF32Test,
1584 UTF32LETest,
1585 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001586 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001587 UTF16LETest,
1588 UTF16BETest,
1589 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001590 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001591 UTF7Test,
1592 UTF16ExTest,
1593 ReadBufferTest,
1594 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001595 EscapeDecodeTest,
1596 RecodingTest,
1597 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001598 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001599 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001600 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001601 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001602 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001603 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001604 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001605 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001606 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001607 CharmapTest,
1608 WithStmtTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00001609 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001610 )
Fred Drake2e2be372001-09-20 21:33:42 +00001611
1612
1613if __name__ == "__main__":
1614 test_main()