blob: d434f837544754a92a59f7cdfe2fd4513cd26711 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Walter Dörwald9ae019b2006-03-18 14:22:26 +00005import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
11 def __init__(self):
12 self._buffer = ""
13
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
20 self._buffer = ""
21 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwalde57d7b12004-12-21 22:24:00 +000027class ReadTest(unittest.TestCase):
28 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000029 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000030 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000031 # the StreamReader and check that the results equal the appropriate
32 # entries from partialresults.
33 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000034 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000035 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000036 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000037 q.write(c)
38 result += r.read()
39 self.assertEqual(result, partialresult)
40 # check that there's nothing left in the buffers
41 self.assertEqual(r.read(), u"")
42 self.assertEqual(r.bytebuffer, "")
43 self.assertEqual(r.charbuffer, u"")
44
Walter Dörwaldabb02e52006-03-15 11:35:15 +000045 # do the check again, this time using a incremental decoder
46 d = codecs.getincrementaldecoder(self.encoding)()
47 result = u""
48 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
49 result += d.decode(c)
50 self.assertEqual(result, partialresult)
51 # check that there's nothing left in the buffers
52 self.assertEqual(d.decode("", True), u"")
53 self.assertEqual(d.buffer, "")
54
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000055 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000056 d.reset()
57 result = u""
58 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
59 result += d.decode(c)
60 self.assertEqual(result, partialresult)
61 # check that there's nothing left in the buffers
62 self.assertEqual(d.decode("", True), u"")
63 self.assertEqual(d.buffer, "")
64
65 # check iterdecode()
66 encoded = input.encode(self.encoding)
67 self.assertEqual(
68 input,
69 u"".join(codecs.iterdecode(encoded, self.encoding))
70 )
71
Walter Dörwalde57d7b12004-12-21 22:24:00 +000072 def test_readline(self):
73 def getreader(input):
74 stream = StringIO.StringIO(input.encode(self.encoding))
75 return codecs.getreader(self.encoding)(stream)
76
Walter Dörwaldca199432006-03-06 22:39:12 +000077 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 reader = getreader(input)
79 lines = []
80 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000081 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000082 if not line:
83 break
84 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000085 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086
87 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000088 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
89 sexpectednoends = u"foo|bar|baz|spam|eggs"
90 self.assertEqual(readalllines(s, True), sexpected)
91 self.assertEqual(readalllines(s, False), sexpectednoends)
92 self.assertEqual(readalllines(s, True, 10), sexpected)
93 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000094
95 # Test long lines (multiple calls to read() in readline())
96 vw = []
97 vwo = []
98 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
99 vw.append((i*200)*u"\3042" + lineend)
100 vwo.append((i*200)*u"\3042")
101 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
102 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
103
104 # Test lines where the first read might end with \r, so the
105 # reader has to look ahead whether this is a lone \r or a \r\n
106 for size in xrange(80):
107 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000108 s = 10*(size*u"a" + lineend + u"xxx\n")
109 reader = getreader(s)
110 for i in xrange(10):
111 self.assertEqual(
112 reader.readline(keepends=True),
113 size*u"a" + lineend,
114 )
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=False),
119 size*u"a",
120 )
121
122 def test_bug1175396(self):
123 s = [
124 '<%!--===================================================\r\n',
125 ' BLOG index page: show recent articles,\r\n',
126 ' today\'s articles, or articles of a specific date.\r\n',
127 '========================================================--%>\r\n',
128 '<%@inputencoding="ISO-8859-1"%>\r\n',
129 '<%@pagetemplate=TEMPLATE.y%>\r\n',
130 '<%@import=import frog.util, frog%>\r\n',
131 '<%@import=import frog.objects%>\r\n',
132 '<%@import=from frog.storageerrors import StorageError%>\r\n',
133 '<%\r\n',
134 '\r\n',
135 'import logging\r\n',
136 'log=logging.getLogger("Snakelets.logger")\r\n',
137 '\r\n',
138 '\r\n',
139 'user=self.SessionCtx.user\r\n',
140 'storageEngine=self.SessionCtx.storageEngine\r\n',
141 '\r\n',
142 '\r\n',
143 'def readArticlesFromDate(date, count=None):\r\n',
144 ' entryids=storageEngine.listBlogEntries(date)\r\n',
145 ' entryids.reverse() # descending\r\n',
146 ' if count:\r\n',
147 ' entryids=entryids[:count]\r\n',
148 ' try:\r\n',
149 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
150 ' except StorageError,x:\r\n',
151 ' log.error("Error loading articles: "+str(x))\r\n',
152 ' self.abort("cannot load articles")\r\n',
153 '\r\n',
154 'showdate=None\r\n',
155 '\r\n',
156 'arg=self.Request.getArg()\r\n',
157 'if arg=="today":\r\n',
158 ' #-------------------- TODAY\'S ARTICLES\r\n',
159 ' self.write("<h2>Today\'s articles</h2>")\r\n',
160 ' showdate = frog.util.isodatestr() \r\n',
161 ' entries = readArticlesFromDate(showdate)\r\n',
162 'elif arg=="active":\r\n',
163 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
164 ' self.Yredirect("active.y")\r\n',
165 'elif arg=="login":\r\n',
166 ' #-------------------- LOGIN PAGE redirect\r\n',
167 ' self.Yredirect("login.y")\r\n',
168 'elif arg=="date":\r\n',
169 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
170 ' showdate = self.Request.getParameter("date")\r\n',
171 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
172 ' entries = readArticlesFromDate(showdate)\r\n',
173 'else:\r\n',
174 ' #-------------------- RECENT ARTICLES\r\n',
175 ' self.write("<h2>Recent articles</h2>")\r\n',
176 ' dates=storageEngine.listBlogEntryDates()\r\n',
177 ' if dates:\r\n',
178 ' entries=[]\r\n',
179 ' SHOWAMOUNT=10\r\n',
180 ' for showdate in dates:\r\n',
181 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
182 ' if len(entries)>=SHOWAMOUNT:\r\n',
183 ' break\r\n',
184 ' \r\n',
185 ]
186 stream = StringIO.StringIO("".join(s).encode(self.encoding))
187 reader = codecs.getreader(self.encoding)(stream)
188 for (i, line) in enumerate(reader):
189 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000190
191 def test_readlinequeue(self):
192 q = Queue()
193 writer = codecs.getwriter(self.encoding)(q)
194 reader = codecs.getreader(self.encoding)(q)
195
196 # No lineends
197 writer.write(u"foo\r")
198 self.assertEqual(reader.readline(keepends=False), u"foo")
199 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000200 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000201 self.assertEqual(reader.readline(keepends=False), u"bar")
202 writer.write(u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"baz")
204 self.assertEqual(reader.readline(keepends=False), u"")
205
206 # Lineends
207 writer.write(u"foo\r")
208 self.assertEqual(reader.readline(keepends=True), u"foo\r")
209 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000210 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000211 self.assertEqual(reader.readline(keepends=True), u"bar\r")
212 writer.write(u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"baz")
214 self.assertEqual(reader.readline(keepends=True), u"")
215 writer.write(u"foo\r\n")
216 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
217
Walter Dörwald9fa09462005-01-10 12:01:39 +0000218 def test_bug1098990_a(self):
219 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
220 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
221 s3 = u"next line.\r\n"
222
223 s = (s1+s2+s3).encode(self.encoding)
224 stream = StringIO.StringIO(s)
225 reader = codecs.getreader(self.encoding)(stream)
226 self.assertEqual(reader.readline(), s1)
227 self.assertEqual(reader.readline(), s2)
228 self.assertEqual(reader.readline(), s3)
229 self.assertEqual(reader.readline(), u"")
230
231 def test_bug1098990_b(self):
232 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
233 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
234 s3 = u"stillokay:bbbbxx\r\n"
235 s4 = u"broken!!!!badbad\r\n"
236 s5 = u"againokay.\r\n"
237
238 s = (s1+s2+s3+s4+s5).encode(self.encoding)
239 stream = StringIO.StringIO(s)
240 reader = codecs.getreader(self.encoding)(stream)
241 self.assertEqual(reader.readline(), s1)
242 self.assertEqual(reader.readline(), s2)
243 self.assertEqual(reader.readline(), s3)
244 self.assertEqual(reader.readline(), s4)
245 self.assertEqual(reader.readline(), s5)
246 self.assertEqual(reader.readline(), u"")
247
Walter Dörwald6e390802007-08-17 16:41:28 +0000248class UTF32Test(ReadTest):
249 encoding = "utf-32"
250
251 spamle = ('\xff\xfe\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
253 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
254 spambe = ('\x00\x00\xfe\xff'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
256 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
257
258 def test_only_one_bom(self):
259 _,_,reader,writer = codecs.lookup(self.encoding)
260 # encode some stream
261 s = StringIO.StringIO()
262 f = writer(s)
263 f.write(u"spam")
264 f.write(u"spam")
265 d = s.getvalue()
266 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000267 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000268 # try to read it back
269 s = StringIO.StringIO(d)
270 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000271 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000272
273 def test_badbom(self):
274 s = StringIO.StringIO(4*"\xff")
275 f = codecs.getreader(self.encoding)(s)
276 self.assertRaises(UnicodeError, f.read)
277
278 s = StringIO.StringIO(8*"\xff")
279 f = codecs.getreader(self.encoding)(s)
280 self.assertRaises(UnicodeError, f.read)
281
282 def test_partial(self):
283 self.check_partial(
284 u"\x00\xff\u0100\uffff",
285 [
286 u"", # first byte of BOM read
287 u"", # second byte of BOM read
288 u"", # third byte of BOM read
289 u"", # fourth byte of BOM read => byteorder known
290 u"",
291 u"",
292 u"",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100",
305 u"\x00\xff\u0100\uffff",
306 ]
307 )
308
Georg Brandle9741f32009-09-17 11:28:09 +0000309 def test_handlers(self):
310 self.assertEqual((u'\ufffd', 1),
311 codecs.utf_32_decode('\x01', 'replace', True))
312 self.assertEqual((u'', 1),
313 codecs.utf_32_decode('\x01', 'ignore', True))
314
Walter Dörwald6e390802007-08-17 16:41:28 +0000315 def test_errors(self):
316 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
317 "\xff", "strict", True)
318
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000319 def test_issue8941(self):
320 # Issue #8941: insufficient result allocation when decoding into
321 # surrogate pairs on UCS-2 builds.
322 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
323 self.assertEqual(u'\U00010000' * 1024,
324 codecs.utf_32_decode(encoded_le)[0])
325 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
326 self.assertEqual(u'\U00010000' * 1024,
327 codecs.utf_32_decode(encoded_be)[0])
328
Walter Dörwald6e390802007-08-17 16:41:28 +0000329class UTF32LETest(ReadTest):
330 encoding = "utf-32-le"
331
332 def test_partial(self):
333 self.check_partial(
334 u"\x00\xff\u0100\uffff",
335 [
336 u"",
337 u"",
338 u"",
339 u"\x00",
340 u"\x00",
341 u"\x00",
342 u"\x00",
343 u"\x00\xff",
344 u"\x00\xff",
345 u"\x00\xff",
346 u"\x00\xff",
347 u"\x00\xff\u0100",
348 u"\x00\xff\u0100",
349 u"\x00\xff\u0100",
350 u"\x00\xff\u0100",
351 u"\x00\xff\u0100\uffff",
352 ]
353 )
354
355 def test_simple(self):
356 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
357
358 def test_errors(self):
359 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
360 "\xff", "strict", True)
361
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000362 def test_issue8941(self):
363 # Issue #8941: insufficient result allocation when decoding into
364 # surrogate pairs on UCS-2 builds.
365 encoded = '\x00\x00\x01\x00' * 1024
366 self.assertEqual(u'\U00010000' * 1024,
367 codecs.utf_32_le_decode(encoded)[0])
368
Walter Dörwald6e390802007-08-17 16:41:28 +0000369class UTF32BETest(ReadTest):
370 encoding = "utf-32-be"
371
372 def test_partial(self):
373 self.check_partial(
374 u"\x00\xff\u0100\uffff",
375 [
376 u"",
377 u"",
378 u"",
379 u"\x00",
380 u"\x00",
381 u"\x00",
382 u"\x00",
383 u"\x00\xff",
384 u"\x00\xff",
385 u"\x00\xff",
386 u"\x00\xff",
387 u"\x00\xff\u0100",
388 u"\x00\xff\u0100",
389 u"\x00\xff\u0100",
390 u"\x00\xff\u0100",
391 u"\x00\xff\u0100\uffff",
392 ]
393 )
394
395 def test_simple(self):
396 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
397
398 def test_errors(self):
399 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
400 "\xff", "strict", True)
401
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000402 def test_issue8941(self):
403 # Issue #8941: insufficient result allocation when decoding into
404 # surrogate pairs on UCS-2 builds.
405 encoded = '\x00\x01\x00\x00' * 1024
406 self.assertEqual(u'\U00010000' * 1024,
407 codecs.utf_32_be_decode(encoded)[0])
408
409
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000410class UTF16Test(ReadTest):
411 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000412
413 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
414 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
415
416 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000417 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000418 # encode some stream
419 s = StringIO.StringIO()
420 f = writer(s)
421 f.write(u"spam")
422 f.write(u"spam")
423 d = s.getvalue()
424 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000425 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000426 # try to read it back
427 s = StringIO.StringIO(d)
428 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000429 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000430
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000431 def test_badbom(self):
432 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000433 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000434 self.assertRaises(UnicodeError, f.read)
435
436 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000437 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000438 self.assertRaises(UnicodeError, f.read)
439
Walter Dörwald69652032004-09-07 20:24:22 +0000440 def test_partial(self):
441 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000442 u"\x00\xff\u0100\uffff",
443 [
444 u"", # first byte of BOM read
445 u"", # second byte of BOM read => byteorder known
446 u"",
447 u"\x00",
448 u"\x00",
449 u"\x00\xff",
450 u"\x00\xff",
451 u"\x00\xff\u0100",
452 u"\x00\xff\u0100",
453 u"\x00\xff\u0100\uffff",
454 ]
455 )
456
Georg Brandle9741f32009-09-17 11:28:09 +0000457 def test_handlers(self):
458 self.assertEqual((u'\ufffd', 1),
459 codecs.utf_16_decode('\x01', 'replace', True))
460 self.assertEqual((u'', 1),
461 codecs.utf_16_decode('\x01', 'ignore', True))
462
Walter Dörwalde22d3392005-11-17 08:52:34 +0000463 def test_errors(self):
464 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
465
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000466 def test_bug691291(self):
467 # Files are always opened in binary mode, even if no binary mode was
468 # specified. This means that no automatic conversion of '\n' is done
469 # on reading and writing.
470 s1 = u'Hello\r\nworld\r\n'
471
472 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200473 self.addCleanup(test_support.unlink, test_support.TESTFN)
474 with open(test_support.TESTFN, 'wb') as fp:
475 fp.write(s)
476 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
477 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000478
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000479class UTF16LETest(ReadTest):
480 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000481
482 def test_partial(self):
483 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000484 u"\x00\xff\u0100\uffff",
485 [
486 u"",
487 u"\x00",
488 u"\x00",
489 u"\x00\xff",
490 u"\x00\xff",
491 u"\x00\xff\u0100",
492 u"\x00\xff\u0100",
493 u"\x00\xff\u0100\uffff",
494 ]
495 )
496
Walter Dörwalde22d3392005-11-17 08:52:34 +0000497 def test_errors(self):
498 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
499
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000500class UTF16BETest(ReadTest):
501 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000502
503 def test_partial(self):
504 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000505 u"\x00\xff\u0100\uffff",
506 [
507 u"",
508 u"\x00",
509 u"\x00",
510 u"\x00\xff",
511 u"\x00\xff",
512 u"\x00\xff\u0100",
513 u"\x00\xff\u0100",
514 u"\x00\xff\u0100\uffff",
515 ]
516 )
517
Walter Dörwalde22d3392005-11-17 08:52:34 +0000518 def test_errors(self):
519 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
520
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000521class UTF8Test(ReadTest):
522 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000523
524 def test_partial(self):
525 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000526 u"\x00\xff\u07ff\u0800\uffff",
527 [
528 u"\x00",
529 u"\x00",
530 u"\x00\xff",
531 u"\x00\xff",
532 u"\x00\xff\u07ff",
533 u"\x00\xff\u07ff",
534 u"\x00\xff\u07ff",
535 u"\x00\xff\u07ff\u0800",
536 u"\x00\xff\u07ff\u0800",
537 u"\x00\xff\u07ff\u0800",
538 u"\x00\xff\u07ff\u0800\uffff",
539 ]
540 )
541
Walter Dörwalde22d3392005-11-17 08:52:34 +0000542class UTF7Test(ReadTest):
543 encoding = "utf-7"
544
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000545 def test_partial(self):
546 self.check_partial(
547 u"a+-b",
548 [
549 u"a",
550 u"a",
551 u"a+",
552 u"a+-",
553 u"a+-b",
554 ]
555 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000556
557class UTF16ExTest(unittest.TestCase):
558
559 def test_errors(self):
560 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
561
562 def test_bad_args(self):
563 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
564
565class ReadBufferTest(unittest.TestCase):
566
567 def test_array(self):
568 import array
569 self.assertEqual(
570 codecs.readbuffer_encode(array.array("c", "spam")),
571 ("spam", 4)
572 )
573
574 def test_empty(self):
575 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
576
577 def test_bad_args(self):
578 self.assertRaises(TypeError, codecs.readbuffer_encode)
579 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
580
581class CharBufferTest(unittest.TestCase):
582
583 def test_string(self):
584 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
585
586 def test_empty(self):
587 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
588
589 def test_bad_args(self):
590 self.assertRaises(TypeError, codecs.charbuffer_encode)
591 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
592
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000593class UTF8SigTest(ReadTest):
594 encoding = "utf-8-sig"
595
596 def test_partial(self):
597 self.check_partial(
598 u"\ufeff\x00\xff\u07ff\u0800\uffff",
599 [
600 u"",
601 u"",
602 u"", # First BOM has been read and skipped
603 u"",
604 u"",
605 u"\ufeff", # Second BOM has been read and emitted
606 u"\ufeff\x00", # "\x00" read and emitted
607 u"\ufeff\x00", # First byte of encoded u"\xff" read
608 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
609 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
610 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
611 u"\ufeff\x00\xff\u07ff",
612 u"\ufeff\x00\xff\u07ff",
613 u"\ufeff\x00\xff\u07ff\u0800",
614 u"\ufeff\x00\xff\u07ff\u0800",
615 u"\ufeff\x00\xff\u07ff\u0800",
616 u"\ufeff\x00\xff\u07ff\u0800\uffff",
617 ]
618 )
619
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000620 def test_bug1601501(self):
621 # SF bug #1601501: check that the codec works with a buffer
622 unicode("\xef\xbb\xbf", "utf-8-sig")
623
Walter Dörwald42348272007-04-12 10:35:00 +0000624 def test_bom(self):
625 d = codecs.getincrementaldecoder("utf-8-sig")()
626 s = u"spam"
627 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
628
Walter Dörwald183744d2007-11-19 12:41:10 +0000629 def test_stream_bom(self):
630 unistring = u"ABC\u00A1\u2200XYZ"
631 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
632
633 reader = codecs.getreader("utf-8-sig")
634 for sizehint in [None] + range(1, 11) + \
635 [64, 128, 256, 512, 1024]:
636 istream = reader(StringIO.StringIO(bytestring))
637 ostream = StringIO.StringIO()
638 while 1:
639 if sizehint is not None:
640 data = istream.read(sizehint)
641 else:
642 data = istream.read()
643
644 if not data:
645 break
646 ostream.write(data)
647
648 got = ostream.getvalue()
649 self.assertEqual(got, unistring)
650
651 def test_stream_bare(self):
652 unistring = u"ABC\u00A1\u2200XYZ"
653 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
654
655 reader = codecs.getreader("utf-8-sig")
656 for sizehint in [None] + range(1, 11) + \
657 [64, 128, 256, 512, 1024]:
658 istream = reader(StringIO.StringIO(bytestring))
659 ostream = StringIO.StringIO()
660 while 1:
661 if sizehint is not None:
662 data = istream.read(sizehint)
663 else:
664 data = istream.read()
665
666 if not data:
667 break
668 ostream.write(data)
669
670 got = ostream.getvalue()
671 self.assertEqual(got, unistring)
672
Walter Dörwald8709a422002-09-03 13:53:40 +0000673class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000674 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000675 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000676
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000677class RecodingTest(unittest.TestCase):
678 def test_recoding(self):
679 f = StringIO.StringIO()
680 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
681 f2.write(u"a")
682 f2.close()
683 # Python used to crash on this at exit because of a refcount
684 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000685
Martin v. Löwis2548c732003-04-18 10:39:54 +0000686# From RFC 3492
687punycode_testcases = [
688 # A Arabic (Egyptian):
689 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
690 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
691 "egbpdaj6bu4bxfgehfvwxn"),
692 # B Chinese (simplified):
693 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
694 "ihqwcrb4cv8a8dqg056pqjye"),
695 # C Chinese (traditional):
696 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
697 "ihqwctvzc91f659drss3x8bo0yb"),
698 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
699 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
700 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
701 u"\u0065\u0073\u006B\u0079",
702 "Proprostnemluvesky-uyb24dma41a"),
703 # E Hebrew:
704 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
705 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
706 u"\u05D1\u05E8\u05D9\u05EA",
707 "4dbcagdahymbxekheh6e0a7fei0b"),
708 # F Hindi (Devanagari):
709 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
710 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
711 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
712 u"\u0939\u0948\u0902",
713 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
714
715 #(G) Japanese (kanji and hiragana):
716 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
717 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
718 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
719
720 # (H) Korean (Hangul syllables):
721 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
722 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
723 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
724 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
725 "psd879ccm6fea98c"),
726
727 # (I) Russian (Cyrillic):
728 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
729 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
730 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
731 u"\u0438",
732 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
733
734 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
735 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
736 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
737 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
738 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
739 u"\u0061\u00F1\u006F\u006C",
740 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
741
742 # (K) Vietnamese:
743 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
744 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
745 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
746 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
747 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
748 u"\u0056\u0069\u1EC7\u0074",
749 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
750
Martin v. Löwis2548c732003-04-18 10:39:54 +0000751 #(L) 3<nen>B<gumi><kinpachi><sensei>
752 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
753 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000754
Martin v. Löwis2548c732003-04-18 10:39:54 +0000755 # (M) <amuro><namie>-with-SUPER-MONKEYS
756 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
757 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
758 u"\u004F\u004E\u004B\u0045\u0059\u0053",
759 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
760
761 # (N) Hello-Another-Way-<sorezore><no><basho>
762 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
763 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
764 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
765 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
766
767 # (O) <hitotsu><yane><no><shita>2
768 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
769 "2-u9tlzr9756bt3uc0v"),
770
771 # (P) Maji<de>Koi<suru>5<byou><mae>
772 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
773 u"\u308B\u0035\u79D2\u524D",
774 "MajiKoi5-783gue6qz075azm5e"),
775
776 # (Q) <pafii>de<runba>
777 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
778 "de-jg4avhby1noc0d"),
779
780 # (R) <sono><supiido><de>
781 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
782 "d9juau41awczczp"),
783
784 # (S) -> $1.00 <-
785 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
786 u"\u003C\u002D",
787 "-> $1.00 <--")
788 ]
789
790for i in punycode_testcases:
791 if len(i)!=2:
792 print repr(i)
793
794class PunycodeTest(unittest.TestCase):
795 def test_encode(self):
796 for uni, puny in punycode_testcases:
797 # Need to convert both strings to lower case, since
798 # some of the extended encodings use upper case, but our
799 # code produces only lower case. Converting just puny to
800 # lower is also insufficient, since some of the input characters
801 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +0000802 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000803
804 def test_decode(self):
805 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +0000806 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000807
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000808class UnicodeInternalTest(unittest.TestCase):
809 def test_bug1251300(self):
810 # Decoding with unicode_internal used to not correctly handle "code
811 # points" above 0x10ffff on UCS-4 builds.
812 if sys.maxunicode > 0xffff:
813 ok = [
814 ("\x00\x10\xff\xff", u"\U0010ffff"),
815 ("\x00\x00\x01\x01", u"\U00000101"),
816 ("", u""),
817 ]
818 not_ok = [
819 "\x7f\xff\xff\xff",
820 "\x80\x00\x00\x00",
821 "\x81\x00\x00\x00",
822 "\x00",
823 "\x00\x00\x00\x00\x00",
824 ]
825 for internal, uni in ok:
826 if sys.byteorder == "little":
827 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +0000828 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000829 for internal in not_ok:
830 if sys.byteorder == "little":
831 internal = "".join(reversed(internal))
832 self.assertRaises(UnicodeDecodeError, internal.decode,
833 "unicode_internal")
834
835 def test_decode_error_attributes(self):
836 if sys.maxunicode > 0xffff:
837 try:
838 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
839 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +0000840 self.assertEqual("unicode_internal", ex.encoding)
841 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
842 self.assertEqual(4, ex.start)
843 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000844 else:
845 self.fail()
846
847 def test_decode_callback(self):
848 if sys.maxunicode > 0xffff:
849 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
850 decoder = codecs.getdecoder("unicode_internal")
851 ab = u"ab".encode("unicode_internal")
852 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
853 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +0000854 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000855
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000856 def test_encode_length(self):
857 # Issue 3739
858 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +0000859 self.assertEqual(encoder(u"a")[1], 1)
860 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000861
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000862 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +0000863 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000864
Martin v. Löwis2548c732003-04-18 10:39:54 +0000865# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
866nameprep_tests = [
867 # 3.1 Map to nothing.
868 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
869 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
870 '\xb8\x8f\xef\xbb\xbf',
871 'foobarbaz'),
872 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
873 ('CAFE',
874 'cafe'),
875 # 3.3 Case folding 8bit U+00DF (german sharp s).
876 # The original test case is bogus; it says \xc3\xdf
877 ('\xc3\x9f',
878 'ss'),
879 # 3.4 Case folding U+0130 (turkish capital I with dot).
880 ('\xc4\xb0',
881 'i\xcc\x87'),
882 # 3.5 Case folding multibyte U+0143 U+037A.
883 ('\xc5\x83\xcd\xba',
884 '\xc5\x84 \xce\xb9'),
885 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
886 # XXX: skip this as it fails in UCS-2 mode
887 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
888 # 'telc\xe2\x88\x95kg\xcf\x83'),
889 (None, None),
890 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
891 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
892 '\xc7\xb0 a'),
893 # 3.8 Case folding U+1FB7 and normalization.
894 ('\xe1\xbe\xb7',
895 '\xe1\xbe\xb6\xce\xb9'),
896 # 3.9 Self-reverting case folding U+01F0 and normalization.
897 # The original test case is bogus, it says `\xc7\xf0'
898 ('\xc7\xb0',
899 '\xc7\xb0'),
900 # 3.10 Self-reverting case folding U+0390 and normalization.
901 ('\xce\x90',
902 '\xce\x90'),
903 # 3.11 Self-reverting case folding U+03B0 and normalization.
904 ('\xce\xb0',
905 '\xce\xb0'),
906 # 3.12 Self-reverting case folding U+1E96 and normalization.
907 ('\xe1\xba\x96',
908 '\xe1\xba\x96'),
909 # 3.13 Self-reverting case folding U+1F56 and normalization.
910 ('\xe1\xbd\x96',
911 '\xe1\xbd\x96'),
912 # 3.14 ASCII space character U+0020.
913 (' ',
914 ' '),
915 # 3.15 Non-ASCII 8bit space character U+00A0.
916 ('\xc2\xa0',
917 ' '),
918 # 3.16 Non-ASCII multibyte space character U+1680.
919 ('\xe1\x9a\x80',
920 None),
921 # 3.17 Non-ASCII multibyte space character U+2000.
922 ('\xe2\x80\x80',
923 ' '),
924 # 3.18 Zero Width Space U+200b.
925 ('\xe2\x80\x8b',
926 ''),
927 # 3.19 Non-ASCII multibyte space character U+3000.
928 ('\xe3\x80\x80',
929 ' '),
930 # 3.20 ASCII control characters U+0010 U+007F.
931 ('\x10\x7f',
932 '\x10\x7f'),
933 # 3.21 Non-ASCII 8bit control character U+0085.
934 ('\xc2\x85',
935 None),
936 # 3.22 Non-ASCII multibyte control character U+180E.
937 ('\xe1\xa0\x8e',
938 None),
939 # 3.23 Zero Width No-Break Space U+FEFF.
940 ('\xef\xbb\xbf',
941 ''),
942 # 3.24 Non-ASCII control character U+1D175.
943 ('\xf0\x9d\x85\xb5',
944 None),
945 # 3.25 Plane 0 private use character U+F123.
946 ('\xef\x84\xa3',
947 None),
948 # 3.26 Plane 15 private use character U+F1234.
949 ('\xf3\xb1\x88\xb4',
950 None),
951 # 3.27 Plane 16 private use character U+10F234.
952 ('\xf4\x8f\x88\xb4',
953 None),
954 # 3.28 Non-character code point U+8FFFE.
955 ('\xf2\x8f\xbf\xbe',
956 None),
957 # 3.29 Non-character code point U+10FFFF.
958 ('\xf4\x8f\xbf\xbf',
959 None),
960 # 3.30 Surrogate code U+DF42.
961 ('\xed\xbd\x82',
962 None),
963 # 3.31 Non-plain text character U+FFFD.
964 ('\xef\xbf\xbd',
965 None),
966 # 3.32 Ideographic description character U+2FF5.
967 ('\xe2\xbf\xb5',
968 None),
969 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000970 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000971 '\xcc\x81'),
972 # 3.34 Left-to-right mark U+200E.
973 ('\xe2\x80\x8e',
974 None),
975 # 3.35 Deprecated U+202A.
976 ('\xe2\x80\xaa',
977 None),
978 # 3.36 Language tagging character U+E0001.
979 ('\xf3\xa0\x80\x81',
980 None),
981 # 3.37 Language tagging character U+E0042.
982 ('\xf3\xa0\x81\x82',
983 None),
984 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
985 ('foo\xd6\xbebar',
986 None),
987 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
988 ('foo\xef\xb5\x90bar',
989 None),
990 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
991 ('foo\xef\xb9\xb6bar',
992 'foo \xd9\x8ebar'),
993 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
994 ('\xd8\xa71',
995 None),
996 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
997 ('\xd8\xa71\xd8\xa8',
998 '\xd8\xa71\xd8\xa8'),
999 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001000 # Skip this test as we allow unassigned
1001 #('\xf3\xa0\x80\x82',
1002 # None),
1003 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001004 # 3.44 Larger test (shrinking).
1005 # Original test case reads \xc3\xdf
1006 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1007 '\xaa\xce\xb0\xe2\x80\x80',
1008 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1009 # 3.45 Larger test (expanding).
1010 # Original test case reads \xc3\x9f
1011 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1012 '\x80',
1013 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1014 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1015 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1016 ]
1017
1018
1019class NameprepTest(unittest.TestCase):
1020 def test_nameprep(self):
1021 from encodings.idna import nameprep
1022 for pos, (orig, prepped) in enumerate(nameprep_tests):
1023 if orig is None:
1024 # Skipped
1025 continue
1026 # The Unicode strings are given in UTF-8
1027 orig = unicode(orig, "utf-8")
1028 if prepped is None:
1029 # Input contains prohibited characters
1030 self.assertRaises(UnicodeError, nameprep, orig)
1031 else:
1032 prepped = unicode(prepped, "utf-8")
1033 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001034 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001035 except Exception,e:
1036 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1037
Walter Dörwald78a0be62006-04-14 18:25:39 +00001038class IDNACodecTest(unittest.TestCase):
1039 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001040 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1041 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1042 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1043 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001044
1045 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001046 self.assertEqual(u"python.org".encode("idna"), "python.org")
1047 self.assertEqual("python.org.".encode("idna"), "python.org.")
1048 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1049 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001050
Martin v. Löwis8b595142005-08-25 11:03:38 +00001051 def test_stream(self):
1052 import StringIO
1053 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1054 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001055 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001056
Walter Dörwald78a0be62006-04-14 18:25:39 +00001057 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001058 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001059 "".join(codecs.iterdecode("python.org", "idna")),
1060 u"python.org"
1061 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001062 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001063 "".join(codecs.iterdecode("python.org.", "idna")),
1064 u"python.org."
1065 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001066 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001067 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1068 u"pyth\xf6n.org."
1069 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001070 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001071 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1072 u"pyth\xf6n.org."
1073 )
1074
1075 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001076 self.assertEqual(decoder.decode("xn--xam", ), u"")
1077 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1078 self.assertEqual(decoder.decode(u"rg"), u"")
1079 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001080
1081 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001082 self.assertEqual(decoder.decode("xn--xam", ), u"")
1083 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1084 self.assertEqual(decoder.decode("rg."), u"org.")
1085 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001086
1087 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001088 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001089 "".join(codecs.iterencode(u"python.org", "idna")),
1090 "python.org"
1091 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001092 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001093 "".join(codecs.iterencode(u"python.org.", "idna")),
1094 "python.org."
1095 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001096 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001097 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1098 "xn--pythn-mua.org."
1099 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001100 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001101 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1102 "xn--pythn-mua.org."
1103 )
1104
1105 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001106 self.assertEqual(encoder.encode(u"\xe4x"), "")
1107 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1108 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001109
1110 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001111 self.assertEqual(encoder.encode(u"\xe4x"), "")
1112 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1113 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001114
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001115class CodecsModuleTest(unittest.TestCase):
1116
1117 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001118 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001119 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001120 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001121 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001122 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1123
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001124 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001125 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001126 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001127 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001128 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001129 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001130 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1131
1132 def test_register(self):
1133 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001134 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001135
1136 def test_lookup(self):
1137 self.assertRaises(TypeError, codecs.lookup)
1138 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001139 self.assertRaises(LookupError, codecs.lookup, " ")
1140
1141 def test_getencoder(self):
1142 self.assertRaises(TypeError, codecs.getencoder)
1143 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1144
1145 def test_getdecoder(self):
1146 self.assertRaises(TypeError, codecs.getdecoder)
1147 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1148
1149 def test_getreader(self):
1150 self.assertRaises(TypeError, codecs.getreader)
1151 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1152
1153 def test_getwriter(self):
1154 self.assertRaises(TypeError, codecs.getwriter)
1155 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001156
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001157 def test_lookup_issue1813(self):
1158 # Issue #1813: under Turkish locales, lookup of some codecs failed
1159 # because 'I' is lowercased as a dotless "i"
1160 oldlocale = locale.getlocale(locale.LC_CTYPE)
1161 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1162 try:
1163 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1164 except locale.Error:
1165 # Unsupported locale on this system
1166 self.skipTest('test needs Turkish locale')
1167 c = codecs.lookup('ASCII')
1168 self.assertEqual(c.name, 'ascii')
1169
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001170class StreamReaderTest(unittest.TestCase):
1171
1172 def setUp(self):
1173 self.reader = codecs.getreader('utf-8')
1174 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1175
1176 def test_readlines(self):
1177 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001178 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001179
Georg Brandl8f99f812006-10-29 08:39:22 +00001180class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001181
Georg Brandl8f99f812006-10-29 08:39:22 +00001182 def test_basic(self):
1183 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001184 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001185 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001186
1187 f = StringIO.StringIO()
1188 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1189 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001190 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001191
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001192class Str2StrTest(unittest.TestCase):
1193
1194 def test_read(self):
1195 sin = "\x80".encode("base64_codec")
1196 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1197 sout = reader.read()
1198 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001199 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001200
1201 def test_readline(self):
1202 sin = "\x80".encode("base64_codec")
1203 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1204 sout = reader.readline()
1205 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001206 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001207
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001208all_unicode_encodings = [
1209 "ascii",
1210 "base64_codec",
1211 "big5",
1212 "big5hkscs",
1213 "charmap",
1214 "cp037",
1215 "cp1006",
1216 "cp1026",
1217 "cp1140",
1218 "cp1250",
1219 "cp1251",
1220 "cp1252",
1221 "cp1253",
1222 "cp1254",
1223 "cp1255",
1224 "cp1256",
1225 "cp1257",
1226 "cp1258",
1227 "cp424",
1228 "cp437",
1229 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001230 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001231 "cp737",
1232 "cp775",
1233 "cp850",
1234 "cp852",
1235 "cp855",
1236 "cp856",
1237 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001238 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001239 "cp860",
1240 "cp861",
1241 "cp862",
1242 "cp863",
1243 "cp864",
1244 "cp865",
1245 "cp866",
1246 "cp869",
1247 "cp874",
1248 "cp875",
1249 "cp932",
1250 "cp949",
1251 "cp950",
1252 "euc_jis_2004",
1253 "euc_jisx0213",
1254 "euc_jp",
1255 "euc_kr",
1256 "gb18030",
1257 "gb2312",
1258 "gbk",
1259 "hex_codec",
1260 "hp_roman8",
1261 "hz",
1262 "idna",
1263 "iso2022_jp",
1264 "iso2022_jp_1",
1265 "iso2022_jp_2",
1266 "iso2022_jp_2004",
1267 "iso2022_jp_3",
1268 "iso2022_jp_ext",
1269 "iso2022_kr",
1270 "iso8859_1",
1271 "iso8859_10",
1272 "iso8859_11",
1273 "iso8859_13",
1274 "iso8859_14",
1275 "iso8859_15",
1276 "iso8859_16",
1277 "iso8859_2",
1278 "iso8859_3",
1279 "iso8859_4",
1280 "iso8859_5",
1281 "iso8859_6",
1282 "iso8859_7",
1283 "iso8859_8",
1284 "iso8859_9",
1285 "johab",
1286 "koi8_r",
1287 "koi8_u",
1288 "latin_1",
1289 "mac_cyrillic",
1290 "mac_greek",
1291 "mac_iceland",
1292 "mac_latin2",
1293 "mac_roman",
1294 "mac_turkish",
1295 "palmos",
1296 "ptcp154",
1297 "punycode",
1298 "raw_unicode_escape",
1299 "rot_13",
1300 "shift_jis",
1301 "shift_jis_2004",
1302 "shift_jisx0213",
1303 "tis_620",
1304 "unicode_escape",
1305 "unicode_internal",
1306 "utf_16",
1307 "utf_16_be",
1308 "utf_16_le",
1309 "utf_7",
1310 "utf_8",
1311]
1312
1313if hasattr(codecs, "mbcs_encode"):
1314 all_unicode_encodings.append("mbcs")
1315
1316# The following encodings work only with str, not unicode
1317all_string_encodings = [
1318 "quopri_codec",
1319 "string_escape",
1320 "uu_codec",
1321]
1322
1323# The following encoding is not tested, because it's not supposed
1324# to work:
1325# "undefined"
1326
1327# The following encodings don't work in stateful mode
1328broken_unicode_with_streams = [
1329 "base64_codec",
1330 "hex_codec",
1331 "punycode",
1332 "unicode_internal"
1333]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001334broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001335
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001336# The following encodings only support "strict" mode
1337only_strict_mode = [
1338 "idna",
1339 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001340 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001341]
1342
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001343try:
1344 import bz2
1345except ImportError:
1346 pass
1347else:
1348 all_unicode_encodings.append("bz2_codec")
1349 broken_unicode_with_streams.append("bz2_codec")
1350
1351try:
1352 import zlib
1353except ImportError:
1354 pass
1355else:
1356 all_unicode_encodings.append("zlib_codec")
1357 broken_unicode_with_streams.append("zlib_codec")
1358
1359class BasicUnicodeTest(unittest.TestCase):
1360 def test_basics(self):
1361 s = u"abc123" # all codecs should be able to encode these
1362 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001363 name = codecs.lookup(encoding).name
1364 if encoding.endswith("_codec"):
1365 name += "_codec"
1366 elif encoding == "latin_1":
1367 name = "latin_1"
1368 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001369 (bytes, size) = codecs.getencoder(encoding)(s)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001370 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001371 (chars, size) = codecs.getdecoder(encoding)(bytes)
1372 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1373
1374 if encoding not in broken_unicode_with_streams:
1375 # check stream reader/writer
1376 q = Queue()
1377 writer = codecs.getwriter(encoding)(q)
1378 encodedresult = ""
1379 for c in s:
1380 writer.write(c)
1381 encodedresult += q.read()
1382 q = Queue()
1383 reader = codecs.getreader(encoding)(q)
1384 decodedresult = u""
1385 for c in encodedresult:
1386 q.write(c)
1387 decodedresult += reader.read()
1388 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1389
Georg Brandl2c9838e2006-10-29 14:39:09 +00001390 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001391 # check incremental decoder/encoder (fetched via the Python
1392 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001393 try:
1394 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001395 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001396 except LookupError: # no IncrementalEncoder
1397 pass
1398 else:
1399 # check incremental decoder/encoder
1400 encodedresult = ""
1401 for c in s:
1402 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001403 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001404 decoder = codecs.getincrementaldecoder(encoding)()
1405 decodedresult = u""
1406 for c in encodedresult:
1407 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001408 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001409 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1410
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001411 # check C API
1412 encodedresult = ""
1413 for c in s:
1414 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001415 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001416 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1417 decodedresult = u""
1418 for c in encodedresult:
1419 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001420 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001421 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1422
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001423 # check iterencode()/iterdecode()
1424 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1425 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1426
1427 # check iterencode()/iterdecode() with empty string
1428 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1429 self.assertEqual(result, u"")
1430
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001431 if encoding not in only_strict_mode:
1432 # check incremental decoder/encoder with errors argument
1433 try:
1434 encoder = codecs.getincrementalencoder(encoding)("ignore")
1435 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1436 except LookupError: # no IncrementalEncoder
1437 pass
1438 else:
1439 encodedresult = "".join(encoder.encode(c) for c in s)
1440 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1441 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1442 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001443
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001444 encodedresult = "".join(cencoder.encode(c) for c in s)
1445 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1446 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1447 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1448
Walter Dörwald729c31f2005-03-14 19:06:30 +00001449 def test_seek(self):
1450 # all codecs should be able to encode these
1451 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1452 for encoding in all_unicode_encodings:
1453 if encoding == "idna": # FIXME: See SF bug #1163178
1454 continue
1455 if encoding in broken_unicode_with_streams:
1456 continue
1457 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1458 for t in xrange(5):
1459 # Test that calling seek resets the internal codec state and buffers
1460 reader.seek(0, 0)
1461 line = reader.readline()
1462 self.assertEqual(s[:len(line)], line)
1463
Walter Dörwalde22d3392005-11-17 08:52:34 +00001464 def test_bad_decode_args(self):
1465 for encoding in all_unicode_encodings:
1466 decoder = codecs.getdecoder(encoding)
1467 self.assertRaises(TypeError, decoder)
1468 if encoding not in ("idna", "punycode"):
1469 self.assertRaises(TypeError, decoder, 42)
1470
1471 def test_bad_encode_args(self):
1472 for encoding in all_unicode_encodings:
1473 encoder = codecs.getencoder(encoding)
1474 self.assertRaises(TypeError, encoder)
1475
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001476 def test_encoding_map_type_initialized(self):
1477 from encodings import cp1140
1478 # This used to crash, we are only verifying there's no crash.
1479 table_type = type(cp1140.encoding_table)
1480 self.assertEqual(table_type, table_type)
1481
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001482class BasicStrTest(unittest.TestCase):
1483 def test_basics(self):
1484 s = "abc123"
1485 for encoding in all_string_encodings:
1486 (bytes, size) = codecs.getencoder(encoding)(s)
1487 self.assertEqual(size, len(s))
1488 (chars, size) = codecs.getdecoder(encoding)(bytes)
1489 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1490
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001491class CharmapTest(unittest.TestCase):
1492 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001493 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001494 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1495 (u"abc", 3)
1496 )
1497
Ezio Melotti2623a372010-11-21 13:34:58 +00001498 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001499 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1500 (u"ab\ufffd", 3)
1501 )
1502
Ezio Melotti2623a372010-11-21 13:34:58 +00001503 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001504 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1505 (u"ab\ufffd", 3)
1506 )
1507
Ezio Melotti2623a372010-11-21 13:34:58 +00001508 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001509 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1510 (u"ab", 3)
1511 )
1512
Ezio Melotti2623a372010-11-21 13:34:58 +00001513 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001514 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1515 (u"ab", 3)
1516 )
1517
1518 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001519 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001520 codecs.charmap_decode(allbytes, "ignore", u""),
1521 (u"", len(allbytes))
1522 )
1523
Georg Brandl8f99f812006-10-29 08:39:22 +00001524class WithStmtTest(unittest.TestCase):
1525 def test_encodedfile(self):
1526 f = StringIO.StringIO("\xc3\xbc")
1527 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001528 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001529
1530 def test_streamreaderwriter(self):
1531 f = StringIO.StringIO("\xc3\xbc")
1532 info = codecs.lookup("utf-8")
1533 with codecs.StreamReaderWriter(f, info.streamreader,
1534 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001535 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001536
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001537
Victor Stinner262be5e2010-05-22 02:11:07 +00001538class BomTest(unittest.TestCase):
1539 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00001540 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00001541 tests = ("utf-16",
1542 "utf-16-le",
1543 "utf-16-be",
1544 "utf-32",
1545 "utf-32-le",
1546 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02001547 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00001548 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00001549 # Check if the BOM is written only once
1550 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00001551 f.write(data)
1552 f.write(data)
1553 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001554 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001555 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001556 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001557
Victor Stinner7df55da2010-05-22 13:37:56 +00001558 # Check that the BOM is written after a seek(0)
1559 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1560 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001561 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001562 f.seek(0)
1563 f.write(data)
1564 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001565 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001566
1567 # (StreamWriter) Check that the BOM is written after a seek(0)
1568 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1569 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001570 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001571 f.writer.seek(0)
1572 f.writer.write(data)
1573 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001574 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001575
1576 # Check that the BOM is not written after a seek() at a position
1577 # different than the start
1578 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1579 f.write(data)
1580 f.seek(f.tell())
1581 f.write(data)
1582 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001583 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001584
1585 # (StreamWriter) Check that the BOM is not written after a seek()
1586 # at a position different than the start
1587 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1588 f.writer.write(data)
1589 f.writer.seek(f.writer.tell())
1590 f.writer.write(data)
1591 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001592 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001593
Victor Stinner262be5e2010-05-22 02:11:07 +00001594
Fred Drake2e2be372001-09-20 21:33:42 +00001595def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001596 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001597 UTF32Test,
1598 UTF32LETest,
1599 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001600 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001601 UTF16LETest,
1602 UTF16BETest,
1603 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001604 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001605 UTF7Test,
1606 UTF16ExTest,
1607 ReadBufferTest,
1608 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001609 EscapeDecodeTest,
1610 RecodingTest,
1611 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001612 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001613 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001614 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001615 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001616 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001617 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001618 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001619 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001620 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001621 CharmapTest,
1622 WithStmtTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00001623 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001624 )
Fred Drake2e2be372001-09-20 21:33:42 +00001625
1626
1627if __name__ == "__main__":
1628 test_main()