blob: e6c39b7391a98dc8c89b74ccd0d161ee822d91a4 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Walter Dörwald9ae019b2006-03-18 14:22:26 +00005import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
11 def __init__(self):
12 self._buffer = ""
13
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
20 self._buffer = ""
21 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwalde57d7b12004-12-21 22:24:00 +000027class ReadTest(unittest.TestCase):
28 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000029 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000030 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000031 # the StreamReader and check that the results equal the appropriate
32 # entries from partialresults.
33 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000034 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000035 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000036 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000037 q.write(c)
38 result += r.read()
39 self.assertEqual(result, partialresult)
40 # check that there's nothing left in the buffers
41 self.assertEqual(r.read(), u"")
42 self.assertEqual(r.bytebuffer, "")
43 self.assertEqual(r.charbuffer, u"")
44
Walter Dörwaldabb02e52006-03-15 11:35:15 +000045 # do the check again, this time using a incremental decoder
46 d = codecs.getincrementaldecoder(self.encoding)()
47 result = u""
48 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
49 result += d.decode(c)
50 self.assertEqual(result, partialresult)
51 # check that there's nothing left in the buffers
52 self.assertEqual(d.decode("", True), u"")
53 self.assertEqual(d.buffer, "")
54
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000055 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000056 d.reset()
57 result = u""
58 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
59 result += d.decode(c)
60 self.assertEqual(result, partialresult)
61 # check that there's nothing left in the buffers
62 self.assertEqual(d.decode("", True), u"")
63 self.assertEqual(d.buffer, "")
64
65 # check iterdecode()
66 encoded = input.encode(self.encoding)
67 self.assertEqual(
68 input,
69 u"".join(codecs.iterdecode(encoded, self.encoding))
70 )
71
Walter Dörwalde57d7b12004-12-21 22:24:00 +000072 def test_readline(self):
73 def getreader(input):
74 stream = StringIO.StringIO(input.encode(self.encoding))
75 return codecs.getreader(self.encoding)(stream)
76
Walter Dörwaldca199432006-03-06 22:39:12 +000077 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 reader = getreader(input)
79 lines = []
80 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000081 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000082 if not line:
83 break
84 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000085 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086
87 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000088 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
89 sexpectednoends = u"foo|bar|baz|spam|eggs"
90 self.assertEqual(readalllines(s, True), sexpected)
91 self.assertEqual(readalllines(s, False), sexpectednoends)
92 self.assertEqual(readalllines(s, True, 10), sexpected)
93 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000094
95 # Test long lines (multiple calls to read() in readline())
96 vw = []
97 vwo = []
98 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
99 vw.append((i*200)*u"\3042" + lineend)
100 vwo.append((i*200)*u"\3042")
101 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
102 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
103
104 # Test lines where the first read might end with \r, so the
105 # reader has to look ahead whether this is a lone \r or a \r\n
106 for size in xrange(80):
107 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000108 s = 10*(size*u"a" + lineend + u"xxx\n")
109 reader = getreader(s)
110 for i in xrange(10):
111 self.assertEqual(
112 reader.readline(keepends=True),
113 size*u"a" + lineend,
114 )
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=False),
119 size*u"a",
120 )
121
122 def test_bug1175396(self):
123 s = [
124 '<%!--===================================================\r\n',
125 ' BLOG index page: show recent articles,\r\n',
126 ' today\'s articles, or articles of a specific date.\r\n',
127 '========================================================--%>\r\n',
128 '<%@inputencoding="ISO-8859-1"%>\r\n',
129 '<%@pagetemplate=TEMPLATE.y%>\r\n',
130 '<%@import=import frog.util, frog%>\r\n',
131 '<%@import=import frog.objects%>\r\n',
132 '<%@import=from frog.storageerrors import StorageError%>\r\n',
133 '<%\r\n',
134 '\r\n',
135 'import logging\r\n',
136 'log=logging.getLogger("Snakelets.logger")\r\n',
137 '\r\n',
138 '\r\n',
139 'user=self.SessionCtx.user\r\n',
140 'storageEngine=self.SessionCtx.storageEngine\r\n',
141 '\r\n',
142 '\r\n',
143 'def readArticlesFromDate(date, count=None):\r\n',
144 ' entryids=storageEngine.listBlogEntries(date)\r\n',
145 ' entryids.reverse() # descending\r\n',
146 ' if count:\r\n',
147 ' entryids=entryids[:count]\r\n',
148 ' try:\r\n',
149 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
150 ' except StorageError,x:\r\n',
151 ' log.error("Error loading articles: "+str(x))\r\n',
152 ' self.abort("cannot load articles")\r\n',
153 '\r\n',
154 'showdate=None\r\n',
155 '\r\n',
156 'arg=self.Request.getArg()\r\n',
157 'if arg=="today":\r\n',
158 ' #-------------------- TODAY\'S ARTICLES\r\n',
159 ' self.write("<h2>Today\'s articles</h2>")\r\n',
160 ' showdate = frog.util.isodatestr() \r\n',
161 ' entries = readArticlesFromDate(showdate)\r\n',
162 'elif arg=="active":\r\n',
163 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
164 ' self.Yredirect("active.y")\r\n',
165 'elif arg=="login":\r\n',
166 ' #-------------------- LOGIN PAGE redirect\r\n',
167 ' self.Yredirect("login.y")\r\n',
168 'elif arg=="date":\r\n',
169 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
170 ' showdate = self.Request.getParameter("date")\r\n',
171 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
172 ' entries = readArticlesFromDate(showdate)\r\n',
173 'else:\r\n',
174 ' #-------------------- RECENT ARTICLES\r\n',
175 ' self.write("<h2>Recent articles</h2>")\r\n',
176 ' dates=storageEngine.listBlogEntryDates()\r\n',
177 ' if dates:\r\n',
178 ' entries=[]\r\n',
179 ' SHOWAMOUNT=10\r\n',
180 ' for showdate in dates:\r\n',
181 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
182 ' if len(entries)>=SHOWAMOUNT:\r\n',
183 ' break\r\n',
184 ' \r\n',
185 ]
186 stream = StringIO.StringIO("".join(s).encode(self.encoding))
187 reader = codecs.getreader(self.encoding)(stream)
188 for (i, line) in enumerate(reader):
189 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000190
191 def test_readlinequeue(self):
192 q = Queue()
193 writer = codecs.getwriter(self.encoding)(q)
194 reader = codecs.getreader(self.encoding)(q)
195
196 # No lineends
197 writer.write(u"foo\r")
198 self.assertEqual(reader.readline(keepends=False), u"foo")
199 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000200 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000201 self.assertEqual(reader.readline(keepends=False), u"bar")
202 writer.write(u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"baz")
204 self.assertEqual(reader.readline(keepends=False), u"")
205
206 # Lineends
207 writer.write(u"foo\r")
208 self.assertEqual(reader.readline(keepends=True), u"foo\r")
209 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000210 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000211 self.assertEqual(reader.readline(keepends=True), u"bar\r")
212 writer.write(u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"baz")
214 self.assertEqual(reader.readline(keepends=True), u"")
215 writer.write(u"foo\r\n")
216 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
217
Walter Dörwald9fa09462005-01-10 12:01:39 +0000218 def test_bug1098990_a(self):
219 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
220 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
221 s3 = u"next line.\r\n"
222
223 s = (s1+s2+s3).encode(self.encoding)
224 stream = StringIO.StringIO(s)
225 reader = codecs.getreader(self.encoding)(stream)
226 self.assertEqual(reader.readline(), s1)
227 self.assertEqual(reader.readline(), s2)
228 self.assertEqual(reader.readline(), s3)
229 self.assertEqual(reader.readline(), u"")
230
231 def test_bug1098990_b(self):
232 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
233 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
234 s3 = u"stillokay:bbbbxx\r\n"
235 s4 = u"broken!!!!badbad\r\n"
236 s5 = u"againokay.\r\n"
237
238 s = (s1+s2+s3+s4+s5).encode(self.encoding)
239 stream = StringIO.StringIO(s)
240 reader = codecs.getreader(self.encoding)(stream)
241 self.assertEqual(reader.readline(), s1)
242 self.assertEqual(reader.readline(), s2)
243 self.assertEqual(reader.readline(), s3)
244 self.assertEqual(reader.readline(), s4)
245 self.assertEqual(reader.readline(), s5)
246 self.assertEqual(reader.readline(), u"")
247
Walter Dörwald6e390802007-08-17 16:41:28 +0000248class UTF32Test(ReadTest):
249 encoding = "utf-32"
250
251 spamle = ('\xff\xfe\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
253 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
254 spambe = ('\x00\x00\xfe\xff'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
256 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
257
258 def test_only_one_bom(self):
259 _,_,reader,writer = codecs.lookup(self.encoding)
260 # encode some stream
261 s = StringIO.StringIO()
262 f = writer(s)
263 f.write(u"spam")
264 f.write(u"spam")
265 d = s.getvalue()
266 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000267 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000268 # try to read it back
269 s = StringIO.StringIO(d)
270 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000271 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000272
273 def test_badbom(self):
274 s = StringIO.StringIO(4*"\xff")
275 f = codecs.getreader(self.encoding)(s)
276 self.assertRaises(UnicodeError, f.read)
277
278 s = StringIO.StringIO(8*"\xff")
279 f = codecs.getreader(self.encoding)(s)
280 self.assertRaises(UnicodeError, f.read)
281
282 def test_partial(self):
283 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200284 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000285 [
286 u"", # first byte of BOM read
287 u"", # second byte of BOM read
288 u"", # third byte of BOM read
289 u"", # fourth byte of BOM read => byteorder known
290 u"",
291 u"",
292 u"",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100",
305 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200306 u"\x00\xff\u0100\uffff",
307 u"\x00\xff\u0100\uffff",
308 u"\x00\xff\u0100\uffff",
309 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000310 ]
311 )
312
Georg Brandle9741f32009-09-17 11:28:09 +0000313 def test_handlers(self):
314 self.assertEqual((u'\ufffd', 1),
315 codecs.utf_32_decode('\x01', 'replace', True))
316 self.assertEqual((u'', 1),
317 codecs.utf_32_decode('\x01', 'ignore', True))
318
Walter Dörwald6e390802007-08-17 16:41:28 +0000319 def test_errors(self):
320 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
321 "\xff", "strict", True)
322
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000323 def test_issue8941(self):
324 # Issue #8941: insufficient result allocation when decoding into
325 # surrogate pairs on UCS-2 builds.
326 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
327 self.assertEqual(u'\U00010000' * 1024,
328 codecs.utf_32_decode(encoded_le)[0])
329 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
330 self.assertEqual(u'\U00010000' * 1024,
331 codecs.utf_32_decode(encoded_be)[0])
332
Walter Dörwald6e390802007-08-17 16:41:28 +0000333class UTF32LETest(ReadTest):
334 encoding = "utf-32-le"
335
336 def test_partial(self):
337 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200338 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000339 [
340 u"",
341 u"",
342 u"",
343 u"\x00",
344 u"\x00",
345 u"\x00",
346 u"\x00",
347 u"\x00\xff",
348 u"\x00\xff",
349 u"\x00\xff",
350 u"\x00\xff",
351 u"\x00\xff\u0100",
352 u"\x00\xff\u0100",
353 u"\x00\xff\u0100",
354 u"\x00\xff\u0100",
355 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200356 u"\x00\xff\u0100\uffff",
357 u"\x00\xff\u0100\uffff",
358 u"\x00\xff\u0100\uffff",
359 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000360 ]
361 )
362
363 def test_simple(self):
364 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
365
366 def test_errors(self):
367 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
368 "\xff", "strict", True)
369
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000370 def test_issue8941(self):
371 # Issue #8941: insufficient result allocation when decoding into
372 # surrogate pairs on UCS-2 builds.
373 encoded = '\x00\x00\x01\x00' * 1024
374 self.assertEqual(u'\U00010000' * 1024,
375 codecs.utf_32_le_decode(encoded)[0])
376
Walter Dörwald6e390802007-08-17 16:41:28 +0000377class UTF32BETest(ReadTest):
378 encoding = "utf-32-be"
379
380 def test_partial(self):
381 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200382 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000383 [
384 u"",
385 u"",
386 u"",
387 u"\x00",
388 u"\x00",
389 u"\x00",
390 u"\x00",
391 u"\x00\xff",
392 u"\x00\xff",
393 u"\x00\xff",
394 u"\x00\xff",
395 u"\x00\xff\u0100",
396 u"\x00\xff\u0100",
397 u"\x00\xff\u0100",
398 u"\x00\xff\u0100",
399 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200400 u"\x00\xff\u0100\uffff",
401 u"\x00\xff\u0100\uffff",
402 u"\x00\xff\u0100\uffff",
403 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000404 ]
405 )
406
407 def test_simple(self):
408 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
409
410 def test_errors(self):
411 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
412 "\xff", "strict", True)
413
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000414 def test_issue8941(self):
415 # Issue #8941: insufficient result allocation when decoding into
416 # surrogate pairs on UCS-2 builds.
417 encoded = '\x00\x01\x00\x00' * 1024
418 self.assertEqual(u'\U00010000' * 1024,
419 codecs.utf_32_be_decode(encoded)[0])
420
421
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000422class UTF16Test(ReadTest):
423 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000424
425 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
426 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
427
428 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000429 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000430 # encode some stream
431 s = StringIO.StringIO()
432 f = writer(s)
433 f.write(u"spam")
434 f.write(u"spam")
435 d = s.getvalue()
436 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000437 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000438 # try to read it back
439 s = StringIO.StringIO(d)
440 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000441 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000442
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000443 def test_badbom(self):
444 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000445 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000446 self.assertRaises(UnicodeError, f.read)
447
448 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000449 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000450 self.assertRaises(UnicodeError, f.read)
451
Walter Dörwald69652032004-09-07 20:24:22 +0000452 def test_partial(self):
453 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200454 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000455 [
456 u"", # first byte of BOM read
457 u"", # second byte of BOM read => byteorder known
458 u"",
459 u"\x00",
460 u"\x00",
461 u"\x00\xff",
462 u"\x00\xff",
463 u"\x00\xff\u0100",
464 u"\x00\xff\u0100",
465 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200466 u"\x00\xff\u0100\uffff",
467 u"\x00\xff\u0100\uffff",
468 u"\x00\xff\u0100\uffff",
469 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000470 ]
471 )
472
Georg Brandle9741f32009-09-17 11:28:09 +0000473 def test_handlers(self):
474 self.assertEqual((u'\ufffd', 1),
475 codecs.utf_16_decode('\x01', 'replace', True))
476 self.assertEqual((u'', 1),
477 codecs.utf_16_decode('\x01', 'ignore', True))
478
Walter Dörwalde22d3392005-11-17 08:52:34 +0000479 def test_errors(self):
480 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
481
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000482 def test_bug691291(self):
483 # Files are always opened in binary mode, even if no binary mode was
484 # specified. This means that no automatic conversion of '\n' is done
485 # on reading and writing.
486 s1 = u'Hello\r\nworld\r\n'
487
488 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200489 self.addCleanup(test_support.unlink, test_support.TESTFN)
490 with open(test_support.TESTFN, 'wb') as fp:
491 fp.write(s)
492 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
493 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000494
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000495class UTF16LETest(ReadTest):
496 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000497
498 def test_partial(self):
499 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200500 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000501 [
502 u"",
503 u"\x00",
504 u"\x00",
505 u"\x00\xff",
506 u"\x00\xff",
507 u"\x00\xff\u0100",
508 u"\x00\xff\u0100",
509 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200510 u"\x00\xff\u0100\uffff",
511 u"\x00\xff\u0100\uffff",
512 u"\x00\xff\u0100\uffff",
513 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000514 ]
515 )
516
Walter Dörwalde22d3392005-11-17 08:52:34 +0000517 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200518 tests = [
519 (b'\xff', u'\ufffd'),
520 (b'A\x00Z', u'A\ufffd'),
521 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
522 (b'\x00\xd8', u'\ufffd'),
523 (b'\x00\xd8A', u'\ufffd'),
524 (b'\x00\xd8A\x00', u'\ufffdA'),
525 (b'\x00\xdcA\x00', u'\ufffdA'),
526 ]
527 for raw, expected in tests:
528 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
529 raw, 'strict', True)
530 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000531
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000532class UTF16BETest(ReadTest):
533 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000534
535 def test_partial(self):
536 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200537 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000538 [
539 u"",
540 u"\x00",
541 u"\x00",
542 u"\x00\xff",
543 u"\x00\xff",
544 u"\x00\xff\u0100",
545 u"\x00\xff\u0100",
546 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200547 u"\x00\xff\u0100\uffff",
548 u"\x00\xff\u0100\uffff",
549 u"\x00\xff\u0100\uffff",
550 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000551 ]
552 )
553
Walter Dörwalde22d3392005-11-17 08:52:34 +0000554 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200555 tests = [
556 (b'\xff', u'\ufffd'),
557 (b'\x00A\xff', u'A\ufffd'),
558 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
559 (b'\xd8\x00', u'\ufffd'),
560 (b'\xd8\x00\xdc', u'\ufffd'),
561 (b'\xd8\x00\x00A', u'\ufffdA'),
562 (b'\xdc\x00\x00A', u'\ufffdA'),
563 ]
564 for raw, expected in tests:
565 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
566 raw, 'strict', True)
567 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000568
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000569class UTF8Test(ReadTest):
570 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000571
572 def test_partial(self):
573 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200574 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000575 [
576 u"\x00",
577 u"\x00",
578 u"\x00\xff",
579 u"\x00\xff",
580 u"\x00\xff\u07ff",
581 u"\x00\xff\u07ff",
582 u"\x00\xff\u07ff",
583 u"\x00\xff\u07ff\u0800",
584 u"\x00\xff\u07ff\u0800",
585 u"\x00\xff\u07ff\u0800",
586 u"\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200587 u"\x00\xff\u07ff\u0800\uffff",
588 u"\x00\xff\u07ff\u0800\uffff",
589 u"\x00\xff\u07ff\u0800\uffff",
590 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000591 ]
592 )
593
Walter Dörwalde22d3392005-11-17 08:52:34 +0000594class UTF7Test(ReadTest):
595 encoding = "utf-7"
596
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000597 def test_partial(self):
598 self.check_partial(
599 u"a+-b",
600 [
601 u"a",
602 u"a",
603 u"a+",
604 u"a+-",
605 u"a+-b",
606 ]
607 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000608
609class UTF16ExTest(unittest.TestCase):
610
611 def test_errors(self):
612 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
613
614 def test_bad_args(self):
615 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
616
617class ReadBufferTest(unittest.TestCase):
618
619 def test_array(self):
620 import array
621 self.assertEqual(
622 codecs.readbuffer_encode(array.array("c", "spam")),
623 ("spam", 4)
624 )
625
626 def test_empty(self):
627 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
628
629 def test_bad_args(self):
630 self.assertRaises(TypeError, codecs.readbuffer_encode)
631 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
632
633class CharBufferTest(unittest.TestCase):
634
635 def test_string(self):
636 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
637
638 def test_empty(self):
639 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
640
641 def test_bad_args(self):
642 self.assertRaises(TypeError, codecs.charbuffer_encode)
643 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
644
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000645class UTF8SigTest(ReadTest):
646 encoding = "utf-8-sig"
647
648 def test_partial(self):
649 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200650 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000651 [
652 u"",
653 u"",
654 u"", # First BOM has been read and skipped
655 u"",
656 u"",
657 u"\ufeff", # Second BOM has been read and emitted
658 u"\ufeff\x00", # "\x00" read and emitted
659 u"\ufeff\x00", # First byte of encoded u"\xff" read
660 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
661 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
662 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
663 u"\ufeff\x00\xff\u07ff",
664 u"\ufeff\x00\xff\u07ff",
665 u"\ufeff\x00\xff\u07ff\u0800",
666 u"\ufeff\x00\xff\u07ff\u0800",
667 u"\ufeff\x00\xff\u07ff\u0800",
668 u"\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200669 u"\ufeff\x00\xff\u07ff\u0800\uffff",
670 u"\ufeff\x00\xff\u07ff\u0800\uffff",
671 u"\ufeff\x00\xff\u07ff\u0800\uffff",
672 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000673 ]
674 )
675
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000676 def test_bug1601501(self):
677 # SF bug #1601501: check that the codec works with a buffer
678 unicode("\xef\xbb\xbf", "utf-8-sig")
679
Walter Dörwald42348272007-04-12 10:35:00 +0000680 def test_bom(self):
681 d = codecs.getincrementaldecoder("utf-8-sig")()
682 s = u"spam"
683 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
684
Walter Dörwald183744d2007-11-19 12:41:10 +0000685 def test_stream_bom(self):
686 unistring = u"ABC\u00A1\u2200XYZ"
687 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
688
689 reader = codecs.getreader("utf-8-sig")
690 for sizehint in [None] + range(1, 11) + \
691 [64, 128, 256, 512, 1024]:
692 istream = reader(StringIO.StringIO(bytestring))
693 ostream = StringIO.StringIO()
694 while 1:
695 if sizehint is not None:
696 data = istream.read(sizehint)
697 else:
698 data = istream.read()
699
700 if not data:
701 break
702 ostream.write(data)
703
704 got = ostream.getvalue()
705 self.assertEqual(got, unistring)
706
707 def test_stream_bare(self):
708 unistring = u"ABC\u00A1\u2200XYZ"
709 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
710
711 reader = codecs.getreader("utf-8-sig")
712 for sizehint in [None] + range(1, 11) + \
713 [64, 128, 256, 512, 1024]:
714 istream = reader(StringIO.StringIO(bytestring))
715 ostream = StringIO.StringIO()
716 while 1:
717 if sizehint is not None:
718 data = istream.read(sizehint)
719 else:
720 data = istream.read()
721
722 if not data:
723 break
724 ostream.write(data)
725
726 got = ostream.getvalue()
727 self.assertEqual(got, unistring)
728
Walter Dörwald8709a422002-09-03 13:53:40 +0000729class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000730 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000731 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000732
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000733class RecodingTest(unittest.TestCase):
734 def test_recoding(self):
735 f = StringIO.StringIO()
736 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
737 f2.write(u"a")
738 f2.close()
739 # Python used to crash on this at exit because of a refcount
740 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000741
Martin v. Löwis2548c732003-04-18 10:39:54 +0000742# From RFC 3492
743punycode_testcases = [
744 # A Arabic (Egyptian):
745 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
746 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
747 "egbpdaj6bu4bxfgehfvwxn"),
748 # B Chinese (simplified):
749 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
750 "ihqwcrb4cv8a8dqg056pqjye"),
751 # C Chinese (traditional):
752 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
753 "ihqwctvzc91f659drss3x8bo0yb"),
754 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
755 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
756 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
757 u"\u0065\u0073\u006B\u0079",
758 "Proprostnemluvesky-uyb24dma41a"),
759 # E Hebrew:
760 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
761 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
762 u"\u05D1\u05E8\u05D9\u05EA",
763 "4dbcagdahymbxekheh6e0a7fei0b"),
764 # F Hindi (Devanagari):
765 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
766 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
767 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
768 u"\u0939\u0948\u0902",
769 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
770
771 #(G) Japanese (kanji and hiragana):
772 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
773 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
774 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
775
776 # (H) Korean (Hangul syllables):
777 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
778 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
779 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
780 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
781 "psd879ccm6fea98c"),
782
783 # (I) Russian (Cyrillic):
784 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
785 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
786 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
787 u"\u0438",
788 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
789
790 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
791 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
792 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
793 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
794 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
795 u"\u0061\u00F1\u006F\u006C",
796 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
797
798 # (K) Vietnamese:
799 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
800 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
801 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
802 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
803 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
804 u"\u0056\u0069\u1EC7\u0074",
805 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
806
Martin v. Löwis2548c732003-04-18 10:39:54 +0000807 #(L) 3<nen>B<gumi><kinpachi><sensei>
808 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
809 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000810
Martin v. Löwis2548c732003-04-18 10:39:54 +0000811 # (M) <amuro><namie>-with-SUPER-MONKEYS
812 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
813 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
814 u"\u004F\u004E\u004B\u0045\u0059\u0053",
815 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
816
817 # (N) Hello-Another-Way-<sorezore><no><basho>
818 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
819 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
820 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
821 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
822
823 # (O) <hitotsu><yane><no><shita>2
824 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
825 "2-u9tlzr9756bt3uc0v"),
826
827 # (P) Maji<de>Koi<suru>5<byou><mae>
828 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
829 u"\u308B\u0035\u79D2\u524D",
830 "MajiKoi5-783gue6qz075azm5e"),
831
832 # (Q) <pafii>de<runba>
833 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
834 "de-jg4avhby1noc0d"),
835
836 # (R) <sono><supiido><de>
837 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
838 "d9juau41awczczp"),
839
840 # (S) -> $1.00 <-
841 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
842 u"\u003C\u002D",
843 "-> $1.00 <--")
844 ]
845
846for i in punycode_testcases:
847 if len(i)!=2:
848 print repr(i)
849
850class PunycodeTest(unittest.TestCase):
851 def test_encode(self):
852 for uni, puny in punycode_testcases:
853 # Need to convert both strings to lower case, since
854 # some of the extended encodings use upper case, but our
855 # code produces only lower case. Converting just puny to
856 # lower is also insufficient, since some of the input characters
857 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +0000858 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000859
860 def test_decode(self):
861 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +0000862 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000863
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000864class UnicodeInternalTest(unittest.TestCase):
865 def test_bug1251300(self):
866 # Decoding with unicode_internal used to not correctly handle "code
867 # points" above 0x10ffff on UCS-4 builds.
868 if sys.maxunicode > 0xffff:
869 ok = [
870 ("\x00\x10\xff\xff", u"\U0010ffff"),
871 ("\x00\x00\x01\x01", u"\U00000101"),
872 ("", u""),
873 ]
874 not_ok = [
875 "\x7f\xff\xff\xff",
876 "\x80\x00\x00\x00",
877 "\x81\x00\x00\x00",
878 "\x00",
879 "\x00\x00\x00\x00\x00",
880 ]
881 for internal, uni in ok:
882 if sys.byteorder == "little":
883 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +0000884 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000885 for internal in not_ok:
886 if sys.byteorder == "little":
887 internal = "".join(reversed(internal))
888 self.assertRaises(UnicodeDecodeError, internal.decode,
889 "unicode_internal")
890
891 def test_decode_error_attributes(self):
892 if sys.maxunicode > 0xffff:
893 try:
894 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
895 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +0000896 self.assertEqual("unicode_internal", ex.encoding)
897 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
898 self.assertEqual(4, ex.start)
899 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000900 else:
901 self.fail()
902
903 def test_decode_callback(self):
904 if sys.maxunicode > 0xffff:
905 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
906 decoder = codecs.getdecoder("unicode_internal")
907 ab = u"ab".encode("unicode_internal")
908 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
909 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +0000910 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000911
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000912 def test_encode_length(self):
913 # Issue 3739
914 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +0000915 self.assertEqual(encoder(u"a")[1], 1)
916 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000917
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000918 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +0000919 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000920
Martin v. Löwis2548c732003-04-18 10:39:54 +0000921# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
922nameprep_tests = [
923 # 3.1 Map to nothing.
924 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
925 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
926 '\xb8\x8f\xef\xbb\xbf',
927 'foobarbaz'),
928 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
929 ('CAFE',
930 'cafe'),
931 # 3.3 Case folding 8bit U+00DF (german sharp s).
932 # The original test case is bogus; it says \xc3\xdf
933 ('\xc3\x9f',
934 'ss'),
935 # 3.4 Case folding U+0130 (turkish capital I with dot).
936 ('\xc4\xb0',
937 'i\xcc\x87'),
938 # 3.5 Case folding multibyte U+0143 U+037A.
939 ('\xc5\x83\xcd\xba',
940 '\xc5\x84 \xce\xb9'),
941 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
942 # XXX: skip this as it fails in UCS-2 mode
943 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
944 # 'telc\xe2\x88\x95kg\xcf\x83'),
945 (None, None),
946 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
947 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
948 '\xc7\xb0 a'),
949 # 3.8 Case folding U+1FB7 and normalization.
950 ('\xe1\xbe\xb7',
951 '\xe1\xbe\xb6\xce\xb9'),
952 # 3.9 Self-reverting case folding U+01F0 and normalization.
953 # The original test case is bogus, it says `\xc7\xf0'
954 ('\xc7\xb0',
955 '\xc7\xb0'),
956 # 3.10 Self-reverting case folding U+0390 and normalization.
957 ('\xce\x90',
958 '\xce\x90'),
959 # 3.11 Self-reverting case folding U+03B0 and normalization.
960 ('\xce\xb0',
961 '\xce\xb0'),
962 # 3.12 Self-reverting case folding U+1E96 and normalization.
963 ('\xe1\xba\x96',
964 '\xe1\xba\x96'),
965 # 3.13 Self-reverting case folding U+1F56 and normalization.
966 ('\xe1\xbd\x96',
967 '\xe1\xbd\x96'),
968 # 3.14 ASCII space character U+0020.
969 (' ',
970 ' '),
971 # 3.15 Non-ASCII 8bit space character U+00A0.
972 ('\xc2\xa0',
973 ' '),
974 # 3.16 Non-ASCII multibyte space character U+1680.
975 ('\xe1\x9a\x80',
976 None),
977 # 3.17 Non-ASCII multibyte space character U+2000.
978 ('\xe2\x80\x80',
979 ' '),
980 # 3.18 Zero Width Space U+200b.
981 ('\xe2\x80\x8b',
982 ''),
983 # 3.19 Non-ASCII multibyte space character U+3000.
984 ('\xe3\x80\x80',
985 ' '),
986 # 3.20 ASCII control characters U+0010 U+007F.
987 ('\x10\x7f',
988 '\x10\x7f'),
989 # 3.21 Non-ASCII 8bit control character U+0085.
990 ('\xc2\x85',
991 None),
992 # 3.22 Non-ASCII multibyte control character U+180E.
993 ('\xe1\xa0\x8e',
994 None),
995 # 3.23 Zero Width No-Break Space U+FEFF.
996 ('\xef\xbb\xbf',
997 ''),
998 # 3.24 Non-ASCII control character U+1D175.
999 ('\xf0\x9d\x85\xb5',
1000 None),
1001 # 3.25 Plane 0 private use character U+F123.
1002 ('\xef\x84\xa3',
1003 None),
1004 # 3.26 Plane 15 private use character U+F1234.
1005 ('\xf3\xb1\x88\xb4',
1006 None),
1007 # 3.27 Plane 16 private use character U+10F234.
1008 ('\xf4\x8f\x88\xb4',
1009 None),
1010 # 3.28 Non-character code point U+8FFFE.
1011 ('\xf2\x8f\xbf\xbe',
1012 None),
1013 # 3.29 Non-character code point U+10FFFF.
1014 ('\xf4\x8f\xbf\xbf',
1015 None),
1016 # 3.30 Surrogate code U+DF42.
1017 ('\xed\xbd\x82',
1018 None),
1019 # 3.31 Non-plain text character U+FFFD.
1020 ('\xef\xbf\xbd',
1021 None),
1022 # 3.32 Ideographic description character U+2FF5.
1023 ('\xe2\xbf\xb5',
1024 None),
1025 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +00001026 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001027 '\xcc\x81'),
1028 # 3.34 Left-to-right mark U+200E.
1029 ('\xe2\x80\x8e',
1030 None),
1031 # 3.35 Deprecated U+202A.
1032 ('\xe2\x80\xaa',
1033 None),
1034 # 3.36 Language tagging character U+E0001.
1035 ('\xf3\xa0\x80\x81',
1036 None),
1037 # 3.37 Language tagging character U+E0042.
1038 ('\xf3\xa0\x81\x82',
1039 None),
1040 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1041 ('foo\xd6\xbebar',
1042 None),
1043 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1044 ('foo\xef\xb5\x90bar',
1045 None),
1046 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1047 ('foo\xef\xb9\xb6bar',
1048 'foo \xd9\x8ebar'),
1049 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1050 ('\xd8\xa71',
1051 None),
1052 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1053 ('\xd8\xa71\xd8\xa8',
1054 '\xd8\xa71\xd8\xa8'),
1055 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001056 # Skip this test as we allow unassigned
1057 #('\xf3\xa0\x80\x82',
1058 # None),
1059 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001060 # 3.44 Larger test (shrinking).
1061 # Original test case reads \xc3\xdf
1062 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1063 '\xaa\xce\xb0\xe2\x80\x80',
1064 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1065 # 3.45 Larger test (expanding).
1066 # Original test case reads \xc3\x9f
1067 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1068 '\x80',
1069 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1070 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1071 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1072 ]
1073
1074
1075class NameprepTest(unittest.TestCase):
1076 def test_nameprep(self):
1077 from encodings.idna import nameprep
1078 for pos, (orig, prepped) in enumerate(nameprep_tests):
1079 if orig is None:
1080 # Skipped
1081 continue
1082 # The Unicode strings are given in UTF-8
1083 orig = unicode(orig, "utf-8")
1084 if prepped is None:
1085 # Input contains prohibited characters
1086 self.assertRaises(UnicodeError, nameprep, orig)
1087 else:
1088 prepped = unicode(prepped, "utf-8")
1089 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001090 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001091 except Exception,e:
1092 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1093
Walter Dörwald78a0be62006-04-14 18:25:39 +00001094class IDNACodecTest(unittest.TestCase):
1095 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001096 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1097 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1098 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1099 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001100
1101 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001102 self.assertEqual(u"python.org".encode("idna"), "python.org")
1103 self.assertEqual("python.org.".encode("idna"), "python.org.")
1104 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1105 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001106
Martin v. Löwis8b595142005-08-25 11:03:38 +00001107 def test_stream(self):
1108 import StringIO
1109 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1110 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001111 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001112
Walter Dörwald78a0be62006-04-14 18:25:39 +00001113 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001114 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001115 "".join(codecs.iterdecode("python.org", "idna")),
1116 u"python.org"
1117 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001118 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001119 "".join(codecs.iterdecode("python.org.", "idna")),
1120 u"python.org."
1121 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001122 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001123 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1124 u"pyth\xf6n.org."
1125 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001126 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001127 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1128 u"pyth\xf6n.org."
1129 )
1130
1131 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001132 self.assertEqual(decoder.decode("xn--xam", ), u"")
1133 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1134 self.assertEqual(decoder.decode(u"rg"), u"")
1135 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001136
1137 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001138 self.assertEqual(decoder.decode("xn--xam", ), u"")
1139 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1140 self.assertEqual(decoder.decode("rg."), u"org.")
1141 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001142
1143 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001144 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001145 "".join(codecs.iterencode(u"python.org", "idna")),
1146 "python.org"
1147 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001148 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001149 "".join(codecs.iterencode(u"python.org.", "idna")),
1150 "python.org."
1151 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001152 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001153 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1154 "xn--pythn-mua.org."
1155 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001156 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001157 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1158 "xn--pythn-mua.org."
1159 )
1160
1161 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001162 self.assertEqual(encoder.encode(u"\xe4x"), "")
1163 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1164 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001165
1166 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001167 self.assertEqual(encoder.encode(u"\xe4x"), "")
1168 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1169 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001170
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001171class CodecsModuleTest(unittest.TestCase):
1172
1173 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001174 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001175 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001176 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001177 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001178 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1179
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001180 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001181 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001182 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001183 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001184 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001185 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001186 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1187
1188 def test_register(self):
1189 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001190 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001191
1192 def test_lookup(self):
1193 self.assertRaises(TypeError, codecs.lookup)
1194 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001195 self.assertRaises(LookupError, codecs.lookup, " ")
1196
1197 def test_getencoder(self):
1198 self.assertRaises(TypeError, codecs.getencoder)
1199 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1200
1201 def test_getdecoder(self):
1202 self.assertRaises(TypeError, codecs.getdecoder)
1203 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1204
1205 def test_getreader(self):
1206 self.assertRaises(TypeError, codecs.getreader)
1207 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1208
1209 def test_getwriter(self):
1210 self.assertRaises(TypeError, codecs.getwriter)
1211 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001212
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001213 def test_lookup_issue1813(self):
1214 # Issue #1813: under Turkish locales, lookup of some codecs failed
1215 # because 'I' is lowercased as a dotless "i"
1216 oldlocale = locale.getlocale(locale.LC_CTYPE)
1217 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1218 try:
1219 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1220 except locale.Error:
1221 # Unsupported locale on this system
1222 self.skipTest('test needs Turkish locale')
1223 c = codecs.lookup('ASCII')
1224 self.assertEqual(c.name, 'ascii')
1225
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001226class StreamReaderTest(unittest.TestCase):
1227
1228 def setUp(self):
1229 self.reader = codecs.getreader('utf-8')
1230 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1231
1232 def test_readlines(self):
1233 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001234 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001235
Georg Brandl8f99f812006-10-29 08:39:22 +00001236class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001237
Georg Brandl8f99f812006-10-29 08:39:22 +00001238 def test_basic(self):
1239 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001240 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001241 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001242
1243 f = StringIO.StringIO()
1244 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1245 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001246 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001247
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001248class Str2StrTest(unittest.TestCase):
1249
1250 def test_read(self):
1251 sin = "\x80".encode("base64_codec")
1252 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1253 sout = reader.read()
1254 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001255 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001256
1257 def test_readline(self):
1258 sin = "\x80".encode("base64_codec")
1259 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1260 sout = reader.readline()
1261 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001262 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001263
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001264all_unicode_encodings = [
1265 "ascii",
1266 "base64_codec",
1267 "big5",
1268 "big5hkscs",
1269 "charmap",
1270 "cp037",
1271 "cp1006",
1272 "cp1026",
1273 "cp1140",
1274 "cp1250",
1275 "cp1251",
1276 "cp1252",
1277 "cp1253",
1278 "cp1254",
1279 "cp1255",
1280 "cp1256",
1281 "cp1257",
1282 "cp1258",
1283 "cp424",
1284 "cp437",
1285 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001286 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001287 "cp737",
1288 "cp775",
1289 "cp850",
1290 "cp852",
1291 "cp855",
1292 "cp856",
1293 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001294 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001295 "cp860",
1296 "cp861",
1297 "cp862",
1298 "cp863",
1299 "cp864",
1300 "cp865",
1301 "cp866",
1302 "cp869",
1303 "cp874",
1304 "cp875",
1305 "cp932",
1306 "cp949",
1307 "cp950",
1308 "euc_jis_2004",
1309 "euc_jisx0213",
1310 "euc_jp",
1311 "euc_kr",
1312 "gb18030",
1313 "gb2312",
1314 "gbk",
1315 "hex_codec",
1316 "hp_roman8",
1317 "hz",
1318 "idna",
1319 "iso2022_jp",
1320 "iso2022_jp_1",
1321 "iso2022_jp_2",
1322 "iso2022_jp_2004",
1323 "iso2022_jp_3",
1324 "iso2022_jp_ext",
1325 "iso2022_kr",
1326 "iso8859_1",
1327 "iso8859_10",
1328 "iso8859_11",
1329 "iso8859_13",
1330 "iso8859_14",
1331 "iso8859_15",
1332 "iso8859_16",
1333 "iso8859_2",
1334 "iso8859_3",
1335 "iso8859_4",
1336 "iso8859_5",
1337 "iso8859_6",
1338 "iso8859_7",
1339 "iso8859_8",
1340 "iso8859_9",
1341 "johab",
1342 "koi8_r",
1343 "koi8_u",
1344 "latin_1",
1345 "mac_cyrillic",
1346 "mac_greek",
1347 "mac_iceland",
1348 "mac_latin2",
1349 "mac_roman",
1350 "mac_turkish",
1351 "palmos",
1352 "ptcp154",
1353 "punycode",
1354 "raw_unicode_escape",
1355 "rot_13",
1356 "shift_jis",
1357 "shift_jis_2004",
1358 "shift_jisx0213",
1359 "tis_620",
1360 "unicode_escape",
1361 "unicode_internal",
1362 "utf_16",
1363 "utf_16_be",
1364 "utf_16_le",
1365 "utf_7",
1366 "utf_8",
1367]
1368
1369if hasattr(codecs, "mbcs_encode"):
1370 all_unicode_encodings.append("mbcs")
1371
1372# The following encodings work only with str, not unicode
1373all_string_encodings = [
1374 "quopri_codec",
1375 "string_escape",
1376 "uu_codec",
1377]
1378
1379# The following encoding is not tested, because it's not supposed
1380# to work:
1381# "undefined"
1382
1383# The following encodings don't work in stateful mode
1384broken_unicode_with_streams = [
1385 "base64_codec",
1386 "hex_codec",
1387 "punycode",
1388 "unicode_internal"
1389]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001390broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001391
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001392# The following encodings only support "strict" mode
1393only_strict_mode = [
1394 "idna",
1395 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001396 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001397]
1398
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001399try:
1400 import bz2
1401except ImportError:
1402 pass
1403else:
1404 all_unicode_encodings.append("bz2_codec")
1405 broken_unicode_with_streams.append("bz2_codec")
1406
1407try:
1408 import zlib
1409except ImportError:
1410 pass
1411else:
1412 all_unicode_encodings.append("zlib_codec")
1413 broken_unicode_with_streams.append("zlib_codec")
1414
1415class BasicUnicodeTest(unittest.TestCase):
1416 def test_basics(self):
1417 s = u"abc123" # all codecs should be able to encode these
1418 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001419 name = codecs.lookup(encoding).name
1420 if encoding.endswith("_codec"):
1421 name += "_codec"
1422 elif encoding == "latin_1":
1423 name = "latin_1"
1424 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001425 (bytes, size) = codecs.getencoder(encoding)(s)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001426 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001427 (chars, size) = codecs.getdecoder(encoding)(bytes)
1428 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1429
1430 if encoding not in broken_unicode_with_streams:
1431 # check stream reader/writer
1432 q = Queue()
1433 writer = codecs.getwriter(encoding)(q)
1434 encodedresult = ""
1435 for c in s:
1436 writer.write(c)
1437 encodedresult += q.read()
1438 q = Queue()
1439 reader = codecs.getreader(encoding)(q)
1440 decodedresult = u""
1441 for c in encodedresult:
1442 q.write(c)
1443 decodedresult += reader.read()
1444 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1445
Georg Brandl2c9838e2006-10-29 14:39:09 +00001446 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001447 # check incremental decoder/encoder (fetched via the Python
1448 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001449 try:
1450 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001451 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001452 except LookupError: # no IncrementalEncoder
1453 pass
1454 else:
1455 # check incremental decoder/encoder
1456 encodedresult = ""
1457 for c in s:
1458 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001459 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001460 decoder = codecs.getincrementaldecoder(encoding)()
1461 decodedresult = u""
1462 for c in encodedresult:
1463 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001464 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001465 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1466
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001467 # check C API
1468 encodedresult = ""
1469 for c in s:
1470 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001471 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001472 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1473 decodedresult = u""
1474 for c in encodedresult:
1475 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001476 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001477 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1478
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001479 # check iterencode()/iterdecode()
1480 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1481 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1482
1483 # check iterencode()/iterdecode() with empty string
1484 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1485 self.assertEqual(result, u"")
1486
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001487 if encoding not in only_strict_mode:
1488 # check incremental decoder/encoder with errors argument
1489 try:
1490 encoder = codecs.getincrementalencoder(encoding)("ignore")
1491 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1492 except LookupError: # no IncrementalEncoder
1493 pass
1494 else:
1495 encodedresult = "".join(encoder.encode(c) for c in s)
1496 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1497 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1498 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001499
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001500 encodedresult = "".join(cencoder.encode(c) for c in s)
1501 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1502 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1503 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1504
Walter Dörwald729c31f2005-03-14 19:06:30 +00001505 def test_seek(self):
1506 # all codecs should be able to encode these
1507 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1508 for encoding in all_unicode_encodings:
1509 if encoding == "idna": # FIXME: See SF bug #1163178
1510 continue
1511 if encoding in broken_unicode_with_streams:
1512 continue
1513 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1514 for t in xrange(5):
1515 # Test that calling seek resets the internal codec state and buffers
1516 reader.seek(0, 0)
1517 line = reader.readline()
1518 self.assertEqual(s[:len(line)], line)
1519
Walter Dörwalde22d3392005-11-17 08:52:34 +00001520 def test_bad_decode_args(self):
1521 for encoding in all_unicode_encodings:
1522 decoder = codecs.getdecoder(encoding)
1523 self.assertRaises(TypeError, decoder)
1524 if encoding not in ("idna", "punycode"):
1525 self.assertRaises(TypeError, decoder, 42)
1526
1527 def test_bad_encode_args(self):
1528 for encoding in all_unicode_encodings:
1529 encoder = codecs.getencoder(encoding)
1530 self.assertRaises(TypeError, encoder)
1531
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001532 def test_encoding_map_type_initialized(self):
1533 from encodings import cp1140
1534 # This used to crash, we are only verifying there's no crash.
1535 table_type = type(cp1140.encoding_table)
1536 self.assertEqual(table_type, table_type)
1537
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001538class BasicStrTest(unittest.TestCase):
1539 def test_basics(self):
1540 s = "abc123"
1541 for encoding in all_string_encodings:
1542 (bytes, size) = codecs.getencoder(encoding)(s)
1543 self.assertEqual(size, len(s))
1544 (chars, size) = codecs.getdecoder(encoding)(bytes)
1545 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1546
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001547class CharmapTest(unittest.TestCase):
1548 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001549 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001550 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1551 (u"abc", 3)
1552 )
1553
Serhiy Storchaka95997452013-01-15 14:42:59 +02001554 self.assertRaises(UnicodeDecodeError,
1555 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1556 )
1557
1558 self.assertRaises(UnicodeDecodeError,
1559 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1560 )
1561
Ezio Melotti2623a372010-11-21 13:34:58 +00001562 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001563 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1564 (u"ab\ufffd", 3)
1565 )
1566
Ezio Melotti2623a372010-11-21 13:34:58 +00001567 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001568 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1569 (u"ab\ufffd", 3)
1570 )
1571
Ezio Melotti2623a372010-11-21 13:34:58 +00001572 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001573 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1574 (u"ab", 3)
1575 )
1576
Ezio Melotti2623a372010-11-21 13:34:58 +00001577 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001578 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1579 (u"ab", 3)
1580 )
1581
1582 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001583 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001584 codecs.charmap_decode(allbytes, "ignore", u""),
1585 (u"", len(allbytes))
1586 )
1587
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001588 def test_decode_with_int2str_map(self):
1589 self.assertEqual(
1590 codecs.charmap_decode("\x00\x01\x02", "strict",
1591 {0: u'a', 1: u'b', 2: u'c'}),
1592 (u"abc", 3)
1593 )
1594
1595 self.assertEqual(
1596 codecs.charmap_decode("\x00\x01\x02", "strict",
1597 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1598 (u"AaBbCc", 3)
1599 )
1600
1601 self.assertEqual(
1602 codecs.charmap_decode("\x00\x01\x02", "strict",
1603 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1604 (u"\U0010FFFFbc", 3)
1605 )
1606
1607 self.assertEqual(
1608 codecs.charmap_decode("\x00\x01\x02", "strict",
1609 {0: u'a', 1: u'b', 2: u''}),
1610 (u"ab", 3)
1611 )
1612
1613 self.assertRaises(UnicodeDecodeError,
1614 codecs.charmap_decode, "\x00\x01\x02", "strict",
1615 {0: u'a', 1: u'b'}
1616 )
1617
Serhiy Storchaka95997452013-01-15 14:42:59 +02001618 self.assertRaises(UnicodeDecodeError,
1619 codecs.charmap_decode, "\x00\x01\x02", "strict",
1620 {0: u'a', 1: u'b', 2: None}
1621 )
1622
1623 # Issue #14850
1624 self.assertRaises(UnicodeDecodeError,
1625 codecs.charmap_decode, "\x00\x01\x02", "strict",
1626 {0: u'a', 1: u'b', 2: u'\ufffe'}
1627 )
1628
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001629 self.assertEqual(
1630 codecs.charmap_decode("\x00\x01\x02", "replace",
1631 {0: u'a', 1: u'b'}),
1632 (u"ab\ufffd", 3)
1633 )
1634
1635 self.assertEqual(
1636 codecs.charmap_decode("\x00\x01\x02", "replace",
1637 {0: u'a', 1: u'b', 2: None}),
1638 (u"ab\ufffd", 3)
1639 )
1640
Serhiy Storchaka95997452013-01-15 14:42:59 +02001641 # Issue #14850
1642 self.assertEqual(
1643 codecs.charmap_decode("\x00\x01\x02", "replace",
1644 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1645 (u"ab\ufffd", 3)
1646 )
1647
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001648 self.assertEqual(
1649 codecs.charmap_decode("\x00\x01\x02", "ignore",
1650 {0: u'a', 1: u'b'}),
1651 (u"ab", 3)
1652 )
1653
1654 self.assertEqual(
1655 codecs.charmap_decode("\x00\x01\x02", "ignore",
1656 {0: u'a', 1: u'b', 2: None}),
1657 (u"ab", 3)
1658 )
1659
Serhiy Storchaka95997452013-01-15 14:42:59 +02001660 # Issue #14850
1661 self.assertEqual(
1662 codecs.charmap_decode("\x00\x01\x02", "ignore",
1663 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1664 (u"ab", 3)
1665 )
1666
1667 allbytes = "".join(chr(i) for i in xrange(256))
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001668 self.assertEqual(
1669 codecs.charmap_decode(allbytes, "ignore", {}),
1670 (u"", len(allbytes))
1671 )
1672
1673 def test_decode_with_int2int_map(self):
1674 a = ord(u'a')
1675 b = ord(u'b')
1676 c = ord(u'c')
1677
1678 self.assertEqual(
1679 codecs.charmap_decode("\x00\x01\x02", "strict",
1680 {0: a, 1: b, 2: c}),
1681 (u"abc", 3)
1682 )
1683
1684 # Issue #15379
1685 self.assertEqual(
1686 codecs.charmap_decode("\x00\x01\x02", "strict",
1687 {0: 0x10FFFF, 1: b, 2: c}),
1688 (u"\U0010FFFFbc", 3)
1689 )
1690
1691 self.assertRaises(TypeError,
1692 codecs.charmap_decode, "\x00\x01\x02", "strict",
1693 {0: 0x110000, 1: b, 2: c}
1694 )
1695
1696 self.assertRaises(UnicodeDecodeError,
1697 codecs.charmap_decode, "\x00\x01\x02", "strict",
1698 {0: a, 1: b},
1699 )
1700
Serhiy Storchaka95997452013-01-15 14:42:59 +02001701 self.assertRaises(UnicodeDecodeError,
1702 codecs.charmap_decode, "\x00\x01\x02", "strict",
1703 {0: a, 1: b, 2: 0xFFFE},
1704 )
1705
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001706 self.assertEqual(
1707 codecs.charmap_decode("\x00\x01\x02", "replace",
1708 {0: a, 1: b}),
1709 (u"ab\ufffd", 3)
1710 )
1711
1712 self.assertEqual(
Serhiy Storchaka95997452013-01-15 14:42:59 +02001713 codecs.charmap_decode("\x00\x01\x02", "replace",
1714 {0: a, 1: b, 2: 0xFFFE}),
1715 (u"ab\ufffd", 3)
1716 )
1717
1718 self.assertEqual(
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001719 codecs.charmap_decode("\x00\x01\x02", "ignore",
1720 {0: a, 1: b}),
1721 (u"ab", 3)
1722 )
1723
Serhiy Storchaka95997452013-01-15 14:42:59 +02001724 self.assertEqual(
1725 codecs.charmap_decode("\x00\x01\x02", "ignore",
1726 {0: a, 1: b, 2: 0xFFFE}),
1727 (u"ab", 3)
1728 )
1729
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001730
Georg Brandl8f99f812006-10-29 08:39:22 +00001731class WithStmtTest(unittest.TestCase):
1732 def test_encodedfile(self):
1733 f = StringIO.StringIO("\xc3\xbc")
1734 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001735 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001736
1737 def test_streamreaderwriter(self):
1738 f = StringIO.StringIO("\xc3\xbc")
1739 info = codecs.lookup("utf-8")
1740 with codecs.StreamReaderWriter(f, info.streamreader,
1741 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001742 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001743
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001744
Victor Stinner262be5e2010-05-22 02:11:07 +00001745class BomTest(unittest.TestCase):
1746 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00001747 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00001748 tests = ("utf-16",
1749 "utf-16-le",
1750 "utf-16-be",
1751 "utf-32",
1752 "utf-32-le",
1753 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02001754 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00001755 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00001756 # Check if the BOM is written only once
1757 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00001758 f.write(data)
1759 f.write(data)
1760 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001761 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001762 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001763 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001764
Victor Stinner7df55da2010-05-22 13:37:56 +00001765 # Check that the BOM is written after a seek(0)
1766 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1767 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001768 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001769 f.seek(0)
1770 f.write(data)
1771 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001772 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001773
1774 # (StreamWriter) Check that the BOM is written after a seek(0)
1775 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1776 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001777 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001778 f.writer.seek(0)
1779 f.writer.write(data)
1780 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001781 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001782
1783 # Check that the BOM is not written after a seek() at a position
1784 # different than the start
1785 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1786 f.write(data)
1787 f.seek(f.tell())
1788 f.write(data)
1789 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001790 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001791
1792 # (StreamWriter) Check that the BOM is not written after a seek()
1793 # at a position different than the start
1794 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1795 f.writer.write(data)
1796 f.writer.seek(f.writer.tell())
1797 f.writer.write(data)
1798 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001799 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001800
Victor Stinner262be5e2010-05-22 02:11:07 +00001801
Fred Drake2e2be372001-09-20 21:33:42 +00001802def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001803 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001804 UTF32Test,
1805 UTF32LETest,
1806 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001807 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001808 UTF16LETest,
1809 UTF16BETest,
1810 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001811 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001812 UTF7Test,
1813 UTF16ExTest,
1814 ReadBufferTest,
1815 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001816 EscapeDecodeTest,
1817 RecodingTest,
1818 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001819 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001820 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001821 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001822 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001823 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001824 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001825 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001826 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001827 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001828 CharmapTest,
1829 WithStmtTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00001830 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001831 )
Fred Drake2e2be372001-09-20 21:33:42 +00001832
1833
1834if __name__ == "__main__":
1835 test_main()