blob: d0f96af326aac1544a3f67da4581df7fd0c99ef4 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Walter Dörwald9ae019b2006-03-18 14:22:26 +00005import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Serhiy Storchakac8e58122013-01-29 10:20:34 +02007def coding_checker(self, coder):
8 def check(input, expect):
9 self.assertEqual(coder(input), (expect, len(input)))
10 return check
11
Walter Dörwald69652032004-09-07 20:24:22 +000012class Queue(object):
13 """
14 queue: write bytes at one end, read bytes from the other end
15 """
16 def __init__(self):
17 self._buffer = ""
18
19 def write(self, chars):
20 self._buffer += chars
21
22 def read(self, size=-1):
23 if size<0:
24 s = self._buffer
25 self._buffer = ""
26 return s
27 else:
28 s = self._buffer[:size]
29 self._buffer = self._buffer[size:]
30 return s
31
Walter Dörwalde57d7b12004-12-21 22:24:00 +000032class ReadTest(unittest.TestCase):
33 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000034 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000035 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000036 # the StreamReader and check that the results equal the appropriate
37 # entries from partialresults.
38 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000039 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000040 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000041 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000042 q.write(c)
43 result += r.read()
44 self.assertEqual(result, partialresult)
45 # check that there's nothing left in the buffers
46 self.assertEqual(r.read(), u"")
47 self.assertEqual(r.bytebuffer, "")
48 self.assertEqual(r.charbuffer, u"")
49
Walter Dörwaldabb02e52006-03-15 11:35:15 +000050 # do the check again, this time using a incremental decoder
51 d = codecs.getincrementaldecoder(self.encoding)()
52 result = u""
53 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
54 result += d.decode(c)
55 self.assertEqual(result, partialresult)
56 # check that there's nothing left in the buffers
57 self.assertEqual(d.decode("", True), u"")
58 self.assertEqual(d.buffer, "")
59
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000060 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000061 d.reset()
62 result = u""
63 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
64 result += d.decode(c)
65 self.assertEqual(result, partialresult)
66 # check that there's nothing left in the buffers
67 self.assertEqual(d.decode("", True), u"")
68 self.assertEqual(d.buffer, "")
69
70 # check iterdecode()
71 encoded = input.encode(self.encoding)
72 self.assertEqual(
73 input,
74 u"".join(codecs.iterdecode(encoded, self.encoding))
75 )
76
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 def test_readline(self):
78 def getreader(input):
79 stream = StringIO.StringIO(input.encode(self.encoding))
80 return codecs.getreader(self.encoding)(stream)
81
Walter Dörwaldca199432006-03-06 22:39:12 +000082 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 reader = getreader(input)
84 lines = []
85 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000086 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 if not line:
88 break
89 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000090 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091
92 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000093 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
94 sexpectednoends = u"foo|bar|baz|spam|eggs"
95 self.assertEqual(readalllines(s, True), sexpected)
96 self.assertEqual(readalllines(s, False), sexpectednoends)
97 self.assertEqual(readalllines(s, True, 10), sexpected)
98 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000099
100 # Test long lines (multiple calls to read() in readline())
101 vw = []
102 vwo = []
103 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
104 vw.append((i*200)*u"\3042" + lineend)
105 vwo.append((i*200)*u"\3042")
106 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
107 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
108
109 # Test lines where the first read might end with \r, so the
110 # reader has to look ahead whether this is a lone \r or a \r\n
111 for size in xrange(80):
112 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000113 s = 10*(size*u"a" + lineend + u"xxx\n")
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=True),
118 size*u"a" + lineend,
119 )
120 reader = getreader(s)
121 for i in xrange(10):
122 self.assertEqual(
123 reader.readline(keepends=False),
124 size*u"a",
125 )
126
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200127 def test_mixed_readline_and_read(self):
128 lines = ["Humpty Dumpty sat on a wall,\n",
129 "Humpty Dumpty had a great fall.\r\n",
130 "All the king's horses and all the king's men\r",
131 "Couldn't put Humpty together again."]
132 data = ''.join(lines)
133 def getreader():
134 stream = StringIO.StringIO(data.encode(self.encoding))
135 return codecs.getreader(self.encoding)(stream)
136
137 # Issue #8260: Test readline() followed by read()
138 f = getreader()
139 self.assertEqual(f.readline(), lines[0])
140 self.assertEqual(f.read(), ''.join(lines[1:]))
141 self.assertEqual(f.read(), '')
142
143 # Issue #16636: Test readline() followed by readlines()
144 f = getreader()
145 self.assertEqual(f.readline(), lines[0])
146 self.assertEqual(f.readlines(), lines[1:])
147 self.assertEqual(f.read(), '')
148
149 # Test read() followed by read()
150 f = getreader()
151 self.assertEqual(f.read(size=40, chars=5), data[:5])
152 self.assertEqual(f.read(), data[5:])
153 self.assertEqual(f.read(), '')
154
155 # Issue #12446: Test read() followed by readlines()
156 f = getreader()
157 self.assertEqual(f.read(size=40, chars=5), data[:5])
158 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
159 self.assertEqual(f.read(), '')
160
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000161 def test_bug1175396(self):
162 s = [
163 '<%!--===================================================\r\n',
164 ' BLOG index page: show recent articles,\r\n',
165 ' today\'s articles, or articles of a specific date.\r\n',
166 '========================================================--%>\r\n',
167 '<%@inputencoding="ISO-8859-1"%>\r\n',
168 '<%@pagetemplate=TEMPLATE.y%>\r\n',
169 '<%@import=import frog.util, frog%>\r\n',
170 '<%@import=import frog.objects%>\r\n',
171 '<%@import=from frog.storageerrors import StorageError%>\r\n',
172 '<%\r\n',
173 '\r\n',
174 'import logging\r\n',
175 'log=logging.getLogger("Snakelets.logger")\r\n',
176 '\r\n',
177 '\r\n',
178 'user=self.SessionCtx.user\r\n',
179 'storageEngine=self.SessionCtx.storageEngine\r\n',
180 '\r\n',
181 '\r\n',
182 'def readArticlesFromDate(date, count=None):\r\n',
183 ' entryids=storageEngine.listBlogEntries(date)\r\n',
184 ' entryids.reverse() # descending\r\n',
185 ' if count:\r\n',
186 ' entryids=entryids[:count]\r\n',
187 ' try:\r\n',
188 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
189 ' except StorageError,x:\r\n',
190 ' log.error("Error loading articles: "+str(x))\r\n',
191 ' self.abort("cannot load articles")\r\n',
192 '\r\n',
193 'showdate=None\r\n',
194 '\r\n',
195 'arg=self.Request.getArg()\r\n',
196 'if arg=="today":\r\n',
197 ' #-------------------- TODAY\'S ARTICLES\r\n',
198 ' self.write("<h2>Today\'s articles</h2>")\r\n',
199 ' showdate = frog.util.isodatestr() \r\n',
200 ' entries = readArticlesFromDate(showdate)\r\n',
201 'elif arg=="active":\r\n',
202 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
203 ' self.Yredirect("active.y")\r\n',
204 'elif arg=="login":\r\n',
205 ' #-------------------- LOGIN PAGE redirect\r\n',
206 ' self.Yredirect("login.y")\r\n',
207 'elif arg=="date":\r\n',
208 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
209 ' showdate = self.Request.getParameter("date")\r\n',
210 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
211 ' entries = readArticlesFromDate(showdate)\r\n',
212 'else:\r\n',
213 ' #-------------------- RECENT ARTICLES\r\n',
214 ' self.write("<h2>Recent articles</h2>")\r\n',
215 ' dates=storageEngine.listBlogEntryDates()\r\n',
216 ' if dates:\r\n',
217 ' entries=[]\r\n',
218 ' SHOWAMOUNT=10\r\n',
219 ' for showdate in dates:\r\n',
220 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
221 ' if len(entries)>=SHOWAMOUNT:\r\n',
222 ' break\r\n',
223 ' \r\n',
224 ]
225 stream = StringIO.StringIO("".join(s).encode(self.encoding))
226 reader = codecs.getreader(self.encoding)(stream)
227 for (i, line) in enumerate(reader):
228 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000229
230 def test_readlinequeue(self):
231 q = Queue()
232 writer = codecs.getwriter(self.encoding)(q)
233 reader = codecs.getreader(self.encoding)(q)
234
235 # No lineends
236 writer.write(u"foo\r")
237 self.assertEqual(reader.readline(keepends=False), u"foo")
238 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000239 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000240 self.assertEqual(reader.readline(keepends=False), u"bar")
241 writer.write(u"baz")
242 self.assertEqual(reader.readline(keepends=False), u"baz")
243 self.assertEqual(reader.readline(keepends=False), u"")
244
245 # Lineends
246 writer.write(u"foo\r")
247 self.assertEqual(reader.readline(keepends=True), u"foo\r")
248 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000249 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000250 self.assertEqual(reader.readline(keepends=True), u"bar\r")
251 writer.write(u"baz")
252 self.assertEqual(reader.readline(keepends=True), u"baz")
253 self.assertEqual(reader.readline(keepends=True), u"")
254 writer.write(u"foo\r\n")
255 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
256
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257 def test_bug1098990_a(self):
258 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
259 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
260 s3 = u"next line.\r\n"
261
262 s = (s1+s2+s3).encode(self.encoding)
263 stream = StringIO.StringIO(s)
264 reader = codecs.getreader(self.encoding)(stream)
265 self.assertEqual(reader.readline(), s1)
266 self.assertEqual(reader.readline(), s2)
267 self.assertEqual(reader.readline(), s3)
268 self.assertEqual(reader.readline(), u"")
269
270 def test_bug1098990_b(self):
271 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
272 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
273 s3 = u"stillokay:bbbbxx\r\n"
274 s4 = u"broken!!!!badbad\r\n"
275 s5 = u"againokay.\r\n"
276
277 s = (s1+s2+s3+s4+s5).encode(self.encoding)
278 stream = StringIO.StringIO(s)
279 reader = codecs.getreader(self.encoding)(stream)
280 self.assertEqual(reader.readline(), s1)
281 self.assertEqual(reader.readline(), s2)
282 self.assertEqual(reader.readline(), s3)
283 self.assertEqual(reader.readline(), s4)
284 self.assertEqual(reader.readline(), s5)
285 self.assertEqual(reader.readline(), u"")
286
Walter Dörwald6e390802007-08-17 16:41:28 +0000287class UTF32Test(ReadTest):
288 encoding = "utf-32"
289
290 spamle = ('\xff\xfe\x00\x00'
291 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
292 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
293 spambe = ('\x00\x00\xfe\xff'
294 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
295 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
296
297 def test_only_one_bom(self):
298 _,_,reader,writer = codecs.lookup(self.encoding)
299 # encode some stream
300 s = StringIO.StringIO()
301 f = writer(s)
302 f.write(u"spam")
303 f.write(u"spam")
304 d = s.getvalue()
305 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000306 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000307 # try to read it back
308 s = StringIO.StringIO(d)
309 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000310 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000311
312 def test_badbom(self):
313 s = StringIO.StringIO(4*"\xff")
314 f = codecs.getreader(self.encoding)(s)
315 self.assertRaises(UnicodeError, f.read)
316
317 s = StringIO.StringIO(8*"\xff")
318 f = codecs.getreader(self.encoding)(s)
319 self.assertRaises(UnicodeError, f.read)
320
321 def test_partial(self):
322 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200323 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000324 [
325 u"", # first byte of BOM read
326 u"", # second byte of BOM read
327 u"", # third byte of BOM read
328 u"", # fourth byte of BOM read => byteorder known
329 u"",
330 u"",
331 u"",
332 u"\x00",
333 u"\x00",
334 u"\x00",
335 u"\x00",
336 u"\x00\xff",
337 u"\x00\xff",
338 u"\x00\xff",
339 u"\x00\xff",
340 u"\x00\xff\u0100",
341 u"\x00\xff\u0100",
342 u"\x00\xff\u0100",
343 u"\x00\xff\u0100",
344 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200345 u"\x00\xff\u0100\uffff",
346 u"\x00\xff\u0100\uffff",
347 u"\x00\xff\u0100\uffff",
348 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000349 ]
350 )
351
Georg Brandle9741f32009-09-17 11:28:09 +0000352 def test_handlers(self):
353 self.assertEqual((u'\ufffd', 1),
354 codecs.utf_32_decode('\x01', 'replace', True))
355 self.assertEqual((u'', 1),
356 codecs.utf_32_decode('\x01', 'ignore', True))
357
Walter Dörwald6e390802007-08-17 16:41:28 +0000358 def test_errors(self):
359 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
360 "\xff", "strict", True)
361
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000362 def test_issue8941(self):
363 # Issue #8941: insufficient result allocation when decoding into
364 # surrogate pairs on UCS-2 builds.
365 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
366 self.assertEqual(u'\U00010000' * 1024,
367 codecs.utf_32_decode(encoded_le)[0])
368 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
369 self.assertEqual(u'\U00010000' * 1024,
370 codecs.utf_32_decode(encoded_be)[0])
371
Walter Dörwald6e390802007-08-17 16:41:28 +0000372class UTF32LETest(ReadTest):
373 encoding = "utf-32-le"
374
375 def test_partial(self):
376 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200377 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000378 [
379 u"",
380 u"",
381 u"",
382 u"\x00",
383 u"\x00",
384 u"\x00",
385 u"\x00",
386 u"\x00\xff",
387 u"\x00\xff",
388 u"\x00\xff",
389 u"\x00\xff",
390 u"\x00\xff\u0100",
391 u"\x00\xff\u0100",
392 u"\x00\xff\u0100",
393 u"\x00\xff\u0100",
394 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200395 u"\x00\xff\u0100\uffff",
396 u"\x00\xff\u0100\uffff",
397 u"\x00\xff\u0100\uffff",
398 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000399 ]
400 )
401
402 def test_simple(self):
403 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
404
405 def test_errors(self):
406 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
407 "\xff", "strict", True)
408
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000409 def test_issue8941(self):
410 # Issue #8941: insufficient result allocation when decoding into
411 # surrogate pairs on UCS-2 builds.
412 encoded = '\x00\x00\x01\x00' * 1024
413 self.assertEqual(u'\U00010000' * 1024,
414 codecs.utf_32_le_decode(encoded)[0])
415
Walter Dörwald6e390802007-08-17 16:41:28 +0000416class UTF32BETest(ReadTest):
417 encoding = "utf-32-be"
418
419 def test_partial(self):
420 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200421 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000422 [
423 u"",
424 u"",
425 u"",
426 u"\x00",
427 u"\x00",
428 u"\x00",
429 u"\x00",
430 u"\x00\xff",
431 u"\x00\xff",
432 u"\x00\xff",
433 u"\x00\xff",
434 u"\x00\xff\u0100",
435 u"\x00\xff\u0100",
436 u"\x00\xff\u0100",
437 u"\x00\xff\u0100",
438 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200439 u"\x00\xff\u0100\uffff",
440 u"\x00\xff\u0100\uffff",
441 u"\x00\xff\u0100\uffff",
442 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000443 ]
444 )
445
446 def test_simple(self):
447 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
448
449 def test_errors(self):
450 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
451 "\xff", "strict", True)
452
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000453 def test_issue8941(self):
454 # Issue #8941: insufficient result allocation when decoding into
455 # surrogate pairs on UCS-2 builds.
456 encoded = '\x00\x01\x00\x00' * 1024
457 self.assertEqual(u'\U00010000' * 1024,
458 codecs.utf_32_be_decode(encoded)[0])
459
460
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000461class UTF16Test(ReadTest):
462 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000463
464 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
465 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
466
467 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000468 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000469 # encode some stream
470 s = StringIO.StringIO()
471 f = writer(s)
472 f.write(u"spam")
473 f.write(u"spam")
474 d = s.getvalue()
475 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000476 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000477 # try to read it back
478 s = StringIO.StringIO(d)
479 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000480 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000481
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000482 def test_badbom(self):
483 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000484 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000485 self.assertRaises(UnicodeError, f.read)
486
487 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000488 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000489 self.assertRaises(UnicodeError, f.read)
490
Walter Dörwald69652032004-09-07 20:24:22 +0000491 def test_partial(self):
492 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200493 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000494 [
495 u"", # first byte of BOM read
496 u"", # second byte of BOM read => byteorder known
497 u"",
498 u"\x00",
499 u"\x00",
500 u"\x00\xff",
501 u"\x00\xff",
502 u"\x00\xff\u0100",
503 u"\x00\xff\u0100",
504 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200505 u"\x00\xff\u0100\uffff",
506 u"\x00\xff\u0100\uffff",
507 u"\x00\xff\u0100\uffff",
508 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000509 ]
510 )
511
Georg Brandle9741f32009-09-17 11:28:09 +0000512 def test_handlers(self):
513 self.assertEqual((u'\ufffd', 1),
514 codecs.utf_16_decode('\x01', 'replace', True))
515 self.assertEqual((u'', 1),
516 codecs.utf_16_decode('\x01', 'ignore', True))
517
Walter Dörwalde22d3392005-11-17 08:52:34 +0000518 def test_errors(self):
519 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
520
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000521 def test_bug691291(self):
522 # Files are always opened in binary mode, even if no binary mode was
523 # specified. This means that no automatic conversion of '\n' is done
524 # on reading and writing.
525 s1 = u'Hello\r\nworld\r\n'
526
527 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200528 self.addCleanup(test_support.unlink, test_support.TESTFN)
529 with open(test_support.TESTFN, 'wb') as fp:
530 fp.write(s)
531 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
532 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000533
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000534class UTF16LETest(ReadTest):
535 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000536
537 def test_partial(self):
538 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200539 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000540 [
541 u"",
542 u"\x00",
543 u"\x00",
544 u"\x00\xff",
545 u"\x00\xff",
546 u"\x00\xff\u0100",
547 u"\x00\xff\u0100",
548 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200549 u"\x00\xff\u0100\uffff",
550 u"\x00\xff\u0100\uffff",
551 u"\x00\xff\u0100\uffff",
552 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000553 ]
554 )
555
Walter Dörwalde22d3392005-11-17 08:52:34 +0000556 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200557 tests = [
558 (b'\xff', u'\ufffd'),
559 (b'A\x00Z', u'A\ufffd'),
560 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
561 (b'\x00\xd8', u'\ufffd'),
562 (b'\x00\xd8A', u'\ufffd'),
563 (b'\x00\xd8A\x00', u'\ufffdA'),
564 (b'\x00\xdcA\x00', u'\ufffdA'),
565 ]
566 for raw, expected in tests:
567 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
568 raw, 'strict', True)
569 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000570
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000571class UTF16BETest(ReadTest):
572 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000573
574 def test_partial(self):
575 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200576 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000577 [
578 u"",
579 u"\x00",
580 u"\x00",
581 u"\x00\xff",
582 u"\x00\xff",
583 u"\x00\xff\u0100",
584 u"\x00\xff\u0100",
585 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200586 u"\x00\xff\u0100\uffff",
587 u"\x00\xff\u0100\uffff",
588 u"\x00\xff\u0100\uffff",
589 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000590 ]
591 )
592
Walter Dörwalde22d3392005-11-17 08:52:34 +0000593 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200594 tests = [
595 (b'\xff', u'\ufffd'),
596 (b'\x00A\xff', u'A\ufffd'),
597 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
598 (b'\xd8\x00', u'\ufffd'),
599 (b'\xd8\x00\xdc', u'\ufffd'),
600 (b'\xd8\x00\x00A', u'\ufffdA'),
601 (b'\xdc\x00\x00A', u'\ufffdA'),
602 ]
603 for raw, expected in tests:
604 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
605 raw, 'strict', True)
606 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000607
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000608class UTF8Test(ReadTest):
609 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000610
611 def test_partial(self):
612 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200613 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000614 [
615 u"\x00",
616 u"\x00",
617 u"\x00\xff",
618 u"\x00\xff",
619 u"\x00\xff\u07ff",
620 u"\x00\xff\u07ff",
621 u"\x00\xff\u07ff",
622 u"\x00\xff\u07ff\u0800",
623 u"\x00\xff\u07ff\u0800",
624 u"\x00\xff\u07ff\u0800",
625 u"\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200626 u"\x00\xff\u07ff\u0800\uffff",
627 u"\x00\xff\u07ff\u0800\uffff",
628 u"\x00\xff\u07ff\u0800\uffff",
629 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000630 ]
631 )
632
Walter Dörwalde22d3392005-11-17 08:52:34 +0000633class UTF7Test(ReadTest):
634 encoding = "utf-7"
635
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000636 def test_partial(self):
637 self.check_partial(
638 u"a+-b",
639 [
640 u"a",
641 u"a",
642 u"a+",
643 u"a+-",
644 u"a+-b",
645 ]
646 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000647
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300648 def test_errors(self):
649 tests = [
650 ('a\xffb', u'a\ufffdb'),
651 ('a+IK', u'a\ufffd'),
652 ('a+IK-b', u'a\ufffdb'),
653 ('a+IK,b', u'a\ufffdb'),
654 ('a+IKx', u'a\u20ac\ufffd'),
655 ('a+IKx-b', u'a\u20ac\ufffdb'),
656 ('a+IKwgr', u'a\u20ac\ufffd'),
657 ('a+IKwgr-b', u'a\u20ac\ufffdb'),
658 ('a+IKwgr,', u'a\u20ac\ufffd'),
659 ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
660 ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
661 ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
662 ('a+/,+IKw-b', u'a\ufffd\u20acb'),
663 ('a+//,+IKw-b', u'a\ufffd\u20acb'),
664 ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
665 ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
666 ]
667 for raw, expected in tests:
668 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
669 raw, 'strict', True)
670 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
671
672 def test_nonbmp(self):
673 self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
674 self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
675 self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
676
Walter Dörwalde22d3392005-11-17 08:52:34 +0000677class UTF16ExTest(unittest.TestCase):
678
679 def test_errors(self):
680 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
681
682 def test_bad_args(self):
683 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
684
685class ReadBufferTest(unittest.TestCase):
686
687 def test_array(self):
688 import array
689 self.assertEqual(
690 codecs.readbuffer_encode(array.array("c", "spam")),
691 ("spam", 4)
692 )
693
694 def test_empty(self):
695 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
696
697 def test_bad_args(self):
698 self.assertRaises(TypeError, codecs.readbuffer_encode)
699 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
700
701class CharBufferTest(unittest.TestCase):
702
703 def test_string(self):
704 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
705
706 def test_empty(self):
707 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
708
709 def test_bad_args(self):
710 self.assertRaises(TypeError, codecs.charbuffer_encode)
711 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
712
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000713class UTF8SigTest(ReadTest):
714 encoding = "utf-8-sig"
715
716 def test_partial(self):
717 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200718 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000719 [
720 u"",
721 u"",
722 u"", # First BOM has been read and skipped
723 u"",
724 u"",
725 u"\ufeff", # Second BOM has been read and emitted
726 u"\ufeff\x00", # "\x00" read and emitted
727 u"\ufeff\x00", # First byte of encoded u"\xff" read
728 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
729 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
730 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
731 u"\ufeff\x00\xff\u07ff",
732 u"\ufeff\x00\xff\u07ff",
733 u"\ufeff\x00\xff\u07ff\u0800",
734 u"\ufeff\x00\xff\u07ff\u0800",
735 u"\ufeff\x00\xff\u07ff\u0800",
736 u"\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200737 u"\ufeff\x00\xff\u07ff\u0800\uffff",
738 u"\ufeff\x00\xff\u07ff\u0800\uffff",
739 u"\ufeff\x00\xff\u07ff\u0800\uffff",
740 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000741 ]
742 )
743
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000744 def test_bug1601501(self):
745 # SF bug #1601501: check that the codec works with a buffer
746 unicode("\xef\xbb\xbf", "utf-8-sig")
747
Walter Dörwald42348272007-04-12 10:35:00 +0000748 def test_bom(self):
749 d = codecs.getincrementaldecoder("utf-8-sig")()
750 s = u"spam"
751 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
752
Walter Dörwald183744d2007-11-19 12:41:10 +0000753 def test_stream_bom(self):
754 unistring = u"ABC\u00A1\u2200XYZ"
755 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
756
757 reader = codecs.getreader("utf-8-sig")
758 for sizehint in [None] + range(1, 11) + \
759 [64, 128, 256, 512, 1024]:
760 istream = reader(StringIO.StringIO(bytestring))
761 ostream = StringIO.StringIO()
762 while 1:
763 if sizehint is not None:
764 data = istream.read(sizehint)
765 else:
766 data = istream.read()
767
768 if not data:
769 break
770 ostream.write(data)
771
772 got = ostream.getvalue()
773 self.assertEqual(got, unistring)
774
775 def test_stream_bare(self):
776 unistring = u"ABC\u00A1\u2200XYZ"
777 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
778
779 reader = codecs.getreader("utf-8-sig")
780 for sizehint in [None] + range(1, 11) + \
781 [64, 128, 256, 512, 1024]:
782 istream = reader(StringIO.StringIO(bytestring))
783 ostream = StringIO.StringIO()
784 while 1:
785 if sizehint is not None:
786 data = istream.read(sizehint)
787 else:
788 data = istream.read()
789
790 if not data:
791 break
792 ostream.write(data)
793
794 got = ostream.getvalue()
795 self.assertEqual(got, unistring)
796
Walter Dörwald8709a422002-09-03 13:53:40 +0000797class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000798 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000799 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000800
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200801 def test_raw(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200802 decode = codecs.escape_decode
803 for b in range(256):
804 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200805 if b != '\\':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200806 self.assertEqual(decode(b + '0'), (b + '0', 2))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200807
808 def test_escape(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200809 decode = codecs.escape_decode
810 check = coding_checker(self, decode)
811 check(b"[\\\n]", b"[]")
812 check(br'[\"]', b'["]')
813 check(br"[\']", b"[']")
814 check(br"[\\]", br"[\]")
815 check(br"[\a]", b"[\x07]")
816 check(br"[\b]", b"[\x08]")
817 check(br"[\t]", b"[\x09]")
818 check(br"[\n]", b"[\x0a]")
819 check(br"[\v]", b"[\x0b]")
820 check(br"[\f]", b"[\x0c]")
821 check(br"[\r]", b"[\x0d]")
822 check(br"[\7]", b"[\x07]")
823 check(br"[\8]", br"[\8]")
824 check(br"[\78]", b"[\x078]")
825 check(br"[\41]", b"[!]")
826 check(br"[\418]", b"[!8]")
827 check(br"[\101]", b"[A]")
828 check(br"[\1010]", b"[A0]")
829 check(br"[\501]", b"[A]")
830 check(br"[\x41]", b"[A]")
831 check(br"[\X41]", br"[\X41]")
832 check(br"[\x410]", b"[A0]")
833 for b in range(256):
834 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200835 if b not in '\n"\'\\abtnvfr01234567x':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200836 check('\\' + b, '\\' + b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200837
838 def test_errors(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200839 decode = codecs.escape_decode
840 self.assertRaises(ValueError, decode, br"\x")
841 self.assertRaises(ValueError, decode, br"[\x]")
842 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
843 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
844 self.assertRaises(ValueError, decode, br"\x0")
845 self.assertRaises(ValueError, decode, br"[\x0]")
846 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
847 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200848
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000849class RecodingTest(unittest.TestCase):
850 def test_recoding(self):
851 f = StringIO.StringIO()
852 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
853 f2.write(u"a")
854 f2.close()
855 # Python used to crash on this at exit because of a refcount
856 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000857
Martin v. Löwis2548c732003-04-18 10:39:54 +0000858# From RFC 3492
859punycode_testcases = [
860 # A Arabic (Egyptian):
861 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
862 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
863 "egbpdaj6bu4bxfgehfvwxn"),
864 # B Chinese (simplified):
865 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
866 "ihqwcrb4cv8a8dqg056pqjye"),
867 # C Chinese (traditional):
868 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
869 "ihqwctvzc91f659drss3x8bo0yb"),
870 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
871 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
872 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
873 u"\u0065\u0073\u006B\u0079",
874 "Proprostnemluvesky-uyb24dma41a"),
875 # E Hebrew:
876 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
877 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
878 u"\u05D1\u05E8\u05D9\u05EA",
879 "4dbcagdahymbxekheh6e0a7fei0b"),
880 # F Hindi (Devanagari):
881 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
882 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
883 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
884 u"\u0939\u0948\u0902",
885 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
886
887 #(G) Japanese (kanji and hiragana):
888 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
889 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
890 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
891
892 # (H) Korean (Hangul syllables):
893 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
894 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
895 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
896 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
897 "psd879ccm6fea98c"),
898
899 # (I) Russian (Cyrillic):
900 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
901 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
902 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
903 u"\u0438",
904 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
905
906 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
907 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
908 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
909 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
910 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
911 u"\u0061\u00F1\u006F\u006C",
912 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
913
914 # (K) Vietnamese:
915 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
916 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
917 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
918 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
919 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
920 u"\u0056\u0069\u1EC7\u0074",
921 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
922
Martin v. Löwis2548c732003-04-18 10:39:54 +0000923 #(L) 3<nen>B<gumi><kinpachi><sensei>
924 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
925 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000926
Martin v. Löwis2548c732003-04-18 10:39:54 +0000927 # (M) <amuro><namie>-with-SUPER-MONKEYS
928 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
929 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
930 u"\u004F\u004E\u004B\u0045\u0059\u0053",
931 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
932
933 # (N) Hello-Another-Way-<sorezore><no><basho>
934 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
935 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
936 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
937 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
938
939 # (O) <hitotsu><yane><no><shita>2
940 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
941 "2-u9tlzr9756bt3uc0v"),
942
943 # (P) Maji<de>Koi<suru>5<byou><mae>
944 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
945 u"\u308B\u0035\u79D2\u524D",
946 "MajiKoi5-783gue6qz075azm5e"),
947
948 # (Q) <pafii>de<runba>
949 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
950 "de-jg4avhby1noc0d"),
951
952 # (R) <sono><supiido><de>
953 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
954 "d9juau41awczczp"),
955
956 # (S) -> $1.00 <-
957 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
958 u"\u003C\u002D",
959 "-> $1.00 <--")
960 ]
961
962for i in punycode_testcases:
963 if len(i)!=2:
964 print repr(i)
965
966class PunycodeTest(unittest.TestCase):
967 def test_encode(self):
968 for uni, puny in punycode_testcases:
969 # Need to convert both strings to lower case, since
970 # some of the extended encodings use upper case, but our
971 # code produces only lower case. Converting just puny to
972 # lower is also insufficient, since some of the input characters
973 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +0000974 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000975
976 def test_decode(self):
977 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +0000978 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000979
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000980class UnicodeInternalTest(unittest.TestCase):
981 def test_bug1251300(self):
982 # Decoding with unicode_internal used to not correctly handle "code
983 # points" above 0x10ffff on UCS-4 builds.
984 if sys.maxunicode > 0xffff:
985 ok = [
986 ("\x00\x10\xff\xff", u"\U0010ffff"),
987 ("\x00\x00\x01\x01", u"\U00000101"),
988 ("", u""),
989 ]
990 not_ok = [
991 "\x7f\xff\xff\xff",
992 "\x80\x00\x00\x00",
993 "\x81\x00\x00\x00",
994 "\x00",
995 "\x00\x00\x00\x00\x00",
996 ]
997 for internal, uni in ok:
998 if sys.byteorder == "little":
999 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +00001000 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001001 for internal in not_ok:
1002 if sys.byteorder == "little":
1003 internal = "".join(reversed(internal))
1004 self.assertRaises(UnicodeDecodeError, internal.decode,
1005 "unicode_internal")
1006
1007 def test_decode_error_attributes(self):
1008 if sys.maxunicode > 0xffff:
1009 try:
1010 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
1011 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +00001012 self.assertEqual("unicode_internal", ex.encoding)
1013 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1014 self.assertEqual(4, ex.start)
1015 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001016 else:
1017 self.fail()
1018
1019 def test_decode_callback(self):
1020 if sys.maxunicode > 0xffff:
1021 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1022 decoder = codecs.getdecoder("unicode_internal")
1023 ab = u"ab".encode("unicode_internal")
1024 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1025 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +00001026 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001027
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001028 def test_encode_length(self):
1029 # Issue 3739
1030 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +00001031 self.assertEqual(encoder(u"a")[1], 1)
1032 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001033
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001034 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +00001035 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001036
Martin v. Löwis2548c732003-04-18 10:39:54 +00001037# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1038nameprep_tests = [
1039 # 3.1 Map to nothing.
1040 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1041 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1042 '\xb8\x8f\xef\xbb\xbf',
1043 'foobarbaz'),
1044 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1045 ('CAFE',
1046 'cafe'),
1047 # 3.3 Case folding 8bit U+00DF (german sharp s).
1048 # The original test case is bogus; it says \xc3\xdf
1049 ('\xc3\x9f',
1050 'ss'),
1051 # 3.4 Case folding U+0130 (turkish capital I with dot).
1052 ('\xc4\xb0',
1053 'i\xcc\x87'),
1054 # 3.5 Case folding multibyte U+0143 U+037A.
1055 ('\xc5\x83\xcd\xba',
1056 '\xc5\x84 \xce\xb9'),
1057 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1058 # XXX: skip this as it fails in UCS-2 mode
1059 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1060 # 'telc\xe2\x88\x95kg\xcf\x83'),
1061 (None, None),
1062 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1063 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
1064 '\xc7\xb0 a'),
1065 # 3.8 Case folding U+1FB7 and normalization.
1066 ('\xe1\xbe\xb7',
1067 '\xe1\xbe\xb6\xce\xb9'),
1068 # 3.9 Self-reverting case folding U+01F0 and normalization.
1069 # The original test case is bogus, it says `\xc7\xf0'
1070 ('\xc7\xb0',
1071 '\xc7\xb0'),
1072 # 3.10 Self-reverting case folding U+0390 and normalization.
1073 ('\xce\x90',
1074 '\xce\x90'),
1075 # 3.11 Self-reverting case folding U+03B0 and normalization.
1076 ('\xce\xb0',
1077 '\xce\xb0'),
1078 # 3.12 Self-reverting case folding U+1E96 and normalization.
1079 ('\xe1\xba\x96',
1080 '\xe1\xba\x96'),
1081 # 3.13 Self-reverting case folding U+1F56 and normalization.
1082 ('\xe1\xbd\x96',
1083 '\xe1\xbd\x96'),
1084 # 3.14 ASCII space character U+0020.
1085 (' ',
1086 ' '),
1087 # 3.15 Non-ASCII 8bit space character U+00A0.
1088 ('\xc2\xa0',
1089 ' '),
1090 # 3.16 Non-ASCII multibyte space character U+1680.
1091 ('\xe1\x9a\x80',
1092 None),
1093 # 3.17 Non-ASCII multibyte space character U+2000.
1094 ('\xe2\x80\x80',
1095 ' '),
1096 # 3.18 Zero Width Space U+200b.
1097 ('\xe2\x80\x8b',
1098 ''),
1099 # 3.19 Non-ASCII multibyte space character U+3000.
1100 ('\xe3\x80\x80',
1101 ' '),
1102 # 3.20 ASCII control characters U+0010 U+007F.
1103 ('\x10\x7f',
1104 '\x10\x7f'),
1105 # 3.21 Non-ASCII 8bit control character U+0085.
1106 ('\xc2\x85',
1107 None),
1108 # 3.22 Non-ASCII multibyte control character U+180E.
1109 ('\xe1\xa0\x8e',
1110 None),
1111 # 3.23 Zero Width No-Break Space U+FEFF.
1112 ('\xef\xbb\xbf',
1113 ''),
1114 # 3.24 Non-ASCII control character U+1D175.
1115 ('\xf0\x9d\x85\xb5',
1116 None),
1117 # 3.25 Plane 0 private use character U+F123.
1118 ('\xef\x84\xa3',
1119 None),
1120 # 3.26 Plane 15 private use character U+F1234.
1121 ('\xf3\xb1\x88\xb4',
1122 None),
1123 # 3.27 Plane 16 private use character U+10F234.
1124 ('\xf4\x8f\x88\xb4',
1125 None),
1126 # 3.28 Non-character code point U+8FFFE.
1127 ('\xf2\x8f\xbf\xbe',
1128 None),
1129 # 3.29 Non-character code point U+10FFFF.
1130 ('\xf4\x8f\xbf\xbf',
1131 None),
1132 # 3.30 Surrogate code U+DF42.
1133 ('\xed\xbd\x82',
1134 None),
1135 # 3.31 Non-plain text character U+FFFD.
1136 ('\xef\xbf\xbd',
1137 None),
1138 # 3.32 Ideographic description character U+2FF5.
1139 ('\xe2\xbf\xb5',
1140 None),
1141 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +00001142 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001143 '\xcc\x81'),
1144 # 3.34 Left-to-right mark U+200E.
1145 ('\xe2\x80\x8e',
1146 None),
1147 # 3.35 Deprecated U+202A.
1148 ('\xe2\x80\xaa',
1149 None),
1150 # 3.36 Language tagging character U+E0001.
1151 ('\xf3\xa0\x80\x81',
1152 None),
1153 # 3.37 Language tagging character U+E0042.
1154 ('\xf3\xa0\x81\x82',
1155 None),
1156 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1157 ('foo\xd6\xbebar',
1158 None),
1159 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1160 ('foo\xef\xb5\x90bar',
1161 None),
1162 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1163 ('foo\xef\xb9\xb6bar',
1164 'foo \xd9\x8ebar'),
1165 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1166 ('\xd8\xa71',
1167 None),
1168 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1169 ('\xd8\xa71\xd8\xa8',
1170 '\xd8\xa71\xd8\xa8'),
1171 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001172 # Skip this test as we allow unassigned
1173 #('\xf3\xa0\x80\x82',
1174 # None),
1175 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001176 # 3.44 Larger test (shrinking).
1177 # Original test case reads \xc3\xdf
1178 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1179 '\xaa\xce\xb0\xe2\x80\x80',
1180 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1181 # 3.45 Larger test (expanding).
1182 # Original test case reads \xc3\x9f
1183 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1184 '\x80',
1185 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1186 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1187 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1188 ]
1189
1190
1191class NameprepTest(unittest.TestCase):
1192 def test_nameprep(self):
1193 from encodings.idna import nameprep
1194 for pos, (orig, prepped) in enumerate(nameprep_tests):
1195 if orig is None:
1196 # Skipped
1197 continue
1198 # The Unicode strings are given in UTF-8
1199 orig = unicode(orig, "utf-8")
1200 if prepped is None:
1201 # Input contains prohibited characters
1202 self.assertRaises(UnicodeError, nameprep, orig)
1203 else:
1204 prepped = unicode(prepped, "utf-8")
1205 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001206 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001207 except Exception,e:
1208 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1209
Walter Dörwald78a0be62006-04-14 18:25:39 +00001210class IDNACodecTest(unittest.TestCase):
1211 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001212 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1213 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1214 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1215 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001216
1217 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001218 self.assertEqual(u"python.org".encode("idna"), "python.org")
1219 self.assertEqual("python.org.".encode("idna"), "python.org.")
1220 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1221 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001222
Martin v. Löwis8b595142005-08-25 11:03:38 +00001223 def test_stream(self):
1224 import StringIO
1225 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1226 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001227 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001228
Walter Dörwald78a0be62006-04-14 18:25:39 +00001229 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001230 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001231 "".join(codecs.iterdecode("python.org", "idna")),
1232 u"python.org"
1233 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001234 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001235 "".join(codecs.iterdecode("python.org.", "idna")),
1236 u"python.org."
1237 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001238 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001239 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1240 u"pyth\xf6n.org."
1241 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001242 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001243 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1244 u"pyth\xf6n.org."
1245 )
1246
1247 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001248 self.assertEqual(decoder.decode("xn--xam", ), u"")
1249 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1250 self.assertEqual(decoder.decode(u"rg"), u"")
1251 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001252
1253 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001254 self.assertEqual(decoder.decode("xn--xam", ), u"")
1255 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1256 self.assertEqual(decoder.decode("rg."), u"org.")
1257 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001258
1259 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001260 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001261 "".join(codecs.iterencode(u"python.org", "idna")),
1262 "python.org"
1263 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001264 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001265 "".join(codecs.iterencode(u"python.org.", "idna")),
1266 "python.org."
1267 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001268 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001269 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1270 "xn--pythn-mua.org."
1271 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001272 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001273 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1274 "xn--pythn-mua.org."
1275 )
1276
1277 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001278 self.assertEqual(encoder.encode(u"\xe4x"), "")
1279 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1280 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001281
1282 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001283 self.assertEqual(encoder.encode(u"\xe4x"), "")
1284 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1285 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001286
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001287class CodecsModuleTest(unittest.TestCase):
1288
1289 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001290 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001291 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001292 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001293 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001294 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1295
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001296 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001297 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001298 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001299 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001300 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001301 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001302 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1303
1304 def test_register(self):
1305 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001306 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001307
1308 def test_lookup(self):
1309 self.assertRaises(TypeError, codecs.lookup)
1310 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001311 self.assertRaises(LookupError, codecs.lookup, " ")
1312
1313 def test_getencoder(self):
1314 self.assertRaises(TypeError, codecs.getencoder)
1315 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1316
1317 def test_getdecoder(self):
1318 self.assertRaises(TypeError, codecs.getdecoder)
1319 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1320
1321 def test_getreader(self):
1322 self.assertRaises(TypeError, codecs.getreader)
1323 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1324
1325 def test_getwriter(self):
1326 self.assertRaises(TypeError, codecs.getwriter)
1327 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001328
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001329 def test_lookup_issue1813(self):
1330 # Issue #1813: under Turkish locales, lookup of some codecs failed
1331 # because 'I' is lowercased as a dotless "i"
1332 oldlocale = locale.getlocale(locale.LC_CTYPE)
1333 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1334 try:
1335 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1336 except locale.Error:
1337 # Unsupported locale on this system
1338 self.skipTest('test needs Turkish locale')
1339 c = codecs.lookup('ASCII')
1340 self.assertEqual(c.name, 'ascii')
1341
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001342class StreamReaderTest(unittest.TestCase):
1343
1344 def setUp(self):
1345 self.reader = codecs.getreader('utf-8')
1346 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1347
1348 def test_readlines(self):
1349 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001350 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001351
Georg Brandl8f99f812006-10-29 08:39:22 +00001352class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001353
Georg Brandl8f99f812006-10-29 08:39:22 +00001354 def test_basic(self):
1355 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001356 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001357 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001358
1359 f = StringIO.StringIO()
1360 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1361 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001362 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001363
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001364class Str2StrTest(unittest.TestCase):
1365
1366 def test_read(self):
1367 sin = "\x80".encode("base64_codec")
1368 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1369 sout = reader.read()
1370 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001371 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001372
1373 def test_readline(self):
1374 sin = "\x80".encode("base64_codec")
1375 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1376 sout = reader.readline()
1377 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001378 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001379
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001380all_unicode_encodings = [
1381 "ascii",
1382 "base64_codec",
1383 "big5",
1384 "big5hkscs",
1385 "charmap",
1386 "cp037",
1387 "cp1006",
1388 "cp1026",
1389 "cp1140",
1390 "cp1250",
1391 "cp1251",
1392 "cp1252",
1393 "cp1253",
1394 "cp1254",
1395 "cp1255",
1396 "cp1256",
1397 "cp1257",
1398 "cp1258",
1399 "cp424",
1400 "cp437",
1401 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001402 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001403 "cp737",
1404 "cp775",
1405 "cp850",
1406 "cp852",
1407 "cp855",
1408 "cp856",
1409 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001410 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001411 "cp860",
1412 "cp861",
1413 "cp862",
1414 "cp863",
1415 "cp864",
1416 "cp865",
1417 "cp866",
1418 "cp869",
1419 "cp874",
1420 "cp875",
1421 "cp932",
1422 "cp949",
1423 "cp950",
1424 "euc_jis_2004",
1425 "euc_jisx0213",
1426 "euc_jp",
1427 "euc_kr",
1428 "gb18030",
1429 "gb2312",
1430 "gbk",
1431 "hex_codec",
1432 "hp_roman8",
1433 "hz",
1434 "idna",
1435 "iso2022_jp",
1436 "iso2022_jp_1",
1437 "iso2022_jp_2",
1438 "iso2022_jp_2004",
1439 "iso2022_jp_3",
1440 "iso2022_jp_ext",
1441 "iso2022_kr",
1442 "iso8859_1",
1443 "iso8859_10",
1444 "iso8859_11",
1445 "iso8859_13",
1446 "iso8859_14",
1447 "iso8859_15",
1448 "iso8859_16",
1449 "iso8859_2",
1450 "iso8859_3",
1451 "iso8859_4",
1452 "iso8859_5",
1453 "iso8859_6",
1454 "iso8859_7",
1455 "iso8859_8",
1456 "iso8859_9",
1457 "johab",
1458 "koi8_r",
1459 "koi8_u",
1460 "latin_1",
1461 "mac_cyrillic",
1462 "mac_greek",
1463 "mac_iceland",
1464 "mac_latin2",
1465 "mac_roman",
1466 "mac_turkish",
1467 "palmos",
1468 "ptcp154",
1469 "punycode",
1470 "raw_unicode_escape",
1471 "rot_13",
1472 "shift_jis",
1473 "shift_jis_2004",
1474 "shift_jisx0213",
1475 "tis_620",
1476 "unicode_escape",
1477 "unicode_internal",
1478 "utf_16",
1479 "utf_16_be",
1480 "utf_16_le",
1481 "utf_7",
1482 "utf_8",
1483]
1484
1485if hasattr(codecs, "mbcs_encode"):
1486 all_unicode_encodings.append("mbcs")
1487
1488# The following encodings work only with str, not unicode
1489all_string_encodings = [
1490 "quopri_codec",
1491 "string_escape",
1492 "uu_codec",
1493]
1494
1495# The following encoding is not tested, because it's not supposed
1496# to work:
1497# "undefined"
1498
1499# The following encodings don't work in stateful mode
1500broken_unicode_with_streams = [
1501 "base64_codec",
1502 "hex_codec",
1503 "punycode",
1504 "unicode_internal"
1505]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001506broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001507
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001508# The following encodings only support "strict" mode
1509only_strict_mode = [
1510 "idna",
1511 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001512 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001513]
1514
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001515try:
1516 import bz2
1517except ImportError:
1518 pass
1519else:
1520 all_unicode_encodings.append("bz2_codec")
1521 broken_unicode_with_streams.append("bz2_codec")
1522
1523try:
1524 import zlib
1525except ImportError:
1526 pass
1527else:
1528 all_unicode_encodings.append("zlib_codec")
1529 broken_unicode_with_streams.append("zlib_codec")
1530
1531class BasicUnicodeTest(unittest.TestCase):
1532 def test_basics(self):
1533 s = u"abc123" # all codecs should be able to encode these
1534 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001535 name = codecs.lookup(encoding).name
1536 if encoding.endswith("_codec"):
1537 name += "_codec"
1538 elif encoding == "latin_1":
1539 name = "latin_1"
1540 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001541 (bytes, size) = codecs.getencoder(encoding)(s)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001542 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001543 (chars, size) = codecs.getdecoder(encoding)(bytes)
1544 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1545
1546 if encoding not in broken_unicode_with_streams:
1547 # check stream reader/writer
1548 q = Queue()
1549 writer = codecs.getwriter(encoding)(q)
1550 encodedresult = ""
1551 for c in s:
1552 writer.write(c)
1553 encodedresult += q.read()
1554 q = Queue()
1555 reader = codecs.getreader(encoding)(q)
1556 decodedresult = u""
1557 for c in encodedresult:
1558 q.write(c)
1559 decodedresult += reader.read()
1560 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1561
Georg Brandl2c9838e2006-10-29 14:39:09 +00001562 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001563 # check incremental decoder/encoder (fetched via the Python
1564 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001565 try:
1566 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001567 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001568 except LookupError: # no IncrementalEncoder
1569 pass
1570 else:
1571 # check incremental decoder/encoder
1572 encodedresult = ""
1573 for c in s:
1574 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001575 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001576 decoder = codecs.getincrementaldecoder(encoding)()
1577 decodedresult = u""
1578 for c in encodedresult:
1579 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001580 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001581 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1582
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001583 # check C API
1584 encodedresult = ""
1585 for c in s:
1586 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001587 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001588 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1589 decodedresult = u""
1590 for c in encodedresult:
1591 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001592 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001593 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1594
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001595 # check iterencode()/iterdecode()
1596 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1597 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1598
1599 # check iterencode()/iterdecode() with empty string
1600 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1601 self.assertEqual(result, u"")
1602
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001603 if encoding not in only_strict_mode:
1604 # check incremental decoder/encoder with errors argument
1605 try:
1606 encoder = codecs.getincrementalencoder(encoding)("ignore")
1607 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1608 except LookupError: # no IncrementalEncoder
1609 pass
1610 else:
1611 encodedresult = "".join(encoder.encode(c) for c in s)
1612 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1613 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1614 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001615
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001616 encodedresult = "".join(cencoder.encode(c) for c in s)
1617 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1618 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1619 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1620
Walter Dörwald729c31f2005-03-14 19:06:30 +00001621 def test_seek(self):
1622 # all codecs should be able to encode these
1623 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1624 for encoding in all_unicode_encodings:
1625 if encoding == "idna": # FIXME: See SF bug #1163178
1626 continue
1627 if encoding in broken_unicode_with_streams:
1628 continue
1629 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1630 for t in xrange(5):
1631 # Test that calling seek resets the internal codec state and buffers
1632 reader.seek(0, 0)
1633 line = reader.readline()
1634 self.assertEqual(s[:len(line)], line)
1635
Walter Dörwalde22d3392005-11-17 08:52:34 +00001636 def test_bad_decode_args(self):
1637 for encoding in all_unicode_encodings:
1638 decoder = codecs.getdecoder(encoding)
1639 self.assertRaises(TypeError, decoder)
1640 if encoding not in ("idna", "punycode"):
1641 self.assertRaises(TypeError, decoder, 42)
1642
1643 def test_bad_encode_args(self):
1644 for encoding in all_unicode_encodings:
1645 encoder = codecs.getencoder(encoding)
1646 self.assertRaises(TypeError, encoder)
1647
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001648 def test_encoding_map_type_initialized(self):
1649 from encodings import cp1140
1650 # This used to crash, we are only verifying there's no crash.
1651 table_type = type(cp1140.encoding_table)
1652 self.assertEqual(table_type, table_type)
1653
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001654class BasicStrTest(unittest.TestCase):
1655 def test_basics(self):
1656 s = "abc123"
1657 for encoding in all_string_encodings:
1658 (bytes, size) = codecs.getencoder(encoding)(s)
1659 self.assertEqual(size, len(s))
1660 (chars, size) = codecs.getdecoder(encoding)(bytes)
1661 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1662
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001663class CharmapTest(unittest.TestCase):
1664 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001665 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001666 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1667 (u"abc", 3)
1668 )
1669
Serhiy Storchaka95997452013-01-15 14:42:59 +02001670 self.assertRaises(UnicodeDecodeError,
1671 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1672 )
1673
1674 self.assertRaises(UnicodeDecodeError,
1675 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1676 )
1677
Ezio Melotti2623a372010-11-21 13:34:58 +00001678 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001679 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1680 (u"ab\ufffd", 3)
1681 )
1682
Ezio Melotti2623a372010-11-21 13:34:58 +00001683 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001684 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1685 (u"ab\ufffd", 3)
1686 )
1687
Ezio Melotti2623a372010-11-21 13:34:58 +00001688 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001689 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1690 (u"ab", 3)
1691 )
1692
Ezio Melotti2623a372010-11-21 13:34:58 +00001693 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001694 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1695 (u"ab", 3)
1696 )
1697
1698 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001699 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001700 codecs.charmap_decode(allbytes, "ignore", u""),
1701 (u"", len(allbytes))
1702 )
1703
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001704 def test_decode_with_int2str_map(self):
1705 self.assertEqual(
1706 codecs.charmap_decode("\x00\x01\x02", "strict",
1707 {0: u'a', 1: u'b', 2: u'c'}),
1708 (u"abc", 3)
1709 )
1710
1711 self.assertEqual(
1712 codecs.charmap_decode("\x00\x01\x02", "strict",
1713 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1714 (u"AaBbCc", 3)
1715 )
1716
1717 self.assertEqual(
1718 codecs.charmap_decode("\x00\x01\x02", "strict",
1719 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1720 (u"\U0010FFFFbc", 3)
1721 )
1722
1723 self.assertEqual(
1724 codecs.charmap_decode("\x00\x01\x02", "strict",
1725 {0: u'a', 1: u'b', 2: u''}),
1726 (u"ab", 3)
1727 )
1728
1729 self.assertRaises(UnicodeDecodeError,
1730 codecs.charmap_decode, "\x00\x01\x02", "strict",
1731 {0: u'a', 1: u'b'}
1732 )
1733
Serhiy Storchaka95997452013-01-15 14:42:59 +02001734 self.assertRaises(UnicodeDecodeError,
1735 codecs.charmap_decode, "\x00\x01\x02", "strict",
1736 {0: u'a', 1: u'b', 2: None}
1737 )
1738
1739 # Issue #14850
1740 self.assertRaises(UnicodeDecodeError,
1741 codecs.charmap_decode, "\x00\x01\x02", "strict",
1742 {0: u'a', 1: u'b', 2: u'\ufffe'}
1743 )
1744
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001745 self.assertEqual(
1746 codecs.charmap_decode("\x00\x01\x02", "replace",
1747 {0: u'a', 1: u'b'}),
1748 (u"ab\ufffd", 3)
1749 )
1750
1751 self.assertEqual(
1752 codecs.charmap_decode("\x00\x01\x02", "replace",
1753 {0: u'a', 1: u'b', 2: None}),
1754 (u"ab\ufffd", 3)
1755 )
1756
Serhiy Storchaka95997452013-01-15 14:42:59 +02001757 # Issue #14850
1758 self.assertEqual(
1759 codecs.charmap_decode("\x00\x01\x02", "replace",
1760 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1761 (u"ab\ufffd", 3)
1762 )
1763
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001764 self.assertEqual(
1765 codecs.charmap_decode("\x00\x01\x02", "ignore",
1766 {0: u'a', 1: u'b'}),
1767 (u"ab", 3)
1768 )
1769
1770 self.assertEqual(
1771 codecs.charmap_decode("\x00\x01\x02", "ignore",
1772 {0: u'a', 1: u'b', 2: None}),
1773 (u"ab", 3)
1774 )
1775
Serhiy Storchaka95997452013-01-15 14:42:59 +02001776 # Issue #14850
1777 self.assertEqual(
1778 codecs.charmap_decode("\x00\x01\x02", "ignore",
1779 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1780 (u"ab", 3)
1781 )
1782
1783 allbytes = "".join(chr(i) for i in xrange(256))
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001784 self.assertEqual(
1785 codecs.charmap_decode(allbytes, "ignore", {}),
1786 (u"", len(allbytes))
1787 )
1788
1789 def test_decode_with_int2int_map(self):
1790 a = ord(u'a')
1791 b = ord(u'b')
1792 c = ord(u'c')
1793
1794 self.assertEqual(
1795 codecs.charmap_decode("\x00\x01\x02", "strict",
1796 {0: a, 1: b, 2: c}),
1797 (u"abc", 3)
1798 )
1799
1800 # Issue #15379
1801 self.assertEqual(
1802 codecs.charmap_decode("\x00\x01\x02", "strict",
1803 {0: 0x10FFFF, 1: b, 2: c}),
1804 (u"\U0010FFFFbc", 3)
1805 )
1806
1807 self.assertRaises(TypeError,
1808 codecs.charmap_decode, "\x00\x01\x02", "strict",
1809 {0: 0x110000, 1: b, 2: c}
1810 )
1811
1812 self.assertRaises(UnicodeDecodeError,
1813 codecs.charmap_decode, "\x00\x01\x02", "strict",
1814 {0: a, 1: b},
1815 )
1816
Serhiy Storchaka95997452013-01-15 14:42:59 +02001817 self.assertRaises(UnicodeDecodeError,
1818 codecs.charmap_decode, "\x00\x01\x02", "strict",
1819 {0: a, 1: b, 2: 0xFFFE},
1820 )
1821
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001822 self.assertEqual(
1823 codecs.charmap_decode("\x00\x01\x02", "replace",
1824 {0: a, 1: b}),
1825 (u"ab\ufffd", 3)
1826 )
1827
1828 self.assertEqual(
Serhiy Storchaka95997452013-01-15 14:42:59 +02001829 codecs.charmap_decode("\x00\x01\x02", "replace",
1830 {0: a, 1: b, 2: 0xFFFE}),
1831 (u"ab\ufffd", 3)
1832 )
1833
1834 self.assertEqual(
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001835 codecs.charmap_decode("\x00\x01\x02", "ignore",
1836 {0: a, 1: b}),
1837 (u"ab", 3)
1838 )
1839
Serhiy Storchaka95997452013-01-15 14:42:59 +02001840 self.assertEqual(
1841 codecs.charmap_decode("\x00\x01\x02", "ignore",
1842 {0: a, 1: b, 2: 0xFFFE}),
1843 (u"ab", 3)
1844 )
1845
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001846
Georg Brandl8f99f812006-10-29 08:39:22 +00001847class WithStmtTest(unittest.TestCase):
1848 def test_encodedfile(self):
1849 f = StringIO.StringIO("\xc3\xbc")
1850 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001851 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001852
1853 def test_streamreaderwriter(self):
1854 f = StringIO.StringIO("\xc3\xbc")
1855 info = codecs.lookup("utf-8")
1856 with codecs.StreamReaderWriter(f, info.streamreader,
1857 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001858 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001859
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001860
Serhiy Storchakac8e58122013-01-29 10:20:34 +02001861class UnicodeEscapeTest(unittest.TestCase):
1862 def test_empty(self):
1863 self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
1864 self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
1865
1866 def test_raw_encode(self):
1867 encode = codecs.unicode_escape_encode
1868 for b in range(32, 127):
1869 if b != ord('\\'):
1870 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1871
1872 def test_raw_decode(self):
1873 decode = codecs.unicode_escape_decode
1874 for b in range(256):
1875 if b != ord('\\'):
1876 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1877
1878 def test_escape_encode(self):
1879 encode = codecs.unicode_escape_encode
1880 check = coding_checker(self, encode)
1881 check(u'\t', r'\t')
1882 check(u'\n', r'\n')
1883 check(u'\r', r'\r')
1884 check(u'\\', r'\\')
1885 for b in range(32):
1886 if chr(b) not in '\t\n\r':
1887 check(unichr(b), '\\x%02x' % b)
1888 for b in range(127, 256):
1889 check(unichr(b), '\\x%02x' % b)
1890 check(u'\u20ac', r'\u20ac')
1891 check(u'\U0001d120', r'\U0001d120')
1892
1893 def test_escape_decode(self):
1894 decode = codecs.unicode_escape_decode
1895 check = coding_checker(self, decode)
1896 check("[\\\n]", u"[]")
1897 check(r'[\"]', u'["]')
1898 check(r"[\']", u"[']")
1899 check(r"[\\]", ur"[\]")
1900 check(r"[\a]", u"[\x07]")
1901 check(r"[\b]", u"[\x08]")
1902 check(r"[\t]", u"[\x09]")
1903 check(r"[\n]", u"[\x0a]")
1904 check(r"[\v]", u"[\x0b]")
1905 check(r"[\f]", u"[\x0c]")
1906 check(r"[\r]", u"[\x0d]")
1907 check(r"[\7]", u"[\x07]")
1908 check(r"[\8]", ur"[\8]")
1909 check(r"[\78]", u"[\x078]")
1910 check(r"[\41]", u"[!]")
1911 check(r"[\418]", u"[!8]")
1912 check(r"[\101]", u"[A]")
1913 check(r"[\1010]", u"[A0]")
1914 check(r"[\x41]", u"[A]")
1915 check(r"[\x410]", u"[A0]")
1916 check(r"\u20ac", u"\u20ac")
1917 check(r"\U0001d120", u"\U0001d120")
1918 for b in range(256):
1919 if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
1920 check('\\' + chr(b), u'\\' + unichr(b))
1921
1922 def test_decode_errors(self):
1923 decode = codecs.unicode_escape_decode
1924 for c, d in ('x', 2), ('u', 4), ('U', 4):
1925 for i in range(d):
1926 self.assertRaises(UnicodeDecodeError, decode,
1927 "\\" + c + "0"*i)
1928 self.assertRaises(UnicodeDecodeError, decode,
1929 "[\\" + c + "0"*i + "]")
1930 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
1931 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
1932 self.assertEqual(decode(data, "replace"),
1933 (u"[\ufffd]\ufffd", len(data)))
1934 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
1935 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
1936 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
1937
1938
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02001939class RawUnicodeEscapeTest(unittest.TestCase):
1940 def test_empty(self):
1941 self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
1942 self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
1943
1944 def test_raw_encode(self):
1945 encode = codecs.raw_unicode_escape_encode
1946 for b in range(256):
1947 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1948
1949 def test_raw_decode(self):
1950 decode = codecs.raw_unicode_escape_decode
1951 for b in range(256):
1952 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1953
1954 def test_escape_encode(self):
1955 encode = codecs.raw_unicode_escape_encode
1956 check = coding_checker(self, encode)
1957 for b in range(256):
1958 if chr(b) not in 'uU':
1959 check(u'\\' + unichr(b), '\\' + chr(b))
1960 check(u'\u20ac', r'\u20ac')
1961 check(u'\U0001d120', r'\U0001d120')
1962
1963 def test_escape_decode(self):
1964 decode = codecs.raw_unicode_escape_decode
1965 check = coding_checker(self, decode)
1966 for b in range(256):
1967 if chr(b) not in 'uU':
1968 check('\\' + chr(b), u'\\' + unichr(b))
1969 check(r"\u20ac", u"\u20ac")
1970 check(r"\U0001d120", u"\U0001d120")
1971
1972 def test_decode_errors(self):
1973 decode = codecs.raw_unicode_escape_decode
1974 for c, d in ('u', 4), ('U', 4):
1975 for i in range(d):
1976 self.assertRaises(UnicodeDecodeError, decode,
1977 "\\" + c + "0"*i)
1978 self.assertRaises(UnicodeDecodeError, decode,
1979 "[\\" + c + "0"*i + "]")
1980 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
1981 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
1982 self.assertEqual(decode(data, "replace"),
1983 (u"[\ufffd]\ufffd", len(data)))
1984 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
1985 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
1986 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
1987
1988
Victor Stinner262be5e2010-05-22 02:11:07 +00001989class BomTest(unittest.TestCase):
1990 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00001991 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00001992 tests = ("utf-16",
1993 "utf-16-le",
1994 "utf-16-be",
1995 "utf-32",
1996 "utf-32-le",
1997 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02001998 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00001999 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00002000 # Check if the BOM is written only once
2001 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00002002 f.write(data)
2003 f.write(data)
2004 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002005 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002006 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002007 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002008
Victor Stinner7df55da2010-05-22 13:37:56 +00002009 # Check that the BOM is written after a seek(0)
2010 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2011 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002012 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002013 f.seek(0)
2014 f.write(data)
2015 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002016 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002017
2018 # (StreamWriter) Check that the BOM is written after a seek(0)
2019 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2020 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002021 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002022 f.writer.seek(0)
2023 f.writer.write(data)
2024 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002025 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002026
2027 # Check that the BOM is not written after a seek() at a position
2028 # different than the start
2029 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2030 f.write(data)
2031 f.seek(f.tell())
2032 f.write(data)
2033 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002034 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002035
2036 # (StreamWriter) Check that the BOM is not written after a seek()
2037 # at a position different than the start
2038 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2039 f.writer.write(data)
2040 f.writer.seek(f.writer.tell())
2041 f.writer.write(data)
2042 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002043 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002044
Victor Stinner262be5e2010-05-22 02:11:07 +00002045
Fred Drake2e2be372001-09-20 21:33:42 +00002046def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00002047 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00002048 UTF32Test,
2049 UTF32LETest,
2050 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002051 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002052 UTF16LETest,
2053 UTF16BETest,
2054 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002055 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002056 UTF7Test,
2057 UTF16ExTest,
2058 ReadBufferTest,
2059 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002060 EscapeDecodeTest,
2061 RecodingTest,
2062 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002063 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002064 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00002065 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002066 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002067 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002068 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00002069 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002070 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002071 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002072 CharmapTest,
2073 WithStmtTest,
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002074 UnicodeEscapeTest,
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02002075 RawUnicodeEscapeTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00002076 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002077 )
Fred Drake2e2be372001-09-20 21:33:42 +00002078
2079
2080if __name__ == "__main__":
2081 test_main()