blob: d8753402ef6de55e82d5af66e8f764104433ad79 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01008
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020011if sys.platform == 'win32':
12 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
13else:
14 VISTA_OR_LATER = False
15
Antoine Pitrou00b2c862011-10-05 13:01:41 +020016try:
17 import ctypes
18except ImportError:
19 ctypes = None
20 SIZEOF_WCHAR_T = -1
21else:
22 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000023
Serhiy Storchakad6793772013-01-29 10:20:44 +020024def coding_checker(self, coder):
25 def check(input, expect):
26 self.assertEqual(coder(input), (expect, len(input)))
27 return check
28
Victor Stinnerf96418d2015-09-21 23:06:27 +020029
Walter Dörwald69652032004-09-07 20:24:22 +000030class Queue(object):
31 """
32 queue: write bytes at one end, read bytes from the other end
33 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000034 def __init__(self, buffer):
35 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000036
37 def write(self, chars):
38 self._buffer += chars
39
40 def read(self, size=-1):
41 if size<0:
42 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000044 return s
45 else:
46 s = self._buffer[:size]
47 self._buffer = self._buffer[size:]
48 return s
49
Victor Stinnerf96418d2015-09-21 23:06:27 +020050
Walter Dörwald3abcb012007-04-16 22:10:50 +000051class MixInCheckStateHandling:
52 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000053 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000054 d = codecs.getincrementaldecoder(encoding)()
55 part1 = d.decode(s[:i])
56 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000057 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000058 # Check that the condition stated in the documentation for
59 # IncrementalDecoder.getstate() holds
60 if not state[1]:
61 # reset decoder to the default state without anything buffered
62 d.setstate((state[0][:0], 0))
63 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000064 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000065 # The decoder must return to the same state
66 self.assertEqual(state, d.getstate())
67 # Create a new decoder and set it to the state
68 # we extracted from the old one
69 d = codecs.getincrementaldecoder(encoding)()
70 d.setstate(state)
71 part2 = d.decode(s[i:], True)
72 self.assertEqual(u, part1+part2)
73
74 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000075 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000076 d = codecs.getincrementalencoder(encoding)()
77 part1 = d.encode(u[:i])
78 state = d.getstate()
79 d = codecs.getincrementalencoder(encoding)()
80 d.setstate(state)
81 part2 = d.encode(u[i:], True)
82 self.assertEqual(s, part1+part2)
83
Victor Stinnerf96418d2015-09-21 23:06:27 +020084
Ezio Melotti5d3dba02013-01-11 06:02:07 +020085class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000087 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000088 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000089 # the StreamReader and check that the results equal the appropriate
90 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020092 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000093 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000094 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000095 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000096 result += r.read()
97 self.assertEqual(result, partialresult)
98 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000099 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000100 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000101
Martin Panter7462b6492015-11-02 03:37:02 +0000102 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000104 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000106 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000107 self.assertEqual(result, partialresult)
108 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000109 self.assertEqual(d.decode(b"", True), "")
110 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000111
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000112 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000114 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000116 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000117 self.assertEqual(result, partialresult)
118 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000119 self.assertEqual(d.decode(b"", True), "")
120 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000121
122 # check iterdecode()
123 encoded = input.encode(self.encoding)
124 self.assertEqual(
125 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000126 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000127 )
128
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000129 def test_readline(self):
130 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000131 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000132 return codecs.getreader(self.encoding)(stream)
133
Walter Dörwaldca199432006-03-06 22:39:12 +0000134 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200135 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000136 lines = []
137 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000138 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000139 if not line:
140 break
141 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000142 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000143
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000144 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
145 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
146 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000147 self.assertEqual(readalllines(s, True), sexpected)
148 self.assertEqual(readalllines(s, False), sexpectednoends)
149 self.assertEqual(readalllines(s, True, 10), sexpected)
150 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000151
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200152 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000153 # Test long lines (multiple calls to read() in readline())
154 vw = []
155 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200156 for (i, lineend) in enumerate(lineends):
157 vw.append((i*200+200)*"\u3042" + lineend)
158 vwo.append((i*200+200)*"\u3042")
159 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
160 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000161
162 # Test lines where the first read might end with \r, so the
163 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000164 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200165 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000166 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000168 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000169 self.assertEqual(
170 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000171 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000172 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200173 self.assertEqual(
174 reader.readline(keepends=True),
175 "xxx\n",
176 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000178 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000179 self.assertEqual(
180 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000181 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000182 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200183 self.assertEqual(
184 reader.readline(keepends=False),
185 "xxx",
186 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000187
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200188 def test_mixed_readline_and_read(self):
189 lines = ["Humpty Dumpty sat on a wall,\n",
190 "Humpty Dumpty had a great fall.\r\n",
191 "All the king's horses and all the king's men\r",
192 "Couldn't put Humpty together again."]
193 data = ''.join(lines)
194 def getreader():
195 stream = io.BytesIO(data.encode(self.encoding))
196 return codecs.getreader(self.encoding)(stream)
197
198 # Issue #8260: Test readline() followed by read()
199 f = getreader()
200 self.assertEqual(f.readline(), lines[0])
201 self.assertEqual(f.read(), ''.join(lines[1:]))
202 self.assertEqual(f.read(), '')
203
204 # Issue #16636: Test readline() followed by readlines()
205 f = getreader()
206 self.assertEqual(f.readline(), lines[0])
207 self.assertEqual(f.readlines(), lines[1:])
208 self.assertEqual(f.read(), '')
209
210 # Test read() followed by read()
211 f = getreader()
212 self.assertEqual(f.read(size=40, chars=5), data[:5])
213 self.assertEqual(f.read(), data[5:])
214 self.assertEqual(f.read(), '')
215
216 # Issue #12446: Test read() followed by readlines()
217 f = getreader()
218 self.assertEqual(f.read(size=40, chars=5), data[:5])
219 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
220 self.assertEqual(f.read(), '')
221
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000222 def test_bug1175396(self):
223 s = [
224 '<%!--===================================================\r\n',
225 ' BLOG index page: show recent articles,\r\n',
226 ' today\'s articles, or articles of a specific date.\r\n',
227 '========================================================--%>\r\n',
228 '<%@inputencoding="ISO-8859-1"%>\r\n',
229 '<%@pagetemplate=TEMPLATE.y%>\r\n',
230 '<%@import=import frog.util, frog%>\r\n',
231 '<%@import=import frog.objects%>\r\n',
232 '<%@import=from frog.storageerrors import StorageError%>\r\n',
233 '<%\r\n',
234 '\r\n',
235 'import logging\r\n',
236 'log=logging.getLogger("Snakelets.logger")\r\n',
237 '\r\n',
238 '\r\n',
239 'user=self.SessionCtx.user\r\n',
240 'storageEngine=self.SessionCtx.storageEngine\r\n',
241 '\r\n',
242 '\r\n',
243 'def readArticlesFromDate(date, count=None):\r\n',
244 ' entryids=storageEngine.listBlogEntries(date)\r\n',
245 ' entryids.reverse() # descending\r\n',
246 ' if count:\r\n',
247 ' entryids=entryids[:count]\r\n',
248 ' try:\r\n',
249 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
250 ' except StorageError,x:\r\n',
251 ' log.error("Error loading articles: "+str(x))\r\n',
252 ' self.abort("cannot load articles")\r\n',
253 '\r\n',
254 'showdate=None\r\n',
255 '\r\n',
256 'arg=self.Request.getArg()\r\n',
257 'if arg=="today":\r\n',
258 ' #-------------------- TODAY\'S ARTICLES\r\n',
259 ' self.write("<h2>Today\'s articles</h2>")\r\n',
260 ' showdate = frog.util.isodatestr() \r\n',
261 ' entries = readArticlesFromDate(showdate)\r\n',
262 'elif arg=="active":\r\n',
263 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
264 ' self.Yredirect("active.y")\r\n',
265 'elif arg=="login":\r\n',
266 ' #-------------------- LOGIN PAGE redirect\r\n',
267 ' self.Yredirect("login.y")\r\n',
268 'elif arg=="date":\r\n',
269 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
270 ' showdate = self.Request.getParameter("date")\r\n',
271 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
272 ' entries = readArticlesFromDate(showdate)\r\n',
273 'else:\r\n',
274 ' #-------------------- RECENT ARTICLES\r\n',
275 ' self.write("<h2>Recent articles</h2>")\r\n',
276 ' dates=storageEngine.listBlogEntryDates()\r\n',
277 ' if dates:\r\n',
278 ' entries=[]\r\n',
279 ' SHOWAMOUNT=10\r\n',
280 ' for showdate in dates:\r\n',
281 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
282 ' if len(entries)>=SHOWAMOUNT:\r\n',
283 ' break\r\n',
284 ' \r\n',
285 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000286 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200287 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000288 for (i, line) in enumerate(reader):
289 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000290
291 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000292 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200293 writer = codecs.getwriter(self.encoding)(q)
294 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000295
296 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000297 writer.write("foo\r")
298 self.assertEqual(reader.readline(keepends=False), "foo")
299 writer.write("\nbar\r")
300 self.assertEqual(reader.readline(keepends=False), "")
301 self.assertEqual(reader.readline(keepends=False), "bar")
302 writer.write("baz")
303 self.assertEqual(reader.readline(keepends=False), "baz")
304 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000305
306 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000307 writer.write("foo\r")
308 self.assertEqual(reader.readline(keepends=True), "foo\r")
309 writer.write("\nbar\r")
310 self.assertEqual(reader.readline(keepends=True), "\n")
311 self.assertEqual(reader.readline(keepends=True), "bar\r")
312 writer.write("baz")
313 self.assertEqual(reader.readline(keepends=True), "baz")
314 self.assertEqual(reader.readline(keepends=True), "")
315 writer.write("foo\r\n")
316 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000317
Walter Dörwald9fa09462005-01-10 12:01:39 +0000318 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000319 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
320 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
321 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000322
323 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000324 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200325 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000326 self.assertEqual(reader.readline(), s1)
327 self.assertEqual(reader.readline(), s2)
328 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000329 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000330
331 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000332 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
333 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
334 s3 = "stillokay:bbbbxx\r\n"
335 s4 = "broken!!!!badbad\r\n"
336 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000337
338 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000339 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200340 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000341 self.assertEqual(reader.readline(), s1)
342 self.assertEqual(reader.readline(), s2)
343 self.assertEqual(reader.readline(), s3)
344 self.assertEqual(reader.readline(), s4)
345 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000346 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000347
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200348 ill_formed_sequence_replace = "\ufffd"
349
350 def test_lone_surrogates(self):
351 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
352 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
353 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200354 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
355 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200356 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
357 "[&#56448;]".encode(self.encoding))
358 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
359 "[]".encode(self.encoding))
360 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
361 "[?]".encode(self.encoding))
362
Victor Stinner01ada392015-10-01 21:54:51 +0200363 # sequential surrogate characters
364 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
365 "[]".encode(self.encoding))
366 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
367 "[??]".encode(self.encoding))
368
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200369 bom = "".encode(self.encoding)
370 for before, after in [("\U00010fff", "A"), ("[", "]"),
371 ("A", "\U00010fff")]:
372 before_sequence = before.encode(self.encoding)[len(bom):]
373 after_sequence = after.encode(self.encoding)[len(bom):]
374 test_string = before + "\uDC80" + after
375 test_sequence = (bom + before_sequence +
376 self.ill_formed_sequence + after_sequence)
377 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
378 self.encoding)
379 self.assertEqual(test_string.encode(self.encoding,
380 "surrogatepass"),
381 test_sequence)
382 self.assertEqual(test_sequence.decode(self.encoding,
383 "surrogatepass"),
384 test_string)
385 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
386 before + after)
387 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
388 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200389 backslashreplace = ''.join('\\x%02x' % b
390 for b in self.ill_formed_sequence)
391 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
392 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200393
Victor Stinnerf96418d2015-09-21 23:06:27 +0200394
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200395class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000396 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200397 if sys.byteorder == 'little':
398 ill_formed_sequence = b"\x80\xdc\x00\x00"
399 else:
400 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000401
402 spamle = (b'\xff\xfe\x00\x00'
403 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
404 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
405 spambe = (b'\x00\x00\xfe\xff'
406 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
407 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
408
409 def test_only_one_bom(self):
410 _,_,reader,writer = codecs.lookup(self.encoding)
411 # encode some stream
412 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200413 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000414 f.write("spam")
415 f.write("spam")
416 d = s.getvalue()
417 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000418 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000419 # try to read it back
420 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200421 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000422 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000423
424 def test_badbom(self):
425 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200426 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000427 self.assertRaises(UnicodeError, f.read)
428
429 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200430 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000431 self.assertRaises(UnicodeError, f.read)
432
433 def test_partial(self):
434 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200435 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000436 [
437 "", # first byte of BOM read
438 "", # second byte of BOM read
439 "", # third byte of BOM read
440 "", # fourth byte of BOM read => byteorder known
441 "",
442 "",
443 "",
444 "\x00",
445 "\x00",
446 "\x00",
447 "\x00",
448 "\x00\xff",
449 "\x00\xff",
450 "\x00\xff",
451 "\x00\xff",
452 "\x00\xff\u0100",
453 "\x00\xff\u0100",
454 "\x00\xff\u0100",
455 "\x00\xff\u0100",
456 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200457 "\x00\xff\u0100\uffff",
458 "\x00\xff\u0100\uffff",
459 "\x00\xff\u0100\uffff",
460 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000461 ]
462 )
463
Georg Brandl791f4e12009-09-17 11:41:24 +0000464 def test_handlers(self):
465 self.assertEqual(('\ufffd', 1),
466 codecs.utf_32_decode(b'\x01', 'replace', True))
467 self.assertEqual(('', 1),
468 codecs.utf_32_decode(b'\x01', 'ignore', True))
469
Walter Dörwald41980ca2007-08-16 21:55:45 +0000470 def test_errors(self):
471 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
472 b"\xff", "strict", True)
473
474 def test_decoder_state(self):
475 self.check_state_handling_decode(self.encoding,
476 "spamspam", self.spamle)
477 self.check_state_handling_decode(self.encoding,
478 "spamspam", self.spambe)
479
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000480 def test_issue8941(self):
481 # Issue #8941: insufficient result allocation when decoding into
482 # surrogate pairs on UCS-2 builds.
483 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
484 self.assertEqual('\U00010000' * 1024,
485 codecs.utf_32_decode(encoded_le)[0])
486 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
487 self.assertEqual('\U00010000' * 1024,
488 codecs.utf_32_decode(encoded_be)[0])
489
Victor Stinnerf96418d2015-09-21 23:06:27 +0200490
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200491class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000492 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200493 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000494
495 def test_partial(self):
496 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200497 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000498 [
499 "",
500 "",
501 "",
502 "\x00",
503 "\x00",
504 "\x00",
505 "\x00",
506 "\x00\xff",
507 "\x00\xff",
508 "\x00\xff",
509 "\x00\xff",
510 "\x00\xff\u0100",
511 "\x00\xff\u0100",
512 "\x00\xff\u0100",
513 "\x00\xff\u0100",
514 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200515 "\x00\xff\u0100\uffff",
516 "\x00\xff\u0100\uffff",
517 "\x00\xff\u0100\uffff",
518 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000519 ]
520 )
521
522 def test_simple(self):
523 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
524
525 def test_errors(self):
526 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
527 b"\xff", "strict", True)
528
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000529 def test_issue8941(self):
530 # Issue #8941: insufficient result allocation when decoding into
531 # surrogate pairs on UCS-2 builds.
532 encoded = b'\x00\x00\x01\x00' * 1024
533 self.assertEqual('\U00010000' * 1024,
534 codecs.utf_32_le_decode(encoded)[0])
535
Victor Stinnerf96418d2015-09-21 23:06:27 +0200536
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200537class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000538 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200539 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000540
541 def test_partial(self):
542 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200543 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000544 [
545 "",
546 "",
547 "",
548 "\x00",
549 "\x00",
550 "\x00",
551 "\x00",
552 "\x00\xff",
553 "\x00\xff",
554 "\x00\xff",
555 "\x00\xff",
556 "\x00\xff\u0100",
557 "\x00\xff\u0100",
558 "\x00\xff\u0100",
559 "\x00\xff\u0100",
560 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200561 "\x00\xff\u0100\uffff",
562 "\x00\xff\u0100\uffff",
563 "\x00\xff\u0100\uffff",
564 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000565 ]
566 )
567
568 def test_simple(self):
569 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
570
571 def test_errors(self):
572 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
573 b"\xff", "strict", True)
574
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000575 def test_issue8941(self):
576 # Issue #8941: insufficient result allocation when decoding into
577 # surrogate pairs on UCS-2 builds.
578 encoded = b'\x00\x01\x00\x00' * 1024
579 self.assertEqual('\U00010000' * 1024,
580 codecs.utf_32_be_decode(encoded)[0])
581
582
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200583class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000584 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200585 if sys.byteorder == 'little':
586 ill_formed_sequence = b"\x80\xdc"
587 else:
588 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000589
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000590 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
591 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000592
593 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000594 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000595 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000596 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200597 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000598 f.write("spam")
599 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000600 d = s.getvalue()
601 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000602 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000603 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000604 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200605 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000606 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000607
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000608 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000609 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200610 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000611 self.assertRaises(UnicodeError, f.read)
612
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000613 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200614 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000615 self.assertRaises(UnicodeError, f.read)
616
Walter Dörwald69652032004-09-07 20:24:22 +0000617 def test_partial(self):
618 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200619 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000620 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000621 "", # first byte of BOM read
622 "", # second byte of BOM read => byteorder known
623 "",
624 "\x00",
625 "\x00",
626 "\x00\xff",
627 "\x00\xff",
628 "\x00\xff\u0100",
629 "\x00\xff\u0100",
630 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200631 "\x00\xff\u0100\uffff",
632 "\x00\xff\u0100\uffff",
633 "\x00\xff\u0100\uffff",
634 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000635 ]
636 )
637
Georg Brandl791f4e12009-09-17 11:41:24 +0000638 def test_handlers(self):
639 self.assertEqual(('\ufffd', 1),
640 codecs.utf_16_decode(b'\x01', 'replace', True))
641 self.assertEqual(('', 1),
642 codecs.utf_16_decode(b'\x01', 'ignore', True))
643
Walter Dörwalde22d3392005-11-17 08:52:34 +0000644 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000645 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000646 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000647
648 def test_decoder_state(self):
649 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000650 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000651 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000652 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000653
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000654 def test_bug691291(self):
655 # Files are always opened in binary mode, even if no binary mode was
656 # specified. This means that no automatic conversion of '\n' is done
657 # on reading and writing.
658 s1 = 'Hello\r\nworld\r\n'
659
660 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200661 self.addCleanup(support.unlink, support.TESTFN)
662 with open(support.TESTFN, 'wb') as fp:
663 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200664 with support.check_warnings(('', DeprecationWarning)):
665 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
666 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200667 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000668
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200669class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000670 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200671 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000672
673 def test_partial(self):
674 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200675 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000676 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000677 "",
678 "\x00",
679 "\x00",
680 "\x00\xff",
681 "\x00\xff",
682 "\x00\xff\u0100",
683 "\x00\xff\u0100",
684 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200685 "\x00\xff\u0100\uffff",
686 "\x00\xff\u0100\uffff",
687 "\x00\xff\u0100\uffff",
688 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000689 ]
690 )
691
Walter Dörwalde22d3392005-11-17 08:52:34 +0000692 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200693 tests = [
694 (b'\xff', '\ufffd'),
695 (b'A\x00Z', 'A\ufffd'),
696 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
697 (b'\x00\xd8', '\ufffd'),
698 (b'\x00\xd8A', '\ufffd'),
699 (b'\x00\xd8A\x00', '\ufffdA'),
700 (b'\x00\xdcA\x00', '\ufffdA'),
701 ]
702 for raw, expected in tests:
703 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
704 raw, 'strict', True)
705 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000706
Victor Stinner53a9dd72010-12-08 22:25:45 +0000707 def test_nonbmp(self):
708 self.assertEqual("\U00010203".encode(self.encoding),
709 b'\x00\xd8\x03\xde')
710 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
711 "\U00010203")
712
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200713class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000714 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200715 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000716
717 def test_partial(self):
718 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200719 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000720 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000721 "",
722 "\x00",
723 "\x00",
724 "\x00\xff",
725 "\x00\xff",
726 "\x00\xff\u0100",
727 "\x00\xff\u0100",
728 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200729 "\x00\xff\u0100\uffff",
730 "\x00\xff\u0100\uffff",
731 "\x00\xff\u0100\uffff",
732 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000733 ]
734 )
735
Walter Dörwalde22d3392005-11-17 08:52:34 +0000736 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200737 tests = [
738 (b'\xff', '\ufffd'),
739 (b'\x00A\xff', 'A\ufffd'),
740 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
741 (b'\xd8\x00', '\ufffd'),
742 (b'\xd8\x00\xdc', '\ufffd'),
743 (b'\xd8\x00\x00A', '\ufffdA'),
744 (b'\xdc\x00\x00A', '\ufffdA'),
745 ]
746 for raw, expected in tests:
747 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
748 raw, 'strict', True)
749 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000750
Victor Stinner53a9dd72010-12-08 22:25:45 +0000751 def test_nonbmp(self):
752 self.assertEqual("\U00010203".encode(self.encoding),
753 b'\xd8\x00\xde\x03')
754 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
755 "\U00010203")
756
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200757class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000758 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200759 ill_formed_sequence = b"\xed\xb2\x80"
760 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200761 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000762
763 def test_partial(self):
764 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200765 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000766 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000767 "\x00",
768 "\x00",
769 "\x00\xff",
770 "\x00\xff",
771 "\x00\xff\u07ff",
772 "\x00\xff\u07ff",
773 "\x00\xff\u07ff",
774 "\x00\xff\u07ff\u0800",
775 "\x00\xff\u07ff\u0800",
776 "\x00\xff\u07ff\u0800",
777 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200778 "\x00\xff\u07ff\u0800\uffff",
779 "\x00\xff\u07ff\u0800\uffff",
780 "\x00\xff\u07ff\u0800\uffff",
781 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000782 ]
783 )
784
Walter Dörwald3abcb012007-04-16 22:10:50 +0000785 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000786 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000787 self.check_state_handling_decode(self.encoding,
788 u, u.encode(self.encoding))
789
Victor Stinner1d65d912015-10-05 13:43:50 +0200790 def test_decode_error(self):
791 for data, error_handler, expected in (
792 (b'[\x80\xff]', 'ignore', '[]'),
793 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
794 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
795 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
796 ):
797 with self.subTest(data=data, error_handler=error_handler,
798 expected=expected):
799 self.assertEqual(data.decode(self.encoding, error_handler),
800 expected)
801
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000802 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200803 super().test_lone_surrogates()
804 # not sure if this is making sense for
805 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200806 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
807 self.BOM + b'[\x80]')
808
809 with self.assertRaises(UnicodeEncodeError) as cm:
810 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
811 exc = cm.exception
812 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000813
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000814 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200815 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
816 self.BOM + b"abc\xed\xa0\x80def")
817 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
818 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
819 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
820 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
821
822 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000823 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200824 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200825 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200826
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000827 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700828 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200829 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200830 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200831 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000832
Victor Stinnerf96418d2015-09-21 23:06:27 +0200833
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200834@unittest.skipUnless(sys.platform == 'win32',
835 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200836class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200837 encoding = "cp65001"
838
839 def test_encode(self):
840 tests = [
841 ('abc', 'strict', b'abc'),
842 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
843 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
844 ]
845 if VISTA_OR_LATER:
846 tests.extend((
847 ('\udc80', 'strict', None),
848 ('\udc80', 'ignore', b''),
849 ('\udc80', 'replace', b'?'),
850 ('\udc80', 'backslashreplace', b'\\udc80'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200851 ('\udc80', 'namereplace', b'\\udc80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200852 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
853 ))
854 else:
855 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
856 for text, errors, expected in tests:
857 if expected is not None:
858 try:
859 encoded = text.encode('cp65001', errors)
860 except UnicodeEncodeError as err:
861 self.fail('Unable to encode %a to cp65001 with '
862 'errors=%r: %s' % (text, errors, err))
863 self.assertEqual(encoded, expected,
864 '%a.encode("cp65001", %r)=%a != %a'
865 % (text, errors, encoded, expected))
866 else:
867 self.assertRaises(UnicodeEncodeError,
868 text.encode, "cp65001", errors)
869
870 def test_decode(self):
871 tests = [
872 (b'abc', 'strict', 'abc'),
873 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
874 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
875 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
876 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
877 # invalid bytes
878 (b'[\xff]', 'strict', None),
879 (b'[\xff]', 'ignore', '[]'),
880 (b'[\xff]', 'replace', '[\ufffd]'),
881 (b'[\xff]', 'surrogateescape', '[\udcff]'),
882 ]
883 if VISTA_OR_LATER:
884 tests.extend((
885 (b'[\xed\xb2\x80]', 'strict', None),
886 (b'[\xed\xb2\x80]', 'ignore', '[]'),
887 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
888 ))
889 else:
890 tests.extend((
891 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
892 ))
893 for raw, errors, expected in tests:
894 if expected is not None:
895 try:
896 decoded = raw.decode('cp65001', errors)
897 except UnicodeDecodeError as err:
898 self.fail('Unable to decode %a from cp65001 with '
899 'errors=%r: %s' % (raw, errors, err))
900 self.assertEqual(decoded, expected,
901 '%a.decode("cp65001", %r)=%a != %a'
902 % (raw, errors, decoded, expected))
903 else:
904 self.assertRaises(UnicodeDecodeError,
905 raw.decode, 'cp65001', errors)
906
907 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
908 def test_lone_surrogates(self):
909 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
910 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
911 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
912 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200913 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
914 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200915 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
916 b'[&#56448;]')
917 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
918 b'[\x80]')
919 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
920 b'[]')
921 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
922 b'[?]')
923
924 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
925 def test_surrogatepass_handler(self):
926 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
927 b"abc\xed\xa0\x80def")
928 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
929 "abc\ud800def")
930 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
931 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
932 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
933 "\U00010fff\uD800")
934 self.assertTrue(codecs.lookup_error("surrogatepass"))
935
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200936
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200937class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000938 encoding = "utf-7"
939
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300940 def test_ascii(self):
941 # Set D (directly encoded characters)
942 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
943 'abcdefghijklmnopqrstuvwxyz'
944 '0123456789'
945 '\'(),-./:?')
946 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
947 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
948 # Set O (optional direct characters)
949 set_o = ' !"#$%&*;<=>@[]^_`{|}'
950 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
951 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
952 # +
953 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
954 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
955 # White spaces
956 ws = ' \t\n\r'
957 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
958 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
959 # Other ASCII characters
960 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
961 set(set_d + set_o + '+' + ws)))
962 self.assertEqual(other_ascii.encode(self.encoding),
963 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
964 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
965
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000966 def test_partial(self):
967 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200968 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000969 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200970 'a',
971 'a',
972 'a+',
973 'a+-',
974 'a+-b',
975 'a+-b',
976 'a+-b',
977 'a+-b',
978 'a+-b',
979 'a+-b\x00',
980 'a+-b\x00c',
981 'a+-b\x00c',
982 'a+-b\x00c',
983 'a+-b\x00c',
984 'a+-b\x00c',
985 'a+-b\x00c\x80',
986 'a+-b\x00c\x80d',
987 'a+-b\x00c\x80d',
988 'a+-b\x00c\x80d',
989 'a+-b\x00c\x80d',
990 'a+-b\x00c\x80d',
991 'a+-b\x00c\x80d\u0100',
992 'a+-b\x00c\x80d\u0100e',
993 'a+-b\x00c\x80d\u0100e',
994 'a+-b\x00c\x80d\u0100e',
995 'a+-b\x00c\x80d\u0100e',
996 'a+-b\x00c\x80d\u0100e',
997 'a+-b\x00c\x80d\u0100e',
998 'a+-b\x00c\x80d\u0100e',
999 'a+-b\x00c\x80d\u0100e',
1000 'a+-b\x00c\x80d\u0100e\U00010000',
1001 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001002 ]
1003 )
Walter Dörwalde22d3392005-11-17 08:52:34 +00001004
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001005 def test_errors(self):
1006 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001007 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001008 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001009 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001010 (b'a+IK', 'a\ufffd'),
1011 (b'a+IK-b', 'a\ufffdb'),
1012 (b'a+IK,b', 'a\ufffdb'),
1013 (b'a+IKx', 'a\u20ac\ufffd'),
1014 (b'a+IKx-b', 'a\u20ac\ufffdb'),
1015 (b'a+IKwgr', 'a\u20ac\ufffd'),
1016 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
1017 (b'a+IKwgr,', 'a\u20ac\ufffd'),
1018 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
1019 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
1020 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
1021 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
1022 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1023 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1024 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001025 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1026 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001027 ]
1028 for raw, expected in tests:
1029 with self.subTest(raw=raw):
1030 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1031 raw, 'strict', True)
1032 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1033
1034 def test_nonbmp(self):
1035 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1036 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1037 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001038 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1039 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1040 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1041 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1042 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1043 b'+IKwgrNgB3KA-')
1044 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1045 '\u20ac\u20ac\U000104A0')
1046 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1047 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001048
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001049 def test_lone_surrogates(self):
1050 tests = [
1051 (b'a+2AE-b', 'a\ud801b'),
1052 (b'a+2AE\xffb', 'a\ufffdb'),
1053 (b'a+2AE', 'a\ufffd'),
1054 (b'a+2AEA-b', 'a\ufffdb'),
1055 (b'a+2AH-b', 'a\ufffdb'),
1056 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1057 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1058 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1059 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1060 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1061 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1062 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1063 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1064 ]
1065 for raw, expected in tests:
1066 with self.subTest(raw=raw):
1067 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001068
1069
Walter Dörwalde22d3392005-11-17 08:52:34 +00001070class UTF16ExTest(unittest.TestCase):
1071
1072 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001073 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001074
1075 def test_bad_args(self):
1076 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1077
1078class ReadBufferTest(unittest.TestCase):
1079
1080 def test_array(self):
1081 import array
1082 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001083 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001084 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001085 )
1086
1087 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001088 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001089
1090 def test_bad_args(self):
1091 self.assertRaises(TypeError, codecs.readbuffer_encode)
1092 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1093
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001094class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001095 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001096 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001097
1098 def test_partial(self):
1099 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001100 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001101 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001102 "",
1103 "",
1104 "", # First BOM has been read and skipped
1105 "",
1106 "",
1107 "\ufeff", # Second BOM has been read and emitted
1108 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001109 "\ufeff\x00", # First byte of encoded "\xff" read
1110 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1111 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1112 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001113 "\ufeff\x00\xff\u07ff",
1114 "\ufeff\x00\xff\u07ff",
1115 "\ufeff\x00\xff\u07ff\u0800",
1116 "\ufeff\x00\xff\u07ff\u0800",
1117 "\ufeff\x00\xff\u07ff\u0800",
1118 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001119 "\ufeff\x00\xff\u07ff\u0800\uffff",
1120 "\ufeff\x00\xff\u07ff\u0800\uffff",
1121 "\ufeff\x00\xff\u07ff\u0800\uffff",
1122 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001123 ]
1124 )
1125
Thomas Wouters89f507f2006-12-13 04:49:30 +00001126 def test_bug1601501(self):
1127 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001128 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001129
Walter Dörwald3abcb012007-04-16 22:10:50 +00001130 def test_bom(self):
1131 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001132 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001133 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1134
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001135 def test_stream_bom(self):
1136 unistring = "ABC\u00A1\u2200XYZ"
1137 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1138
1139 reader = codecs.getreader("utf-8-sig")
1140 for sizehint in [None] + list(range(1, 11)) + \
1141 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001142 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001143 ostream = io.StringIO()
1144 while 1:
1145 if sizehint is not None:
1146 data = istream.read(sizehint)
1147 else:
1148 data = istream.read()
1149
1150 if not data:
1151 break
1152 ostream.write(data)
1153
1154 got = ostream.getvalue()
1155 self.assertEqual(got, unistring)
1156
1157 def test_stream_bare(self):
1158 unistring = "ABC\u00A1\u2200XYZ"
1159 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1160
1161 reader = codecs.getreader("utf-8-sig")
1162 for sizehint in [None] + list(range(1, 11)) + \
1163 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001164 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001165 ostream = io.StringIO()
1166 while 1:
1167 if sizehint is not None:
1168 data = istream.read(sizehint)
1169 else:
1170 data = istream.read()
1171
1172 if not data:
1173 break
1174 ostream.write(data)
1175
1176 got = ostream.getvalue()
1177 self.assertEqual(got, unistring)
1178
1179class EscapeDecodeTest(unittest.TestCase):
1180 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001181 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001182 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001183
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001184 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001185 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001186 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001187 b = bytes([b])
1188 if b != b'\\':
1189 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001190
1191 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001192 decode = codecs.escape_decode
1193 check = coding_checker(self, decode)
1194 check(b"[\\\n]", b"[]")
1195 check(br'[\"]', b'["]')
1196 check(br"[\']", b"[']")
1197 check(br"[\\]", br"[\]")
1198 check(br"[\a]", b"[\x07]")
1199 check(br"[\b]", b"[\x08]")
1200 check(br"[\t]", b"[\x09]")
1201 check(br"[\n]", b"[\x0a]")
1202 check(br"[\v]", b"[\x0b]")
1203 check(br"[\f]", b"[\x0c]")
1204 check(br"[\r]", b"[\x0d]")
1205 check(br"[\7]", b"[\x07]")
1206 check(br"[\8]", br"[\8]")
1207 check(br"[\78]", b"[\x078]")
1208 check(br"[\41]", b"[!]")
1209 check(br"[\418]", b"[!8]")
1210 check(br"[\101]", b"[A]")
1211 check(br"[\1010]", b"[A0]")
1212 check(br"[\501]", b"[A]")
1213 check(br"[\x41]", b"[A]")
1214 check(br"[\X41]", br"[\X41]")
1215 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001216 for b in range(256):
1217 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001218 b = bytes([b])
1219 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001220
1221 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001222 decode = codecs.escape_decode
1223 self.assertRaises(ValueError, decode, br"\x")
1224 self.assertRaises(ValueError, decode, br"[\x]")
1225 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1226 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1227 self.assertRaises(ValueError, decode, br"\x0")
1228 self.assertRaises(ValueError, decode, br"[\x0]")
1229 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1230 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001231
Victor Stinnerf96418d2015-09-21 23:06:27 +02001232
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001233class RecodingTest(unittest.TestCase):
1234 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001235 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001236 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001237 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001238 f2.close()
1239 # Python used to crash on this at exit because of a refcount
1240 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001241
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001242 self.assertTrue(f.closed)
1243
Martin v. Löwis2548c732003-04-18 10:39:54 +00001244# From RFC 3492
1245punycode_testcases = [
1246 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001247 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1248 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001249 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001250 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001251 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001252 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001253 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001254 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001255 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001256 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001257 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1258 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1259 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001260 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001261 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001262 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1263 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1264 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001265 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001266 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001267 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001268 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1269 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1270 "\u0939\u0948\u0902",
1271 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001272
1273 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001274 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001275 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1276 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001277
1278 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001279 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1280 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1281 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001282 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1283 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001284
1285 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001286 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1287 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1288 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1289 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001290 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001291
1292 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001293 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1294 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1295 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1296 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1297 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001298 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001299
1300 # (K) Vietnamese:
1301 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1302 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001303 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1304 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1305 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1306 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001307 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001308
Martin v. Löwis2548c732003-04-18 10:39:54 +00001309 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001310 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001311 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001312
Martin v. Löwis2548c732003-04-18 10:39:54 +00001313 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001314 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1315 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1316 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001317 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001318
1319 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001320 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1321 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1322 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001323 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001324
1325 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001326 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001327 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001328
1329 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001330 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1331 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001332 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001333
1334 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001335 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001336 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001337
1338 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001339 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001340 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001341
1342 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001343 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1344 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001345 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001346 ]
1347
1348for i in punycode_testcases:
1349 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001350 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001351
Victor Stinnerf96418d2015-09-21 23:06:27 +02001352
Martin v. Löwis2548c732003-04-18 10:39:54 +00001353class PunycodeTest(unittest.TestCase):
1354 def test_encode(self):
1355 for uni, puny in punycode_testcases:
1356 # Need to convert both strings to lower case, since
1357 # some of the extended encodings use upper case, but our
1358 # code produces only lower case. Converting just puny to
1359 # lower is also insufficient, since some of the input characters
1360 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001361 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001362 str(uni.encode("punycode"), "ascii").lower(),
1363 str(puny, "ascii").lower()
1364 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001365
1366 def test_decode(self):
1367 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001368 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001369 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001370 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001371
Victor Stinnerf96418d2015-09-21 23:06:27 +02001372
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001373class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001374 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001375 def test_bug1251300(self):
1376 # Decoding with unicode_internal used to not correctly handle "code
1377 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001378 ok = [
1379 (b"\x00\x10\xff\xff", "\U0010ffff"),
1380 (b"\x00\x00\x01\x01", "\U00000101"),
1381 (b"", ""),
1382 ]
1383 not_ok = [
1384 b"\x7f\xff\xff\xff",
1385 b"\x80\x00\x00\x00",
1386 b"\x81\x00\x00\x00",
1387 b"\x00",
1388 b"\x00\x00\x00\x00\x00",
1389 ]
1390 for internal, uni in ok:
1391 if sys.byteorder == "little":
1392 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001393 with support.check_warnings():
1394 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001395 for internal in not_ok:
1396 if sys.byteorder == "little":
1397 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001398 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001399 'deprecated', DeprecationWarning)):
1400 self.assertRaises(UnicodeDecodeError, internal.decode,
1401 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001402 if sys.byteorder == "little":
1403 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001404 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001405 else:
1406 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001407 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001408 with support.check_warnings():
1409 self.assertRaises(UnicodeDecodeError,
1410 invalid.decode, "unicode_internal")
1411 with support.check_warnings():
1412 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1413 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001414 with support.check_warnings():
1415 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1416 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001417
Victor Stinner182d90d2011-09-29 19:53:55 +02001418 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001419 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001420 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001421 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001422 'deprecated', DeprecationWarning)):
1423 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001424 except UnicodeDecodeError as ex:
1425 self.assertEqual("unicode_internal", ex.encoding)
1426 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1427 self.assertEqual(4, ex.start)
1428 self.assertEqual(8, ex.end)
1429 else:
1430 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001431
Victor Stinner182d90d2011-09-29 19:53:55 +02001432 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001433 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001434 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1435 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001436 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001437 'deprecated', DeprecationWarning)):
1438 ab = "ab".encode("unicode_internal").decode()
1439 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1440 "ascii"),
1441 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001442 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001443
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001444 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001445 with support.check_warnings(('unicode_internal codec has been '
1446 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001447 # Issue 3739
1448 encoder = codecs.getencoder("unicode_internal")
1449 self.assertEqual(encoder("a")[1], 1)
1450 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1451
1452 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001453
Martin v. Löwis2548c732003-04-18 10:39:54 +00001454# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1455nameprep_tests = [
1456 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001457 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1458 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1459 b'\xb8\x8f\xef\xbb\xbf',
1460 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001461 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001462 (b'CAFE',
1463 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001464 # 3.3 Case folding 8bit U+00DF (german sharp s).
1465 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001466 (b'\xc3\x9f',
1467 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001468 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001469 (b'\xc4\xb0',
1470 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001471 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001472 (b'\xc5\x83\xcd\xba',
1473 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001474 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1475 # XXX: skip this as it fails in UCS-2 mode
1476 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1477 # 'telc\xe2\x88\x95kg\xcf\x83'),
1478 (None, None),
1479 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001480 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1481 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001482 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001483 (b'\xe1\xbe\xb7',
1484 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001485 # 3.9 Self-reverting case folding U+01F0 and normalization.
1486 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001487 (b'\xc7\xb0',
1488 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001489 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001490 (b'\xce\x90',
1491 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001492 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001493 (b'\xce\xb0',
1494 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001495 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001496 (b'\xe1\xba\x96',
1497 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001498 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001499 (b'\xe1\xbd\x96',
1500 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001501 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001502 (b' ',
1503 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001504 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001505 (b'\xc2\xa0',
1506 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001507 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001508 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001509 None),
1510 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001511 (b'\xe2\x80\x80',
1512 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001513 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001514 (b'\xe2\x80\x8b',
1515 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001516 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001517 (b'\xe3\x80\x80',
1518 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001519 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001520 (b'\x10\x7f',
1521 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001522 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001523 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001524 None),
1525 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001526 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001527 None),
1528 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001529 (b'\xef\xbb\xbf',
1530 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001531 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001532 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001533 None),
1534 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001535 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001536 None),
1537 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001538 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001539 None),
1540 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001541 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001542 None),
1543 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001544 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001545 None),
1546 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001547 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001548 None),
1549 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001550 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001551 None),
1552 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001553 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001554 None),
1555 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001556 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001557 None),
1558 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001559 (b'\xcd\x81',
1560 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001561 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001562 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001563 None),
1564 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001565 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001566 None),
1567 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001568 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001569 None),
1570 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001571 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001572 None),
1573 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001574 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001575 None),
1576 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001577 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001578 None),
1579 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001580 (b'foo\xef\xb9\xb6bar',
1581 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001582 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001583 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001584 None),
1585 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001586 (b'\xd8\xa71\xd8\xa8',
1587 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001588 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001589 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001590 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001591 # None),
1592 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001593 # 3.44 Larger test (shrinking).
1594 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001595 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1596 b'\xaa\xce\xb0\xe2\x80\x80',
1597 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001598 # 3.45 Larger test (expanding).
1599 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001600 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1601 b'\x80',
1602 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1603 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1604 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001605 ]
1606
1607
1608class NameprepTest(unittest.TestCase):
1609 def test_nameprep(self):
1610 from encodings.idna import nameprep
1611 for pos, (orig, prepped) in enumerate(nameprep_tests):
1612 if orig is None:
1613 # Skipped
1614 continue
1615 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001616 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001617 if prepped is None:
1618 # Input contains prohibited characters
1619 self.assertRaises(UnicodeError, nameprep, orig)
1620 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001621 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001622 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001623 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001624 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001625 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001626
Victor Stinnerf96418d2015-09-21 23:06:27 +02001627
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001628class IDNACodecTest(unittest.TestCase):
1629 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001630 self.assertEqual(str(b"python.org", "idna"), "python.org")
1631 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1632 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1633 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001634
1635 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001636 self.assertEqual("python.org".encode("idna"), b"python.org")
1637 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1638 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1639 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001640
Martin v. Löwis8b595142005-08-25 11:03:38 +00001641 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001642 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001643 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001644 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001645
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001646 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001647 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001648 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001649 "python.org"
1650 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001651 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001652 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001653 "python.org."
1654 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001655 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001656 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001657 "pyth\xf6n.org."
1658 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001659 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001660 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001661 "pyth\xf6n.org."
1662 )
1663
1664 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001665 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1666 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1667 self.assertEqual(decoder.decode(b"rg"), "")
1668 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001669
1670 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001671 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1672 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1673 self.assertEqual(decoder.decode(b"rg."), "org.")
1674 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001675
1676 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001677 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001678 b"".join(codecs.iterencode("python.org", "idna")),
1679 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001680 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001681 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001682 b"".join(codecs.iterencode("python.org.", "idna")),
1683 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001684 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001685 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001686 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1687 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001688 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001689 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001690 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1691 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001692 )
1693
1694 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001695 self.assertEqual(encoder.encode("\xe4x"), b"")
1696 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1697 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001698
1699 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001700 self.assertEqual(encoder.encode("\xe4x"), b"")
1701 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1702 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001703
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001704 def test_errors(self):
1705 """Only supports "strict" error handler"""
1706 "python.org".encode("idna", "strict")
1707 b"python.org".decode("idna", "strict")
1708 for errors in ("ignore", "replace", "backslashreplace",
1709 "surrogateescape"):
1710 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1711 self.assertRaises(Exception,
1712 b"python.org".decode, "idna", errors)
1713
Victor Stinnerf96418d2015-09-21 23:06:27 +02001714
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001715class CodecsModuleTest(unittest.TestCase):
1716
1717 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001718 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1719 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001720 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001721 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001722 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001723
Victor Stinnera57dfd02014-05-14 17:13:14 +02001724 # test keywords
1725 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1726 '\xe4\xf6\xfc')
1727 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1728 '[]')
1729
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001730 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001731 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1732 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001733 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001734 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001735 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001736 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001737
Victor Stinnera57dfd02014-05-14 17:13:14 +02001738 # test keywords
1739 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1740 b'\xe4\xf6\xfc')
1741 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1742 b'[]')
1743
Walter Dörwald063e1e82004-10-28 13:04:26 +00001744 def test_register(self):
1745 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001746 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001747
1748 def test_lookup(self):
1749 self.assertRaises(TypeError, codecs.lookup)
1750 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001751 self.assertRaises(LookupError, codecs.lookup, " ")
1752
1753 def test_getencoder(self):
1754 self.assertRaises(TypeError, codecs.getencoder)
1755 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1756
1757 def test_getdecoder(self):
1758 self.assertRaises(TypeError, codecs.getdecoder)
1759 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1760
1761 def test_getreader(self):
1762 self.assertRaises(TypeError, codecs.getreader)
1763 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1764
1765 def test_getwriter(self):
1766 self.assertRaises(TypeError, codecs.getwriter)
1767 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001768
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001769 def test_lookup_issue1813(self):
1770 # Issue #1813: under Turkish locales, lookup of some codecs failed
1771 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001772 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001773 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1774 try:
1775 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1776 except locale.Error:
1777 # Unsupported locale on this system
1778 self.skipTest('test needs Turkish locale')
1779 c = codecs.lookup('ASCII')
1780 self.assertEqual(c.name, 'ascii')
1781
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001782 def test_all(self):
1783 api = (
1784 "encode", "decode",
1785 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1786 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1787 "getencoder", "getdecoder", "getincrementalencoder",
1788 "getincrementaldecoder", "getreader", "getwriter",
1789 "register_error", "lookup_error",
1790 "strict_errors", "replace_errors", "ignore_errors",
1791 "xmlcharrefreplace_errors", "backslashreplace_errors",
1792 "namereplace_errors",
1793 "open", "EncodedFile",
1794 "iterencode", "iterdecode",
1795 "BOM", "BOM_BE", "BOM_LE",
1796 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1797 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1798 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1799 "StreamReaderWriter", "StreamRecoder",
1800 )
1801 self.assertCountEqual(api, codecs.__all__)
1802 for api in codecs.__all__:
1803 getattr(codecs, api)
1804
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001805 def test_open(self):
1806 self.addCleanup(support.unlink, support.TESTFN)
1807 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1808 with self.subTest(mode), \
1809 codecs.open(support.TESTFN, mode, 'ascii') as file:
1810 self.assertIsInstance(file, codecs.StreamReaderWriter)
1811
1812 def test_undefined(self):
1813 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1814 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1815 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1816 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1817 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1818 self.assertRaises(UnicodeError,
1819 codecs.encode, 'abc', 'undefined', errors)
1820 self.assertRaises(UnicodeError,
1821 codecs.decode, b'abc', 'undefined', errors)
1822
Victor Stinnerf96418d2015-09-21 23:06:27 +02001823
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001824class StreamReaderTest(unittest.TestCase):
1825
1826 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001827 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001828 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001829
1830 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001831 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001832 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001833
Victor Stinnerf96418d2015-09-21 23:06:27 +02001834
Thomas Wouters89f507f2006-12-13 04:49:30 +00001835class EncodedFileTest(unittest.TestCase):
1836
1837 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001838 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001839 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001840 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001841
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001842 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001843 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001844 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001845 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001846
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001847all_unicode_encodings = [
1848 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001849 "big5",
1850 "big5hkscs",
1851 "charmap",
1852 "cp037",
1853 "cp1006",
1854 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001855 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001856 "cp1140",
1857 "cp1250",
1858 "cp1251",
1859 "cp1252",
1860 "cp1253",
1861 "cp1254",
1862 "cp1255",
1863 "cp1256",
1864 "cp1257",
1865 "cp1258",
1866 "cp424",
1867 "cp437",
1868 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001869 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001870 "cp737",
1871 "cp775",
1872 "cp850",
1873 "cp852",
1874 "cp855",
1875 "cp856",
1876 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001877 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001878 "cp860",
1879 "cp861",
1880 "cp862",
1881 "cp863",
1882 "cp864",
1883 "cp865",
1884 "cp866",
1885 "cp869",
1886 "cp874",
1887 "cp875",
1888 "cp932",
1889 "cp949",
1890 "cp950",
1891 "euc_jis_2004",
1892 "euc_jisx0213",
1893 "euc_jp",
1894 "euc_kr",
1895 "gb18030",
1896 "gb2312",
1897 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001898 "hp_roman8",
1899 "hz",
1900 "idna",
1901 "iso2022_jp",
1902 "iso2022_jp_1",
1903 "iso2022_jp_2",
1904 "iso2022_jp_2004",
1905 "iso2022_jp_3",
1906 "iso2022_jp_ext",
1907 "iso2022_kr",
1908 "iso8859_1",
1909 "iso8859_10",
1910 "iso8859_11",
1911 "iso8859_13",
1912 "iso8859_14",
1913 "iso8859_15",
1914 "iso8859_16",
1915 "iso8859_2",
1916 "iso8859_3",
1917 "iso8859_4",
1918 "iso8859_5",
1919 "iso8859_6",
1920 "iso8859_7",
1921 "iso8859_8",
1922 "iso8859_9",
1923 "johab",
1924 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001925 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001926 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001927 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001928 "latin_1",
1929 "mac_cyrillic",
1930 "mac_greek",
1931 "mac_iceland",
1932 "mac_latin2",
1933 "mac_roman",
1934 "mac_turkish",
1935 "palmos",
1936 "ptcp154",
1937 "punycode",
1938 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001939 "shift_jis",
1940 "shift_jis_2004",
1941 "shift_jisx0213",
1942 "tis_620",
1943 "unicode_escape",
1944 "unicode_internal",
1945 "utf_16",
1946 "utf_16_be",
1947 "utf_16_le",
1948 "utf_7",
1949 "utf_8",
1950]
1951
1952if hasattr(codecs, "mbcs_encode"):
1953 all_unicode_encodings.append("mbcs")
1954
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001955# The following encoding is not tested, because it's not supposed
1956# to work:
1957# "undefined"
1958
1959# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001960broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001961 "punycode",
1962 "unicode_internal"
1963]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001964
Victor Stinnerf96418d2015-09-21 23:06:27 +02001965
Walter Dörwald3abcb012007-04-16 22:10:50 +00001966class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001967 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001968 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001969 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001970 name = codecs.lookup(encoding).name
1971 if encoding.endswith("_codec"):
1972 name += "_codec"
1973 elif encoding == "latin_1":
1974 name = "latin_1"
1975 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001976
Ezio Melottiadc417c2011-11-17 12:23:34 +02001977 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001978 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001979 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001980 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001981 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001982 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001983
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001984 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001985 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001986 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001987 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001988 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001989 for c in s:
1990 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001991 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001992 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001993 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001994 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001995 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001996 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001997 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001998 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001999 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002000 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002001
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002002 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002003 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00002004 try:
2005 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002006 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00002007 pass
2008 else:
2009 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002010 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00002011 for c in s:
2012 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002013 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00002014 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002015 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00002016 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002017 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002018 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002019 self.assertEqual(decodedresult, s,
2020 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002021
2022 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002023 result = "".join(codecs.iterdecode(
2024 codecs.iterencode(s, encoding), encoding))
2025 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002026
2027 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002028 result = "".join(codecs.iterdecode(
2029 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002030 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00002031
Victor Stinner554f3f02010-06-16 23:33:54 +00002032 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00002033 # check incremental decoder/encoder with errors argument
2034 try:
2035 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002036 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00002037 pass
2038 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002039 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002040 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002041 decodedresult = "".join(decoder.decode(bytes([c]))
2042 for c in encodedresult)
2043 self.assertEqual(decodedresult, s,
2044 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002045
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002046 @support.cpython_only
2047 def test_basics_capi(self):
2048 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
2049 s = "abc123" # all codecs should be able to encode these
2050 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002051 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002052 # check incremental decoder/encoder (fetched via the C API)
2053 try:
2054 cencoder = codec_incrementalencoder(encoding)
2055 except LookupError: # no IncrementalEncoder
2056 pass
2057 else:
2058 # check C API
2059 encodedresult = b""
2060 for c in s:
2061 encodedresult += cencoder.encode(c)
2062 encodedresult += cencoder.encode("", True)
2063 cdecoder = codec_incrementaldecoder(encoding)
2064 decodedresult = ""
2065 for c in encodedresult:
2066 decodedresult += cdecoder.decode(bytes([c]))
2067 decodedresult += cdecoder.decode(b"", True)
2068 self.assertEqual(decodedresult, s,
2069 "encoding=%r" % encoding)
2070
2071 if encoding not in ("idna", "mbcs"):
2072 # check incremental decoder/encoder with errors argument
2073 try:
2074 cencoder = codec_incrementalencoder(encoding, "ignore")
2075 except LookupError: # no IncrementalEncoder
2076 pass
2077 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002078 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002079 cdecoder = codec_incrementaldecoder(encoding, "ignore")
2080 decodedresult = "".join(cdecoder.decode(bytes([c]))
2081 for c in encodedresult)
2082 self.assertEqual(decodedresult, s,
2083 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002084
Walter Dörwald729c31f2005-03-14 19:06:30 +00002085 def test_seek(self):
2086 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002087 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002088 for encoding in all_unicode_encodings:
2089 if encoding == "idna": # FIXME: See SF bug #1163178
2090 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002091 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002092 continue
Victor Stinner05010702011-05-27 16:50:40 +02002093 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002094 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002095 # Test that calling seek resets the internal codec state and buffers
2096 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002097 data = reader.read()
2098 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002099
Walter Dörwalde22d3392005-11-17 08:52:34 +00002100 def test_bad_decode_args(self):
2101 for encoding in all_unicode_encodings:
2102 decoder = codecs.getdecoder(encoding)
2103 self.assertRaises(TypeError, decoder)
2104 if encoding not in ("idna", "punycode"):
2105 self.assertRaises(TypeError, decoder, 42)
2106
2107 def test_bad_encode_args(self):
2108 for encoding in all_unicode_encodings:
2109 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002110 with support.check_warnings():
2111 # unicode-internal has been deprecated
2112 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002113
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002114 def test_encoding_map_type_initialized(self):
2115 from encodings import cp1140
2116 # This used to crash, we are only verifying there's no crash.
2117 table_type = type(cp1140.encoding_table)
2118 self.assertEqual(table_type, table_type)
2119
Walter Dörwald3abcb012007-04-16 22:10:50 +00002120 def test_decoder_state(self):
2121 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002122 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002123 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002124 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002125 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2126 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2127
Victor Stinnerf96418d2015-09-21 23:06:27 +02002128
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002129class CharmapTest(unittest.TestCase):
2130 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002131 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002132 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002133 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002134 )
2135
Ezio Melottib3aedd42010-11-20 19:04:17 +00002136 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002137 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2138 ("\U0010FFFFbc", 3)
2139 )
2140
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002141 self.assertRaises(UnicodeDecodeError,
2142 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2143 )
2144
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002145 self.assertRaises(UnicodeDecodeError,
2146 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2147 )
2148
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002149 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002150 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002151 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002152 )
2153
Ezio Melottib3aedd42010-11-20 19:04:17 +00002154 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002155 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002156 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002157 )
2158
Ezio Melottib3aedd42010-11-20 19:04:17 +00002159 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002160 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2161 ("ab\\x02", 3)
2162 )
2163
2164 self.assertEqual(
2165 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2166 ("ab\\x02", 3)
2167 )
2168
2169 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002170 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002171 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002172 )
2173
Ezio Melottib3aedd42010-11-20 19:04:17 +00002174 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002175 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002176 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002177 )
2178
Guido van Rossum805365e2007-05-07 22:24:25 +00002179 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002180 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002181 codecs.charmap_decode(allbytes, "ignore", ""),
2182 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002183 )
2184
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002185 def test_decode_with_int2str_map(self):
2186 self.assertEqual(
2187 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2188 {0: 'a', 1: 'b', 2: 'c'}),
2189 ("abc", 3)
2190 )
2191
2192 self.assertEqual(
2193 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2194 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2195 ("AaBbCc", 3)
2196 )
2197
2198 self.assertEqual(
2199 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2200 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2201 ("\U0010FFFFbc", 3)
2202 )
2203
2204 self.assertEqual(
2205 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2206 {0: 'a', 1: 'b', 2: ''}),
2207 ("ab", 3)
2208 )
2209
2210 self.assertRaises(UnicodeDecodeError,
2211 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2212 {0: 'a', 1: 'b'}
2213 )
2214
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002215 self.assertRaises(UnicodeDecodeError,
2216 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2217 {0: 'a', 1: 'b', 2: None}
2218 )
2219
2220 # Issue #14850
2221 self.assertRaises(UnicodeDecodeError,
2222 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2223 {0: 'a', 1: 'b', 2: '\ufffe'}
2224 )
2225
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002226 self.assertEqual(
2227 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2228 {0: 'a', 1: 'b'}),
2229 ("ab\ufffd", 3)
2230 )
2231
2232 self.assertEqual(
2233 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2234 {0: 'a', 1: 'b', 2: None}),
2235 ("ab\ufffd", 3)
2236 )
2237
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002238 # Issue #14850
2239 self.assertEqual(
2240 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2241 {0: 'a', 1: 'b', 2: '\ufffe'}),
2242 ("ab\ufffd", 3)
2243 )
2244
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002245 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002246 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2247 {0: 'a', 1: 'b'}),
2248 ("ab\\x02", 3)
2249 )
2250
2251 self.assertEqual(
2252 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2253 {0: 'a', 1: 'b', 2: None}),
2254 ("ab\\x02", 3)
2255 )
2256
2257 # Issue #14850
2258 self.assertEqual(
2259 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2260 {0: 'a', 1: 'b', 2: '\ufffe'}),
2261 ("ab\\x02", 3)
2262 )
2263
2264 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002265 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2266 {0: 'a', 1: 'b'}),
2267 ("ab", 3)
2268 )
2269
2270 self.assertEqual(
2271 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2272 {0: 'a', 1: 'b', 2: None}),
2273 ("ab", 3)
2274 )
2275
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002276 # Issue #14850
2277 self.assertEqual(
2278 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2279 {0: 'a', 1: 'b', 2: '\ufffe'}),
2280 ("ab", 3)
2281 )
2282
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002283 allbytes = bytes(range(256))
2284 self.assertEqual(
2285 codecs.charmap_decode(allbytes, "ignore", {}),
2286 ("", len(allbytes))
2287 )
2288
2289 def test_decode_with_int2int_map(self):
2290 a = ord('a')
2291 b = ord('b')
2292 c = ord('c')
2293
2294 self.assertEqual(
2295 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2296 {0: a, 1: b, 2: c}),
2297 ("abc", 3)
2298 )
2299
2300 # Issue #15379
2301 self.assertEqual(
2302 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2303 {0: 0x10FFFF, 1: b, 2: c}),
2304 ("\U0010FFFFbc", 3)
2305 )
2306
Antoine Pitroua1f76552012-09-23 20:00:04 +02002307 self.assertEqual(
2308 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2309 {0: sys.maxunicode, 1: b, 2: c}),
2310 (chr(sys.maxunicode) + "bc", 3)
2311 )
2312
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002313 self.assertRaises(TypeError,
2314 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002315 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002316 )
2317
2318 self.assertRaises(UnicodeDecodeError,
2319 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2320 {0: a, 1: b},
2321 )
2322
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002323 self.assertRaises(UnicodeDecodeError,
2324 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2325 {0: a, 1: b, 2: 0xFFFE},
2326 )
2327
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002328 self.assertEqual(
2329 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2330 {0: a, 1: b}),
2331 ("ab\ufffd", 3)
2332 )
2333
2334 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002335 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2336 {0: a, 1: b, 2: 0xFFFE}),
2337 ("ab\ufffd", 3)
2338 )
2339
2340 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002341 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2342 {0: a, 1: b}),
2343 ("ab\\x02", 3)
2344 )
2345
2346 self.assertEqual(
2347 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2348 {0: a, 1: b, 2: 0xFFFE}),
2349 ("ab\\x02", 3)
2350 )
2351
2352 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002353 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2354 {0: a, 1: b}),
2355 ("ab", 3)
2356 )
2357
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002358 self.assertEqual(
2359 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2360 {0: a, 1: b, 2: 0xFFFE}),
2361 ("ab", 3)
2362 )
2363
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002364
Thomas Wouters89f507f2006-12-13 04:49:30 +00002365class WithStmtTest(unittest.TestCase):
2366 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002367 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002368 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2369 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002370 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002371
2372 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002373 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002374 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002375 with codecs.StreamReaderWriter(f, info.streamreader,
2376 info.streamwriter, 'strict') as srw:
2377 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002378
Victor Stinnerf96418d2015-09-21 23:06:27 +02002379
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002380class TypesTest(unittest.TestCase):
2381 def test_decode_unicode(self):
2382 # Most decoders don't accept unicode input
2383 decoders = [
2384 codecs.utf_7_decode,
2385 codecs.utf_8_decode,
2386 codecs.utf_16_le_decode,
2387 codecs.utf_16_be_decode,
2388 codecs.utf_16_ex_decode,
2389 codecs.utf_32_decode,
2390 codecs.utf_32_le_decode,
2391 codecs.utf_32_be_decode,
2392 codecs.utf_32_ex_decode,
2393 codecs.latin_1_decode,
2394 codecs.ascii_decode,
2395 codecs.charmap_decode,
2396 ]
2397 if hasattr(codecs, "mbcs_decode"):
2398 decoders.append(codecs.mbcs_decode)
2399 for decoder in decoders:
2400 self.assertRaises(TypeError, decoder, "xxx")
2401
2402 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002403 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002404 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002405 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2406 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2407 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2408 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002409
Victor Stinnere3b47152011-12-09 20:49:49 +01002410 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2411 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002412 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2413 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002414
2415 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2416 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002417 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2418 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002419
Serhiy Storchakad6793772013-01-29 10:20:44 +02002420
2421class UnicodeEscapeTest(unittest.TestCase):
2422 def test_empty(self):
2423 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2424 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2425
2426 def test_raw_encode(self):
2427 encode = codecs.unicode_escape_encode
2428 for b in range(32, 127):
2429 if b != b'\\'[0]:
2430 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2431
2432 def test_raw_decode(self):
2433 decode = codecs.unicode_escape_decode
2434 for b in range(256):
2435 if b != b'\\'[0]:
2436 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2437
2438 def test_escape_encode(self):
2439 encode = codecs.unicode_escape_encode
2440 check = coding_checker(self, encode)
2441 check('\t', br'\t')
2442 check('\n', br'\n')
2443 check('\r', br'\r')
2444 check('\\', br'\\')
2445 for b in range(32):
2446 if chr(b) not in '\t\n\r':
2447 check(chr(b), ('\\x%02x' % b).encode())
2448 for b in range(127, 256):
2449 check(chr(b), ('\\x%02x' % b).encode())
2450 check('\u20ac', br'\u20ac')
2451 check('\U0001d120', br'\U0001d120')
2452
2453 def test_escape_decode(self):
2454 decode = codecs.unicode_escape_decode
2455 check = coding_checker(self, decode)
2456 check(b"[\\\n]", "[]")
2457 check(br'[\"]', '["]')
2458 check(br"[\']", "[']")
2459 check(br"[\\]", r"[\]")
2460 check(br"[\a]", "[\x07]")
2461 check(br"[\b]", "[\x08]")
2462 check(br"[\t]", "[\x09]")
2463 check(br"[\n]", "[\x0a]")
2464 check(br"[\v]", "[\x0b]")
2465 check(br"[\f]", "[\x0c]")
2466 check(br"[\r]", "[\x0d]")
2467 check(br"[\7]", "[\x07]")
2468 check(br"[\8]", r"[\8]")
2469 check(br"[\78]", "[\x078]")
2470 check(br"[\41]", "[!]")
2471 check(br"[\418]", "[!8]")
2472 check(br"[\101]", "[A]")
2473 check(br"[\1010]", "[A0]")
2474 check(br"[\x41]", "[A]")
2475 check(br"[\x410]", "[A0]")
2476 check(br"\u20ac", "\u20ac")
2477 check(br"\U0001d120", "\U0001d120")
2478 for b in range(256):
2479 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2480 check(b'\\' + bytes([b]), '\\' + chr(b))
2481
2482 def test_decode_errors(self):
2483 decode = codecs.unicode_escape_decode
2484 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2485 for i in range(d):
2486 self.assertRaises(UnicodeDecodeError, decode,
2487 b"\\" + c + b"0"*i)
2488 self.assertRaises(UnicodeDecodeError, decode,
2489 b"[\\" + c + b"0"*i + b"]")
2490 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2491 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2492 self.assertEqual(decode(data, "replace"),
2493 ("[\ufffd]\ufffd", len(data)))
2494 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2495 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2496 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2497
2498
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002499class RawUnicodeEscapeTest(unittest.TestCase):
2500 def test_empty(self):
2501 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2502 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2503
2504 def test_raw_encode(self):
2505 encode = codecs.raw_unicode_escape_encode
2506 for b in range(256):
2507 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2508
2509 def test_raw_decode(self):
2510 decode = codecs.raw_unicode_escape_decode
2511 for b in range(256):
2512 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2513
2514 def test_escape_encode(self):
2515 encode = codecs.raw_unicode_escape_encode
2516 check = coding_checker(self, encode)
2517 for b in range(256):
2518 if b not in b'uU':
2519 check('\\' + chr(b), b'\\' + bytes([b]))
2520 check('\u20ac', br'\u20ac')
2521 check('\U0001d120', br'\U0001d120')
2522
2523 def test_escape_decode(self):
2524 decode = codecs.raw_unicode_escape_decode
2525 check = coding_checker(self, decode)
2526 for b in range(256):
2527 if b not in b'uU':
2528 check(b'\\' + bytes([b]), '\\' + chr(b))
2529 check(br"\u20ac", "\u20ac")
2530 check(br"\U0001d120", "\U0001d120")
2531
2532 def test_decode_errors(self):
2533 decode = codecs.raw_unicode_escape_decode
2534 for c, d in (b'u', 4), (b'U', 4):
2535 for i in range(d):
2536 self.assertRaises(UnicodeDecodeError, decode,
2537 b"\\" + c + b"0"*i)
2538 self.assertRaises(UnicodeDecodeError, decode,
2539 b"[\\" + c + b"0"*i + b"]")
2540 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2541 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2542 self.assertEqual(decode(data, "replace"),
2543 ("[\ufffd]\ufffd", len(data)))
2544 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2545 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2546 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2547
2548
Martin v. Löwis43c57782009-05-10 08:15:24 +00002549class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002550
2551 def test_utf8(self):
2552 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002553 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002554 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002555 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002556 b"foo\x80bar")
2557 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002558 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002559 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002560 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002561 b"\xed\xb0\x80")
2562
2563 def test_ascii(self):
2564 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002565 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002566 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002567 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002568 b"foo\x80bar")
2569
2570 def test_charmap(self):
2571 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002572 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002573 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002574 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002575 b"foo\xa5bar")
2576
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002577 def test_latin1(self):
2578 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002579 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002580 b"\xe4\xeb\xef\xf6\xfc")
2581
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002582
Victor Stinner3fed0872010-05-22 02:16:27 +00002583class BomTest(unittest.TestCase):
2584 def test_seek0(self):
2585 data = "1234567890"
2586 tests = ("utf-16",
2587 "utf-16-le",
2588 "utf-16-be",
2589 "utf-32",
2590 "utf-32-le",
2591 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002592 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002593 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002594 # Check if the BOM is written only once
2595 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002596 f.write(data)
2597 f.write(data)
2598 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002599 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002600 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002601 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002602
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002603 # Check that the BOM is written after a seek(0)
2604 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2605 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002606 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002607 f.seek(0)
2608 f.write(data)
2609 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002610 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002611
2612 # (StreamWriter) Check that the BOM is written after a seek(0)
2613 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002614 f.writer.write(data[0])
2615 self.assertNotEqual(f.writer.tell(), 0)
2616 f.writer.seek(0)
2617 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002618 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002619 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002620
Victor Stinner05010702011-05-27 16:50:40 +02002621 # Check that the BOM is not written after a seek() at a position
2622 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002623 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2624 f.write(data)
2625 f.seek(f.tell())
2626 f.write(data)
2627 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002628 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002629
Victor Stinner05010702011-05-27 16:50:40 +02002630 # (StreamWriter) Check that the BOM is not written after a seek()
2631 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002632 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002633 f.writer.write(data)
2634 f.writer.seek(f.writer.tell())
2635 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002636 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002637 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002638
Victor Stinner3fed0872010-05-22 02:16:27 +00002639
Georg Brandl02524622010-12-02 18:06:51 +00002640bytes_transform_encodings = [
2641 "base64_codec",
2642 "uu_codec",
2643 "quopri_codec",
2644 "hex_codec",
2645]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002646
2647transform_aliases = {
2648 "base64_codec": ["base64", "base_64"],
2649 "uu_codec": ["uu"],
2650 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2651 "hex_codec": ["hex"],
2652 "rot_13": ["rot13"],
2653}
2654
Georg Brandl02524622010-12-02 18:06:51 +00002655try:
2656 import zlib
2657except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002658 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002659else:
2660 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002661 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002662try:
2663 import bz2
2664except ImportError:
2665 pass
2666else:
2667 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002668 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002669
Victor Stinnerf96418d2015-09-21 23:06:27 +02002670
Georg Brandl02524622010-12-02 18:06:51 +00002671class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002672
Georg Brandl02524622010-12-02 18:06:51 +00002673 def test_basics(self):
2674 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002675 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002676 with self.subTest(encoding=encoding):
2677 # generic codecs interface
2678 (o, size) = codecs.getencoder(encoding)(binput)
2679 self.assertEqual(size, len(binput))
2680 (i, size) = codecs.getdecoder(encoding)(o)
2681 self.assertEqual(size, len(o))
2682 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002683
Georg Brandl02524622010-12-02 18:06:51 +00002684 def test_read(self):
2685 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002686 with self.subTest(encoding=encoding):
2687 sin = codecs.encode(b"\x80", encoding)
2688 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2689 sout = reader.read()
2690 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002691
2692 def test_readline(self):
2693 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002694 with self.subTest(encoding=encoding):
2695 sin = codecs.encode(b"\x80", encoding)
2696 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2697 sout = reader.readline()
2698 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002699
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002700 def test_buffer_api_usage(self):
2701 # We check all the transform codecs accept memoryview input
2702 # for encoding and decoding
2703 # and also that they roundtrip correctly
2704 original = b"12345\x80"
2705 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002706 with self.subTest(encoding=encoding):
2707 data = original
2708 view = memoryview(data)
2709 data = codecs.encode(data, encoding)
2710 view_encoded = codecs.encode(view, encoding)
2711 self.assertEqual(view_encoded, data)
2712 view = memoryview(data)
2713 data = codecs.decode(data, encoding)
2714 self.assertEqual(data, original)
2715 view_decoded = codecs.decode(view, encoding)
2716 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002717
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002718 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002719 # Check binary -> binary codecs give a good error for str input
2720 bad_input = "bad input type"
2721 for encoding in bytes_transform_encodings:
2722 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002723 fmt = ( "{!r} is not a text encoding; "
2724 "use codecs.encode\(\) to handle arbitrary codecs")
2725 msg = fmt.format(encoding)
2726 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002727 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002728 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002729
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002730 def test_text_to_binary_blacklists_text_transforms(self):
2731 # Check str.encode gives a good error message for str -> str codecs
2732 msg = (r"^'rot_13' is not a text encoding; "
2733 "use codecs.encode\(\) to handle arbitrary codecs")
2734 with self.assertRaisesRegex(LookupError, msg):
2735 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002736
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002737 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002738 # Check bytes.decode and bytearray.decode give a good error
2739 # message for binary -> binary codecs
2740 data = b"encode first to ensure we meet any format restrictions"
2741 for encoding in bytes_transform_encodings:
2742 with self.subTest(encoding=encoding):
2743 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002744 fmt = (r"{!r} is not a text encoding; "
2745 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002746 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002747 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002748 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002749 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002750 bytearray(encoded_data).decode(encoding)
2751
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002752 def test_binary_to_text_blacklists_text_transforms(self):
2753 # Check str -> str codec gives a good error for binary input
2754 for bad_input in (b"immutable", bytearray(b"mutable")):
2755 with self.subTest(bad_input=bad_input):
2756 msg = (r"^'rot_13' is not a text encoding; "
2757 "use codecs.decode\(\) to handle arbitrary codecs")
2758 with self.assertRaisesRegex(LookupError, msg) as failure:
2759 bad_input.decode("rot_13")
2760 self.assertIsNone(failure.exception.__cause__)
2761
Zachary Wareefa2e042013-12-30 14:54:11 -06002762 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002763 def test_custom_zlib_error_is_wrapped(self):
2764 # Check zlib codec gives a good error for malformed input
2765 msg = "^decoding with 'zlib_codec' codec failed"
2766 with self.assertRaisesRegex(Exception, msg) as failure:
2767 codecs.decode(b"hello", "zlib_codec")
2768 self.assertIsInstance(failure.exception.__cause__,
2769 type(failure.exception))
2770
2771 def test_custom_hex_error_is_wrapped(self):
2772 # Check hex codec gives a good error for malformed input
2773 msg = "^decoding with 'hex_codec' codec failed"
2774 with self.assertRaisesRegex(Exception, msg) as failure:
2775 codecs.decode(b"hello", "hex_codec")
2776 self.assertIsInstance(failure.exception.__cause__,
2777 type(failure.exception))
2778
2779 # Unfortunately, the bz2 module throws OSError, which the codec
2780 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002781
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002782 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2783 def test_aliases(self):
2784 for codec_name, aliases in transform_aliases.items():
2785 expected_name = codecs.lookup(codec_name).name
2786 for alias in aliases:
2787 with self.subTest(alias=alias):
2788 info = codecs.lookup(alias)
2789 self.assertEqual(info.name, expected_name)
2790
Martin Panter06171bd2015-09-12 00:34:28 +00002791 def test_quopri_stateless(self):
2792 # Should encode with quotetabs=True
2793 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2794 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2795 # But should still support unescaped tabs and spaces
2796 unescaped = b"space tab eol\n"
2797 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2798
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002799 def test_uu_invalid(self):
2800 # Missing "begin" line
2801 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2802
Nick Coghlan8b097b42013-11-13 23:49:21 +10002803
2804# The codec system tries to wrap exceptions in order to ensure the error
2805# mentions the operation being performed and the codec involved. We
2806# currently *only* want this to happen for relatively stateless
2807# exceptions, where the only significant information they contain is their
2808# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002809
2810# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002811# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002812_TEST_CODECS = {}
2813
2814def _get_test_codec(codec_name):
2815 return _TEST_CODECS.get(codec_name)
2816codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2817
Nick Coghlan8fad1672014-09-15 23:50:44 +12002818try:
2819 # Issue #22166: Also need to clear the internal cache in CPython
2820 from _codecs import _forget_codec
2821except ImportError:
2822 def _forget_codec(codec_name):
2823 pass
2824
2825
Nick Coghlan8b097b42013-11-13 23:49:21 +10002826class ExceptionChainingTest(unittest.TestCase):
2827
2828 def setUp(self):
2829 # There's no way to unregister a codec search function, so we just
2830 # ensure we render this one fairly harmless after the test
2831 # case finishes by using the test case repr as the codec name
2832 # The codecs module normalizes codec names, although this doesn't
2833 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002834 # We also make sure we use a truly unique id for the custom codec
2835 # to avoid issues with the codec cache when running these tests
2836 # multiple times (e.g. when hunting for refleaks)
2837 unique_id = repr(self) + str(id(self))
2838 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2839
2840 # We store the object to raise on the instance because of a bad
2841 # interaction between the codec caching (which means we can't
2842 # recreate the codec entry) and regrtest refleak hunting (which
2843 # runs the same test instance multiple times). This means we
2844 # need to ensure the codecs call back in to the instance to find
2845 # out which exception to raise rather than binding them in a
2846 # closure to an object that may change on the next run
2847 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002848
Nick Coghlan4e553e22013-11-16 00:35:34 +10002849 def tearDown(self):
2850 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002851 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2852 encodings._cache.pop(self.codec_name, None)
2853 try:
2854 _forget_codec(self.codec_name)
2855 except KeyError:
2856 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002857
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002858 def set_codec(self, encode, decode):
2859 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002860 name=self.codec_name)
2861 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002862
2863 @contextlib.contextmanager
2864 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002865 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002866 operation, self.codec_name, exc_type.__name__, msg)
2867 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2868 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002869 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002870 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002871
2872 def raise_obj(self, *args, **kwds):
2873 # Helper to dynamically change the object raised by a test codec
2874 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002875
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002876 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002877 self.obj_to_raise = obj_to_raise
2878 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002879 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002880 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002881 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002882 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002883 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002884 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002885 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002886 codecs.decode(b"bytes input", self.codec_name)
2887
2888 def test_raise_by_type(self):
2889 self.check_wrapped(RuntimeError, "")
2890
2891 def test_raise_by_value(self):
2892 msg = "This should be wrapped"
2893 self.check_wrapped(RuntimeError(msg), msg)
2894
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002895 def test_raise_grandchild_subclass_exact_size(self):
2896 msg = "This should be wrapped"
2897 class MyRuntimeError(RuntimeError):
2898 __slots__ = ()
2899 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2900
2901 def test_raise_subclass_with_weakref_support(self):
2902 msg = "This should be wrapped"
2903 class MyRuntimeError(RuntimeError):
2904 pass
2905 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2906
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002907 def check_not_wrapped(self, obj_to_raise, msg):
2908 def raise_obj(*args, **kwds):
2909 raise obj_to_raise
2910 self.set_codec(raise_obj, raise_obj)
2911 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002912 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002913 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002914 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002915 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002916 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002917 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002918 codecs.decode(b"bytes input", self.codec_name)
2919
2920 def test_init_override_is_not_wrapped(self):
2921 class CustomInit(RuntimeError):
2922 def __init__(self):
2923 pass
2924 self.check_not_wrapped(CustomInit, "")
2925
2926 def test_new_override_is_not_wrapped(self):
2927 class CustomNew(RuntimeError):
2928 def __new__(cls):
2929 return super().__new__(cls)
2930 self.check_not_wrapped(CustomNew, "")
2931
2932 def test_instance_attribute_is_not_wrapped(self):
2933 msg = "This should NOT be wrapped"
2934 exc = RuntimeError(msg)
2935 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002936 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002937
2938 def test_non_str_arg_is_not_wrapped(self):
2939 self.check_not_wrapped(RuntimeError(1), "1")
2940
2941 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002942 msg_re = r"^\('a', 'b', 'c'\)$"
2943 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002944
2945 # http://bugs.python.org/issue19609
2946 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002947 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002948 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002949 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002950 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002951 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002952 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002953 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002954 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002955 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002956 codecs.decode(b"bytes input", self.codec_name)
2957
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002958 def test_unflagged_non_text_codec_handling(self):
2959 # The stdlib non-text codecs are now marked so they're
2960 # pre-emptively skipped by the text model related methods
2961 # However, third party codecs won't be flagged, so we still make
2962 # sure the case where an inappropriate output type is produced is
2963 # handled appropriately
2964 def encode_to_str(*args, **kwds):
2965 return "not bytes!", 0
2966 def decode_to_bytes(*args, **kwds):
2967 return b"not str!", 0
2968 self.set_codec(encode_to_str, decode_to_bytes)
2969 # No input or output type checks on the codecs module functions
2970 encoded = codecs.encode(None, self.codec_name)
2971 self.assertEqual(encoded, "not bytes!")
2972 decoded = codecs.decode(None, self.codec_name)
2973 self.assertEqual(decoded, b"not str!")
2974 # Text model methods should complain
2975 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2976 "use codecs.encode\(\) to encode to arbitrary types$")
2977 msg = fmt.format(self.codec_name)
2978 with self.assertRaisesRegex(TypeError, msg):
2979 "str_input".encode(self.codec_name)
2980 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2981 "use codecs.decode\(\) to decode to arbitrary types$")
2982 msg = fmt.format(self.codec_name)
2983 with self.assertRaisesRegex(TypeError, msg):
2984 b"bytes input".decode(self.codec_name)
2985
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002986
Georg Brandl02524622010-12-02 18:06:51 +00002987
Victor Stinner62be4fb2011-10-18 21:46:37 +02002988@unittest.skipUnless(sys.platform == 'win32',
2989 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002990class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002991 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002992 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002993
Victor Stinner3a50e702011-10-18 21:21:00 +02002994 def test_invalid_code_page(self):
2995 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2996 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002997 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2998 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002999
3000 def test_code_page_name(self):
3001 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
3002 codecs.code_page_encode, 932, '\xff')
3003 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003004 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003005 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003006 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003007
3008 def check_decode(self, cp, tests):
3009 for raw, errors, expected in tests:
3010 if expected is not None:
3011 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01003012 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003013 except UnicodeDecodeError as err:
3014 self.fail('Unable to decode %a from "cp%s" with '
3015 'errors=%r: %s' % (raw, cp, errors, err))
3016 self.assertEqual(decoded[0], expected,
3017 '%a.decode("cp%s", %r)=%a != %a'
3018 % (raw, cp, errors, decoded[0], expected))
3019 # assert 0 <= decoded[1] <= len(raw)
3020 self.assertGreaterEqual(decoded[1], 0)
3021 self.assertLessEqual(decoded[1], len(raw))
3022 else:
3023 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01003024 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003025
3026 def check_encode(self, cp, tests):
3027 for text, errors, expected in tests:
3028 if expected is not None:
3029 try:
3030 encoded = codecs.code_page_encode(cp, text, errors)
3031 except UnicodeEncodeError as err:
3032 self.fail('Unable to encode %a to "cp%s" with '
3033 'errors=%r: %s' % (text, cp, errors, err))
3034 self.assertEqual(encoded[0], expected,
3035 '%a.encode("cp%s", %r)=%a != %a'
3036 % (text, cp, errors, encoded[0], expected))
3037 self.assertEqual(encoded[1], len(text))
3038 else:
3039 self.assertRaises(UnicodeEncodeError,
3040 codecs.code_page_encode, cp, text, errors)
3041
3042 def test_cp932(self):
3043 self.check_encode(932, (
3044 ('abc', 'strict', b'abc'),
3045 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003046 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003047 ('\xff', 'strict', None),
3048 ('[\xff]', 'ignore', b'[]'),
3049 ('[\xff]', 'replace', b'[y]'),
3050 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003051 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02003052 ('[\xff]', 'namereplace',
3053 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003054 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003055 ('\udcff', 'strict', None),
3056 ('[\udcff]', 'surrogateescape', b'[\xff]'),
3057 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003058 ))
Victor Stinner9e921882011-10-18 21:55:25 +02003059 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02003060 (b'abc', 'strict', 'abc'),
3061 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3062 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003063 (b'[\xff]', 'strict', None),
3064 (b'[\xff]', 'ignore', '[]'),
3065 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02003066 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003067 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003068 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003069 (b'\x81\x00abc', 'strict', None),
3070 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003071 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003072 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003073 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003074
3075 def test_cp1252(self):
3076 self.check_encode(1252, (
3077 ('abc', 'strict', b'abc'),
3078 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3079 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003080 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003081 ('\u0141', 'strict', None),
3082 ('\u0141', 'ignore', b''),
3083 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003084 ('\udc98', 'surrogateescape', b'\x98'),
3085 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003086 ))
3087 self.check_decode(1252, (
3088 (b'abc', 'strict', 'abc'),
3089 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3090 (b'\xff', 'strict', '\xff'),
3091 ))
3092
3093 def test_cp_utf7(self):
3094 cp = 65000
3095 self.check_encode(cp, (
3096 ('abc', 'strict', b'abc'),
3097 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3098 ('\U0010ffff', 'strict', b'+2//f/w-'),
3099 ('\udc80', 'strict', b'+3IA-'),
3100 ('\ufffd', 'strict', b'+//0-'),
3101 ))
3102 self.check_decode(cp, (
3103 (b'abc', 'strict', 'abc'),
3104 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3105 (b'+2//f/w-', 'strict', '\U0010ffff'),
3106 (b'+3IA-', 'strict', '\udc80'),
3107 (b'+//0-', 'strict', '\ufffd'),
3108 # invalid bytes
3109 (b'[+/]', 'strict', '[]'),
3110 (b'[\xff]', 'strict', '[\xff]'),
3111 ))
3112
Victor Stinner3a50e702011-10-18 21:21:00 +02003113 def test_multibyte_encoding(self):
3114 self.check_decode(932, (
3115 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3116 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3117 ))
3118 self.check_decode(self.CP_UTF8, (
3119 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3120 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3121 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003122 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02003123 self.check_encode(self.CP_UTF8, (
3124 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3125 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3126 ))
3127
3128 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003129 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3130 self.assertEqual(decoded, ('', 0))
3131
Victor Stinner3a50e702011-10-18 21:21:00 +02003132 decoded = codecs.code_page_decode(932,
3133 b'\xe9\x80\xe9', 'strict',
3134 False)
3135 self.assertEqual(decoded, ('\u9a3e', 2))
3136
3137 decoded = codecs.code_page_decode(932,
3138 b'\xe9\x80\xe9\x80', 'strict',
3139 False)
3140 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3141
3142 decoded = codecs.code_page_decode(932,
3143 b'abc', 'strict',
3144 False)
3145 self.assertEqual(decoded, ('abc', 3))
3146
3147
Victor Stinnerf96418d2015-09-21 23:06:27 +02003148class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003149 def test_encode(self):
3150 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3151
3152 def test_encode_error(self):
3153 for data, error_handler, expected in (
3154 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3155 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3156 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003157 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3158 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003159 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3160 ):
3161 with self.subTest(data=data, error_handler=error_handler,
3162 expected=expected):
3163 self.assertEqual(data.encode('ascii', error_handler),
3164 expected)
3165
3166 def test_encode_surrogateescape_error(self):
3167 with self.assertRaises(UnicodeEncodeError):
3168 # the first character can be decoded, but not the second
3169 '\udc80\xff'.encode('ascii', 'surrogateescape')
3170
Victor Stinnerf96418d2015-09-21 23:06:27 +02003171 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003172 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3173
3174 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003175 for data, error_handler, expected in (
3176 (b'[\x80\xff]', 'ignore', '[]'),
3177 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3178 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3179 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3180 ):
3181 with self.subTest(data=data, error_handler=error_handler,
3182 expected=expected):
3183 self.assertEqual(data.decode('ascii', error_handler),
3184 expected)
3185
3186
Victor Stinnerc3713e92015-09-29 12:32:13 +02003187class Latin1Test(unittest.TestCase):
3188 def test_encode(self):
3189 for data, expected in (
3190 ('abc', b'abc'),
3191 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3192 ):
3193 with self.subTest(data=data, expected=expected):
3194 self.assertEqual(data.encode('latin1'), expected)
3195
3196 def test_encode_errors(self):
3197 for data, error_handler, expected in (
3198 ('[\u20ac\udc80]', 'ignore', b'[]'),
3199 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003200 ('[\u20ac\U000abcde]', 'backslashreplace',
3201 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003202 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3203 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3204 ):
3205 with self.subTest(data=data, error_handler=error_handler,
3206 expected=expected):
3207 self.assertEqual(data.encode('latin1', error_handler),
3208 expected)
3209
3210 def test_encode_surrogateescape_error(self):
3211 with self.assertRaises(UnicodeEncodeError):
3212 # the first character can be decoded, but not the second
3213 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3214
3215 def test_decode(self):
3216 for data, expected in (
3217 (b'abc', 'abc'),
3218 (b'[\x80\xff]', '[\x80\xff]'),
3219 ):
3220 with self.subTest(data=data, expected=expected):
3221 self.assertEqual(data.decode('latin1'), expected)
3222
3223
Fred Drake2e2be372001-09-20 21:33:42 +00003224if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003225 unittest.main()