blob: 45a19876113f281f7371404066af1e0bb6a36a40 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10008import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Victor Stinnerf96418d2015-09-21 23:06:27 +020030
Walter Dörwald69652032004-09-07 20:24:22 +000031class Queue(object):
32 """
33 queue: write bytes at one end, read bytes from the other end
34 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000035 def __init__(self, buffer):
36 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000037
38 def write(self, chars):
39 self._buffer += chars
40
41 def read(self, size=-1):
42 if size<0:
43 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000044 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000045 return s
46 else:
47 s = self._buffer[:size]
48 self._buffer = self._buffer[size:]
49 return s
50
Victor Stinnerf96418d2015-09-21 23:06:27 +020051
Walter Dörwald3abcb012007-04-16 22:10:50 +000052class MixInCheckStateHandling:
53 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000054 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000055 d = codecs.getincrementaldecoder(encoding)()
56 part1 = d.decode(s[:i])
57 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000058 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000059 # Check that the condition stated in the documentation for
60 # IncrementalDecoder.getstate() holds
61 if not state[1]:
62 # reset decoder to the default state without anything buffered
63 d.setstate((state[0][:0], 0))
64 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000065 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000066 # The decoder must return to the same state
67 self.assertEqual(state, d.getstate())
68 # Create a new decoder and set it to the state
69 # we extracted from the old one
70 d = codecs.getincrementaldecoder(encoding)()
71 d.setstate(state)
72 part2 = d.decode(s[i:], True)
73 self.assertEqual(u, part1+part2)
74
75 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000076 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000077 d = codecs.getincrementalencoder(encoding)()
78 part1 = d.encode(u[:i])
79 state = d.getstate()
80 d = codecs.getincrementalencoder(encoding)()
81 d.setstate(state)
82 part2 = d.encode(u[i:], True)
83 self.assertEqual(s, part1+part2)
84
Victor Stinnerf96418d2015-09-21 23:06:27 +020085
Ezio Melotti5d3dba02013-01-11 06:02:07 +020086class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000088 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000089 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000090 # the StreamReader and check that the results equal the appropriate
91 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020093 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000094 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000095 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000096 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000097 result += r.read()
98 self.assertEqual(result, partialresult)
99 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000100 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000102
Martin Panter7462b6492015-11-02 03:37:02 +0000103 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000104 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000105 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000106 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000108 self.assertEqual(result, partialresult)
109 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 self.assertEqual(d.decode(b"", True), "")
111 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000112
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000113 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000114 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000115 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000116 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000118 self.assertEqual(result, partialresult)
119 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000120 self.assertEqual(d.decode(b"", True), "")
121 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000122
123 # check iterdecode()
124 encoded = input.encode(self.encoding)
125 self.assertEqual(
126 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000127 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000128 )
129
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 def test_readline(self):
131 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000132 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 return codecs.getreader(self.encoding)(stream)
134
Walter Dörwaldca199432006-03-06 22:39:12 +0000135 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200136 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 lines = []
138 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000140 if not line:
141 break
142 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000143 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000144
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
146 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
147 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000148 self.assertEqual(readalllines(s, True), sexpected)
149 self.assertEqual(readalllines(s, False), sexpectednoends)
150 self.assertEqual(readalllines(s, True, 10), sexpected)
151 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000152
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200153 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000154 # Test long lines (multiple calls to read() in readline())
155 vw = []
156 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200157 for (i, lineend) in enumerate(lineends):
158 vw.append((i*200+200)*"\u3042" + lineend)
159 vwo.append((i*200+200)*"\u3042")
160 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
161 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000162
163 # Test lines where the first read might end with \r, so the
164 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200166 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000168 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000169 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 self.assertEqual(
171 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000172 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000173 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200174 self.assertEqual(
175 reader.readline(keepends=True),
176 "xxx\n",
177 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000178 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000179 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 self.assertEqual(
181 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000182 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000183 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200184 self.assertEqual(
185 reader.readline(keepends=False),
186 "xxx",
187 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000188
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200189 def test_mixed_readline_and_read(self):
190 lines = ["Humpty Dumpty sat on a wall,\n",
191 "Humpty Dumpty had a great fall.\r\n",
192 "All the king's horses and all the king's men\r",
193 "Couldn't put Humpty together again."]
194 data = ''.join(lines)
195 def getreader():
196 stream = io.BytesIO(data.encode(self.encoding))
197 return codecs.getreader(self.encoding)(stream)
198
199 # Issue #8260: Test readline() followed by read()
200 f = getreader()
201 self.assertEqual(f.readline(), lines[0])
202 self.assertEqual(f.read(), ''.join(lines[1:]))
203 self.assertEqual(f.read(), '')
204
205 # Issue #16636: Test readline() followed by readlines()
206 f = getreader()
207 self.assertEqual(f.readline(), lines[0])
208 self.assertEqual(f.readlines(), lines[1:])
209 self.assertEqual(f.read(), '')
210
211 # Test read() followed by read()
212 f = getreader()
213 self.assertEqual(f.read(size=40, chars=5), data[:5])
214 self.assertEqual(f.read(), data[5:])
215 self.assertEqual(f.read(), '')
216
217 # Issue #12446: Test read() followed by readlines()
218 f = getreader()
219 self.assertEqual(f.read(size=40, chars=5), data[:5])
220 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
221 self.assertEqual(f.read(), '')
222
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000223 def test_bug1175396(self):
224 s = [
225 '<%!--===================================================\r\n',
226 ' BLOG index page: show recent articles,\r\n',
227 ' today\'s articles, or articles of a specific date.\r\n',
228 '========================================================--%>\r\n',
229 '<%@inputencoding="ISO-8859-1"%>\r\n',
230 '<%@pagetemplate=TEMPLATE.y%>\r\n',
231 '<%@import=import frog.util, frog%>\r\n',
232 '<%@import=import frog.objects%>\r\n',
233 '<%@import=from frog.storageerrors import StorageError%>\r\n',
234 '<%\r\n',
235 '\r\n',
236 'import logging\r\n',
237 'log=logging.getLogger("Snakelets.logger")\r\n',
238 '\r\n',
239 '\r\n',
240 'user=self.SessionCtx.user\r\n',
241 'storageEngine=self.SessionCtx.storageEngine\r\n',
242 '\r\n',
243 '\r\n',
244 'def readArticlesFromDate(date, count=None):\r\n',
245 ' entryids=storageEngine.listBlogEntries(date)\r\n',
246 ' entryids.reverse() # descending\r\n',
247 ' if count:\r\n',
248 ' entryids=entryids[:count]\r\n',
249 ' try:\r\n',
250 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
251 ' except StorageError,x:\r\n',
252 ' log.error("Error loading articles: "+str(x))\r\n',
253 ' self.abort("cannot load articles")\r\n',
254 '\r\n',
255 'showdate=None\r\n',
256 '\r\n',
257 'arg=self.Request.getArg()\r\n',
258 'if arg=="today":\r\n',
259 ' #-------------------- TODAY\'S ARTICLES\r\n',
260 ' self.write("<h2>Today\'s articles</h2>")\r\n',
261 ' showdate = frog.util.isodatestr() \r\n',
262 ' entries = readArticlesFromDate(showdate)\r\n',
263 'elif arg=="active":\r\n',
264 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
265 ' self.Yredirect("active.y")\r\n',
266 'elif arg=="login":\r\n',
267 ' #-------------------- LOGIN PAGE redirect\r\n',
268 ' self.Yredirect("login.y")\r\n',
269 'elif arg=="date":\r\n',
270 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
271 ' showdate = self.Request.getParameter("date")\r\n',
272 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
273 ' entries = readArticlesFromDate(showdate)\r\n',
274 'else:\r\n',
275 ' #-------------------- RECENT ARTICLES\r\n',
276 ' self.write("<h2>Recent articles</h2>")\r\n',
277 ' dates=storageEngine.listBlogEntryDates()\r\n',
278 ' if dates:\r\n',
279 ' entries=[]\r\n',
280 ' SHOWAMOUNT=10\r\n',
281 ' for showdate in dates:\r\n',
282 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
283 ' if len(entries)>=SHOWAMOUNT:\r\n',
284 ' break\r\n',
285 ' \r\n',
286 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000287 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200288 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000289 for (i, line) in enumerate(reader):
290 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000291
292 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000293 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200294 writer = codecs.getwriter(self.encoding)(q)
295 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000296
297 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000298 writer.write("foo\r")
299 self.assertEqual(reader.readline(keepends=False), "foo")
300 writer.write("\nbar\r")
301 self.assertEqual(reader.readline(keepends=False), "")
302 self.assertEqual(reader.readline(keepends=False), "bar")
303 writer.write("baz")
304 self.assertEqual(reader.readline(keepends=False), "baz")
305 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000306
307 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000308 writer.write("foo\r")
309 self.assertEqual(reader.readline(keepends=True), "foo\r")
310 writer.write("\nbar\r")
311 self.assertEqual(reader.readline(keepends=True), "\n")
312 self.assertEqual(reader.readline(keepends=True), "bar\r")
313 writer.write("baz")
314 self.assertEqual(reader.readline(keepends=True), "baz")
315 self.assertEqual(reader.readline(keepends=True), "")
316 writer.write("foo\r\n")
317 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000318
Walter Dörwald9fa09462005-01-10 12:01:39 +0000319 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000320 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
321 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
322 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000323
324 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000325 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200326 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000327 self.assertEqual(reader.readline(), s1)
328 self.assertEqual(reader.readline(), s2)
329 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000330 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000331
332 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000333 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
334 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
335 s3 = "stillokay:bbbbxx\r\n"
336 s4 = "broken!!!!badbad\r\n"
337 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000338
339 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000340 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200341 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000342 self.assertEqual(reader.readline(), s1)
343 self.assertEqual(reader.readline(), s2)
344 self.assertEqual(reader.readline(), s3)
345 self.assertEqual(reader.readline(), s4)
346 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000347 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000348
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200349 ill_formed_sequence_replace = "\ufffd"
350
351 def test_lone_surrogates(self):
352 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
353 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
354 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200355 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
356 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200357 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
358 "[&#56448;]".encode(self.encoding))
359 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
360 "[]".encode(self.encoding))
361 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
362 "[?]".encode(self.encoding))
363
Victor Stinner01ada392015-10-01 21:54:51 +0200364 # sequential surrogate characters
365 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
366 "[]".encode(self.encoding))
367 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
368 "[??]".encode(self.encoding))
369
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200370 bom = "".encode(self.encoding)
371 for before, after in [("\U00010fff", "A"), ("[", "]"),
372 ("A", "\U00010fff")]:
373 before_sequence = before.encode(self.encoding)[len(bom):]
374 after_sequence = after.encode(self.encoding)[len(bom):]
375 test_string = before + "\uDC80" + after
376 test_sequence = (bom + before_sequence +
377 self.ill_formed_sequence + after_sequence)
378 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
379 self.encoding)
380 self.assertEqual(test_string.encode(self.encoding,
381 "surrogatepass"),
382 test_sequence)
383 self.assertEqual(test_sequence.decode(self.encoding,
384 "surrogatepass"),
385 test_string)
386 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
387 before + after)
388 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
389 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200390 backslashreplace = ''.join('\\x%02x' % b
391 for b in self.ill_formed_sequence)
392 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
393 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200394
Victor Stinnerf96418d2015-09-21 23:06:27 +0200395
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200396class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000397 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200398 if sys.byteorder == 'little':
399 ill_formed_sequence = b"\x80\xdc\x00\x00"
400 else:
401 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000402
403 spamle = (b'\xff\xfe\x00\x00'
404 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
405 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
406 spambe = (b'\x00\x00\xfe\xff'
407 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
408 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
409
410 def test_only_one_bom(self):
411 _,_,reader,writer = codecs.lookup(self.encoding)
412 # encode some stream
413 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200414 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000415 f.write("spam")
416 f.write("spam")
417 d = s.getvalue()
418 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000419 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000420 # try to read it back
421 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200422 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000423 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000424
425 def test_badbom(self):
426 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200427 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000428 self.assertRaises(UnicodeError, f.read)
429
430 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200431 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000432 self.assertRaises(UnicodeError, f.read)
433
434 def test_partial(self):
435 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200436 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000437 [
438 "", # first byte of BOM read
439 "", # second byte of BOM read
440 "", # third byte of BOM read
441 "", # fourth byte of BOM read => byteorder known
442 "",
443 "",
444 "",
445 "\x00",
446 "\x00",
447 "\x00",
448 "\x00",
449 "\x00\xff",
450 "\x00\xff",
451 "\x00\xff",
452 "\x00\xff",
453 "\x00\xff\u0100",
454 "\x00\xff\u0100",
455 "\x00\xff\u0100",
456 "\x00\xff\u0100",
457 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200458 "\x00\xff\u0100\uffff",
459 "\x00\xff\u0100\uffff",
460 "\x00\xff\u0100\uffff",
461 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000462 ]
463 )
464
Georg Brandl791f4e12009-09-17 11:41:24 +0000465 def test_handlers(self):
466 self.assertEqual(('\ufffd', 1),
467 codecs.utf_32_decode(b'\x01', 'replace', True))
468 self.assertEqual(('', 1),
469 codecs.utf_32_decode(b'\x01', 'ignore', True))
470
Walter Dörwald41980ca2007-08-16 21:55:45 +0000471 def test_errors(self):
472 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
473 b"\xff", "strict", True)
474
475 def test_decoder_state(self):
476 self.check_state_handling_decode(self.encoding,
477 "spamspam", self.spamle)
478 self.check_state_handling_decode(self.encoding,
479 "spamspam", self.spambe)
480
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000481 def test_issue8941(self):
482 # Issue #8941: insufficient result allocation when decoding into
483 # surrogate pairs on UCS-2 builds.
484 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
485 self.assertEqual('\U00010000' * 1024,
486 codecs.utf_32_decode(encoded_le)[0])
487 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
488 self.assertEqual('\U00010000' * 1024,
489 codecs.utf_32_decode(encoded_be)[0])
490
Victor Stinnerf96418d2015-09-21 23:06:27 +0200491
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200492class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000493 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200494 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000495
496 def test_partial(self):
497 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200498 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000499 [
500 "",
501 "",
502 "",
503 "\x00",
504 "\x00",
505 "\x00",
506 "\x00",
507 "\x00\xff",
508 "\x00\xff",
509 "\x00\xff",
510 "\x00\xff",
511 "\x00\xff\u0100",
512 "\x00\xff\u0100",
513 "\x00\xff\u0100",
514 "\x00\xff\u0100",
515 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200516 "\x00\xff\u0100\uffff",
517 "\x00\xff\u0100\uffff",
518 "\x00\xff\u0100\uffff",
519 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000520 ]
521 )
522
523 def test_simple(self):
524 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
525
526 def test_errors(self):
527 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
528 b"\xff", "strict", True)
529
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000530 def test_issue8941(self):
531 # Issue #8941: insufficient result allocation when decoding into
532 # surrogate pairs on UCS-2 builds.
533 encoded = b'\x00\x00\x01\x00' * 1024
534 self.assertEqual('\U00010000' * 1024,
535 codecs.utf_32_le_decode(encoded)[0])
536
Victor Stinnerf96418d2015-09-21 23:06:27 +0200537
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200538class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000539 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200540 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000541
542 def test_partial(self):
543 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200544 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000545 [
546 "",
547 "",
548 "",
549 "\x00",
550 "\x00",
551 "\x00",
552 "\x00",
553 "\x00\xff",
554 "\x00\xff",
555 "\x00\xff",
556 "\x00\xff",
557 "\x00\xff\u0100",
558 "\x00\xff\u0100",
559 "\x00\xff\u0100",
560 "\x00\xff\u0100",
561 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200562 "\x00\xff\u0100\uffff",
563 "\x00\xff\u0100\uffff",
564 "\x00\xff\u0100\uffff",
565 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000566 ]
567 )
568
569 def test_simple(self):
570 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
571
572 def test_errors(self):
573 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
574 b"\xff", "strict", True)
575
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000576 def test_issue8941(self):
577 # Issue #8941: insufficient result allocation when decoding into
578 # surrogate pairs on UCS-2 builds.
579 encoded = b'\x00\x01\x00\x00' * 1024
580 self.assertEqual('\U00010000' * 1024,
581 codecs.utf_32_be_decode(encoded)[0])
582
583
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200584class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000585 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200586 if sys.byteorder == 'little':
587 ill_formed_sequence = b"\x80\xdc"
588 else:
589 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000590
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000591 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
592 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000593
594 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000595 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000596 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000597 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200598 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000599 f.write("spam")
600 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000601 d = s.getvalue()
602 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000603 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000604 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000605 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200606 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000607 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000608
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000609 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000610 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200611 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000612 self.assertRaises(UnicodeError, f.read)
613
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000614 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200615 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000616 self.assertRaises(UnicodeError, f.read)
617
Walter Dörwald69652032004-09-07 20:24:22 +0000618 def test_partial(self):
619 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200620 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000621 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000622 "", # first byte of BOM read
623 "", # second byte of BOM read => byteorder known
624 "",
625 "\x00",
626 "\x00",
627 "\x00\xff",
628 "\x00\xff",
629 "\x00\xff\u0100",
630 "\x00\xff\u0100",
631 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200632 "\x00\xff\u0100\uffff",
633 "\x00\xff\u0100\uffff",
634 "\x00\xff\u0100\uffff",
635 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000636 ]
637 )
638
Georg Brandl791f4e12009-09-17 11:41:24 +0000639 def test_handlers(self):
640 self.assertEqual(('\ufffd', 1),
641 codecs.utf_16_decode(b'\x01', 'replace', True))
642 self.assertEqual(('', 1),
643 codecs.utf_16_decode(b'\x01', 'ignore', True))
644
Walter Dörwalde22d3392005-11-17 08:52:34 +0000645 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000646 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000647 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000648
649 def test_decoder_state(self):
650 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000651 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000652 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000653 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000654
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000655 def test_bug691291(self):
656 # Files are always opened in binary mode, even if no binary mode was
657 # specified. This means that no automatic conversion of '\n' is done
658 # on reading and writing.
659 s1 = 'Hello\r\nworld\r\n'
660
661 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200662 self.addCleanup(support.unlink, support.TESTFN)
663 with open(support.TESTFN, 'wb') as fp:
664 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200665 with support.check_warnings(('', DeprecationWarning)):
666 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
667 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200668 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000669
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200670class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000671 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200672 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000673
674 def test_partial(self):
675 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200676 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000677 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000678 "",
679 "\x00",
680 "\x00",
681 "\x00\xff",
682 "\x00\xff",
683 "\x00\xff\u0100",
684 "\x00\xff\u0100",
685 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200686 "\x00\xff\u0100\uffff",
687 "\x00\xff\u0100\uffff",
688 "\x00\xff\u0100\uffff",
689 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000690 ]
691 )
692
Walter Dörwalde22d3392005-11-17 08:52:34 +0000693 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200694 tests = [
695 (b'\xff', '\ufffd'),
696 (b'A\x00Z', 'A\ufffd'),
697 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
698 (b'\x00\xd8', '\ufffd'),
699 (b'\x00\xd8A', '\ufffd'),
700 (b'\x00\xd8A\x00', '\ufffdA'),
701 (b'\x00\xdcA\x00', '\ufffdA'),
702 ]
703 for raw, expected in tests:
704 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
705 raw, 'strict', True)
706 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000707
Victor Stinner53a9dd72010-12-08 22:25:45 +0000708 def test_nonbmp(self):
709 self.assertEqual("\U00010203".encode(self.encoding),
710 b'\x00\xd8\x03\xde')
711 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
712 "\U00010203")
713
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200714class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000715 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200716 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000717
718 def test_partial(self):
719 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200720 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000721 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000722 "",
723 "\x00",
724 "\x00",
725 "\x00\xff",
726 "\x00\xff",
727 "\x00\xff\u0100",
728 "\x00\xff\u0100",
729 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200730 "\x00\xff\u0100\uffff",
731 "\x00\xff\u0100\uffff",
732 "\x00\xff\u0100\uffff",
733 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000734 ]
735 )
736
Walter Dörwalde22d3392005-11-17 08:52:34 +0000737 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200738 tests = [
739 (b'\xff', '\ufffd'),
740 (b'\x00A\xff', 'A\ufffd'),
741 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
742 (b'\xd8\x00', '\ufffd'),
743 (b'\xd8\x00\xdc', '\ufffd'),
744 (b'\xd8\x00\x00A', '\ufffdA'),
745 (b'\xdc\x00\x00A', '\ufffdA'),
746 ]
747 for raw, expected in tests:
748 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
749 raw, 'strict', True)
750 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000751
Victor Stinner53a9dd72010-12-08 22:25:45 +0000752 def test_nonbmp(self):
753 self.assertEqual("\U00010203".encode(self.encoding),
754 b'\xd8\x00\xde\x03')
755 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
756 "\U00010203")
757
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200758class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000759 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200760 ill_formed_sequence = b"\xed\xb2\x80"
761 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200762 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000763
764 def test_partial(self):
765 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200766 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000767 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000768 "\x00",
769 "\x00",
770 "\x00\xff",
771 "\x00\xff",
772 "\x00\xff\u07ff",
773 "\x00\xff\u07ff",
774 "\x00\xff\u07ff",
775 "\x00\xff\u07ff\u0800",
776 "\x00\xff\u07ff\u0800",
777 "\x00\xff\u07ff\u0800",
778 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200779 "\x00\xff\u07ff\u0800\uffff",
780 "\x00\xff\u07ff\u0800\uffff",
781 "\x00\xff\u07ff\u0800\uffff",
782 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000783 ]
784 )
785
Walter Dörwald3abcb012007-04-16 22:10:50 +0000786 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000787 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000788 self.check_state_handling_decode(self.encoding,
789 u, u.encode(self.encoding))
790
Victor Stinner1d65d912015-10-05 13:43:50 +0200791 def test_decode_error(self):
792 for data, error_handler, expected in (
793 (b'[\x80\xff]', 'ignore', '[]'),
794 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
795 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
796 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
797 ):
798 with self.subTest(data=data, error_handler=error_handler,
799 expected=expected):
800 self.assertEqual(data.decode(self.encoding, error_handler),
801 expected)
802
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000803 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200804 super().test_lone_surrogates()
805 # not sure if this is making sense for
806 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200807 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
808 self.BOM + b'[\x80]')
809
810 with self.assertRaises(UnicodeEncodeError) as cm:
811 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
812 exc = cm.exception
813 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000814
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000815 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200816 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
817 self.BOM + b"abc\xed\xa0\x80def")
818 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
819 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
820 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
821 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
822
823 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000824 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200825 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200827
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000828 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700829 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200830 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200831 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200832 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000833
Victor Stinnerf96418d2015-09-21 23:06:27 +0200834
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200835@unittest.skipUnless(sys.platform == 'win32',
836 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200837class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200838 encoding = "cp65001"
839
840 def test_encode(self):
841 tests = [
842 ('abc', 'strict', b'abc'),
843 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
844 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
845 ]
846 if VISTA_OR_LATER:
847 tests.extend((
848 ('\udc80', 'strict', None),
849 ('\udc80', 'ignore', b''),
850 ('\udc80', 'replace', b'?'),
851 ('\udc80', 'backslashreplace', b'\\udc80'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200852 ('\udc80', 'namereplace', b'\\udc80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200853 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
854 ))
855 else:
856 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
857 for text, errors, expected in tests:
858 if expected is not None:
859 try:
860 encoded = text.encode('cp65001', errors)
861 except UnicodeEncodeError as err:
862 self.fail('Unable to encode %a to cp65001 with '
863 'errors=%r: %s' % (text, errors, err))
864 self.assertEqual(encoded, expected,
865 '%a.encode("cp65001", %r)=%a != %a'
866 % (text, errors, encoded, expected))
867 else:
868 self.assertRaises(UnicodeEncodeError,
869 text.encode, "cp65001", errors)
870
871 def test_decode(self):
872 tests = [
873 (b'abc', 'strict', 'abc'),
874 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
875 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
876 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
877 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
878 # invalid bytes
879 (b'[\xff]', 'strict', None),
880 (b'[\xff]', 'ignore', '[]'),
881 (b'[\xff]', 'replace', '[\ufffd]'),
882 (b'[\xff]', 'surrogateescape', '[\udcff]'),
883 ]
884 if VISTA_OR_LATER:
885 tests.extend((
886 (b'[\xed\xb2\x80]', 'strict', None),
887 (b'[\xed\xb2\x80]', 'ignore', '[]'),
888 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
889 ))
890 else:
891 tests.extend((
892 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
893 ))
894 for raw, errors, expected in tests:
895 if expected is not None:
896 try:
897 decoded = raw.decode('cp65001', errors)
898 except UnicodeDecodeError as err:
899 self.fail('Unable to decode %a from cp65001 with '
900 'errors=%r: %s' % (raw, errors, err))
901 self.assertEqual(decoded, expected,
902 '%a.decode("cp65001", %r)=%a != %a'
903 % (raw, errors, decoded, expected))
904 else:
905 self.assertRaises(UnicodeDecodeError,
906 raw.decode, 'cp65001', errors)
907
908 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
909 def test_lone_surrogates(self):
910 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
911 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
912 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
913 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200914 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
915 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200916 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
917 b'[&#56448;]')
918 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
919 b'[\x80]')
920 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
921 b'[]')
922 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
923 b'[?]')
924
925 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
926 def test_surrogatepass_handler(self):
927 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
928 b"abc\xed\xa0\x80def")
929 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
930 "abc\ud800def")
931 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
932 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
933 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
934 "\U00010fff\uD800")
935 self.assertTrue(codecs.lookup_error("surrogatepass"))
936
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200937
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200938class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000939 encoding = "utf-7"
940
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300941 def test_ascii(self):
942 # Set D (directly encoded characters)
943 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
944 'abcdefghijklmnopqrstuvwxyz'
945 '0123456789'
946 '\'(),-./:?')
947 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
948 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
949 # Set O (optional direct characters)
950 set_o = ' !"#$%&*;<=>@[]^_`{|}'
951 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
952 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
953 # +
954 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
955 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
956 # White spaces
957 ws = ' \t\n\r'
958 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
959 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
960 # Other ASCII characters
961 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
962 set(set_d + set_o + '+' + ws)))
963 self.assertEqual(other_ascii.encode(self.encoding),
964 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
965 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
966
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000967 def test_partial(self):
968 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200969 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000970 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200971 'a',
972 'a',
973 'a+',
974 'a+-',
975 'a+-b',
976 'a+-b',
977 'a+-b',
978 'a+-b',
979 'a+-b',
980 'a+-b\x00',
981 'a+-b\x00c',
982 'a+-b\x00c',
983 'a+-b\x00c',
984 'a+-b\x00c',
985 'a+-b\x00c',
986 'a+-b\x00c\x80',
987 'a+-b\x00c\x80d',
988 'a+-b\x00c\x80d',
989 'a+-b\x00c\x80d',
990 'a+-b\x00c\x80d',
991 'a+-b\x00c\x80d',
992 'a+-b\x00c\x80d\u0100',
993 'a+-b\x00c\x80d\u0100e',
994 'a+-b\x00c\x80d\u0100e',
995 'a+-b\x00c\x80d\u0100e',
996 'a+-b\x00c\x80d\u0100e',
997 'a+-b\x00c\x80d\u0100e',
998 'a+-b\x00c\x80d\u0100e',
999 'a+-b\x00c\x80d\u0100e',
1000 'a+-b\x00c\x80d\u0100e',
1001 'a+-b\x00c\x80d\u0100e\U00010000',
1002 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001003 ]
1004 )
Walter Dörwalde22d3392005-11-17 08:52:34 +00001005
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001006 def test_errors(self):
1007 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001008 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001009 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001010 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001011 (b'a+IK', 'a\ufffd'),
1012 (b'a+IK-b', 'a\ufffdb'),
1013 (b'a+IK,b', 'a\ufffdb'),
1014 (b'a+IKx', 'a\u20ac\ufffd'),
1015 (b'a+IKx-b', 'a\u20ac\ufffdb'),
1016 (b'a+IKwgr', 'a\u20ac\ufffd'),
1017 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
1018 (b'a+IKwgr,', 'a\u20ac\ufffd'),
1019 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
1020 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
1021 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
1022 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
1023 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1024 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1025 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001026 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1027 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001028 ]
1029 for raw, expected in tests:
1030 with self.subTest(raw=raw):
1031 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1032 raw, 'strict', True)
1033 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1034
1035 def test_nonbmp(self):
1036 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1037 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1038 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001039 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1040 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1041 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1042 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1043 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1044 b'+IKwgrNgB3KA-')
1045 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1046 '\u20ac\u20ac\U000104A0')
1047 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1048 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001049
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001050 def test_lone_surrogates(self):
1051 tests = [
1052 (b'a+2AE-b', 'a\ud801b'),
1053 (b'a+2AE\xffb', 'a\ufffdb'),
1054 (b'a+2AE', 'a\ufffd'),
1055 (b'a+2AEA-b', 'a\ufffdb'),
1056 (b'a+2AH-b', 'a\ufffdb'),
1057 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1058 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1059 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1060 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1061 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1062 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1063 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1064 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1065 ]
1066 for raw, expected in tests:
1067 with self.subTest(raw=raw):
1068 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001069
1070
Walter Dörwalde22d3392005-11-17 08:52:34 +00001071class UTF16ExTest(unittest.TestCase):
1072
1073 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001074 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001075
1076 def test_bad_args(self):
1077 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1078
1079class ReadBufferTest(unittest.TestCase):
1080
1081 def test_array(self):
1082 import array
1083 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001084 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001085 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001086 )
1087
1088 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001089 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001090
1091 def test_bad_args(self):
1092 self.assertRaises(TypeError, codecs.readbuffer_encode)
1093 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1094
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001095class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001096 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001097 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001098
1099 def test_partial(self):
1100 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001101 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001102 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001103 "",
1104 "",
1105 "", # First BOM has been read and skipped
1106 "",
1107 "",
1108 "\ufeff", # Second BOM has been read and emitted
1109 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001110 "\ufeff\x00", # First byte of encoded "\xff" read
1111 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1112 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1113 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001114 "\ufeff\x00\xff\u07ff",
1115 "\ufeff\x00\xff\u07ff",
1116 "\ufeff\x00\xff\u07ff\u0800",
1117 "\ufeff\x00\xff\u07ff\u0800",
1118 "\ufeff\x00\xff\u07ff\u0800",
1119 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001120 "\ufeff\x00\xff\u07ff\u0800\uffff",
1121 "\ufeff\x00\xff\u07ff\u0800\uffff",
1122 "\ufeff\x00\xff\u07ff\u0800\uffff",
1123 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001124 ]
1125 )
1126
Thomas Wouters89f507f2006-12-13 04:49:30 +00001127 def test_bug1601501(self):
1128 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001129 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001130
Walter Dörwald3abcb012007-04-16 22:10:50 +00001131 def test_bom(self):
1132 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001133 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001134 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1135
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001136 def test_stream_bom(self):
1137 unistring = "ABC\u00A1\u2200XYZ"
1138 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1139
1140 reader = codecs.getreader("utf-8-sig")
1141 for sizehint in [None] + list(range(1, 11)) + \
1142 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001143 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001144 ostream = io.StringIO()
1145 while 1:
1146 if sizehint is not None:
1147 data = istream.read(sizehint)
1148 else:
1149 data = istream.read()
1150
1151 if not data:
1152 break
1153 ostream.write(data)
1154
1155 got = ostream.getvalue()
1156 self.assertEqual(got, unistring)
1157
1158 def test_stream_bare(self):
1159 unistring = "ABC\u00A1\u2200XYZ"
1160 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1161
1162 reader = codecs.getreader("utf-8-sig")
1163 for sizehint in [None] + list(range(1, 11)) + \
1164 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001165 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001166 ostream = io.StringIO()
1167 while 1:
1168 if sizehint is not None:
1169 data = istream.read(sizehint)
1170 else:
1171 data = istream.read()
1172
1173 if not data:
1174 break
1175 ostream.write(data)
1176
1177 got = ostream.getvalue()
1178 self.assertEqual(got, unistring)
1179
1180class EscapeDecodeTest(unittest.TestCase):
1181 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001182 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001183 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001184
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001185 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001186 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001187 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001188 b = bytes([b])
1189 if b != b'\\':
1190 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001191
1192 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001193 decode = codecs.escape_decode
1194 check = coding_checker(self, decode)
1195 check(b"[\\\n]", b"[]")
1196 check(br'[\"]', b'["]')
1197 check(br"[\']", b"[']")
1198 check(br"[\\]", br"[\]")
1199 check(br"[\a]", b"[\x07]")
1200 check(br"[\b]", b"[\x08]")
1201 check(br"[\t]", b"[\x09]")
1202 check(br"[\n]", b"[\x0a]")
1203 check(br"[\v]", b"[\x0b]")
1204 check(br"[\f]", b"[\x0c]")
1205 check(br"[\r]", b"[\x0d]")
1206 check(br"[\7]", b"[\x07]")
1207 check(br"[\8]", br"[\8]")
1208 check(br"[\78]", b"[\x078]")
1209 check(br"[\41]", b"[!]")
1210 check(br"[\418]", b"[!8]")
1211 check(br"[\101]", b"[A]")
1212 check(br"[\1010]", b"[A0]")
1213 check(br"[\501]", b"[A]")
1214 check(br"[\x41]", b"[A]")
1215 check(br"[\X41]", br"[\X41]")
1216 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001217 for b in range(256):
1218 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001219 b = bytes([b])
1220 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001221
1222 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001223 decode = codecs.escape_decode
1224 self.assertRaises(ValueError, decode, br"\x")
1225 self.assertRaises(ValueError, decode, br"[\x]")
1226 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1227 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1228 self.assertRaises(ValueError, decode, br"\x0")
1229 self.assertRaises(ValueError, decode, br"[\x0]")
1230 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1231 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001232
Victor Stinnerf96418d2015-09-21 23:06:27 +02001233
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001234class RecodingTest(unittest.TestCase):
1235 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001236 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001237 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001238 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001239 f2.close()
1240 # Python used to crash on this at exit because of a refcount
1241 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001242
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001243 self.assertTrue(f.closed)
1244
Martin v. Löwis2548c732003-04-18 10:39:54 +00001245# From RFC 3492
1246punycode_testcases = [
1247 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001248 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1249 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001250 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001251 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001252 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001253 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001254 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001255 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001256 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001257 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001258 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1259 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1260 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001261 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001262 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001263 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1264 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1265 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001266 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001267 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001268 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001269 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1270 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1271 "\u0939\u0948\u0902",
1272 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001273
1274 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001275 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001276 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1277 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001278
1279 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001280 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1281 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1282 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001283 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1284 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001285
1286 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001287 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1288 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1289 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1290 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001291 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001292
1293 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001294 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1295 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1296 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1297 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1298 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001299 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001300
1301 # (K) Vietnamese:
1302 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1303 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001304 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1305 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1306 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1307 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001308 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001309
Martin v. Löwis2548c732003-04-18 10:39:54 +00001310 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001311 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001312 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001313
Martin v. Löwis2548c732003-04-18 10:39:54 +00001314 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001315 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1316 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1317 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001318 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001319
1320 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001321 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1322 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1323 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001324 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001325
1326 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001327 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001328 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001329
1330 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001331 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1332 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001333 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001334
1335 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001336 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001337 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001338
1339 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001340 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001341 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001342
1343 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001344 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1345 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001346 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001347 ]
1348
1349for i in punycode_testcases:
1350 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001351 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001352
Victor Stinnerf96418d2015-09-21 23:06:27 +02001353
Martin v. Löwis2548c732003-04-18 10:39:54 +00001354class PunycodeTest(unittest.TestCase):
1355 def test_encode(self):
1356 for uni, puny in punycode_testcases:
1357 # Need to convert both strings to lower case, since
1358 # some of the extended encodings use upper case, but our
1359 # code produces only lower case. Converting just puny to
1360 # lower is also insufficient, since some of the input characters
1361 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001362 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001363 str(uni.encode("punycode"), "ascii").lower(),
1364 str(puny, "ascii").lower()
1365 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001366
1367 def test_decode(self):
1368 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001369 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001370 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001371 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001372
Victor Stinnerf96418d2015-09-21 23:06:27 +02001373
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001374class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001375 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001376 def test_bug1251300(self):
1377 # Decoding with unicode_internal used to not correctly handle "code
1378 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001379 ok = [
1380 (b"\x00\x10\xff\xff", "\U0010ffff"),
1381 (b"\x00\x00\x01\x01", "\U00000101"),
1382 (b"", ""),
1383 ]
1384 not_ok = [
1385 b"\x7f\xff\xff\xff",
1386 b"\x80\x00\x00\x00",
1387 b"\x81\x00\x00\x00",
1388 b"\x00",
1389 b"\x00\x00\x00\x00\x00",
1390 ]
1391 for internal, uni in ok:
1392 if sys.byteorder == "little":
1393 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001394 with support.check_warnings():
1395 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001396 for internal in not_ok:
1397 if sys.byteorder == "little":
1398 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001399 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001400 'deprecated', DeprecationWarning)):
1401 self.assertRaises(UnicodeDecodeError, internal.decode,
1402 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001403 if sys.byteorder == "little":
1404 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001405 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001406 else:
1407 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001408 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001409 with support.check_warnings():
1410 self.assertRaises(UnicodeDecodeError,
1411 invalid.decode, "unicode_internal")
1412 with support.check_warnings():
1413 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1414 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001415 with support.check_warnings():
1416 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1417 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001418
Victor Stinner182d90d2011-09-29 19:53:55 +02001419 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001420 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001421 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001422 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001423 'deprecated', DeprecationWarning)):
1424 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001425 except UnicodeDecodeError as ex:
1426 self.assertEqual("unicode_internal", ex.encoding)
1427 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1428 self.assertEqual(4, ex.start)
1429 self.assertEqual(8, ex.end)
1430 else:
1431 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001432
Victor Stinner182d90d2011-09-29 19:53:55 +02001433 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001434 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001435 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1436 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001437 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001438 'deprecated', DeprecationWarning)):
1439 ab = "ab".encode("unicode_internal").decode()
1440 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1441 "ascii"),
1442 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001443 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001444
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001445 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001446 with support.check_warnings(('unicode_internal codec has been '
1447 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001448 # Issue 3739
1449 encoder = codecs.getencoder("unicode_internal")
1450 self.assertEqual(encoder("a")[1], 1)
1451 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1452
1453 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001454
Martin v. Löwis2548c732003-04-18 10:39:54 +00001455# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1456nameprep_tests = [
1457 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001458 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1459 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1460 b'\xb8\x8f\xef\xbb\xbf',
1461 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001462 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001463 (b'CAFE',
1464 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001465 # 3.3 Case folding 8bit U+00DF (german sharp s).
1466 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001467 (b'\xc3\x9f',
1468 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001469 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001470 (b'\xc4\xb0',
1471 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001472 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001473 (b'\xc5\x83\xcd\xba',
1474 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001475 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1476 # XXX: skip this as it fails in UCS-2 mode
1477 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1478 # 'telc\xe2\x88\x95kg\xcf\x83'),
1479 (None, None),
1480 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001481 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1482 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001483 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001484 (b'\xe1\xbe\xb7',
1485 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001486 # 3.9 Self-reverting case folding U+01F0 and normalization.
1487 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001488 (b'\xc7\xb0',
1489 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001490 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001491 (b'\xce\x90',
1492 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001493 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001494 (b'\xce\xb0',
1495 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001496 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001497 (b'\xe1\xba\x96',
1498 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001499 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001500 (b'\xe1\xbd\x96',
1501 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001502 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001503 (b' ',
1504 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001505 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001506 (b'\xc2\xa0',
1507 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001508 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001509 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001510 None),
1511 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001512 (b'\xe2\x80\x80',
1513 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001514 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001515 (b'\xe2\x80\x8b',
1516 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001517 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001518 (b'\xe3\x80\x80',
1519 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001520 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001521 (b'\x10\x7f',
1522 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001523 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001524 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001525 None),
1526 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001527 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001528 None),
1529 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001530 (b'\xef\xbb\xbf',
1531 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001532 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001533 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001534 None),
1535 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001536 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001537 None),
1538 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001539 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001540 None),
1541 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001542 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001543 None),
1544 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001545 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001546 None),
1547 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001548 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001549 None),
1550 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001551 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001552 None),
1553 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001554 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001555 None),
1556 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001557 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001558 None),
1559 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001560 (b'\xcd\x81',
1561 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001562 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001563 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001564 None),
1565 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001566 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001567 None),
1568 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001569 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001570 None),
1571 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001572 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001573 None),
1574 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001575 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001576 None),
1577 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001578 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001579 None),
1580 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001581 (b'foo\xef\xb9\xb6bar',
1582 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001583 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001584 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001585 None),
1586 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001587 (b'\xd8\xa71\xd8\xa8',
1588 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001589 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001590 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001591 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001592 # None),
1593 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001594 # 3.44 Larger test (shrinking).
1595 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001596 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1597 b'\xaa\xce\xb0\xe2\x80\x80',
1598 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001599 # 3.45 Larger test (expanding).
1600 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001601 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1602 b'\x80',
1603 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1604 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1605 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001606 ]
1607
1608
1609class NameprepTest(unittest.TestCase):
1610 def test_nameprep(self):
1611 from encodings.idna import nameprep
1612 for pos, (orig, prepped) in enumerate(nameprep_tests):
1613 if orig is None:
1614 # Skipped
1615 continue
1616 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001617 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001618 if prepped is None:
1619 # Input contains prohibited characters
1620 self.assertRaises(UnicodeError, nameprep, orig)
1621 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001622 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001623 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001624 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001625 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001626 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001627
Victor Stinnerf96418d2015-09-21 23:06:27 +02001628
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001629class IDNACodecTest(unittest.TestCase):
1630 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001631 self.assertEqual(str(b"python.org", "idna"), "python.org")
1632 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1633 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1634 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001635
1636 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001637 self.assertEqual("python.org".encode("idna"), b"python.org")
1638 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1639 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1640 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001641
Martin v. Löwis8b595142005-08-25 11:03:38 +00001642 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001643 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001644 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001645 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001646
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001647 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001648 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001649 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001650 "python.org"
1651 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001652 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001653 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001654 "python.org."
1655 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001656 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001657 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001658 "pyth\xf6n.org."
1659 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001660 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001661 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001662 "pyth\xf6n.org."
1663 )
1664
1665 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001666 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1667 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1668 self.assertEqual(decoder.decode(b"rg"), "")
1669 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001670
1671 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001672 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1673 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1674 self.assertEqual(decoder.decode(b"rg."), "org.")
1675 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001676
1677 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001678 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001679 b"".join(codecs.iterencode("python.org", "idna")),
1680 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001681 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001682 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001683 b"".join(codecs.iterencode("python.org.", "idna")),
1684 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001685 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001686 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001687 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1688 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001689 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001690 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001691 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1692 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001693 )
1694
1695 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001696 self.assertEqual(encoder.encode("\xe4x"), b"")
1697 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1698 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001699
1700 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001701 self.assertEqual(encoder.encode("\xe4x"), b"")
1702 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1703 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001704
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001705 def test_errors(self):
1706 """Only supports "strict" error handler"""
1707 "python.org".encode("idna", "strict")
1708 b"python.org".decode("idna", "strict")
1709 for errors in ("ignore", "replace", "backslashreplace",
1710 "surrogateescape"):
1711 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1712 self.assertRaises(Exception,
1713 b"python.org".decode, "idna", errors)
1714
Victor Stinnerf96418d2015-09-21 23:06:27 +02001715
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001716class CodecsModuleTest(unittest.TestCase):
1717
1718 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001719 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1720 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001721 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001722 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001723 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001724
Victor Stinnera57dfd02014-05-14 17:13:14 +02001725 # test keywords
1726 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1727 '\xe4\xf6\xfc')
1728 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1729 '[]')
1730
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001731 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001732 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1733 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001734 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001735 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001736 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001737 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001738
Victor Stinnera57dfd02014-05-14 17:13:14 +02001739 # test keywords
1740 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1741 b'\xe4\xf6\xfc')
1742 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1743 b'[]')
1744
Walter Dörwald063e1e82004-10-28 13:04:26 +00001745 def test_register(self):
1746 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001747 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001748
1749 def test_lookup(self):
1750 self.assertRaises(TypeError, codecs.lookup)
1751 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001752 self.assertRaises(LookupError, codecs.lookup, " ")
1753
1754 def test_getencoder(self):
1755 self.assertRaises(TypeError, codecs.getencoder)
1756 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1757
1758 def test_getdecoder(self):
1759 self.assertRaises(TypeError, codecs.getdecoder)
1760 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1761
1762 def test_getreader(self):
1763 self.assertRaises(TypeError, codecs.getreader)
1764 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1765
1766 def test_getwriter(self):
1767 self.assertRaises(TypeError, codecs.getwriter)
1768 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001769
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001770 def test_lookup_issue1813(self):
1771 # Issue #1813: under Turkish locales, lookup of some codecs failed
1772 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001773 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001774 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1775 try:
1776 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1777 except locale.Error:
1778 # Unsupported locale on this system
1779 self.skipTest('test needs Turkish locale')
1780 c = codecs.lookup('ASCII')
1781 self.assertEqual(c.name, 'ascii')
1782
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001783 def test_all(self):
1784 api = (
1785 "encode", "decode",
1786 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1787 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1788 "getencoder", "getdecoder", "getincrementalencoder",
1789 "getincrementaldecoder", "getreader", "getwriter",
1790 "register_error", "lookup_error",
1791 "strict_errors", "replace_errors", "ignore_errors",
1792 "xmlcharrefreplace_errors", "backslashreplace_errors",
1793 "namereplace_errors",
1794 "open", "EncodedFile",
1795 "iterencode", "iterdecode",
1796 "BOM", "BOM_BE", "BOM_LE",
1797 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1798 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1799 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1800 "StreamReaderWriter", "StreamRecoder",
1801 )
1802 self.assertCountEqual(api, codecs.__all__)
1803 for api in codecs.__all__:
1804 getattr(codecs, api)
1805
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001806 def test_open(self):
1807 self.addCleanup(support.unlink, support.TESTFN)
1808 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1809 with self.subTest(mode), \
1810 codecs.open(support.TESTFN, mode, 'ascii') as file:
1811 self.assertIsInstance(file, codecs.StreamReaderWriter)
1812
1813 def test_undefined(self):
1814 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1815 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1816 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1817 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1818 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1819 self.assertRaises(UnicodeError,
1820 codecs.encode, 'abc', 'undefined', errors)
1821 self.assertRaises(UnicodeError,
1822 codecs.decode, b'abc', 'undefined', errors)
1823
Victor Stinnerf96418d2015-09-21 23:06:27 +02001824
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001825class StreamReaderTest(unittest.TestCase):
1826
1827 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001828 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001829 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001830
1831 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001832 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001833 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001834
Victor Stinnerf96418d2015-09-21 23:06:27 +02001835
Thomas Wouters89f507f2006-12-13 04:49:30 +00001836class EncodedFileTest(unittest.TestCase):
1837
1838 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001839 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001840 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001841 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001842
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001843 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001844 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001845 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001846 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001847
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001848all_unicode_encodings = [
1849 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001850 "big5",
1851 "big5hkscs",
1852 "charmap",
1853 "cp037",
1854 "cp1006",
1855 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001856 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001857 "cp1140",
1858 "cp1250",
1859 "cp1251",
1860 "cp1252",
1861 "cp1253",
1862 "cp1254",
1863 "cp1255",
1864 "cp1256",
1865 "cp1257",
1866 "cp1258",
1867 "cp424",
1868 "cp437",
1869 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001870 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001871 "cp737",
1872 "cp775",
1873 "cp850",
1874 "cp852",
1875 "cp855",
1876 "cp856",
1877 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001878 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001879 "cp860",
1880 "cp861",
1881 "cp862",
1882 "cp863",
1883 "cp864",
1884 "cp865",
1885 "cp866",
1886 "cp869",
1887 "cp874",
1888 "cp875",
1889 "cp932",
1890 "cp949",
1891 "cp950",
1892 "euc_jis_2004",
1893 "euc_jisx0213",
1894 "euc_jp",
1895 "euc_kr",
1896 "gb18030",
1897 "gb2312",
1898 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001899 "hp_roman8",
1900 "hz",
1901 "idna",
1902 "iso2022_jp",
1903 "iso2022_jp_1",
1904 "iso2022_jp_2",
1905 "iso2022_jp_2004",
1906 "iso2022_jp_3",
1907 "iso2022_jp_ext",
1908 "iso2022_kr",
1909 "iso8859_1",
1910 "iso8859_10",
1911 "iso8859_11",
1912 "iso8859_13",
1913 "iso8859_14",
1914 "iso8859_15",
1915 "iso8859_16",
1916 "iso8859_2",
1917 "iso8859_3",
1918 "iso8859_4",
1919 "iso8859_5",
1920 "iso8859_6",
1921 "iso8859_7",
1922 "iso8859_8",
1923 "iso8859_9",
1924 "johab",
1925 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001926 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001927 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001928 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001929 "latin_1",
1930 "mac_cyrillic",
1931 "mac_greek",
1932 "mac_iceland",
1933 "mac_latin2",
1934 "mac_roman",
1935 "mac_turkish",
1936 "palmos",
1937 "ptcp154",
1938 "punycode",
1939 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001940 "shift_jis",
1941 "shift_jis_2004",
1942 "shift_jisx0213",
1943 "tis_620",
1944 "unicode_escape",
1945 "unicode_internal",
1946 "utf_16",
1947 "utf_16_be",
1948 "utf_16_le",
1949 "utf_7",
1950 "utf_8",
1951]
1952
1953if hasattr(codecs, "mbcs_encode"):
1954 all_unicode_encodings.append("mbcs")
1955
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001956# The following encoding is not tested, because it's not supposed
1957# to work:
1958# "undefined"
1959
1960# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001961broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001962 "punycode",
1963 "unicode_internal"
1964]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001965
Victor Stinnerf96418d2015-09-21 23:06:27 +02001966
Walter Dörwald3abcb012007-04-16 22:10:50 +00001967class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001968 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001969 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001970 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001971 name = codecs.lookup(encoding).name
1972 if encoding.endswith("_codec"):
1973 name += "_codec"
1974 elif encoding == "latin_1":
1975 name = "latin_1"
1976 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001977
Ezio Melottiadc417c2011-11-17 12:23:34 +02001978 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001979 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001980 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001981 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001982 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001983 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001984
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001985 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001986 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001987 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001988 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001989 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001990 for c in s:
1991 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001992 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001993 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001994 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001995 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001996 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001997 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001998 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001999 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002000 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002001 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002002
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002003 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002004 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00002005 try:
2006 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002007 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00002008 pass
2009 else:
2010 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002011 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00002012 for c in s:
2013 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002014 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00002015 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002016 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00002017 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002018 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002019 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002020 self.assertEqual(decodedresult, s,
2021 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002022
2023 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002024 result = "".join(codecs.iterdecode(
2025 codecs.iterencode(s, encoding), encoding))
2026 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002027
2028 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002029 result = "".join(codecs.iterdecode(
2030 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002031 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00002032
Victor Stinner554f3f02010-06-16 23:33:54 +00002033 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00002034 # check incremental decoder/encoder with errors argument
2035 try:
2036 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002037 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00002038 pass
2039 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002040 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002041 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002042 decodedresult = "".join(decoder.decode(bytes([c]))
2043 for c in encodedresult)
2044 self.assertEqual(decodedresult, s,
2045 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002046
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002047 @support.cpython_only
2048 def test_basics_capi(self):
2049 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
2050 s = "abc123" # all codecs should be able to encode these
2051 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002052 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002053 # check incremental decoder/encoder (fetched via the C API)
2054 try:
2055 cencoder = codec_incrementalencoder(encoding)
2056 except LookupError: # no IncrementalEncoder
2057 pass
2058 else:
2059 # check C API
2060 encodedresult = b""
2061 for c in s:
2062 encodedresult += cencoder.encode(c)
2063 encodedresult += cencoder.encode("", True)
2064 cdecoder = codec_incrementaldecoder(encoding)
2065 decodedresult = ""
2066 for c in encodedresult:
2067 decodedresult += cdecoder.decode(bytes([c]))
2068 decodedresult += cdecoder.decode(b"", True)
2069 self.assertEqual(decodedresult, s,
2070 "encoding=%r" % encoding)
2071
2072 if encoding not in ("idna", "mbcs"):
2073 # check incremental decoder/encoder with errors argument
2074 try:
2075 cencoder = codec_incrementalencoder(encoding, "ignore")
2076 except LookupError: # no IncrementalEncoder
2077 pass
2078 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002079 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002080 cdecoder = codec_incrementaldecoder(encoding, "ignore")
2081 decodedresult = "".join(cdecoder.decode(bytes([c]))
2082 for c in encodedresult)
2083 self.assertEqual(decodedresult, s,
2084 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002085
Walter Dörwald729c31f2005-03-14 19:06:30 +00002086 def test_seek(self):
2087 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002088 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002089 for encoding in all_unicode_encodings:
2090 if encoding == "idna": # FIXME: See SF bug #1163178
2091 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002092 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002093 continue
Victor Stinner05010702011-05-27 16:50:40 +02002094 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002095 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002096 # Test that calling seek resets the internal codec state and buffers
2097 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002098 data = reader.read()
2099 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002100
Walter Dörwalde22d3392005-11-17 08:52:34 +00002101 def test_bad_decode_args(self):
2102 for encoding in all_unicode_encodings:
2103 decoder = codecs.getdecoder(encoding)
2104 self.assertRaises(TypeError, decoder)
2105 if encoding not in ("idna", "punycode"):
2106 self.assertRaises(TypeError, decoder, 42)
2107
2108 def test_bad_encode_args(self):
2109 for encoding in all_unicode_encodings:
2110 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002111 with support.check_warnings():
2112 # unicode-internal has been deprecated
2113 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002114
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002115 def test_encoding_map_type_initialized(self):
2116 from encodings import cp1140
2117 # This used to crash, we are only verifying there's no crash.
2118 table_type = type(cp1140.encoding_table)
2119 self.assertEqual(table_type, table_type)
2120
Walter Dörwald3abcb012007-04-16 22:10:50 +00002121 def test_decoder_state(self):
2122 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002123 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002124 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002125 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002126 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2127 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2128
Victor Stinnerf96418d2015-09-21 23:06:27 +02002129
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002130class CharmapTest(unittest.TestCase):
2131 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002132 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002133 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002134 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002135 )
2136
Ezio Melottib3aedd42010-11-20 19:04:17 +00002137 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002138 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2139 ("\U0010FFFFbc", 3)
2140 )
2141
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002142 self.assertRaises(UnicodeDecodeError,
2143 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2144 )
2145
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002146 self.assertRaises(UnicodeDecodeError,
2147 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2148 )
2149
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002150 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002151 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002152 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002153 )
2154
Ezio Melottib3aedd42010-11-20 19:04:17 +00002155 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002156 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002157 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002158 )
2159
Ezio Melottib3aedd42010-11-20 19:04:17 +00002160 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002161 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2162 ("ab\\x02", 3)
2163 )
2164
2165 self.assertEqual(
2166 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2167 ("ab\\x02", 3)
2168 )
2169
2170 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002171 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002172 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002173 )
2174
Ezio Melottib3aedd42010-11-20 19:04:17 +00002175 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002176 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002177 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002178 )
2179
Guido van Rossum805365e2007-05-07 22:24:25 +00002180 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002181 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002182 codecs.charmap_decode(allbytes, "ignore", ""),
2183 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002184 )
2185
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002186 def test_decode_with_int2str_map(self):
2187 self.assertEqual(
2188 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2189 {0: 'a', 1: 'b', 2: 'c'}),
2190 ("abc", 3)
2191 )
2192
2193 self.assertEqual(
2194 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2195 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2196 ("AaBbCc", 3)
2197 )
2198
2199 self.assertEqual(
2200 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2201 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2202 ("\U0010FFFFbc", 3)
2203 )
2204
2205 self.assertEqual(
2206 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2207 {0: 'a', 1: 'b', 2: ''}),
2208 ("ab", 3)
2209 )
2210
2211 self.assertRaises(UnicodeDecodeError,
2212 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2213 {0: 'a', 1: 'b'}
2214 )
2215
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002216 self.assertRaises(UnicodeDecodeError,
2217 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2218 {0: 'a', 1: 'b', 2: None}
2219 )
2220
2221 # Issue #14850
2222 self.assertRaises(UnicodeDecodeError,
2223 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2224 {0: 'a', 1: 'b', 2: '\ufffe'}
2225 )
2226
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002227 self.assertEqual(
2228 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2229 {0: 'a', 1: 'b'}),
2230 ("ab\ufffd", 3)
2231 )
2232
2233 self.assertEqual(
2234 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2235 {0: 'a', 1: 'b', 2: None}),
2236 ("ab\ufffd", 3)
2237 )
2238
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002239 # Issue #14850
2240 self.assertEqual(
2241 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2242 {0: 'a', 1: 'b', 2: '\ufffe'}),
2243 ("ab\ufffd", 3)
2244 )
2245
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002246 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002247 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2248 {0: 'a', 1: 'b'}),
2249 ("ab\\x02", 3)
2250 )
2251
2252 self.assertEqual(
2253 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2254 {0: 'a', 1: 'b', 2: None}),
2255 ("ab\\x02", 3)
2256 )
2257
2258 # Issue #14850
2259 self.assertEqual(
2260 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2261 {0: 'a', 1: 'b', 2: '\ufffe'}),
2262 ("ab\\x02", 3)
2263 )
2264
2265 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002266 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2267 {0: 'a', 1: 'b'}),
2268 ("ab", 3)
2269 )
2270
2271 self.assertEqual(
2272 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2273 {0: 'a', 1: 'b', 2: None}),
2274 ("ab", 3)
2275 )
2276
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002277 # Issue #14850
2278 self.assertEqual(
2279 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2280 {0: 'a', 1: 'b', 2: '\ufffe'}),
2281 ("ab", 3)
2282 )
2283
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002284 allbytes = bytes(range(256))
2285 self.assertEqual(
2286 codecs.charmap_decode(allbytes, "ignore", {}),
2287 ("", len(allbytes))
2288 )
2289
2290 def test_decode_with_int2int_map(self):
2291 a = ord('a')
2292 b = ord('b')
2293 c = ord('c')
2294
2295 self.assertEqual(
2296 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2297 {0: a, 1: b, 2: c}),
2298 ("abc", 3)
2299 )
2300
2301 # Issue #15379
2302 self.assertEqual(
2303 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2304 {0: 0x10FFFF, 1: b, 2: c}),
2305 ("\U0010FFFFbc", 3)
2306 )
2307
Antoine Pitroua1f76552012-09-23 20:00:04 +02002308 self.assertEqual(
2309 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2310 {0: sys.maxunicode, 1: b, 2: c}),
2311 (chr(sys.maxunicode) + "bc", 3)
2312 )
2313
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002314 self.assertRaises(TypeError,
2315 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002316 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002317 )
2318
2319 self.assertRaises(UnicodeDecodeError,
2320 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2321 {0: a, 1: b},
2322 )
2323
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002324 self.assertRaises(UnicodeDecodeError,
2325 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2326 {0: a, 1: b, 2: 0xFFFE},
2327 )
2328
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002329 self.assertEqual(
2330 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2331 {0: a, 1: b}),
2332 ("ab\ufffd", 3)
2333 )
2334
2335 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002336 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2337 {0: a, 1: b, 2: 0xFFFE}),
2338 ("ab\ufffd", 3)
2339 )
2340
2341 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002342 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2343 {0: a, 1: b}),
2344 ("ab\\x02", 3)
2345 )
2346
2347 self.assertEqual(
2348 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2349 {0: a, 1: b, 2: 0xFFFE}),
2350 ("ab\\x02", 3)
2351 )
2352
2353 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002354 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2355 {0: a, 1: b}),
2356 ("ab", 3)
2357 )
2358
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002359 self.assertEqual(
2360 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2361 {0: a, 1: b, 2: 0xFFFE}),
2362 ("ab", 3)
2363 )
2364
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002365
Thomas Wouters89f507f2006-12-13 04:49:30 +00002366class WithStmtTest(unittest.TestCase):
2367 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002368 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002369 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2370 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002371 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002372
2373 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002374 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002375 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002376 with codecs.StreamReaderWriter(f, info.streamreader,
2377 info.streamwriter, 'strict') as srw:
2378 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002379
Victor Stinnerf96418d2015-09-21 23:06:27 +02002380
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002381class TypesTest(unittest.TestCase):
2382 def test_decode_unicode(self):
2383 # Most decoders don't accept unicode input
2384 decoders = [
2385 codecs.utf_7_decode,
2386 codecs.utf_8_decode,
2387 codecs.utf_16_le_decode,
2388 codecs.utf_16_be_decode,
2389 codecs.utf_16_ex_decode,
2390 codecs.utf_32_decode,
2391 codecs.utf_32_le_decode,
2392 codecs.utf_32_be_decode,
2393 codecs.utf_32_ex_decode,
2394 codecs.latin_1_decode,
2395 codecs.ascii_decode,
2396 codecs.charmap_decode,
2397 ]
2398 if hasattr(codecs, "mbcs_decode"):
2399 decoders.append(codecs.mbcs_decode)
2400 for decoder in decoders:
2401 self.assertRaises(TypeError, decoder, "xxx")
2402
2403 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002404 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002405 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002406 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2407 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2408 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2409 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002410
Victor Stinnere3b47152011-12-09 20:49:49 +01002411 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2412 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002413 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2414 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002415
2416 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2417 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002418 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2419 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002420
Serhiy Storchakad6793772013-01-29 10:20:44 +02002421
2422class UnicodeEscapeTest(unittest.TestCase):
2423 def test_empty(self):
2424 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2425 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2426
2427 def test_raw_encode(self):
2428 encode = codecs.unicode_escape_encode
2429 for b in range(32, 127):
2430 if b != b'\\'[0]:
2431 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2432
2433 def test_raw_decode(self):
2434 decode = codecs.unicode_escape_decode
2435 for b in range(256):
2436 if b != b'\\'[0]:
2437 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2438
2439 def test_escape_encode(self):
2440 encode = codecs.unicode_escape_encode
2441 check = coding_checker(self, encode)
2442 check('\t', br'\t')
2443 check('\n', br'\n')
2444 check('\r', br'\r')
2445 check('\\', br'\\')
2446 for b in range(32):
2447 if chr(b) not in '\t\n\r':
2448 check(chr(b), ('\\x%02x' % b).encode())
2449 for b in range(127, 256):
2450 check(chr(b), ('\\x%02x' % b).encode())
2451 check('\u20ac', br'\u20ac')
2452 check('\U0001d120', br'\U0001d120')
2453
2454 def test_escape_decode(self):
2455 decode = codecs.unicode_escape_decode
2456 check = coding_checker(self, decode)
2457 check(b"[\\\n]", "[]")
2458 check(br'[\"]', '["]')
2459 check(br"[\']", "[']")
2460 check(br"[\\]", r"[\]")
2461 check(br"[\a]", "[\x07]")
2462 check(br"[\b]", "[\x08]")
2463 check(br"[\t]", "[\x09]")
2464 check(br"[\n]", "[\x0a]")
2465 check(br"[\v]", "[\x0b]")
2466 check(br"[\f]", "[\x0c]")
2467 check(br"[\r]", "[\x0d]")
2468 check(br"[\7]", "[\x07]")
2469 check(br"[\8]", r"[\8]")
2470 check(br"[\78]", "[\x078]")
2471 check(br"[\41]", "[!]")
2472 check(br"[\418]", "[!8]")
2473 check(br"[\101]", "[A]")
2474 check(br"[\1010]", "[A0]")
2475 check(br"[\x41]", "[A]")
2476 check(br"[\x410]", "[A0]")
2477 check(br"\u20ac", "\u20ac")
2478 check(br"\U0001d120", "\U0001d120")
2479 for b in range(256):
2480 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2481 check(b'\\' + bytes([b]), '\\' + chr(b))
2482
2483 def test_decode_errors(self):
2484 decode = codecs.unicode_escape_decode
2485 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2486 for i in range(d):
2487 self.assertRaises(UnicodeDecodeError, decode,
2488 b"\\" + c + b"0"*i)
2489 self.assertRaises(UnicodeDecodeError, decode,
2490 b"[\\" + c + b"0"*i + b"]")
2491 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2492 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2493 self.assertEqual(decode(data, "replace"),
2494 ("[\ufffd]\ufffd", len(data)))
2495 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2496 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2497 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2498
2499
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002500class RawUnicodeEscapeTest(unittest.TestCase):
2501 def test_empty(self):
2502 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2503 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2504
2505 def test_raw_encode(self):
2506 encode = codecs.raw_unicode_escape_encode
2507 for b in range(256):
2508 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2509
2510 def test_raw_decode(self):
2511 decode = codecs.raw_unicode_escape_decode
2512 for b in range(256):
2513 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2514
2515 def test_escape_encode(self):
2516 encode = codecs.raw_unicode_escape_encode
2517 check = coding_checker(self, encode)
2518 for b in range(256):
2519 if b not in b'uU':
2520 check('\\' + chr(b), b'\\' + bytes([b]))
2521 check('\u20ac', br'\u20ac')
2522 check('\U0001d120', br'\U0001d120')
2523
2524 def test_escape_decode(self):
2525 decode = codecs.raw_unicode_escape_decode
2526 check = coding_checker(self, decode)
2527 for b in range(256):
2528 if b not in b'uU':
2529 check(b'\\' + bytes([b]), '\\' + chr(b))
2530 check(br"\u20ac", "\u20ac")
2531 check(br"\U0001d120", "\U0001d120")
2532
2533 def test_decode_errors(self):
2534 decode = codecs.raw_unicode_escape_decode
2535 for c, d in (b'u', 4), (b'U', 4):
2536 for i in range(d):
2537 self.assertRaises(UnicodeDecodeError, decode,
2538 b"\\" + c + b"0"*i)
2539 self.assertRaises(UnicodeDecodeError, decode,
2540 b"[\\" + c + b"0"*i + b"]")
2541 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2542 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2543 self.assertEqual(decode(data, "replace"),
2544 ("[\ufffd]\ufffd", len(data)))
2545 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2546 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2547 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2548
2549
Martin v. Löwis43c57782009-05-10 08:15:24 +00002550class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002551
2552 def test_utf8(self):
2553 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002554 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002555 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002556 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002557 b"foo\x80bar")
2558 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002559 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002560 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002561 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002562 b"\xed\xb0\x80")
2563
2564 def test_ascii(self):
2565 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002566 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002567 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002568 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002569 b"foo\x80bar")
2570
2571 def test_charmap(self):
2572 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002573 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002574 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002575 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002576 b"foo\xa5bar")
2577
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002578 def test_latin1(self):
2579 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002580 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002581 b"\xe4\xeb\xef\xf6\xfc")
2582
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002583
Victor Stinner3fed0872010-05-22 02:16:27 +00002584class BomTest(unittest.TestCase):
2585 def test_seek0(self):
2586 data = "1234567890"
2587 tests = ("utf-16",
2588 "utf-16-le",
2589 "utf-16-be",
2590 "utf-32",
2591 "utf-32-le",
2592 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002593 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002594 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002595 # Check if the BOM is written only once
2596 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002597 f.write(data)
2598 f.write(data)
2599 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002600 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002601 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002602 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002603
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002604 # Check that the BOM is written after a seek(0)
2605 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2606 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002607 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002608 f.seek(0)
2609 f.write(data)
2610 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002611 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002612
2613 # (StreamWriter) Check that the BOM is written after a seek(0)
2614 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002615 f.writer.write(data[0])
2616 self.assertNotEqual(f.writer.tell(), 0)
2617 f.writer.seek(0)
2618 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002619 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002620 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002621
Victor Stinner05010702011-05-27 16:50:40 +02002622 # Check that the BOM is not written after a seek() at a position
2623 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002624 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2625 f.write(data)
2626 f.seek(f.tell())
2627 f.write(data)
2628 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002629 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002630
Victor Stinner05010702011-05-27 16:50:40 +02002631 # (StreamWriter) Check that the BOM is not written after a seek()
2632 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002633 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002634 f.writer.write(data)
2635 f.writer.seek(f.writer.tell())
2636 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002637 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002638 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002639
Victor Stinner3fed0872010-05-22 02:16:27 +00002640
Georg Brandl02524622010-12-02 18:06:51 +00002641bytes_transform_encodings = [
2642 "base64_codec",
2643 "uu_codec",
2644 "quopri_codec",
2645 "hex_codec",
2646]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002647
2648transform_aliases = {
2649 "base64_codec": ["base64", "base_64"],
2650 "uu_codec": ["uu"],
2651 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2652 "hex_codec": ["hex"],
2653 "rot_13": ["rot13"],
2654}
2655
Georg Brandl02524622010-12-02 18:06:51 +00002656try:
2657 import zlib
2658except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002659 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002660else:
2661 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002662 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002663try:
2664 import bz2
2665except ImportError:
2666 pass
2667else:
2668 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002669 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002670
Victor Stinnerf96418d2015-09-21 23:06:27 +02002671
Georg Brandl02524622010-12-02 18:06:51 +00002672class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002673
Georg Brandl02524622010-12-02 18:06:51 +00002674 def test_basics(self):
2675 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002676 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002677 with self.subTest(encoding=encoding):
2678 # generic codecs interface
2679 (o, size) = codecs.getencoder(encoding)(binput)
2680 self.assertEqual(size, len(binput))
2681 (i, size) = codecs.getdecoder(encoding)(o)
2682 self.assertEqual(size, len(o))
2683 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002684
Georg Brandl02524622010-12-02 18:06:51 +00002685 def test_read(self):
2686 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002687 with self.subTest(encoding=encoding):
2688 sin = codecs.encode(b"\x80", encoding)
2689 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2690 sout = reader.read()
2691 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002692
2693 def test_readline(self):
2694 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002695 with self.subTest(encoding=encoding):
2696 sin = codecs.encode(b"\x80", encoding)
2697 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2698 sout = reader.readline()
2699 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002700
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002701 def test_buffer_api_usage(self):
2702 # We check all the transform codecs accept memoryview input
2703 # for encoding and decoding
2704 # and also that they roundtrip correctly
2705 original = b"12345\x80"
2706 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002707 with self.subTest(encoding=encoding):
2708 data = original
2709 view = memoryview(data)
2710 data = codecs.encode(data, encoding)
2711 view_encoded = codecs.encode(view, encoding)
2712 self.assertEqual(view_encoded, data)
2713 view = memoryview(data)
2714 data = codecs.decode(data, encoding)
2715 self.assertEqual(data, original)
2716 view_decoded = codecs.decode(view, encoding)
2717 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002718
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002719 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002720 # Check binary -> binary codecs give a good error for str input
2721 bad_input = "bad input type"
2722 for encoding in bytes_transform_encodings:
2723 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002724 fmt = ( "{!r} is not a text encoding; "
2725 "use codecs.encode\(\) to handle arbitrary codecs")
2726 msg = fmt.format(encoding)
2727 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002728 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002729 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002730
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002731 def test_text_to_binary_blacklists_text_transforms(self):
2732 # Check str.encode gives a good error message for str -> str codecs
2733 msg = (r"^'rot_13' is not a text encoding; "
2734 "use codecs.encode\(\) to handle arbitrary codecs")
2735 with self.assertRaisesRegex(LookupError, msg):
2736 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002737
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002738 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002739 # Check bytes.decode and bytearray.decode give a good error
2740 # message for binary -> binary codecs
2741 data = b"encode first to ensure we meet any format restrictions"
2742 for encoding in bytes_transform_encodings:
2743 with self.subTest(encoding=encoding):
2744 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002745 fmt = (r"{!r} is not a text encoding; "
2746 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002747 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002748 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002749 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002750 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002751 bytearray(encoded_data).decode(encoding)
2752
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002753 def test_binary_to_text_blacklists_text_transforms(self):
2754 # Check str -> str codec gives a good error for binary input
2755 for bad_input in (b"immutable", bytearray(b"mutable")):
2756 with self.subTest(bad_input=bad_input):
2757 msg = (r"^'rot_13' is not a text encoding; "
2758 "use codecs.decode\(\) to handle arbitrary codecs")
2759 with self.assertRaisesRegex(LookupError, msg) as failure:
2760 bad_input.decode("rot_13")
2761 self.assertIsNone(failure.exception.__cause__)
2762
Zachary Wareefa2e042013-12-30 14:54:11 -06002763 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002764 def test_custom_zlib_error_is_wrapped(self):
2765 # Check zlib codec gives a good error for malformed input
2766 msg = "^decoding with 'zlib_codec' codec failed"
2767 with self.assertRaisesRegex(Exception, msg) as failure:
2768 codecs.decode(b"hello", "zlib_codec")
2769 self.assertIsInstance(failure.exception.__cause__,
2770 type(failure.exception))
2771
2772 def test_custom_hex_error_is_wrapped(self):
2773 # Check hex codec gives a good error for malformed input
2774 msg = "^decoding with 'hex_codec' codec failed"
2775 with self.assertRaisesRegex(Exception, msg) as failure:
2776 codecs.decode(b"hello", "hex_codec")
2777 self.assertIsInstance(failure.exception.__cause__,
2778 type(failure.exception))
2779
2780 # Unfortunately, the bz2 module throws OSError, which the codec
2781 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002782
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002783 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2784 def test_aliases(self):
2785 for codec_name, aliases in transform_aliases.items():
2786 expected_name = codecs.lookup(codec_name).name
2787 for alias in aliases:
2788 with self.subTest(alias=alias):
2789 info = codecs.lookup(alias)
2790 self.assertEqual(info.name, expected_name)
2791
Martin Panter06171bd2015-09-12 00:34:28 +00002792 def test_quopri_stateless(self):
2793 # Should encode with quotetabs=True
2794 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2795 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2796 # But should still support unescaped tabs and spaces
2797 unescaped = b"space tab eol\n"
2798 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2799
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002800 def test_uu_invalid(self):
2801 # Missing "begin" line
2802 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2803
Nick Coghlan8b097b42013-11-13 23:49:21 +10002804
2805# The codec system tries to wrap exceptions in order to ensure the error
2806# mentions the operation being performed and the codec involved. We
2807# currently *only* want this to happen for relatively stateless
2808# exceptions, where the only significant information they contain is their
2809# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002810
2811# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002812# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002813_TEST_CODECS = {}
2814
2815def _get_test_codec(codec_name):
2816 return _TEST_CODECS.get(codec_name)
2817codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2818
Nick Coghlan8fad1672014-09-15 23:50:44 +12002819try:
2820 # Issue #22166: Also need to clear the internal cache in CPython
2821 from _codecs import _forget_codec
2822except ImportError:
2823 def _forget_codec(codec_name):
2824 pass
2825
2826
Nick Coghlan8b097b42013-11-13 23:49:21 +10002827class ExceptionChainingTest(unittest.TestCase):
2828
2829 def setUp(self):
2830 # There's no way to unregister a codec search function, so we just
2831 # ensure we render this one fairly harmless after the test
2832 # case finishes by using the test case repr as the codec name
2833 # The codecs module normalizes codec names, although this doesn't
2834 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002835 # We also make sure we use a truly unique id for the custom codec
2836 # to avoid issues with the codec cache when running these tests
2837 # multiple times (e.g. when hunting for refleaks)
2838 unique_id = repr(self) + str(id(self))
2839 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2840
2841 # We store the object to raise on the instance because of a bad
2842 # interaction between the codec caching (which means we can't
2843 # recreate the codec entry) and regrtest refleak hunting (which
2844 # runs the same test instance multiple times). This means we
2845 # need to ensure the codecs call back in to the instance to find
2846 # out which exception to raise rather than binding them in a
2847 # closure to an object that may change on the next run
2848 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002849
Nick Coghlan4e553e22013-11-16 00:35:34 +10002850 def tearDown(self):
2851 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002852 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2853 encodings._cache.pop(self.codec_name, None)
2854 try:
2855 _forget_codec(self.codec_name)
2856 except KeyError:
2857 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002858
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002859 def set_codec(self, encode, decode):
2860 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002861 name=self.codec_name)
2862 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002863
2864 @contextlib.contextmanager
2865 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002866 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002867 operation, self.codec_name, exc_type.__name__, msg)
2868 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2869 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002870 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002871 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002872
2873 def raise_obj(self, *args, **kwds):
2874 # Helper to dynamically change the object raised by a test codec
2875 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002876
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002877 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002878 self.obj_to_raise = obj_to_raise
2879 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002880 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002881 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002882 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002883 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002884 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002885 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002886 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002887 codecs.decode(b"bytes input", self.codec_name)
2888
2889 def test_raise_by_type(self):
2890 self.check_wrapped(RuntimeError, "")
2891
2892 def test_raise_by_value(self):
2893 msg = "This should be wrapped"
2894 self.check_wrapped(RuntimeError(msg), msg)
2895
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002896 def test_raise_grandchild_subclass_exact_size(self):
2897 msg = "This should be wrapped"
2898 class MyRuntimeError(RuntimeError):
2899 __slots__ = ()
2900 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2901
2902 def test_raise_subclass_with_weakref_support(self):
2903 msg = "This should be wrapped"
2904 class MyRuntimeError(RuntimeError):
2905 pass
2906 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2907
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002908 def check_not_wrapped(self, obj_to_raise, msg):
2909 def raise_obj(*args, **kwds):
2910 raise obj_to_raise
2911 self.set_codec(raise_obj, raise_obj)
2912 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002913 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002914 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002915 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002916 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002917 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002918 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002919 codecs.decode(b"bytes input", self.codec_name)
2920
2921 def test_init_override_is_not_wrapped(self):
2922 class CustomInit(RuntimeError):
2923 def __init__(self):
2924 pass
2925 self.check_not_wrapped(CustomInit, "")
2926
2927 def test_new_override_is_not_wrapped(self):
2928 class CustomNew(RuntimeError):
2929 def __new__(cls):
2930 return super().__new__(cls)
2931 self.check_not_wrapped(CustomNew, "")
2932
2933 def test_instance_attribute_is_not_wrapped(self):
2934 msg = "This should NOT be wrapped"
2935 exc = RuntimeError(msg)
2936 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002937 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002938
2939 def test_non_str_arg_is_not_wrapped(self):
2940 self.check_not_wrapped(RuntimeError(1), "1")
2941
2942 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002943 msg_re = r"^\('a', 'b', 'c'\)$"
2944 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002945
2946 # http://bugs.python.org/issue19609
2947 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002948 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002949 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002950 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002951 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002952 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002953 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002954 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002955 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002956 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002957 codecs.decode(b"bytes input", self.codec_name)
2958
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002959 def test_unflagged_non_text_codec_handling(self):
2960 # The stdlib non-text codecs are now marked so they're
2961 # pre-emptively skipped by the text model related methods
2962 # However, third party codecs won't be flagged, so we still make
2963 # sure the case where an inappropriate output type is produced is
2964 # handled appropriately
2965 def encode_to_str(*args, **kwds):
2966 return "not bytes!", 0
2967 def decode_to_bytes(*args, **kwds):
2968 return b"not str!", 0
2969 self.set_codec(encode_to_str, decode_to_bytes)
2970 # No input or output type checks on the codecs module functions
2971 encoded = codecs.encode(None, self.codec_name)
2972 self.assertEqual(encoded, "not bytes!")
2973 decoded = codecs.decode(None, self.codec_name)
2974 self.assertEqual(decoded, b"not str!")
2975 # Text model methods should complain
2976 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2977 "use codecs.encode\(\) to encode to arbitrary types$")
2978 msg = fmt.format(self.codec_name)
2979 with self.assertRaisesRegex(TypeError, msg):
2980 "str_input".encode(self.codec_name)
2981 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2982 "use codecs.decode\(\) to decode to arbitrary types$")
2983 msg = fmt.format(self.codec_name)
2984 with self.assertRaisesRegex(TypeError, msg):
2985 b"bytes input".decode(self.codec_name)
2986
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002987
Georg Brandl02524622010-12-02 18:06:51 +00002988
Victor Stinner62be4fb2011-10-18 21:46:37 +02002989@unittest.skipUnless(sys.platform == 'win32',
2990 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002991class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002992 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002993 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002994
Victor Stinner3a50e702011-10-18 21:21:00 +02002995 def test_invalid_code_page(self):
2996 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2997 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002998 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2999 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02003000
3001 def test_code_page_name(self):
3002 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
3003 codecs.code_page_encode, 932, '\xff')
3004 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003005 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003006 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003007 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003008
3009 def check_decode(self, cp, tests):
3010 for raw, errors, expected in tests:
3011 if expected is not None:
3012 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01003013 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003014 except UnicodeDecodeError as err:
3015 self.fail('Unable to decode %a from "cp%s" with '
3016 'errors=%r: %s' % (raw, cp, errors, err))
3017 self.assertEqual(decoded[0], expected,
3018 '%a.decode("cp%s", %r)=%a != %a'
3019 % (raw, cp, errors, decoded[0], expected))
3020 # assert 0 <= decoded[1] <= len(raw)
3021 self.assertGreaterEqual(decoded[1], 0)
3022 self.assertLessEqual(decoded[1], len(raw))
3023 else:
3024 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01003025 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003026
3027 def check_encode(self, cp, tests):
3028 for text, errors, expected in tests:
3029 if expected is not None:
3030 try:
3031 encoded = codecs.code_page_encode(cp, text, errors)
3032 except UnicodeEncodeError as err:
3033 self.fail('Unable to encode %a to "cp%s" with '
3034 'errors=%r: %s' % (text, cp, errors, err))
3035 self.assertEqual(encoded[0], expected,
3036 '%a.encode("cp%s", %r)=%a != %a'
3037 % (text, cp, errors, encoded[0], expected))
3038 self.assertEqual(encoded[1], len(text))
3039 else:
3040 self.assertRaises(UnicodeEncodeError,
3041 codecs.code_page_encode, cp, text, errors)
3042
3043 def test_cp932(self):
3044 self.check_encode(932, (
3045 ('abc', 'strict', b'abc'),
3046 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003047 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003048 ('\xff', 'strict', None),
3049 ('[\xff]', 'ignore', b'[]'),
3050 ('[\xff]', 'replace', b'[y]'),
3051 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003052 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02003053 ('[\xff]', 'namereplace',
3054 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003055 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003056 ('\udcff', 'strict', None),
3057 ('[\udcff]', 'surrogateescape', b'[\xff]'),
3058 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003059 ))
Victor Stinner9e921882011-10-18 21:55:25 +02003060 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02003061 (b'abc', 'strict', 'abc'),
3062 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3063 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003064 (b'[\xff]', 'strict', None),
3065 (b'[\xff]', 'ignore', '[]'),
3066 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02003067 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003068 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003069 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003070 (b'\x81\x00abc', 'strict', None),
3071 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003072 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003073 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003074 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003075
3076 def test_cp1252(self):
3077 self.check_encode(1252, (
3078 ('abc', 'strict', b'abc'),
3079 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3080 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003081 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003082 ('\u0141', 'strict', None),
3083 ('\u0141', 'ignore', b''),
3084 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003085 ('\udc98', 'surrogateescape', b'\x98'),
3086 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003087 ))
3088 self.check_decode(1252, (
3089 (b'abc', 'strict', 'abc'),
3090 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3091 (b'\xff', 'strict', '\xff'),
3092 ))
3093
3094 def test_cp_utf7(self):
3095 cp = 65000
3096 self.check_encode(cp, (
3097 ('abc', 'strict', b'abc'),
3098 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3099 ('\U0010ffff', 'strict', b'+2//f/w-'),
3100 ('\udc80', 'strict', b'+3IA-'),
3101 ('\ufffd', 'strict', b'+//0-'),
3102 ))
3103 self.check_decode(cp, (
3104 (b'abc', 'strict', 'abc'),
3105 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3106 (b'+2//f/w-', 'strict', '\U0010ffff'),
3107 (b'+3IA-', 'strict', '\udc80'),
3108 (b'+//0-', 'strict', '\ufffd'),
3109 # invalid bytes
3110 (b'[+/]', 'strict', '[]'),
3111 (b'[\xff]', 'strict', '[\xff]'),
3112 ))
3113
Victor Stinner3a50e702011-10-18 21:21:00 +02003114 def test_multibyte_encoding(self):
3115 self.check_decode(932, (
3116 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3117 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3118 ))
3119 self.check_decode(self.CP_UTF8, (
3120 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3121 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3122 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003123 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02003124 self.check_encode(self.CP_UTF8, (
3125 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3126 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3127 ))
3128
3129 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003130 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3131 self.assertEqual(decoded, ('', 0))
3132
Victor Stinner3a50e702011-10-18 21:21:00 +02003133 decoded = codecs.code_page_decode(932,
3134 b'\xe9\x80\xe9', 'strict',
3135 False)
3136 self.assertEqual(decoded, ('\u9a3e', 2))
3137
3138 decoded = codecs.code_page_decode(932,
3139 b'\xe9\x80\xe9\x80', 'strict',
3140 False)
3141 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3142
3143 decoded = codecs.code_page_decode(932,
3144 b'abc', 'strict',
3145 False)
3146 self.assertEqual(decoded, ('abc', 3))
3147
3148
Victor Stinnerf96418d2015-09-21 23:06:27 +02003149class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003150 def test_encode(self):
3151 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3152
3153 def test_encode_error(self):
3154 for data, error_handler, expected in (
3155 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3156 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3157 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003158 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3159 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003160 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3161 ):
3162 with self.subTest(data=data, error_handler=error_handler,
3163 expected=expected):
3164 self.assertEqual(data.encode('ascii', error_handler),
3165 expected)
3166
3167 def test_encode_surrogateescape_error(self):
3168 with self.assertRaises(UnicodeEncodeError):
3169 # the first character can be decoded, but not the second
3170 '\udc80\xff'.encode('ascii', 'surrogateescape')
3171
Victor Stinnerf96418d2015-09-21 23:06:27 +02003172 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003173 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3174
3175 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003176 for data, error_handler, expected in (
3177 (b'[\x80\xff]', 'ignore', '[]'),
3178 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3179 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3180 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3181 ):
3182 with self.subTest(data=data, error_handler=error_handler,
3183 expected=expected):
3184 self.assertEqual(data.decode('ascii', error_handler),
3185 expected)
3186
3187
Victor Stinnerc3713e92015-09-29 12:32:13 +02003188class Latin1Test(unittest.TestCase):
3189 def test_encode(self):
3190 for data, expected in (
3191 ('abc', b'abc'),
3192 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3193 ):
3194 with self.subTest(data=data, expected=expected):
3195 self.assertEqual(data.encode('latin1'), expected)
3196
3197 def test_encode_errors(self):
3198 for data, error_handler, expected in (
3199 ('[\u20ac\udc80]', 'ignore', b'[]'),
3200 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003201 ('[\u20ac\U000abcde]', 'backslashreplace',
3202 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003203 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3204 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3205 ):
3206 with self.subTest(data=data, error_handler=error_handler,
3207 expected=expected):
3208 self.assertEqual(data.encode('latin1', error_handler),
3209 expected)
3210
3211 def test_encode_surrogateescape_error(self):
3212 with self.assertRaises(UnicodeEncodeError):
3213 # the first character can be decoded, but not the second
3214 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3215
3216 def test_decode(self):
3217 for data, expected in (
3218 (b'abc', 'abc'),
3219 (b'[\x80\xff]', '[\x80\xff]'),
3220 ):
3221 with self.subTest(data=data, expected=expected):
3222 self.assertEqual(data.decode('latin1'), expected)
3223
3224
Fred Drake2e2be372001-09-20 21:33:42 +00003225if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003226 unittest.main()