blob: f61cc33a4f664023aaaef9d2d32073422428cb9f [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00004import sys, _testcapi, io
Guido van Rossum5c4501a2007-05-09 23:47:07 +00005from StringIO import StringIO
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000011 def __init__(self, buffer):
12 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000013
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000020 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000021 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwald3abcb012007-04-16 22:10:50 +000027class MixInCheckStateHandling:
28 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000029 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000030 d = codecs.getincrementaldecoder(encoding)()
31 part1 = d.decode(s[:i])
32 state = d.getstate()
33 self.assert_(isinstance(state[1], int))
34 # Check that the condition stated in the documentation for
35 # IncrementalDecoder.getstate() holds
36 if not state[1]:
37 # reset decoder to the default state without anything buffered
38 d.setstate((state[0][:0], 0))
39 # Feeding the previous input may not produce any output
40 self.assert_(not d.decode(state[0]))
41 # The decoder must return to the same state
42 self.assertEqual(state, d.getstate())
43 # Create a new decoder and set it to the state
44 # we extracted from the old one
45 d = codecs.getincrementaldecoder(encoding)()
46 d.setstate(state)
47 part2 = d.decode(s[i:], True)
48 self.assertEqual(u, part1+part2)
49
50 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000051 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000052 d = codecs.getincrementalencoder(encoding)()
53 part1 = d.encode(u[:i])
54 state = d.getstate()
55 d = codecs.getincrementalencoder(encoding)()
56 d.setstate(state)
57 part2 = d.encode(u[i:], True)
58 self.assertEqual(s, part1+part2)
59
60class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000061 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000062 # get a StreamReader for the encoding and feed the bytestring version
63 # of input to the reader byte by byte. Read every available from
64 # the StreamReader and check that the results equal the appropriate
65 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000066 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000067 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000068 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000069 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000070 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000071 result += r.read()
72 self.assertEqual(result, partialresult)
73 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000074 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000075 self.assertEqual(r.bytebuffer, b"")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000076 self.assertEqual(r.charbuffer, "")
Walter Dörwald69652032004-09-07 20:24:22 +000077
Thomas Woutersa9773292006-04-21 09:43:23 +000078 # do the check again, this time using a incremental decoder
79 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000080 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000081 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000082 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000083 self.assertEqual(result, partialresult)
84 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000085 self.assertEqual(d.decode(b"", True), "")
86 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000087
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000088 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000089 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000090 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000091 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000093 self.assertEqual(result, partialresult)
94 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000095 self.assertEqual(d.decode(b"", True), "")
96 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000097
98 # check iterdecode()
99 encoded = input.encode(self.encoding)
100 self.assertEqual(
101 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000102 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 )
104
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000105 def test_readline(self):
106 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000107 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000108 return codecs.getreader(self.encoding)(stream)
109
Walter Dörwaldca199432006-03-06 22:39:12 +0000110 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000111 reader = getreader(input)
112 lines = []
113 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000114 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000115 if not line:
116 break
117 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000118 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000119
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000120 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
121 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
122 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000123 self.assertEqual(readalllines(s, True), sexpected)
124 self.assertEqual(readalllines(s, False), sexpectednoends)
125 self.assertEqual(readalllines(s, True, 10), sexpected)
126 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127
128 # Test long lines (multiple calls to read() in readline())
129 vw = []
130 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000131 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
132 vw.append((i*200)*"\3042" + lineend)
133 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
135 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
136
137 # Test lines where the first read might end with \r, so the
138 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000139 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000140 for lineend in "\n \r\n \r \u2028".split():
141 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000142 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000143 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000144 self.assertEqual(
145 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000146 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000147 )
148 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000149 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000150 self.assertEqual(
151 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000152 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000153 )
154
155 def test_bug1175396(self):
156 s = [
157 '<%!--===================================================\r\n',
158 ' BLOG index page: show recent articles,\r\n',
159 ' today\'s articles, or articles of a specific date.\r\n',
160 '========================================================--%>\r\n',
161 '<%@inputencoding="ISO-8859-1"%>\r\n',
162 '<%@pagetemplate=TEMPLATE.y%>\r\n',
163 '<%@import=import frog.util, frog%>\r\n',
164 '<%@import=import frog.objects%>\r\n',
165 '<%@import=from frog.storageerrors import StorageError%>\r\n',
166 '<%\r\n',
167 '\r\n',
168 'import logging\r\n',
169 'log=logging.getLogger("Snakelets.logger")\r\n',
170 '\r\n',
171 '\r\n',
172 'user=self.SessionCtx.user\r\n',
173 'storageEngine=self.SessionCtx.storageEngine\r\n',
174 '\r\n',
175 '\r\n',
176 'def readArticlesFromDate(date, count=None):\r\n',
177 ' entryids=storageEngine.listBlogEntries(date)\r\n',
178 ' entryids.reverse() # descending\r\n',
179 ' if count:\r\n',
180 ' entryids=entryids[:count]\r\n',
181 ' try:\r\n',
182 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
183 ' except StorageError,x:\r\n',
184 ' log.error("Error loading articles: "+str(x))\r\n',
185 ' self.abort("cannot load articles")\r\n',
186 '\r\n',
187 'showdate=None\r\n',
188 '\r\n',
189 'arg=self.Request.getArg()\r\n',
190 'if arg=="today":\r\n',
191 ' #-------------------- TODAY\'S ARTICLES\r\n',
192 ' self.write("<h2>Today\'s articles</h2>")\r\n',
193 ' showdate = frog.util.isodatestr() \r\n',
194 ' entries = readArticlesFromDate(showdate)\r\n',
195 'elif arg=="active":\r\n',
196 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
197 ' self.Yredirect("active.y")\r\n',
198 'elif arg=="login":\r\n',
199 ' #-------------------- LOGIN PAGE redirect\r\n',
200 ' self.Yredirect("login.y")\r\n',
201 'elif arg=="date":\r\n',
202 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
203 ' showdate = self.Request.getParameter("date")\r\n',
204 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
205 ' entries = readArticlesFromDate(showdate)\r\n',
206 'else:\r\n',
207 ' #-------------------- RECENT ARTICLES\r\n',
208 ' self.write("<h2>Recent articles</h2>")\r\n',
209 ' dates=storageEngine.listBlogEntryDates()\r\n',
210 ' if dates:\r\n',
211 ' entries=[]\r\n',
212 ' SHOWAMOUNT=10\r\n',
213 ' for showdate in dates:\r\n',
214 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
215 ' if len(entries)>=SHOWAMOUNT:\r\n',
216 ' break\r\n',
217 ' \r\n',
218 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000219 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000220 reader = codecs.getreader(self.encoding)(stream)
221 for (i, line) in enumerate(reader):
222 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000223
224 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000225 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000226 writer = codecs.getwriter(self.encoding)(q)
227 reader = codecs.getreader(self.encoding)(q)
228
229 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000230 writer.write("foo\r")
231 self.assertEqual(reader.readline(keepends=False), "foo")
232 writer.write("\nbar\r")
233 self.assertEqual(reader.readline(keepends=False), "")
234 self.assertEqual(reader.readline(keepends=False), "bar")
235 writer.write("baz")
236 self.assertEqual(reader.readline(keepends=False), "baz")
237 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000238
239 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000240 writer.write("foo\r")
241 self.assertEqual(reader.readline(keepends=True), "foo\r")
242 writer.write("\nbar\r")
243 self.assertEqual(reader.readline(keepends=True), "\n")
244 self.assertEqual(reader.readline(keepends=True), "bar\r")
245 writer.write("baz")
246 self.assertEqual(reader.readline(keepends=True), "baz")
247 self.assertEqual(reader.readline(keepends=True), "")
248 writer.write("foo\r\n")
249 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000250
Walter Dörwald9fa09462005-01-10 12:01:39 +0000251 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000252 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
253 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
254 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000255
256 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000257 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000258 reader = codecs.getreader(self.encoding)(stream)
259 self.assertEqual(reader.readline(), s1)
260 self.assertEqual(reader.readline(), s2)
261 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000262 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000263
264 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000265 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
266 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
267 s3 = "stillokay:bbbbxx\r\n"
268 s4 = "broken!!!!badbad\r\n"
269 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000270
271 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000272 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000273 reader = codecs.getreader(self.encoding)(stream)
274 self.assertEqual(reader.readline(), s1)
275 self.assertEqual(reader.readline(), s2)
276 self.assertEqual(reader.readline(), s3)
277 self.assertEqual(reader.readline(), s4)
278 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000279 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000280
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000281class UTF16Test(ReadTest):
282 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000283
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000284 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
285 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000286
287 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000288 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000289 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000290 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000291 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000292 f.write("spam")
293 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000294 d = s.getvalue()
295 # check whether there is exactly one BOM in it
296 self.assert_(d == self.spamle or d == self.spambe)
297 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000298 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000299 f = reader(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000300 self.assertEquals(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000301
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000302 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000303 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000304 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000305 self.assertRaises(UnicodeError, f.read)
306
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000307 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000308 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000309 self.assertRaises(UnicodeError, f.read)
310
Walter Dörwald69652032004-09-07 20:24:22 +0000311 def test_partial(self):
312 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000313 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000314 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000315 "", # first byte of BOM read
316 "", # second byte of BOM read => byteorder known
317 "",
318 "\x00",
319 "\x00",
320 "\x00\xff",
321 "\x00\xff",
322 "\x00\xff\u0100",
323 "\x00\xff\u0100",
324 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000325 ]
326 )
327
Walter Dörwalde22d3392005-11-17 08:52:34 +0000328 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000329 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000330 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000331
332 def test_decoder_state(self):
333 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000334 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000335 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000336 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000337
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000338class UTF16LETest(ReadTest):
339 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000340
341 def test_partial(self):
342 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000343 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000344 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000345 "",
346 "\x00",
347 "\x00",
348 "\x00\xff",
349 "\x00\xff",
350 "\x00\xff\u0100",
351 "\x00\xff\u0100",
352 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000353 ]
354 )
355
Walter Dörwalde22d3392005-11-17 08:52:34 +0000356 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000357 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000358 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000359
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000360class UTF16BETest(ReadTest):
361 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000362
363 def test_partial(self):
364 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000365 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000366 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000367 "",
368 "\x00",
369 "\x00",
370 "\x00\xff",
371 "\x00\xff",
372 "\x00\xff\u0100",
373 "\x00\xff\u0100",
374 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000375 ]
376 )
377
Walter Dörwalde22d3392005-11-17 08:52:34 +0000378 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000379 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000380 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000381
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000382class UTF8Test(ReadTest):
383 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000384
385 def test_partial(self):
386 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000387 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000388 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000389 "\x00",
390 "\x00",
391 "\x00\xff",
392 "\x00\xff",
393 "\x00\xff\u07ff",
394 "\x00\xff\u07ff",
395 "\x00\xff\u07ff",
396 "\x00\xff\u07ff\u0800",
397 "\x00\xff\u07ff\u0800",
398 "\x00\xff\u07ff\u0800",
399 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000400 ]
401 )
402
Walter Dörwald3abcb012007-04-16 22:10:50 +0000403 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000404 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000405 self.check_state_handling_decode(self.encoding,
406 u, u.encode(self.encoding))
407
Walter Dörwalde22d3392005-11-17 08:52:34 +0000408class UTF7Test(ReadTest):
409 encoding = "utf-7"
410
411 # No test_partial() yet, because UTF-7 doesn't support it.
412
413class UTF16ExTest(unittest.TestCase):
414
415 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000416 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000417
418 def test_bad_args(self):
419 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
420
421class ReadBufferTest(unittest.TestCase):
422
423 def test_array(self):
424 import array
425 self.assertEqual(
426 codecs.readbuffer_encode(array.array("c", "spam")),
427 ("spam", 4)
428 )
429
430 def test_empty(self):
431 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
432
433 def test_bad_args(self):
434 self.assertRaises(TypeError, codecs.readbuffer_encode)
435 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
436
437class CharBufferTest(unittest.TestCase):
438
439 def test_string(self):
440 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
441
442 def test_empty(self):
443 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
444
445 def test_bad_args(self):
446 self.assertRaises(TypeError, codecs.charbuffer_encode)
447 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
448
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000449class UTF8SigTest(ReadTest):
450 encoding = "utf-8-sig"
451
452 def test_partial(self):
453 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000454 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000455 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000456 "",
457 "",
458 "", # First BOM has been read and skipped
459 "",
460 "",
461 "\ufeff", # Second BOM has been read and emitted
462 "\ufeff\x00", # "\x00" read and emitted
463 "\ufeff\x00", # First byte of encoded u"\xff" read
464 "\ufeff\x00\xff", # Second byte of encoded u"\xff" read
465 "\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
466 "\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
467 "\ufeff\x00\xff\u07ff",
468 "\ufeff\x00\xff\u07ff",
469 "\ufeff\x00\xff\u07ff\u0800",
470 "\ufeff\x00\xff\u07ff\u0800",
471 "\ufeff\x00\xff\u07ff\u0800",
472 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000473 ]
474 )
475
Thomas Wouters89f507f2006-12-13 04:49:30 +0000476 def test_bug1601501(self):
477 # SF bug #1601501: check that the codec works with a buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000478 str(b"\xef\xbb\xbf", "utf-8-sig")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000479
Walter Dörwald3abcb012007-04-16 22:10:50 +0000480 def test_bom(self):
481 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000482 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000483 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
484
485 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000486 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000487 self.check_state_handling_decode(self.encoding,
488 u, u.encode(self.encoding))
489
Walter Dörwald8709a422002-09-03 13:53:40 +0000490class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000491 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000492 self.assertEquals(codecs.escape_decode(""), ("", 0))
493
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000494class RecodingTest(unittest.TestCase):
495 def test_recoding(self):
Guido van Rossum5c4501a2007-05-09 23:47:07 +0000496 f = StringIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000497 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000498 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000499 f2.close()
500 # Python used to crash on this at exit because of a refcount
501 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000502
Martin v. Löwis2548c732003-04-18 10:39:54 +0000503# From RFC 3492
504punycode_testcases = [
505 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000506 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
507 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000508 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000509 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000510 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000511 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000512 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000513 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000514 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000515 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000516 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
517 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
518 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000519 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000520 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000521 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
522 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
523 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000524 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000525 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000526 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000527 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
528 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
529 "\u0939\u0948\u0902",
530 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000531
532 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000533 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000534 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
535 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000536
537 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000538 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
539 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
540 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000541 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
542 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000543
544 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000545 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
546 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
547 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
548 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000549 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000550
551 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000552 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
553 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
554 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
555 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
556 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000557 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000558
559 # (K) Vietnamese:
560 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
561 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000562 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
563 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
564 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
565 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000566 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000567
Martin v. Löwis2548c732003-04-18 10:39:54 +0000568 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000569 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000570 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000571
Martin v. Löwis2548c732003-04-18 10:39:54 +0000572 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000573 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
574 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
575 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000576 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000577
578 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000579 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
580 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
581 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000582 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000583
584 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000585 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000586 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000587
588 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000589 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
590 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000591 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000592
593 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000594 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000595 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000596
597 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000598 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000599 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000600
601 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000602 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
603 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000604 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000605 ]
606
607for i in punycode_testcases:
608 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000609 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000610
611class PunycodeTest(unittest.TestCase):
612 def test_encode(self):
613 for uni, puny in punycode_testcases:
614 # Need to convert both strings to lower case, since
615 # some of the extended encodings use upper case, but our
616 # code produces only lower case. Converting just puny to
617 # lower is also insufficient, since some of the input characters
618 # are upper case.
Walter Dörwalda4c61282007-05-10 12:36:25 +0000619 self.assertEquals(
620 str(uni.encode("punycode"), "ascii").lower(),
621 str(puny, "ascii").lower()
622 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000623
624 def test_decode(self):
625 for uni, puny in punycode_testcases:
626 self.assertEquals(uni, puny.decode("punycode"))
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000627 self.assertEquals(uni, puny.decode("ascii").decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000628
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000629class UnicodeInternalTest(unittest.TestCase):
630 def test_bug1251300(self):
631 # Decoding with unicode_internal used to not correctly handle "code
632 # points" above 0x10ffff on UCS-4 builds.
633 if sys.maxunicode > 0xffff:
634 ok = [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000635 ("\x00\x10\xff\xff", "\U0010ffff"),
636 ("\x00\x00\x01\x01", "\U00000101"),
637 ("", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000638 ]
639 not_ok = [
640 "\x7f\xff\xff\xff",
641 "\x80\x00\x00\x00",
642 "\x81\x00\x00\x00",
643 "\x00",
644 "\x00\x00\x00\x00\x00",
645 ]
646 for internal, uni in ok:
647 if sys.byteorder == "little":
648 internal = "".join(reversed(internal))
649 self.assertEquals(uni, internal.decode("unicode_internal"))
650 for internal in not_ok:
651 if sys.byteorder == "little":
652 internal = "".join(reversed(internal))
653 self.assertRaises(UnicodeDecodeError, internal.decode,
654 "unicode_internal")
655
656 def test_decode_error_attributes(self):
657 if sys.maxunicode > 0xffff:
658 try:
659 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000660 except UnicodeDecodeError as ex:
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000661 self.assertEquals("unicode_internal", ex.encoding)
662 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
663 self.assertEquals(4, ex.start)
664 self.assertEquals(8, ex.end)
665 else:
666 self.fail()
667
668 def test_decode_callback(self):
669 if sys.maxunicode > 0xffff:
670 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
671 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000672 ab = "ab".encode("unicode_internal")
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000673 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
674 "UnicodeInternalTest")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000675 self.assertEquals(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000676
Martin v. Löwis2548c732003-04-18 10:39:54 +0000677# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
678nameprep_tests = [
679 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000680 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
681 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
682 b'\xb8\x8f\xef\xbb\xbf',
683 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000684 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000685 (b'CAFE',
686 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000687 # 3.3 Case folding 8bit U+00DF (german sharp s).
688 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000689 (b'\xc3\x9f',
690 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000691 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000692 (b'\xc4\xb0',
693 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000694 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000695 (b'\xc5\x83\xcd\xba',
696 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000697 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
698 # XXX: skip this as it fails in UCS-2 mode
699 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
700 # 'telc\xe2\x88\x95kg\xcf\x83'),
701 (None, None),
702 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000703 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
704 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000705 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000706 (b'\xe1\xbe\xb7',
707 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000708 # 3.9 Self-reverting case folding U+01F0 and normalization.
709 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000710 (b'\xc7\xb0',
711 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000712 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000713 (b'\xce\x90',
714 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000715 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000716 (b'\xce\xb0',
717 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000718 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000719 (b'\xe1\xba\x96',
720 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000721 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000722 (b'\xe1\xbd\x96',
723 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000724 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000725 (b' ',
726 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000727 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000728 (b'\xc2\xa0',
729 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000730 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000731 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000732 None),
733 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000734 (b'\xe2\x80\x80',
735 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000736 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000737 (b'\xe2\x80\x8b',
738 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000739 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000740 (b'\xe3\x80\x80',
741 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000742 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000743 (b'\x10\x7f',
744 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000745 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000746 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000747 None),
748 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000749 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000750 None),
751 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000752 (b'\xef\xbb\xbf',
753 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000754 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000755 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000756 None),
757 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000758 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000759 None),
760 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000761 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000762 None),
763 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000764 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000765 None),
766 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000767 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000768 None),
769 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000770 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000771 None),
772 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000773 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000774 None),
775 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000776 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000777 None),
778 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000779 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000780 None),
781 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000782 (b'\xcd\x81',
783 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000784 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000785 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000786 None),
787 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000788 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000789 None),
790 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000791 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000792 None),
793 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000794 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000795 None),
796 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000797 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000798 None),
799 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000800 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000801 None),
802 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000803 (b'foo\xef\xb9\xb6bar',
804 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000805 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000806 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000807 None),
808 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000809 (b'\xd8\xa71\xd8\xa8',
810 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000811 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000812 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000813 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000814 # None),
815 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000816 # 3.44 Larger test (shrinking).
817 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000818 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
819 b'\xaa\xce\xb0\xe2\x80\x80',
820 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000821 # 3.45 Larger test (expanding).
822 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000823 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
824 b'\x80',
825 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
826 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
827 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +0000828 ]
829
830
831class NameprepTest(unittest.TestCase):
832 def test_nameprep(self):
833 from encodings.idna import nameprep
834 for pos, (orig, prepped) in enumerate(nameprep_tests):
835 if orig is None:
836 # Skipped
837 continue
838 # The Unicode strings are given in UTF-8
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000839 orig = str(orig, "utf-8")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000840 if prepped is None:
841 # Input contains prohibited characters
842 self.assertRaises(UnicodeError, nameprep, orig)
843 else:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000844 prepped = str(prepped, "utf-8")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000845 try:
846 self.assertEquals(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +0000847 except Exception as e:
Martin v. Löwis2548c732003-04-18 10:39:54 +0000848 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
849
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000850class IDNACodecTest(unittest.TestCase):
851 def test_builtin_decode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000852 self.assertEquals(str(b"python.org", "idna"), "python.org")
853 self.assertEquals(str(b"python.org.", "idna"), "python.org.")
854 self.assertEquals(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
855 self.assertEquals(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000856
857 def test_builtin_encode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000858 self.assertEquals("python.org".encode("idna"), b"python.org")
859 self.assertEquals("python.org.".encode("idna"), b"python.org.")
860 self.assertEquals("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
861 self.assertEquals("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000862
Martin v. Löwis8b595142005-08-25 11:03:38 +0000863 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000864 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +0000865 r.read(3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000866 self.assertEquals(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +0000867
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000868 def test_incremental_decode(self):
869 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000870 "".join(codecs.iterdecode((bytes(chr(c)) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000871 "python.org"
872 )
873 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000874 "".join(codecs.iterdecode((bytes(chr(c)) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000875 "python.org."
876 )
877 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000878 "".join(codecs.iterdecode((bytes(chr(c)) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000879 "pyth\xf6n.org."
880 )
881 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000882 "".join(codecs.iterdecode((bytes(chr(c)) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000883 "pyth\xf6n.org."
884 )
885
886 decoder = codecs.getincrementaldecoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000887 self.assertEquals(decoder.decode(b"xn--xam", ), "")
888 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
889 self.assertEquals(decoder.decode(b"rg"), "")
890 self.assertEquals(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000891
892 decoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000893 self.assertEquals(decoder.decode(b"xn--xam", ), "")
894 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
895 self.assertEquals(decoder.decode(b"rg."), "org.")
896 self.assertEquals(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000897
898 def test_incremental_encode(self):
899 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000900 b"".join(codecs.iterencode("python.org", "idna")),
901 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000902 )
903 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000904 b"".join(codecs.iterencode("python.org.", "idna")),
905 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000906 )
907 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000908 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
909 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000910 )
911 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000912 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
913 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000914 )
915
916 encoder = codecs.getincrementalencoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000917 self.assertEquals(encoder.encode("\xe4x"), b"")
918 self.assertEquals(encoder.encode("ample.org"), b"xn--xample-9ta.")
919 self.assertEquals(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000920
921 encoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000922 self.assertEquals(encoder.encode("\xe4x"), b"")
923 self.assertEquals(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
924 self.assertEquals(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000925
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000926class CodecsModuleTest(unittest.TestCase):
927
928 def test_decode(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000929 self.assertEquals(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000930 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000931 self.assertRaises(TypeError, codecs.decode)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000932 self.assertEquals(codecs.decode(b'abc'), 'abc')
933 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000934
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000935 def test_encode(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000936 self.assertEquals(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000937 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000938 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +0000939 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000940 self.assertEquals(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000941 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000942
943 def test_register(self):
944 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +0000945 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +0000946
947 def test_lookup(self):
948 self.assertRaises(TypeError, codecs.lookup)
949 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +0000950 self.assertRaises(LookupError, codecs.lookup, " ")
951
952 def test_getencoder(self):
953 self.assertRaises(TypeError, codecs.getencoder)
954 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
955
956 def test_getdecoder(self):
957 self.assertRaises(TypeError, codecs.getdecoder)
958 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
959
960 def test_getreader(self):
961 self.assertRaises(TypeError, codecs.getreader)
962 self.assertRaises(LookupError, codecs.getreader, "__spam__")
963
964 def test_getwriter(self):
965 self.assertRaises(TypeError, codecs.getwriter)
966 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000967
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000968class StreamReaderTest(unittest.TestCase):
969
970 def setUp(self):
971 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000972 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000973
974 def test_readlines(self):
975 f = self.reader(self.stream)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000976 self.assertEquals(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000977
Thomas Wouters89f507f2006-12-13 04:49:30 +0000978class EncodedFileTest(unittest.TestCase):
979
980 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000981 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +0000982 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000983 self.assertEquals(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +0000984
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000985 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +0000986 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000987 ef.write(b'\xc3\xbc')
988 self.assertEquals(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +0000989
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000990class Str2StrTest(unittest.TestCase):
991
992 def test_read(self):
993 sin = "\x80".encode("base64_codec")
Guido van Rossum5c4501a2007-05-09 23:47:07 +0000994 reader = codecs.getreader("base64_codec")(StringIO(sin))
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000995 sout = reader.read()
996 self.assertEqual(sout, "\x80")
997 self.assert_(isinstance(sout, str))
998
999 def test_readline(self):
1000 sin = "\x80".encode("base64_codec")
Guido van Rossum5c4501a2007-05-09 23:47:07 +00001001 reader = codecs.getreader("base64_codec")(StringIO(sin))
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001002 sout = reader.readline()
1003 self.assertEqual(sout, "\x80")
1004 self.assert_(isinstance(sout, str))
1005
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001006all_unicode_encodings = [
1007 "ascii",
1008 "base64_codec",
1009 "big5",
1010 "big5hkscs",
1011 "charmap",
1012 "cp037",
1013 "cp1006",
1014 "cp1026",
1015 "cp1140",
1016 "cp1250",
1017 "cp1251",
1018 "cp1252",
1019 "cp1253",
1020 "cp1254",
1021 "cp1255",
1022 "cp1256",
1023 "cp1257",
1024 "cp1258",
1025 "cp424",
1026 "cp437",
1027 "cp500",
1028 "cp737",
1029 "cp775",
1030 "cp850",
1031 "cp852",
1032 "cp855",
1033 "cp856",
1034 "cp857",
1035 "cp860",
1036 "cp861",
1037 "cp862",
1038 "cp863",
1039 "cp864",
1040 "cp865",
1041 "cp866",
1042 "cp869",
1043 "cp874",
1044 "cp875",
1045 "cp932",
1046 "cp949",
1047 "cp950",
1048 "euc_jis_2004",
1049 "euc_jisx0213",
1050 "euc_jp",
1051 "euc_kr",
1052 "gb18030",
1053 "gb2312",
1054 "gbk",
1055 "hex_codec",
1056 "hp_roman8",
1057 "hz",
1058 "idna",
1059 "iso2022_jp",
1060 "iso2022_jp_1",
1061 "iso2022_jp_2",
1062 "iso2022_jp_2004",
1063 "iso2022_jp_3",
1064 "iso2022_jp_ext",
1065 "iso2022_kr",
1066 "iso8859_1",
1067 "iso8859_10",
1068 "iso8859_11",
1069 "iso8859_13",
1070 "iso8859_14",
1071 "iso8859_15",
1072 "iso8859_16",
1073 "iso8859_2",
1074 "iso8859_3",
1075 "iso8859_4",
1076 "iso8859_5",
1077 "iso8859_6",
1078 "iso8859_7",
1079 "iso8859_8",
1080 "iso8859_9",
1081 "johab",
1082 "koi8_r",
1083 "koi8_u",
1084 "latin_1",
1085 "mac_cyrillic",
1086 "mac_greek",
1087 "mac_iceland",
1088 "mac_latin2",
1089 "mac_roman",
1090 "mac_turkish",
1091 "palmos",
1092 "ptcp154",
1093 "punycode",
1094 "raw_unicode_escape",
1095 "rot_13",
1096 "shift_jis",
1097 "shift_jis_2004",
1098 "shift_jisx0213",
1099 "tis_620",
1100 "unicode_escape",
1101 "unicode_internal",
1102 "utf_16",
1103 "utf_16_be",
1104 "utf_16_le",
1105 "utf_7",
1106 "utf_8",
1107]
1108
1109if hasattr(codecs, "mbcs_encode"):
1110 all_unicode_encodings.append("mbcs")
1111
1112# The following encodings work only with str, not unicode
1113all_string_encodings = [
1114 "quopri_codec",
1115 "string_escape",
1116 "uu_codec",
1117]
1118
1119# The following encoding is not tested, because it's not supposed
1120# to work:
1121# "undefined"
1122
1123# The following encodings don't work in stateful mode
1124broken_unicode_with_streams = [
1125 "base64_codec",
1126 "hex_codec",
1127 "punycode",
1128 "unicode_internal"
1129]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001130broken_incremental_coders = broken_unicode_with_streams + [
1131 "idna",
1132 "zlib_codec",
1133 "bz2_codec",
1134]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001135
1136# The following encodings only support "strict" mode
1137only_strict_mode = [
1138 "idna",
1139 "zlib_codec",
1140 "bz2_codec",
1141]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001142
1143try:
1144 import bz2
1145except ImportError:
1146 pass
1147else:
1148 all_unicode_encodings.append("bz2_codec")
1149 broken_unicode_with_streams.append("bz2_codec")
1150
1151try:
1152 import zlib
1153except ImportError:
1154 pass
1155else:
1156 all_unicode_encodings.append("zlib_codec")
1157 broken_unicode_with_streams.append("zlib_codec")
1158
Walter Dörwald3abcb012007-04-16 22:10:50 +00001159class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001160 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001161 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001162 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001163 name = codecs.lookup(encoding).name
1164 if encoding.endswith("_codec"):
1165 name += "_codec"
1166 elif encoding == "latin_1":
1167 name = "latin_1"
1168 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001169 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001170 if encoding != "unicode_internal":
1171 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001172 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001173 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1174
1175 if encoding not in broken_unicode_with_streams:
1176 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001177 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001178 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001179 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001180 for c in s:
1181 writer.write(c)
1182 encodedresult += q.read()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001183 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001184 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001185 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001186 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001187 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001188 decodedresult += reader.read()
1189 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1190
Thomas Wouters89f507f2006-12-13 04:49:30 +00001191 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001192 # check incremental decoder/encoder (fetched via the Python
1193 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001194 try:
1195 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001196 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001197 except LookupError: # no IncrementalEncoder
1198 pass
1199 else:
1200 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001201 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001202 for c in s:
1203 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001204 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001205 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001206 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001207 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001208 decodedresult += decoder.decode(bytes([c]))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001209 decodedresult += decoder.decode("", True)
1210 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1211
1212 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001213 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001214 for c in s:
1215 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001216 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001217 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001218 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001219 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001220 decodedresult += cdecoder.decode(bytes([c]))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001221 decodedresult += cdecoder.decode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001222 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1223
1224 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001225 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001226 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1227
1228 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001229 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1230 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001231
Thomas Wouters89f507f2006-12-13 04:49:30 +00001232 if encoding not in only_strict_mode:
1233 # check incremental decoder/encoder with errors argument
1234 try:
1235 encoder = codecs.getincrementalencoder(encoding)("ignore")
1236 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1237 except LookupError: # no IncrementalEncoder
1238 pass
1239 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001240 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001241 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001242 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001243 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1244
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001245 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001246 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001247 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001248 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1249
Walter Dörwald729c31f2005-03-14 19:06:30 +00001250 def test_seek(self):
1251 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001252 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001253 for encoding in all_unicode_encodings:
1254 if encoding == "idna": # FIXME: See SF bug #1163178
1255 continue
1256 if encoding in broken_unicode_with_streams:
1257 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001258 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001259 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001260 # Test that calling seek resets the internal codec state and buffers
1261 reader.seek(0, 0)
1262 line = reader.readline()
1263 self.assertEqual(s[:len(line)], line)
1264
Walter Dörwalde22d3392005-11-17 08:52:34 +00001265 def test_bad_decode_args(self):
1266 for encoding in all_unicode_encodings:
1267 decoder = codecs.getdecoder(encoding)
1268 self.assertRaises(TypeError, decoder)
1269 if encoding not in ("idna", "punycode"):
1270 self.assertRaises(TypeError, decoder, 42)
1271
1272 def test_bad_encode_args(self):
1273 for encoding in all_unicode_encodings:
1274 encoder = codecs.getencoder(encoding)
1275 self.assertRaises(TypeError, encoder)
1276
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001277 def test_encoding_map_type_initialized(self):
1278 from encodings import cp1140
1279 # This used to crash, we are only verifying there's no crash.
1280 table_type = type(cp1140.encoding_table)
1281 self.assertEqual(table_type, table_type)
1282
Walter Dörwald3abcb012007-04-16 22:10:50 +00001283 def test_decoder_state(self):
1284 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001285 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001286 for encoding in all_unicode_encodings:
1287 if encoding not in broken_incremental_coders:
1288 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1289 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1290
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001291class BasicStrTest(unittest.TestCase):
1292 def test_basics(self):
1293 s = "abc123"
1294 for encoding in all_string_encodings:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001295 (encoded, size) = codecs.getencoder(encoding)(s)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001296 self.assertEqual(size, len(s))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001297 (chars, size) = codecs.getdecoder(encoding)(encoded)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001298 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1299
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001300class CharmapTest(unittest.TestCase):
1301 def test_decode_with_string_map(self):
1302 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001303 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001304 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001305 )
1306
1307 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001308 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001309 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001310 )
1311
1312 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001313 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001314 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001315 )
1316
1317 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001318 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001319 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001320 )
1321
1322 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001323 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001324 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001325 )
1326
Guido van Rossum805365e2007-05-07 22:24:25 +00001327 allbytes = bytes(range(256))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001328 self.assertEquals(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001329 codecs.charmap_decode(allbytes, "ignore", ""),
1330 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001331 )
1332
Thomas Wouters89f507f2006-12-13 04:49:30 +00001333class WithStmtTest(unittest.TestCase):
1334 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001335 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001336 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001337 self.assertEquals(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001338
1339 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001340 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001341 info = codecs.lookup("utf-8")
1342 with codecs.StreamReaderWriter(f, info.streamreader,
1343 info.streamwriter, 'strict') as srw:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001344 self.assertEquals(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001345
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001346
Fred Drake2e2be372001-09-20 21:33:42 +00001347def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001348 test_support.run_unittest(
1349 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001350 UTF16LETest,
1351 UTF16BETest,
1352 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001353 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001354 UTF7Test,
1355 UTF16ExTest,
1356 ReadBufferTest,
1357 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001358 EscapeDecodeTest,
1359 RecodingTest,
1360 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001361 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001362 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001363 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001364 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001365 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001366 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001367 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001368 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001369 BasicStrTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001370 CharmapTest,
1371 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001372 )
Fred Drake2e2be372001-09-20 21:33:42 +00001373
1374
1375if __name__ == "__main__":
1376 test_main()