blob: 666f0dff95c0412d4d41b3c20c3c20d1db2c4e17 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00004import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000010 def __init__(self, buffer):
11 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000012
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000019 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000020 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwald3abcb012007-04-16 22:10:50 +000026class MixInCheckStateHandling:
27 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000028 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000029 d = codecs.getincrementaldecoder(encoding)()
30 part1 = d.decode(s[:i])
31 state = d.getstate()
32 self.assert_(isinstance(state[1], int))
33 # Check that the condition stated in the documentation for
34 # IncrementalDecoder.getstate() holds
35 if not state[1]:
36 # reset decoder to the default state without anything buffered
37 d.setstate((state[0][:0], 0))
38 # Feeding the previous input may not produce any output
39 self.assert_(not d.decode(state[0]))
40 # The decoder must return to the same state
41 self.assertEqual(state, d.getstate())
42 # Create a new decoder and set it to the state
43 # we extracted from the old one
44 d = codecs.getincrementaldecoder(encoding)()
45 d.setstate(state)
46 part2 = d.decode(s[i:], True)
47 self.assertEqual(u, part1+part2)
48
49 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000050 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 d = codecs.getincrementalencoder(encoding)()
52 part1 = d.encode(u[:i])
53 state = d.getstate()
54 d = codecs.getincrementalencoder(encoding)()
55 d.setstate(state)
56 part2 = d.encode(u[i:], True)
57 self.assertEqual(s, part1+part2)
58
59class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000060 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000061 # get a StreamReader for the encoding and feed the bytestring version
62 # of input to the reader byte by byte. Read every available from
63 # the StreamReader and check that the results equal the appropriate
64 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000065 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000066 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000067 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000068 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000069 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000070 result += r.read()
71 self.assertEqual(result, partialresult)
72 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000074 self.assertEqual(r.bytebuffer, b"")
Guido van Rossumef87d6e2007-05-02 19:09:54 +000075 self.assertEqual(r.charbuffer, "")
Walter Dörwald69652032004-09-07 20:24:22 +000076
Thomas Woutersa9773292006-04-21 09:43:23 +000077 # do the check again, this time using a incremental decoder
78 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000080 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000081 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000082 self.assertEqual(result, partialresult)
83 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000084 self.assertEqual(d.decode(b"", True), "")
85 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000086
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000088 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000092 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 self.assertEqual(d.decode(b"", True), "")
95 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000096
97 # check iterdecode()
98 encoded = input.encode(self.encoding)
99 self.assertEqual(
100 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 )
103
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000104 def test_readline(self):
105 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000106 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 return codecs.getreader(self.encoding)(stream)
108
Walter Dörwaldca199432006-03-06 22:39:12 +0000109 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 reader = getreader(input)
111 lines = []
112 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 if not line:
115 break
116 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000119 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
120 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
121 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 self.assertEqual(readalllines(s, True), sexpected)
123 self.assertEqual(readalllines(s, False), sexpectednoends)
124 self.assertEqual(readalllines(s, True, 10), sexpected)
125 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
127 # Test long lines (multiple calls to read() in readline())
128 vw = []
129 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
131 vw.append((i*200)*"\3042" + lineend)
132 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
134 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
135
136 # Test lines where the first read might end with \r, so the
137 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000138 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 for lineend in "\n \r\n \r \u2028".split():
140 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000141 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000142 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000143 self.assertEqual(
144 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 )
147 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000148 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 self.assertEqual(
150 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 )
153
154 def test_bug1175396(self):
155 s = [
156 '<%!--===================================================\r\n',
157 ' BLOG index page: show recent articles,\r\n',
158 ' today\'s articles, or articles of a specific date.\r\n',
159 '========================================================--%>\r\n',
160 '<%@inputencoding="ISO-8859-1"%>\r\n',
161 '<%@pagetemplate=TEMPLATE.y%>\r\n',
162 '<%@import=import frog.util, frog%>\r\n',
163 '<%@import=import frog.objects%>\r\n',
164 '<%@import=from frog.storageerrors import StorageError%>\r\n',
165 '<%\r\n',
166 '\r\n',
167 'import logging\r\n',
168 'log=logging.getLogger("Snakelets.logger")\r\n',
169 '\r\n',
170 '\r\n',
171 'user=self.SessionCtx.user\r\n',
172 'storageEngine=self.SessionCtx.storageEngine\r\n',
173 '\r\n',
174 '\r\n',
175 'def readArticlesFromDate(date, count=None):\r\n',
176 ' entryids=storageEngine.listBlogEntries(date)\r\n',
177 ' entryids.reverse() # descending\r\n',
178 ' if count:\r\n',
179 ' entryids=entryids[:count]\r\n',
180 ' try:\r\n',
181 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
182 ' except StorageError,x:\r\n',
183 ' log.error("Error loading articles: "+str(x))\r\n',
184 ' self.abort("cannot load articles")\r\n',
185 '\r\n',
186 'showdate=None\r\n',
187 '\r\n',
188 'arg=self.Request.getArg()\r\n',
189 'if arg=="today":\r\n',
190 ' #-------------------- TODAY\'S ARTICLES\r\n',
191 ' self.write("<h2>Today\'s articles</h2>")\r\n',
192 ' showdate = frog.util.isodatestr() \r\n',
193 ' entries = readArticlesFromDate(showdate)\r\n',
194 'elif arg=="active":\r\n',
195 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
196 ' self.Yredirect("active.y")\r\n',
197 'elif arg=="login":\r\n',
198 ' #-------------------- LOGIN PAGE redirect\r\n',
199 ' self.Yredirect("login.y")\r\n',
200 'elif arg=="date":\r\n',
201 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
202 ' showdate = self.Request.getParameter("date")\r\n',
203 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
204 ' entries = readArticlesFromDate(showdate)\r\n',
205 'else:\r\n',
206 ' #-------------------- RECENT ARTICLES\r\n',
207 ' self.write("<h2>Recent articles</h2>")\r\n',
208 ' dates=storageEngine.listBlogEntryDates()\r\n',
209 ' if dates:\r\n',
210 ' entries=[]\r\n',
211 ' SHOWAMOUNT=10\r\n',
212 ' for showdate in dates:\r\n',
213 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
214 ' if len(entries)>=SHOWAMOUNT:\r\n',
215 ' break\r\n',
216 ' \r\n',
217 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000218 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000219 reader = codecs.getreader(self.encoding)(stream)
220 for (i, line) in enumerate(reader):
221 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000222
223 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000224 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000225 writer = codecs.getwriter(self.encoding)(q)
226 reader = codecs.getreader(self.encoding)(q)
227
228 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000229 writer.write("foo\r")
230 self.assertEqual(reader.readline(keepends=False), "foo")
231 writer.write("\nbar\r")
232 self.assertEqual(reader.readline(keepends=False), "")
233 self.assertEqual(reader.readline(keepends=False), "bar")
234 writer.write("baz")
235 self.assertEqual(reader.readline(keepends=False), "baz")
236 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000237
238 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 writer.write("foo\r")
240 self.assertEqual(reader.readline(keepends=True), "foo\r")
241 writer.write("\nbar\r")
242 self.assertEqual(reader.readline(keepends=True), "\n")
243 self.assertEqual(reader.readline(keepends=True), "bar\r")
244 writer.write("baz")
245 self.assertEqual(reader.readline(keepends=True), "baz")
246 self.assertEqual(reader.readline(keepends=True), "")
247 writer.write("foo\r\n")
248 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
Walter Dörwald9fa09462005-01-10 12:01:39 +0000250 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
252 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
253 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000254
255 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000256 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257 reader = codecs.getreader(self.encoding)(stream)
258 self.assertEqual(reader.readline(), s1)
259 self.assertEqual(reader.readline(), s2)
260 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262
263 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
265 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
266 s3 = "stillokay:bbbbxx\r\n"
267 s4 = "broken!!!!badbad\r\n"
268 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000269
270 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000271 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 reader = codecs.getreader(self.encoding)(stream)
273 self.assertEqual(reader.readline(), s1)
274 self.assertEqual(reader.readline(), s2)
275 self.assertEqual(reader.readline(), s3)
276 self.assertEqual(reader.readline(), s4)
277 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000280class UTF16Test(ReadTest):
281 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000282
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000283 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
284 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000285
286 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000287 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000288 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000289 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000290 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000291 f.write("spam")
292 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000293 d = s.getvalue()
294 # check whether there is exactly one BOM in it
295 self.assert_(d == self.spamle or d == self.spambe)
296 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000297 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000298 f = reader(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000299 self.assertEquals(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000300
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000301 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000302 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000303 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000304 self.assertRaises(UnicodeError, f.read)
305
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000306 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000307 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000308 self.assertRaises(UnicodeError, f.read)
309
Walter Dörwald69652032004-09-07 20:24:22 +0000310 def test_partial(self):
311 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000312 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000313 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000314 "", # first byte of BOM read
315 "", # second byte of BOM read => byteorder known
316 "",
317 "\x00",
318 "\x00",
319 "\x00\xff",
320 "\x00\xff",
321 "\x00\xff\u0100",
322 "\x00\xff\u0100",
323 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000324 ]
325 )
326
Walter Dörwalde22d3392005-11-17 08:52:34 +0000327 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000328 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000329 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000330
331 def test_decoder_state(self):
332 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000333 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000334 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000335 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000336
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000337class UTF16LETest(ReadTest):
338 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000339
340 def test_partial(self):
341 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000342 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000343 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 "",
345 "\x00",
346 "\x00",
347 "\x00\xff",
348 "\x00\xff",
349 "\x00\xff\u0100",
350 "\x00\xff\u0100",
351 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000352 ]
353 )
354
Walter Dörwalde22d3392005-11-17 08:52:34 +0000355 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000356 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000357 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000358
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000359class UTF16BETest(ReadTest):
360 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000361
362 def test_partial(self):
363 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000364 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000365 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000366 "",
367 "\x00",
368 "\x00",
369 "\x00\xff",
370 "\x00\xff",
371 "\x00\xff\u0100",
372 "\x00\xff\u0100",
373 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000374 ]
375 )
376
Walter Dörwalde22d3392005-11-17 08:52:34 +0000377 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000378 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000379 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000380
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000381class UTF8Test(ReadTest):
382 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000383
384 def test_partial(self):
385 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000386 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000387 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000388 "\x00",
389 "\x00",
390 "\x00\xff",
391 "\x00\xff",
392 "\x00\xff\u07ff",
393 "\x00\xff\u07ff",
394 "\x00\xff\u07ff",
395 "\x00\xff\u07ff\u0800",
396 "\x00\xff\u07ff\u0800",
397 "\x00\xff\u07ff\u0800",
398 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000399 ]
400 )
401
Walter Dörwald3abcb012007-04-16 22:10:50 +0000402 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000403 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000404 self.check_state_handling_decode(self.encoding,
405 u, u.encode(self.encoding))
406
Walter Dörwalde22d3392005-11-17 08:52:34 +0000407class UTF7Test(ReadTest):
408 encoding = "utf-7"
409
410 # No test_partial() yet, because UTF-7 doesn't support it.
411
412class UTF16ExTest(unittest.TestCase):
413
414 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000415 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000416
417 def test_bad_args(self):
418 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
419
420class ReadBufferTest(unittest.TestCase):
421
422 def test_array(self):
423 import array
424 self.assertEqual(
425 codecs.readbuffer_encode(array.array("c", "spam")),
426 ("spam", 4)
427 )
428
429 def test_empty(self):
430 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
431
432 def test_bad_args(self):
433 self.assertRaises(TypeError, codecs.readbuffer_encode)
434 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
435
436class CharBufferTest(unittest.TestCase):
437
438 def test_string(self):
439 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
440
441 def test_empty(self):
442 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
443
444 def test_bad_args(self):
445 self.assertRaises(TypeError, codecs.charbuffer_encode)
446 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
447
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000448class UTF8SigTest(ReadTest):
449 encoding = "utf-8-sig"
450
451 def test_partial(self):
452 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000453 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000454 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000455 "",
456 "",
457 "", # First BOM has been read and skipped
458 "",
459 "",
460 "\ufeff", # Second BOM has been read and emitted
461 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000462 "\ufeff\x00", # First byte of encoded "\xff" read
463 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
464 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
465 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000466 "\ufeff\x00\xff\u07ff",
467 "\ufeff\x00\xff\u07ff",
468 "\ufeff\x00\xff\u07ff\u0800",
469 "\ufeff\x00\xff\u07ff\u0800",
470 "\ufeff\x00\xff\u07ff\u0800",
471 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000472 ]
473 )
474
Thomas Wouters89f507f2006-12-13 04:49:30 +0000475 def test_bug1601501(self):
476 # SF bug #1601501: check that the codec works with a buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000477 str(b"\xef\xbb\xbf", "utf-8-sig")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000478
Walter Dörwald3abcb012007-04-16 22:10:50 +0000479 def test_bom(self):
480 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000481 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000482 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
483
484 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000485 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000486 self.check_state_handling_decode(self.encoding,
487 u, u.encode(self.encoding))
488
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000489class RecodingTest(unittest.TestCase):
490 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000491 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000492 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000493 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000494 f2.close()
495 # Python used to crash on this at exit because of a refcount
496 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000497
Martin v. Löwis2548c732003-04-18 10:39:54 +0000498# From RFC 3492
499punycode_testcases = [
500 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000501 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
502 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000503 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000504 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000505 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000506 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000507 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000508 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000509 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000510 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000511 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
512 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
513 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000514 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000515 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000516 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
517 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
518 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000519 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000520 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000521 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000522 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
523 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
524 "\u0939\u0948\u0902",
525 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000526
527 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000528 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000529 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
530 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000531
532 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000533 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
534 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
535 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000536 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
537 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000538
539 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000540 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
541 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
542 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
543 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000544 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000545
546 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000547 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
548 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
549 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
550 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
551 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000552 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000553
554 # (K) Vietnamese:
555 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
556 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000557 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
558 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
559 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
560 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000561 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000562
Martin v. Löwis2548c732003-04-18 10:39:54 +0000563 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000564 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000565 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000566
Martin v. Löwis2548c732003-04-18 10:39:54 +0000567 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000568 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
569 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
570 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000571 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000572
573 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000574 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
575 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
576 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000577 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000578
579 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000580 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000581 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000582
583 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000584 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
585 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000586 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000587
588 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000589 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000590 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000591
592 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000593 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000594 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000595
596 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000597 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
598 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000599 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000600 ]
601
602for i in punycode_testcases:
603 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000604 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000605
606class PunycodeTest(unittest.TestCase):
607 def test_encode(self):
608 for uni, puny in punycode_testcases:
609 # Need to convert both strings to lower case, since
610 # some of the extended encodings use upper case, but our
611 # code produces only lower case. Converting just puny to
612 # lower is also insufficient, since some of the input characters
613 # are upper case.
Walter Dörwalda4c61282007-05-10 12:36:25 +0000614 self.assertEquals(
615 str(uni.encode("punycode"), "ascii").lower(),
616 str(puny, "ascii").lower()
617 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000618
619 def test_decode(self):
620 for uni, puny in punycode_testcases:
621 self.assertEquals(uni, puny.decode("punycode"))
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000622 self.assertEquals(uni, puny.decode("ascii").decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000623
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000624class UnicodeInternalTest(unittest.TestCase):
625 def test_bug1251300(self):
626 # Decoding with unicode_internal used to not correctly handle "code
627 # points" above 0x10ffff on UCS-4 builds.
628 if sys.maxunicode > 0xffff:
629 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000630 (b"\x00\x10\xff\xff", "\U0010ffff"),
631 (b"\x00\x00\x01\x01", "\U00000101"),
632 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000633 ]
634 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000635 b"\x7f\xff\xff\xff",
636 b"\x80\x00\x00\x00",
637 b"\x81\x00\x00\x00",
638 b"\x00",
639 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000640 ]
641 for internal, uni in ok:
642 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000643 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000644 self.assertEquals(uni, internal.decode("unicode_internal"))
645 for internal in not_ok:
646 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000647 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000648 self.assertRaises(UnicodeDecodeError, internal.decode,
649 "unicode_internal")
650
651 def test_decode_error_attributes(self):
652 if sys.maxunicode > 0xffff:
653 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000654 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000655 except UnicodeDecodeError as ex:
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000656 self.assertEquals("unicode_internal", ex.encoding)
Walter Dörwald092a2252007-06-07 11:26:16 +0000657 self.assertEquals(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000658 self.assertEquals(4, ex.start)
659 self.assertEquals(8, ex.end)
660 else:
661 self.fail()
662
663 def test_decode_callback(self):
664 if sys.maxunicode > 0xffff:
665 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
666 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000667 ab = "ab".encode("unicode_internal")
Walter Dörwald092a2252007-06-07 11:26:16 +0000668 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:])),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000669 "UnicodeInternalTest")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000670 self.assertEquals(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000671
Martin v. Löwis2548c732003-04-18 10:39:54 +0000672# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
673nameprep_tests = [
674 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000675 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
676 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
677 b'\xb8\x8f\xef\xbb\xbf',
678 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000679 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000680 (b'CAFE',
681 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000682 # 3.3 Case folding 8bit U+00DF (german sharp s).
683 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000684 (b'\xc3\x9f',
685 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000686 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000687 (b'\xc4\xb0',
688 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000689 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000690 (b'\xc5\x83\xcd\xba',
691 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000692 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
693 # XXX: skip this as it fails in UCS-2 mode
694 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
695 # 'telc\xe2\x88\x95kg\xcf\x83'),
696 (None, None),
697 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000698 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
699 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000700 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000701 (b'\xe1\xbe\xb7',
702 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000703 # 3.9 Self-reverting case folding U+01F0 and normalization.
704 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000705 (b'\xc7\xb0',
706 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000707 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000708 (b'\xce\x90',
709 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000710 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000711 (b'\xce\xb0',
712 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000713 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000714 (b'\xe1\xba\x96',
715 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000716 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000717 (b'\xe1\xbd\x96',
718 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000719 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000720 (b' ',
721 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000722 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000723 (b'\xc2\xa0',
724 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000725 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000726 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000727 None),
728 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000729 (b'\xe2\x80\x80',
730 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000731 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000732 (b'\xe2\x80\x8b',
733 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000734 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000735 (b'\xe3\x80\x80',
736 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000737 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000738 (b'\x10\x7f',
739 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000740 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000741 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000742 None),
743 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000744 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000745 None),
746 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000747 (b'\xef\xbb\xbf',
748 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000749 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000750 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000751 None),
752 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000753 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000754 None),
755 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000756 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000757 None),
758 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000759 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000760 None),
761 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000762 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000763 None),
764 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000765 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000766 None),
767 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000768 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000769 None),
770 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000771 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000772 None),
773 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000774 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000775 None),
776 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000777 (b'\xcd\x81',
778 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000779 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000780 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000781 None),
782 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000783 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000784 None),
785 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000786 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000787 None),
788 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000789 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000790 None),
791 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000792 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000793 None),
794 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000795 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000796 None),
797 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000798 (b'foo\xef\xb9\xb6bar',
799 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000800 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000801 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000802 None),
803 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000804 (b'\xd8\xa71\xd8\xa8',
805 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000806 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000807 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000808 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000809 # None),
810 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000811 # 3.44 Larger test (shrinking).
812 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000813 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
814 b'\xaa\xce\xb0\xe2\x80\x80',
815 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000816 # 3.45 Larger test (expanding).
817 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000818 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
819 b'\x80',
820 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
821 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
822 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +0000823 ]
824
825
826class NameprepTest(unittest.TestCase):
827 def test_nameprep(self):
828 from encodings.idna import nameprep
829 for pos, (orig, prepped) in enumerate(nameprep_tests):
830 if orig is None:
831 # Skipped
832 continue
833 # The Unicode strings are given in UTF-8
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000834 orig = str(orig, "utf-8")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000835 if prepped is None:
836 # Input contains prohibited characters
837 self.assertRaises(UnicodeError, nameprep, orig)
838 else:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000839 prepped = str(prepped, "utf-8")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000840 try:
841 self.assertEquals(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +0000842 except Exception as e:
Martin v. Löwis2548c732003-04-18 10:39:54 +0000843 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
844
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000845class IDNACodecTest(unittest.TestCase):
846 def test_builtin_decode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000847 self.assertEquals(str(b"python.org", "idna"), "python.org")
848 self.assertEquals(str(b"python.org.", "idna"), "python.org.")
849 self.assertEquals(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
850 self.assertEquals(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000851
852 def test_builtin_encode(self):
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000853 self.assertEquals("python.org".encode("idna"), b"python.org")
854 self.assertEquals("python.org.".encode("idna"), b"python.org.")
855 self.assertEquals("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
856 self.assertEquals("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000857
Martin v. Löwis8b595142005-08-25 11:03:38 +0000858 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000859 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +0000860 r.read(3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000861 self.assertEquals(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +0000862
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000863 def test_incremental_decode(self):
864 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000865 "".join(codecs.iterdecode((bytes(chr(c)) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000866 "python.org"
867 )
868 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000869 "".join(codecs.iterdecode((bytes(chr(c)) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000870 "python.org."
871 )
872 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000873 "".join(codecs.iterdecode((bytes(chr(c)) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000874 "pyth\xf6n.org."
875 )
876 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000877 "".join(codecs.iterdecode((bytes(chr(c)) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000878 "pyth\xf6n.org."
879 )
880
881 decoder = codecs.getincrementaldecoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000882 self.assertEquals(decoder.decode(b"xn--xam", ), "")
883 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
884 self.assertEquals(decoder.decode(b"rg"), "")
885 self.assertEquals(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000886
887 decoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000888 self.assertEquals(decoder.decode(b"xn--xam", ), "")
889 self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
890 self.assertEquals(decoder.decode(b"rg."), "org.")
891 self.assertEquals(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000892
893 def test_incremental_encode(self):
894 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000895 b"".join(codecs.iterencode("python.org", "idna")),
896 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000897 )
898 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000899 b"".join(codecs.iterencode("python.org.", "idna")),
900 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000901 )
902 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000903 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
904 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000905 )
906 self.assertEquals(
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000907 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
908 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000909 )
910
911 encoder = codecs.getincrementalencoder("idna")()
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000912 self.assertEquals(encoder.encode("\xe4x"), b"")
913 self.assertEquals(encoder.encode("ample.org"), b"xn--xample-9ta.")
914 self.assertEquals(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000915
916 encoder.reset()
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000917 self.assertEquals(encoder.encode("\xe4x"), b"")
918 self.assertEquals(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
919 self.assertEquals(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000920
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000921class CodecsModuleTest(unittest.TestCase):
922
923 def test_decode(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000924 self.assertEquals(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000925 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000926 self.assertRaises(TypeError, codecs.decode)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000927 self.assertEquals(codecs.decode(b'abc'), 'abc')
928 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000929
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000930 def test_encode(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000931 self.assertEquals(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000932 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000933 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +0000934 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000935 self.assertEquals(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000936 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000937
938 def test_register(self):
939 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +0000940 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +0000941
942 def test_lookup(self):
943 self.assertRaises(TypeError, codecs.lookup)
944 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +0000945 self.assertRaises(LookupError, codecs.lookup, " ")
946
947 def test_getencoder(self):
948 self.assertRaises(TypeError, codecs.getencoder)
949 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
950
951 def test_getdecoder(self):
952 self.assertRaises(TypeError, codecs.getdecoder)
953 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
954
955 def test_getreader(self):
956 self.assertRaises(TypeError, codecs.getreader)
957 self.assertRaises(LookupError, codecs.getreader, "__spam__")
958
959 def test_getwriter(self):
960 self.assertRaises(TypeError, codecs.getwriter)
961 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000962
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000963class StreamReaderTest(unittest.TestCase):
964
965 def setUp(self):
966 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000967 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000968
969 def test_readlines(self):
970 f = self.reader(self.stream)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000971 self.assertEquals(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000972
Thomas Wouters89f507f2006-12-13 04:49:30 +0000973class EncodedFileTest(unittest.TestCase):
974
975 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000976 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +0000977 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000978 self.assertEquals(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +0000979
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000980 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +0000981 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000982 ef.write(b'\xc3\xbc')
983 self.assertEquals(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +0000984
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000985all_unicode_encodings = [
986 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000987 "big5",
988 "big5hkscs",
989 "charmap",
990 "cp037",
991 "cp1006",
992 "cp1026",
993 "cp1140",
994 "cp1250",
995 "cp1251",
996 "cp1252",
997 "cp1253",
998 "cp1254",
999 "cp1255",
1000 "cp1256",
1001 "cp1257",
1002 "cp1258",
1003 "cp424",
1004 "cp437",
1005 "cp500",
1006 "cp737",
1007 "cp775",
1008 "cp850",
1009 "cp852",
1010 "cp855",
1011 "cp856",
1012 "cp857",
1013 "cp860",
1014 "cp861",
1015 "cp862",
1016 "cp863",
1017 "cp864",
1018 "cp865",
1019 "cp866",
1020 "cp869",
1021 "cp874",
1022 "cp875",
1023 "cp932",
1024 "cp949",
1025 "cp950",
1026 "euc_jis_2004",
1027 "euc_jisx0213",
1028 "euc_jp",
1029 "euc_kr",
1030 "gb18030",
1031 "gb2312",
1032 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001033 "hp_roman8",
1034 "hz",
1035 "idna",
1036 "iso2022_jp",
1037 "iso2022_jp_1",
1038 "iso2022_jp_2",
1039 "iso2022_jp_2004",
1040 "iso2022_jp_3",
1041 "iso2022_jp_ext",
1042 "iso2022_kr",
1043 "iso8859_1",
1044 "iso8859_10",
1045 "iso8859_11",
1046 "iso8859_13",
1047 "iso8859_14",
1048 "iso8859_15",
1049 "iso8859_16",
1050 "iso8859_2",
1051 "iso8859_3",
1052 "iso8859_4",
1053 "iso8859_5",
1054 "iso8859_6",
1055 "iso8859_7",
1056 "iso8859_8",
1057 "iso8859_9",
1058 "johab",
1059 "koi8_r",
1060 "koi8_u",
1061 "latin_1",
1062 "mac_cyrillic",
1063 "mac_greek",
1064 "mac_iceland",
1065 "mac_latin2",
1066 "mac_roman",
1067 "mac_turkish",
1068 "palmos",
1069 "ptcp154",
1070 "punycode",
1071 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001072 "shift_jis",
1073 "shift_jis_2004",
1074 "shift_jisx0213",
1075 "tis_620",
1076 "unicode_escape",
1077 "unicode_internal",
1078 "utf_16",
1079 "utf_16_be",
1080 "utf_16_le",
1081 "utf_7",
1082 "utf_8",
1083]
1084
1085if hasattr(codecs, "mbcs_encode"):
1086 all_unicode_encodings.append("mbcs")
1087
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001088# The following encoding is not tested, because it's not supposed
1089# to work:
1090# "undefined"
1091
1092# The following encodings don't work in stateful mode
1093broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001094 "punycode",
1095 "unicode_internal"
1096]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001097broken_incremental_coders = broken_unicode_with_streams + [
1098 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001099]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001100
1101# The following encodings only support "strict" mode
1102only_strict_mode = [
1103 "idna",
Thomas Wouters89f507f2006-12-13 04:49:30 +00001104]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001105
Walter Dörwald3abcb012007-04-16 22:10:50 +00001106class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001107 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001108 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001109 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001110 name = codecs.lookup(encoding).name
1111 if encoding.endswith("_codec"):
1112 name += "_codec"
1113 elif encoding == "latin_1":
1114 name = "latin_1"
1115 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001116 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001117 if encoding != "unicode_internal":
1118 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001119 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001120 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1121
1122 if encoding not in broken_unicode_with_streams:
1123 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001124 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001125 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001126 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001127 for c in s:
1128 writer.write(c)
1129 encodedresult += q.read()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001130 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001131 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001132 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001133 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001134 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001135 decodedresult += reader.read()
1136 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1137
Thomas Wouters89f507f2006-12-13 04:49:30 +00001138 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001139 # check incremental decoder/encoder (fetched via the Python
1140 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001141 try:
1142 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001143 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001144 except LookupError: # no IncrementalEncoder
1145 pass
1146 else:
1147 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001148 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001149 for c in s:
1150 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001151 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001152 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001153 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001154 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001155 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001156 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001157 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1158
1159 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001160 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001161 for c in s:
1162 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001163 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001164 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001165 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001166 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001167 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001168 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001169 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1170
1171 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001172 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001173 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1174
1175 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001176 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1177 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001178
Thomas Wouters89f507f2006-12-13 04:49:30 +00001179 if encoding not in only_strict_mode:
1180 # check incremental decoder/encoder with errors argument
1181 try:
1182 encoder = codecs.getincrementalencoder(encoding)("ignore")
1183 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1184 except LookupError: # no IncrementalEncoder
1185 pass
1186 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001187 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001188 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001189 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001190 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1191
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001192 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001193 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001194 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001195 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1196
Walter Dörwald729c31f2005-03-14 19:06:30 +00001197 def test_seek(self):
1198 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001199 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001200 for encoding in all_unicode_encodings:
1201 if encoding == "idna": # FIXME: See SF bug #1163178
1202 continue
1203 if encoding in broken_unicode_with_streams:
1204 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001205 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001206 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001207 # Test that calling seek resets the internal codec state and buffers
1208 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001209 data = reader.read()
1210 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001211
Walter Dörwalde22d3392005-11-17 08:52:34 +00001212 def test_bad_decode_args(self):
1213 for encoding in all_unicode_encodings:
1214 decoder = codecs.getdecoder(encoding)
1215 self.assertRaises(TypeError, decoder)
1216 if encoding not in ("idna", "punycode"):
1217 self.assertRaises(TypeError, decoder, 42)
1218
1219 def test_bad_encode_args(self):
1220 for encoding in all_unicode_encodings:
1221 encoder = codecs.getencoder(encoding)
1222 self.assertRaises(TypeError, encoder)
1223
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001224 def test_encoding_map_type_initialized(self):
1225 from encodings import cp1140
1226 # This used to crash, we are only verifying there's no crash.
1227 table_type = type(cp1140.encoding_table)
1228 self.assertEqual(table_type, table_type)
1229
Walter Dörwald3abcb012007-04-16 22:10:50 +00001230 def test_decoder_state(self):
1231 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001232 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001233 for encoding in all_unicode_encodings:
1234 if encoding not in broken_incremental_coders:
1235 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1236 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1237
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001238class CharmapTest(unittest.TestCase):
1239 def test_decode_with_string_map(self):
1240 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001241 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001242 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001243 )
1244
1245 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001246 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001247 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001248 )
1249
1250 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001251 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001252 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001253 )
1254
1255 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001256 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001257 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001258 )
1259
1260 self.assertEquals(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001261 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001262 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001263 )
1264
Guido van Rossum805365e2007-05-07 22:24:25 +00001265 allbytes = bytes(range(256))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001266 self.assertEquals(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001267 codecs.charmap_decode(allbytes, "ignore", ""),
1268 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001269 )
1270
Thomas Wouters89f507f2006-12-13 04:49:30 +00001271class WithStmtTest(unittest.TestCase):
1272 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001273 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001274 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001275 self.assertEquals(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001276
1277 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001278 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001279 info = codecs.lookup("utf-8")
1280 with codecs.StreamReaderWriter(f, info.streamreader,
1281 info.streamwriter, 'strict') as srw:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001282 self.assertEquals(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001283
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001284
Fred Drake2e2be372001-09-20 21:33:42 +00001285def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001286 test_support.run_unittest(
1287 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001288 UTF16LETest,
1289 UTF16BETest,
1290 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001291 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001292 UTF7Test,
1293 UTF16ExTest,
1294 ReadBufferTest,
1295 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001296 RecodingTest,
1297 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001298 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001299 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001300 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001301 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001302 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001303 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001304 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001305 CharmapTest,
1306 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001307 )
Fred Drake2e2be372001-09-20 21:33:42 +00001308
1309
1310if __name__ == "__main__":
1311 test_main()