blob: 5b61e7e857a425775c6e121c62e220b357fe15a1 [file] [log] [blame]
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00001# test_multibytecodec.py
2# Unit test for multibytecodec itself
3#
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00004
5from test import test_support
Hye-Shik Chang84392be2006-07-06 15:21:52 +00006from test.test_support import TESTFN
7import unittest, StringIO, codecs, sys, os
Georg Brandlb9b68ae2008-07-16 22:04:20 +00008import _multibytecodec
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00009
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000010ALL_CJKENCODINGS = [
11# _codecs_cn
12 'gb2312', 'gbk', 'gb18030', 'hz',
13# _codecs_hk
14 'big5hkscs',
15# _codecs_jp
16 'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
17 'euc_jis_2004', 'shift_jis_2004',
18# _codecs_kr
19 'cp949', 'euc_kr', 'johab',
20# _codecs_tw
21 'big5', 'cp950',
22# _codecs_iso2022
23 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
24 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
25]
26
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000027class Test_MultibyteCodec(unittest.TestCase):
28
29 def test_nullcoding(self):
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000030 for enc in ALL_CJKENCODINGS:
31 self.assertEqual(''.decode(enc), u'')
32 self.assertEqual(unicode('', enc), u'')
33 self.assertEqual(u''.encode(enc), '')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000034
35 def test_str_decode(self):
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000036 for enc in ALL_CJKENCODINGS:
37 self.assertEqual('abcd'.encode(enc), 'abcd')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000038
Hye-Shik Chang9f4b6322006-03-26 06:21:34 +000039 def test_errorcallback_longindex(self):
40 dec = codecs.getdecoder('euc-kr')
41 myreplace = lambda exc: (u'', sys.maxint+1)
42 codecs.register_error('test.cjktest', myreplace)
43 self.assertRaises(IndexError, dec,
44 'apple\x92ham\x93spam', 'test.cjktest')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000045
Serhiy Storchaka021d55f2015-02-21 01:19:17 +020046 def test_errorcallback_custom_ignore(self):
47 # Issue #23215: MemoryError with custom error handlers and multibyte codecs
48 data = 100 * unichr(0xdc00)
49 codecs.register_error("test.ignore", codecs.ignore_errors)
50 for enc in ALL_CJKENCODINGS:
51 self.assertEqual(data.encode(enc, "test.ignore"), b'')
52
Hye-Shik Chang84392be2006-07-06 15:21:52 +000053 def test_codingspec(self):
Antoine Pitrou99d848b2010-10-14 22:22:30 +000054 for enc in ALL_CJKENCODINGS:
55 code = '# coding: {}\n'.format(enc)
56 exec code
Hye-Shik Chang84392be2006-07-06 15:21:52 +000057
Georg Brandlb9b68ae2008-07-16 22:04:20 +000058 def test_init_segfault(self):
59 # bug #3305: this used to segfault
60 self.assertRaises(AttributeError,
61 _multibytecodec.MultibyteStreamReader, None)
62 self.assertRaises(AttributeError,
63 _multibytecodec.MultibyteStreamWriter, None)
64
65
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000066class Test_IncrementalEncoder(unittest.TestCase):
67
68 def test_stateless(self):
69 # cp949 encoder isn't stateful at all.
70 encoder = codecs.getincrementalencoder('cp949')()
71 self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'),
72 '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
73 self.assertEqual(encoder.reset(), None)
74 self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True),
75 '\xa1\xd9\xa1\xad\xa1\xd9')
76 self.assertEqual(encoder.reset(), None)
77 self.assertEqual(encoder.encode(u'', True), '')
78 self.assertEqual(encoder.encode(u'', False), '')
79 self.assertEqual(encoder.reset(), None)
80
81 def test_stateful(self):
Serhiy Storchakae8c9e142015-01-18 11:42:50 +020082 # jisx0213 encoder is stateful for a few code points. eg)
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000083 # U+00E6 => A9DC
84 # U+00E6 U+0300 => ABC4
85 # U+0300 => ABDC
86
87 encoder = codecs.getincrementalencoder('jisx0213')()
88 self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4')
89 self.assertEqual(encoder.encode(u'\u00e6'), '')
90 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4')
91 self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc')
92
93 self.assertEqual(encoder.reset(), None)
94 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
95
96 self.assertEqual(encoder.encode(u'\u00e6'), '')
97 self.assertEqual(encoder.encode('', True), '\xa9\xdc')
98 self.assertEqual(encoder.encode('', True), '')
99
100 def test_stateful_keep_buffer(self):
101 encoder = codecs.getincrementalencoder('jisx0213')()
102 self.assertEqual(encoder.encode(u'\u00e6'), '')
103 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
104 self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4')
105 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
106 self.assertEqual(encoder.reset(), None)
107 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
108 self.assertEqual(encoder.encode(u'\u00e6'), '')
109 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
110 self.assertEqual(encoder.encode(u'', True), '\xa9\xdc')
111
Victor Stinnerd6703b52010-05-21 22:50:28 +0000112 def test_issue5640(self):
113 encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace')
114 self.assertEqual(encoder.encode(u'\xff'), b'\\xff')
115 self.assertEqual(encoder.encode(u'\n'), b'\n')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000116
117class Test_IncrementalDecoder(unittest.TestCase):
118
119 def test_dbcs(self):
120 # cp949 decoder is simple with only 1 or 2 bytes sequences.
121 decoder = codecs.getincrementaldecoder('cp949')()
122 self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'),
123 u'\ud30c\uc774')
124 self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'),
125 u'\uc36c \ub9c8\uc744')
126 self.assertEqual(decoder.decode(''), u'')
127
128 def test_dbcs_keep_buffer(self):
129 decoder = codecs.getincrementaldecoder('cp949')()
130 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
131 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
132 self.assertEqual(decoder.decode('\xcc'), u'\uc774')
133
134 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
135 self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True)
136 self.assertEqual(decoder.decode('\xcc'), u'\uc774')
137
138 def test_iso2022(self):
139 decoder = codecs.getincrementaldecoder('iso2022-jp')()
140 ESC = '\x1b'
141 self.assertEqual(decoder.decode(ESC + '('), u'')
142 self.assertEqual(decoder.decode('B', True), u'')
143 self.assertEqual(decoder.decode(ESC + '$'), u'')
144 self.assertEqual(decoder.decode('B@$'), u'\u4e16')
145 self.assertEqual(decoder.decode('@$@'), u'\u4e16')
146 self.assertEqual(decoder.decode('$', True), u'\u4e16')
147 self.assertEqual(decoder.reset(), None)
148 self.assertEqual(decoder.decode('@$'), u'@$')
149 self.assertEqual(decoder.decode(ESC + '$'), u'')
150 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
151 self.assertEqual(decoder.decode('B@$'), u'\u4e16')
152
Hye-Shik Chang9b541402007-06-05 18:58:51 +0000153class Test_StreamReader(unittest.TestCase):
154 def test_bug1728403(self):
155 try:
156 open(TESTFN, 'w').write('\xa1')
157 f = codecs.open(TESTFN, encoding='cp949')
158 self.assertRaises(UnicodeDecodeError, f.read, 2)
159 finally:
Hye-Shik Changf9a0ea82007-06-05 19:28:15 +0000160 try: f.close()
161 except: pass
Hye-Shik Chang9b541402007-06-05 18:58:51 +0000162 os.unlink(TESTFN)
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000163
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000164class Test_StreamWriter(unittest.TestCase):
Zachary Ware1f702212013-12-10 14:09:20 -0600165 @unittest.skipUnless(len(u'\U00012345') == 2, 'need a narrow build')
166 def test_gb18030(self):
167 s = StringIO.StringIO()
168 c = codecs.getwriter('gb18030')(s)
169 c.write(u'123')
170 self.assertEqual(s.getvalue(), '123')
171 c.write(u'\U00012345')
172 self.assertEqual(s.getvalue(), '123\x907\x959')
173 c.write(u'\U00012345'[0])
174 self.assertEqual(s.getvalue(), '123\x907\x959')
175 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
176 self.assertEqual(s.getvalue(),
177 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
178 c.write(u'\U00012345'[0])
179 self.assertEqual(s.getvalue(),
180 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
181 self.assertRaises(UnicodeError, c.reset)
182 self.assertEqual(s.getvalue(),
183 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
184
185 @unittest.skipUnless(len(u'\U00012345') == 2, 'need a narrow build')
186 def test_utf_8(self):
187 s= StringIO.StringIO()
188 c = codecs.getwriter('utf-8')(s)
189 c.write(u'123')
190 self.assertEqual(s.getvalue(), '123')
191 c.write(u'\U00012345')
192 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
193
194 # Python utf-8 codec can't buffer surrogate pairs yet.
195 if 0:
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000196 c.write(u'\U00012345'[0])
Zachary Ware1f702212013-12-10 14:09:20 -0600197 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000198 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
199 self.assertEqual(s.getvalue(),
Zachary Ware1f702212013-12-10 14:09:20 -0600200 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
201 '\xea\xb0\x80\xc2\xac')
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000202 c.write(u'\U00012345'[0])
203 self.assertEqual(s.getvalue(),
Zachary Ware1f702212013-12-10 14:09:20 -0600204 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
205 '\xea\xb0\x80\xc2\xac')
206 c.reset()
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000207 self.assertEqual(s.getvalue(),
Zachary Ware1f702212013-12-10 14:09:20 -0600208 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
209 '\xea\xb0\x80\xc2\xac\xed\xa0\x88')
210 c.write(u'\U00012345'[1])
211 self.assertEqual(s.getvalue(),
212 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
213 '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000214
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000215 def test_streamwriter_strwrite(self):
216 s = StringIO.StringIO()
217 wr = codecs.getwriter('gb18030')(s)
218 wr.write('abcd')
219 self.assertEqual(s.getvalue(), 'abcd')
220
Hye-Shik Changabb903f2006-03-13 10:20:08 +0000221class Test_ISO2022(unittest.TestCase):
222 def test_g2(self):
223 iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
224 uni = u':hu4:unit\xe9 de famille'
225 self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
226
Hye-Shik Chang199f1db2006-09-05 12:07:09 +0000227 def test_iso2022_jp_g0(self):
Ezio Melottiaa980582010-01-23 23:04:36 +0000228 self.assertNotIn('\x0e', u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
Hye-Shik Chang199f1db2006-09-05 12:07:09 +0000229 for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
230 e = u'\u3406'.encode(encoding)
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000231 self.assertFalse(filter(lambda x: x >= '\x80', e))
Hye-Shik Chang199f1db2006-09-05 12:07:09 +0000232
Hye-Shik Changb7883462006-10-08 13:48:34 +0000233 def test_bug1572832(self):
234 if sys.maxunicode >= 0x10000:
235 myunichr = unichr
236 else:
237 myunichr = lambda x: unichr(0xD7C0+(x>>10)) + unichr(0xDC00+(x&0x3FF))
238
239 for x in xrange(0x10000, 0x110000):
240 # Any ISO 2022 codec will cause the segfault
241 myunichr(x).encode('iso_2022_jp', 'ignore')
242
Victor Stinner40b41e12011-05-24 22:29:13 +0200243class TestStateful(unittest.TestCase):
244 text = u'\u4E16\u4E16'
245 encoding = 'iso-2022-jp'
246 expected = b'\x1b$B@$@$'
247 expected_reset = b'\x1b$B@$@$\x1b(B'
248
249 def test_encode(self):
250 self.assertEqual(self.text.encode(self.encoding), self.expected_reset)
251
252 def test_incrementalencoder(self):
253 encoder = codecs.getincrementalencoder(self.encoding)()
254 output = b''.join(
255 encoder.encode(char)
256 for char in self.text)
257 self.assertEqual(output, self.expected)
258
259 def test_incrementalencoder_final(self):
260 encoder = codecs.getincrementalencoder(self.encoding)()
261 last_index = len(self.text) - 1
262 output = b''.join(
263 encoder.encode(char, index == last_index)
264 for index, char in enumerate(self.text))
265 self.assertEqual(output, self.expected_reset)
266
267class TestHZStateful(TestStateful):
268 text = u'\u804a\u804a'
269 encoding = 'hz'
270 expected = b'~{ADAD'
271 expected_reset = b'~{ADAD~}'
272
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000273def test_main():
Collin Winterc2898c52007-04-25 17:29:52 +0000274 test_support.run_unittest(__name__)
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000275
276if __name__ == "__main__":
277 test_main()