blob: 8536f0a9f488590bbc1cd885f871401d09e1e853 [file] [log] [blame]
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00001#!/usr/bin/env python
2#
3# test_multibytecodec.py
4# Unit test for multibytecodec itself
5#
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00006
7from test import test_support
8from test import test_multibytecodec_support
Hye-Shik Chang84392be2006-07-06 15:21:52 +00009from test.test_support import TESTFN
10import unittest, StringIO, codecs, sys, os
Georg Brandlb9b68ae2008-07-16 22:04:20 +000011import _multibytecodec
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +000012
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000013ALL_CJKENCODINGS = [
14# _codecs_cn
15 'gb2312', 'gbk', 'gb18030', 'hz',
16# _codecs_hk
17 'big5hkscs',
18# _codecs_jp
19 'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
20 'euc_jis_2004', 'shift_jis_2004',
21# _codecs_kr
22 'cp949', 'euc_kr', 'johab',
23# _codecs_tw
24 'big5', 'cp950',
25# _codecs_iso2022
26 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
27 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
28]
29
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000030class Test_MultibyteCodec(unittest.TestCase):
31
32 def test_nullcoding(self):
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000033 for enc in ALL_CJKENCODINGS:
34 self.assertEqual(''.decode(enc), u'')
35 self.assertEqual(unicode('', enc), u'')
36 self.assertEqual(u''.encode(enc), '')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000037
38 def test_str_decode(self):
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000039 for enc in ALL_CJKENCODINGS:
40 self.assertEqual('abcd'.encode(enc), 'abcd')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000041
Hye-Shik Chang9f4b6322006-03-26 06:21:34 +000042 def test_errorcallback_longindex(self):
43 dec = codecs.getdecoder('euc-kr')
44 myreplace = lambda exc: (u'', sys.maxint+1)
45 codecs.register_error('test.cjktest', myreplace)
46 self.assertRaises(IndexError, dec,
47 'apple\x92ham\x93spam', 'test.cjktest')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000048
Hye-Shik Chang84392be2006-07-06 15:21:52 +000049 def test_codingspec(self):
Hye-Shik Chang84392be2006-07-06 15:21:52 +000050 try:
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000051 for enc in ALL_CJKENCODINGS:
52 print >> open(TESTFN, 'w'), '# coding:', enc
53 exec open(TESTFN)
Hye-Shik Chang84392be2006-07-06 15:21:52 +000054 finally:
55 os.unlink(TESTFN)
56
Georg Brandlb9b68ae2008-07-16 22:04:20 +000057 def test_init_segfault(self):
58 # bug #3305: this used to segfault
59 self.assertRaises(AttributeError,
60 _multibytecodec.MultibyteStreamReader, None)
61 self.assertRaises(AttributeError,
62 _multibytecodec.MultibyteStreamWriter, None)
63
64
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000065class Test_IncrementalEncoder(unittest.TestCase):
66
67 def test_stateless(self):
68 # cp949 encoder isn't stateful at all.
69 encoder = codecs.getincrementalencoder('cp949')()
70 self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'),
71 '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
72 self.assertEqual(encoder.reset(), None)
73 self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True),
74 '\xa1\xd9\xa1\xad\xa1\xd9')
75 self.assertEqual(encoder.reset(), None)
76 self.assertEqual(encoder.encode(u'', True), '')
77 self.assertEqual(encoder.encode(u'', False), '')
78 self.assertEqual(encoder.reset(), None)
79
80 def test_stateful(self):
81 # jisx0213 encoder is stateful for a few codepoints. eg)
82 # U+00E6 => A9DC
83 # U+00E6 U+0300 => ABC4
84 # U+0300 => ABDC
85
86 encoder = codecs.getincrementalencoder('jisx0213')()
87 self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4')
88 self.assertEqual(encoder.encode(u'\u00e6'), '')
89 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4')
90 self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc')
91
92 self.assertEqual(encoder.reset(), None)
93 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
94
95 self.assertEqual(encoder.encode(u'\u00e6'), '')
96 self.assertEqual(encoder.encode('', True), '\xa9\xdc')
97 self.assertEqual(encoder.encode('', True), '')
98
99 def test_stateful_keep_buffer(self):
100 encoder = codecs.getincrementalencoder('jisx0213')()
101 self.assertEqual(encoder.encode(u'\u00e6'), '')
102 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
103 self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4')
104 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
105 self.assertEqual(encoder.reset(), None)
106 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
107 self.assertEqual(encoder.encode(u'\u00e6'), '')
108 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
109 self.assertEqual(encoder.encode(u'', True), '\xa9\xdc')
110
Victor Stinner9a4a01d2010-05-21 22:55:31 +0000111 def test_issue5640(self):
112 encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace')
113 self.assertEqual(encoder.encode(u'\xff'), b'\\xff')
114 self.assertEqual(encoder.encode(u'\n'), b'\n')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000115
116class Test_IncrementalDecoder(unittest.TestCase):
117
118 def test_dbcs(self):
119 # cp949 decoder is simple with only 1 or 2 bytes sequences.
120 decoder = codecs.getincrementaldecoder('cp949')()
121 self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'),
122 u'\ud30c\uc774')
123 self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'),
124 u'\uc36c \ub9c8\uc744')
125 self.assertEqual(decoder.decode(''), u'')
126
127 def test_dbcs_keep_buffer(self):
128 decoder = codecs.getincrementaldecoder('cp949')()
129 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
130 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
131 self.assertEqual(decoder.decode('\xcc'), u'\uc774')
132
133 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
134 self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True)
135 self.assertEqual(decoder.decode('\xcc'), u'\uc774')
136
137 def test_iso2022(self):
138 decoder = codecs.getincrementaldecoder('iso2022-jp')()
139 ESC = '\x1b'
140 self.assertEqual(decoder.decode(ESC + '('), u'')
141 self.assertEqual(decoder.decode('B', True), u'')
142 self.assertEqual(decoder.decode(ESC + '$'), u'')
143 self.assertEqual(decoder.decode('B@$'), u'\u4e16')
144 self.assertEqual(decoder.decode('@$@'), u'\u4e16')
145 self.assertEqual(decoder.decode('$', True), u'\u4e16')
146 self.assertEqual(decoder.reset(), None)
147 self.assertEqual(decoder.decode('@$'), u'@$')
148 self.assertEqual(decoder.decode(ESC + '$'), u'')
149 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
150 self.assertEqual(decoder.decode('B@$'), u'\u4e16')
151
Hye-Shik Chang9b541402007-06-05 18:58:51 +0000152class Test_StreamReader(unittest.TestCase):
153 def test_bug1728403(self):
154 try:
155 open(TESTFN, 'w').write('\xa1')
156 f = codecs.open(TESTFN, encoding='cp949')
157 self.assertRaises(UnicodeDecodeError, f.read, 2)
158 finally:
Hye-Shik Changf9a0ea82007-06-05 19:28:15 +0000159 try: f.close()
160 except: pass
Hye-Shik Chang9b541402007-06-05 18:58:51 +0000161 os.unlink(TESTFN)
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000162
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000163class Test_StreamWriter(unittest.TestCase):
164 if len(u'\U00012345') == 2: # UCS2
165 def test_gb18030(self):
Hye-Shik Chang9b541402007-06-05 18:58:51 +0000166 s = StringIO.StringIO()
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000167 c = codecs.getwriter('gb18030')(s)
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000168 c.write(u'123')
169 self.assertEqual(s.getvalue(), '123')
170 c.write(u'\U00012345')
171 self.assertEqual(s.getvalue(), '123\x907\x959')
172 c.write(u'\U00012345'[0])
173 self.assertEqual(s.getvalue(), '123\x907\x959')
174 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
175 self.assertEqual(s.getvalue(),
176 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
177 c.write(u'\U00012345'[0])
178 self.assertEqual(s.getvalue(),
179 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
180 self.assertRaises(UnicodeError, c.reset)
181 self.assertEqual(s.getvalue(),
182 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
183
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000184 def test_utf_8(self):
185 s= StringIO.StringIO()
186 c = codecs.getwriter('utf-8')(s)
187 c.write(u'123')
188 self.assertEqual(s.getvalue(), '123')
189 c.write(u'\U00012345')
190 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
191
192 # Python utf-8 codec can't buffer surrogate pairs yet.
193 if 0:
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000194 c.write(u'\U00012345'[0])
195 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
196 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
197 self.assertEqual(s.getvalue(),
198 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
199 '\xea\xb0\x80\xc2\xac')
200 c.write(u'\U00012345'[0])
201 self.assertEqual(s.getvalue(),
202 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
203 '\xea\xb0\x80\xc2\xac')
204 c.reset()
205 self.assertEqual(s.getvalue(),
206 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
207 '\xea\xb0\x80\xc2\xac\xed\xa0\x88')
208 c.write(u'\U00012345'[1])
209 self.assertEqual(s.getvalue(),
210 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
211 '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
212
213 else: # UCS4
214 pass
215
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000216 def test_streamwriter_strwrite(self):
217 s = StringIO.StringIO()
218 wr = codecs.getwriter('gb18030')(s)
219 wr.write('abcd')
220 self.assertEqual(s.getvalue(), 'abcd')
221
Hye-Shik Changabb903f2006-03-13 10:20:08 +0000222class Test_ISO2022(unittest.TestCase):
223 def test_g2(self):
224 iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
225 uni = u':hu4:unit\xe9 de famille'
226 self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
227
Hye-Shik Chang199f1db2006-09-05 12:07:09 +0000228 def test_iso2022_jp_g0(self):
229 self.failIf('\x0e' in u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
230 for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
231 e = u'\u3406'.encode(encoding)
232 self.failIf(filter(lambda x: x >= '\x80', e))
233
Hye-Shik Changb7883462006-10-08 13:48:34 +0000234 def test_bug1572832(self):
235 if sys.maxunicode >= 0x10000:
236 myunichr = unichr
237 else:
238 myunichr = lambda x: unichr(0xD7C0+(x>>10)) + unichr(0xDC00+(x&0x3FF))
239
240 for x in xrange(0x10000, 0x110000):
241 # Any ISO 2022 codec will cause the segfault
242 myunichr(x).encode('iso_2022_jp', 'ignore')
243
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000244def test_main():
Collin Winterc2898c52007-04-25 17:29:52 +0000245 test_support.run_unittest(__name__)
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000246
247if __name__ == "__main__":
248 test_main()