blob: 95ca0ab288cacd95ebe15086a326fe50c6a17616 [file] [log] [blame]
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00001#!/usr/bin/env python
2#
3# test_multibytecodec.py
4# Unit test for multibytecodec itself
5#
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00006
7from test import test_support
Hye-Shik Chang84392be2006-07-06 15:21:52 +00008from test.test_support import TESTFN
9import unittest, StringIO, codecs, sys, os
Georg Brandlb9b68ae2008-07-16 22:04:20 +000010import _multibytecodec
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +000011
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000012ALL_CJKENCODINGS = [
13# _codecs_cn
14 'gb2312', 'gbk', 'gb18030', 'hz',
15# _codecs_hk
16 'big5hkscs',
17# _codecs_jp
18 'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
19 'euc_jis_2004', 'shift_jis_2004',
20# _codecs_kr
21 'cp949', 'euc_kr', 'johab',
22# _codecs_tw
23 'big5', 'cp950',
24# _codecs_iso2022
25 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
26 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
27]
28
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000029class Test_MultibyteCodec(unittest.TestCase):
30
31 def test_nullcoding(self):
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000032 for enc in ALL_CJKENCODINGS:
33 self.assertEqual(''.decode(enc), u'')
34 self.assertEqual(unicode('', enc), u'')
35 self.assertEqual(u''.encode(enc), '')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000036
37 def test_str_decode(self):
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000038 for enc in ALL_CJKENCODINGS:
39 self.assertEqual('abcd'.encode(enc), 'abcd')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000040
Hye-Shik Chang9f4b6322006-03-26 06:21:34 +000041 def test_errorcallback_longindex(self):
42 dec = codecs.getdecoder('euc-kr')
43 myreplace = lambda exc: (u'', sys.maxint+1)
44 codecs.register_error('test.cjktest', myreplace)
45 self.assertRaises(IndexError, dec,
46 'apple\x92ham\x93spam', 'test.cjktest')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000047
Hye-Shik Chang84392be2006-07-06 15:21:52 +000048 def test_codingspec(self):
Hye-Shik Chang84392be2006-07-06 15:21:52 +000049 try:
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000050 for enc in ALL_CJKENCODINGS:
51 print >> open(TESTFN, 'w'), '# coding:', enc
52 exec open(TESTFN)
Hye-Shik Chang84392be2006-07-06 15:21:52 +000053 finally:
54 os.unlink(TESTFN)
55
Georg Brandlb9b68ae2008-07-16 22:04:20 +000056 def test_init_segfault(self):
57 # bug #3305: this used to segfault
58 self.assertRaises(AttributeError,
59 _multibytecodec.MultibyteStreamReader, None)
60 self.assertRaises(AttributeError,
61 _multibytecodec.MultibyteStreamWriter, None)
62
63
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000064class Test_IncrementalEncoder(unittest.TestCase):
65
66 def test_stateless(self):
67 # cp949 encoder isn't stateful at all.
68 encoder = codecs.getincrementalencoder('cp949')()
69 self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'),
70 '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
71 self.assertEqual(encoder.reset(), None)
72 self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True),
73 '\xa1\xd9\xa1\xad\xa1\xd9')
74 self.assertEqual(encoder.reset(), None)
75 self.assertEqual(encoder.encode(u'', True), '')
76 self.assertEqual(encoder.encode(u'', False), '')
77 self.assertEqual(encoder.reset(), None)
78
79 def test_stateful(self):
80 # jisx0213 encoder is stateful for a few codepoints. eg)
81 # U+00E6 => A9DC
82 # U+00E6 U+0300 => ABC4
83 # U+0300 => ABDC
84
85 encoder = codecs.getincrementalencoder('jisx0213')()
86 self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4')
87 self.assertEqual(encoder.encode(u'\u00e6'), '')
88 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4')
89 self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc')
90
91 self.assertEqual(encoder.reset(), None)
92 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
93
94 self.assertEqual(encoder.encode(u'\u00e6'), '')
95 self.assertEqual(encoder.encode('', True), '\xa9\xdc')
96 self.assertEqual(encoder.encode('', True), '')
97
98 def test_stateful_keep_buffer(self):
99 encoder = codecs.getincrementalencoder('jisx0213')()
100 self.assertEqual(encoder.encode(u'\u00e6'), '')
101 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
102 self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4')
103 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
104 self.assertEqual(encoder.reset(), None)
105 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
106 self.assertEqual(encoder.encode(u'\u00e6'), '')
107 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
108 self.assertEqual(encoder.encode(u'', True), '\xa9\xdc')
109
Victor Stinnerd6703b52010-05-21 22:50:28 +0000110 def test_issue5640(self):
111 encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace')
112 self.assertEqual(encoder.encode(u'\xff'), b'\\xff')
113 self.assertEqual(encoder.encode(u'\n'), b'\n')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000114
115class Test_IncrementalDecoder(unittest.TestCase):
116
117 def test_dbcs(self):
118 # cp949 decoder is simple with only 1 or 2 bytes sequences.
119 decoder = codecs.getincrementaldecoder('cp949')()
120 self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'),
121 u'\ud30c\uc774')
122 self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'),
123 u'\uc36c \ub9c8\uc744')
124 self.assertEqual(decoder.decode(''), u'')
125
126 def test_dbcs_keep_buffer(self):
127 decoder = codecs.getincrementaldecoder('cp949')()
128 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
129 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
130 self.assertEqual(decoder.decode('\xcc'), u'\uc774')
131
132 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
133 self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True)
134 self.assertEqual(decoder.decode('\xcc'), u'\uc774')
135
136 def test_iso2022(self):
137 decoder = codecs.getincrementaldecoder('iso2022-jp')()
138 ESC = '\x1b'
139 self.assertEqual(decoder.decode(ESC + '('), u'')
140 self.assertEqual(decoder.decode('B', True), u'')
141 self.assertEqual(decoder.decode(ESC + '$'), u'')
142 self.assertEqual(decoder.decode('B@$'), u'\u4e16')
143 self.assertEqual(decoder.decode('@$@'), u'\u4e16')
144 self.assertEqual(decoder.decode('$', True), u'\u4e16')
145 self.assertEqual(decoder.reset(), None)
146 self.assertEqual(decoder.decode('@$'), u'@$')
147 self.assertEqual(decoder.decode(ESC + '$'), u'')
148 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
149 self.assertEqual(decoder.decode('B@$'), u'\u4e16')
150
Hye-Shik Chang9b541402007-06-05 18:58:51 +0000151class Test_StreamReader(unittest.TestCase):
152 def test_bug1728403(self):
153 try:
154 open(TESTFN, 'w').write('\xa1')
155 f = codecs.open(TESTFN, encoding='cp949')
156 self.assertRaises(UnicodeDecodeError, f.read, 2)
157 finally:
Hye-Shik Changf9a0ea82007-06-05 19:28:15 +0000158 try: f.close()
159 except: pass
Hye-Shik Chang9b541402007-06-05 18:58:51 +0000160 os.unlink(TESTFN)
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000161
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000162class Test_StreamWriter(unittest.TestCase):
163 if len(u'\U00012345') == 2: # UCS2
164 def test_gb18030(self):
Hye-Shik Chang9b541402007-06-05 18:58:51 +0000165 s = StringIO.StringIO()
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000166 c = codecs.getwriter('gb18030')(s)
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000167 c.write(u'123')
168 self.assertEqual(s.getvalue(), '123')
169 c.write(u'\U00012345')
170 self.assertEqual(s.getvalue(), '123\x907\x959')
171 c.write(u'\U00012345'[0])
172 self.assertEqual(s.getvalue(), '123\x907\x959')
173 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
174 self.assertEqual(s.getvalue(),
175 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
176 c.write(u'\U00012345'[0])
177 self.assertEqual(s.getvalue(),
178 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
179 self.assertRaises(UnicodeError, c.reset)
180 self.assertEqual(s.getvalue(),
181 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
182
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000183 def test_utf_8(self):
184 s= StringIO.StringIO()
185 c = codecs.getwriter('utf-8')(s)
186 c.write(u'123')
187 self.assertEqual(s.getvalue(), '123')
188 c.write(u'\U00012345')
189 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
190
191 # Python utf-8 codec can't buffer surrogate pairs yet.
192 if 0:
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000193 c.write(u'\U00012345'[0])
194 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
195 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
196 self.assertEqual(s.getvalue(),
197 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
198 '\xea\xb0\x80\xc2\xac')
199 c.write(u'\U00012345'[0])
200 self.assertEqual(s.getvalue(),
201 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
202 '\xea\xb0\x80\xc2\xac')
203 c.reset()
204 self.assertEqual(s.getvalue(),
205 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
206 '\xea\xb0\x80\xc2\xac\xed\xa0\x88')
207 c.write(u'\U00012345'[1])
208 self.assertEqual(s.getvalue(),
209 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
210 '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
211
212 else: # UCS4
213 pass
214
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000215 def test_streamwriter_strwrite(self):
216 s = StringIO.StringIO()
217 wr = codecs.getwriter('gb18030')(s)
218 wr.write('abcd')
219 self.assertEqual(s.getvalue(), 'abcd')
220
Hye-Shik Changabb903f2006-03-13 10:20:08 +0000221class Test_ISO2022(unittest.TestCase):
222 def test_g2(self):
223 iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
224 uni = u':hu4:unit\xe9 de famille'
225 self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
226
Hye-Shik Chang199f1db2006-09-05 12:07:09 +0000227 def test_iso2022_jp_g0(self):
Ezio Melottiaa980582010-01-23 23:04:36 +0000228 self.assertNotIn('\x0e', u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
Hye-Shik Chang199f1db2006-09-05 12:07:09 +0000229 for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
230 e = u'\u3406'.encode(encoding)
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000231 self.assertFalse(filter(lambda x: x >= '\x80', e))
Hye-Shik Chang199f1db2006-09-05 12:07:09 +0000232
Hye-Shik Changb7883462006-10-08 13:48:34 +0000233 def test_bug1572832(self):
234 if sys.maxunicode >= 0x10000:
235 myunichr = unichr
236 else:
237 myunichr = lambda x: unichr(0xD7C0+(x>>10)) + unichr(0xDC00+(x&0x3FF))
238
239 for x in xrange(0x10000, 0x110000):
240 # Any ISO 2022 codec will cause the segfault
241 myunichr(x).encode('iso_2022_jp', 'ignore')
242
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000243def test_main():
Collin Winterc2898c52007-04-25 17:29:52 +0000244 test_support.run_unittest(__name__)
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000245
246if __name__ == "__main__":
247 test_main()