Fix a few bugs on cjkcodecs found by Oren Tirosh:
- gbk and gb18030 codec now handle U+30FB KATAKANA MIDDLE DOT correctly.
- iso2022_jp_2 codec now encodes into G0 for KS X 1001, GB2312
codepoints to conform the standard.
- iso2022_jp_3 and iso2022_jp_2004 codec can encode JIS X 2013:2
codepoints now.
diff --git a/Lib/test/test_codecencodings_cn.py b/Lib/test/test_codecencodings_cn.py
index 1bf8583..c558f1b 100644
--- a/Lib/test/test_codecencodings_cn.py
+++ b/Lib/test/test_codecencodings_cn.py
@@ -32,6 +32,7 @@
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u804a\ufffd"),
("abc\x80\x80\xc1\xc4", "ignore", u"abc\u804a"),
("\x83\x34\x83\x31", "strict", None),
+ (u"\u30fb", "strict", None),
)
class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase):
@@ -45,6 +46,7 @@
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u804a\ufffd"),
("abc\x80\x80\xc1\xc4", "ignore", u"abc\u804a"),
("abc\x84\x39\x84\x39\xc1\xc4", "replace", u"abc\ufffd\u804a"),
+ (u"\u30fb", "strict", "\x819\xa79"),
)
has_iso10646 = True
diff --git a/Lib/test/test_multibytecodec.py b/Lib/test/test_multibytecodec.py
index 397ebeb..800456e 100644
--- a/Lib/test/test_multibytecodec.py
+++ b/Lib/test/test_multibytecodec.py
@@ -202,6 +202,12 @@
uni = u':hu4:unit\xe9 de famille'
self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
+ def test_iso2022_jp_g0(self):
+ self.failIf('\x0e' in u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
+ for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
+ e = u'\u3406'.encode(encoding)
+ self.failIf(filter(lambda x: x >= '\x80', e))
+
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(Test_MultibyteCodec))