blob: 647c0333ed272840db5b14266231f0259f903d71 [file] [log] [blame]
Dong-hee Na113feb32020-04-30 02:34:24 +09001#
2# genmap_schinese.py: Simplified Chinese Codecs Map Generator
3#
4# Original Author: Hye-Shik Chang <perky@FreeBSD.org>
5# Modified Author: Dong-hee Na <donghee.na92@gmail.com>
6#
7import os
8import re
9
10from genmap_support import *
11
12
13GB2312_C1 = (0x21, 0x7e)
14GB2312_C2 = (0x21, 0x7e)
15GBKL1_C1 = (0x81, 0xa8)
16GBKL1_C2 = (0x40, 0xfe)
17GBKL2_C1 = (0xa9, 0xfe)
18GBKL2_C2 = (0x40, 0xa0)
19GB18030EXTP1_C1 = (0xa1, 0xa9)
20GB18030EXTP1_C2 = (0x40, 0xfe)
21GB18030EXTP2_C1 = (0xaa, 0xaf)
22GB18030EXTP2_C2 = (0xa1, 0xfe)
23GB18030EXTP3_C1 = (0xd7, 0xd7)
24GB18030EXTP3_C2 = (0xfa, 0xfe)
25GB18030EXTP4_C1 = (0xf8, 0xfd)
26GB18030EXTP4_C2 = (0xa1, 0xfe)
27GB18030EXTP5_C1 = (0xfe, 0xfe)
28GB18030EXTP5_C2 = (0x50, 0xfe)
29
30MAPPINGS_GB2312 = 'http://people.freebsd.org/~perky/i18n/GB2312.TXT'
31MAPPINGS_CP936 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT'
32MAPPINGS_GB18030 = 'http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/gb-18030-2000.xml'
33
34re_gb18030ass = re.compile('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>')
35
36
37def parse_gb18030map(fo):
38 m, gbuni = {}, {}
39 for i in range(65536):
40 if i < 0xd800 or i > 0xdfff: # exclude unicode surrogate area
41 gbuni[i] = None
42 for uni, native in re_gb18030ass.findall(fo.read()):
43 uni = eval('0x'+uni)
44 native = [eval('0x'+u) for u in native.split()]
45 if len(native) <= 2:
46 del gbuni[uni]
47 if len(native) == 2: # we can decode algorithmically for 1 or 4 bytes
48 m.setdefault(native[0], {})
49 m[native[0]][native[1]] = uni
50 gbuni = [k for k in gbuni.keys()]
51 gbuni.sort()
52 return m, gbuni
53
54def main():
55 print("Loading Mapping File...")
56 gb2312map = open_mapping_file('python-mappings/GB2312.TXT', MAPPINGS_GB2312)
57 cp936map = open_mapping_file('python-mappings/CP936.TXT', MAPPINGS_CP936)
58 gb18030map = open_mapping_file('python-mappings/gb-18030-2000.xml', MAPPINGS_GB18030)
59
60 gb18030decmap, gb18030unilinear = parse_gb18030map(gb18030map)
61 gbkdecmap = loadmap(cp936map)
62 gb2312decmap = loadmap(gb2312map)
63 difmap = {}
64 for c1, m in gbkdecmap.items():
65 for c2, code in m.items():
66 del gb18030decmap[c1][c2]
67 if not gb18030decmap[c1]:
68 del gb18030decmap[c1]
69 for c1, m in gb2312decmap.items():
70 for c2, code in m.items():
71 gbkc1, gbkc2 = c1 | 0x80, c2 | 0x80
72 if gbkdecmap[gbkc1][gbkc2] == code:
73 del gbkdecmap[gbkc1][gbkc2]
74 if not gbkdecmap[gbkc1]:
75 del gbkdecmap[gbkc1]
76
77 gb2312_gbkencmap, gb18030encmap = {}, {}
78 for c1, m in gbkdecmap.items():
79 for c2, code in m.items():
80 gb2312_gbkencmap.setdefault(code >> 8, {})
81 gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB set
82 for c1, m in gb2312decmap.items():
83 for c2, code in m.items():
84 gb2312_gbkencmap.setdefault(code >> 8, {})
85 gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB unset
86 for c1, m in gb18030decmap.items():
87 for c2, code in m.items():
88 gb18030encmap.setdefault(code >> 8, {})
89 gb18030encmap[code >> 8][code & 0xff] = c1 << 8 | c2
90
91 with open('mappings_cn.h', 'w') as fp:
92 print_autogen(fp, os.path.basename(__file__))
93
94 print("Generating GB2312 decode map...")
95 writer = DecodeMapWriter(fp, "gb2312", gb2312decmap)
96 writer.update_decode_map(GB2312_C1, GB2312_C2)
97 writer.generate()
98
99 print("Generating GBK decode map...")
100 writer = DecodeMapWriter(fp, "gbkext", gbkdecmap)
101 writer.update_decode_map(GBKL1_C1, GBKL1_C2)
102 writer.update_decode_map(GBKL2_C1, GBKL2_C2)
103 writer.generate()
104
105 print("Generating GB2312 && GBK encode map...")
106 writer = EncodeMapWriter(fp, "gbcommon", gb2312_gbkencmap)
107 writer.generate()
108
109 print("Generating GB18030 extension decode map...")
110 writer = DecodeMapWriter(fp, "gb18030ext", gb18030decmap)
111 for i in range(1, 6):
112 writer.update_decode_map(eval("GB18030EXTP%d_C1" % i), eval("GB18030EXTP%d_C2" % i))
113
114 writer.generate()
115
116 print("Generating GB18030 extension encode map...")
117 writer = EncodeMapWriter(fp, "gb18030ext", gb18030encmap)
118 writer.generate()
119
120 print("Generating GB18030 Unicode BMP Mapping Ranges...")
121 ranges = [[-1, -1, -1]]
122 gblinnum = 0
123 fp.write("""
124static const struct _gb18030_to_unibmp_ranges {
125 Py_UCS4 first, last;
126 DBCHAR base;
127} gb18030_to_unibmp_ranges[] = {
128""")
129
130 for uni in gb18030unilinear:
131 if uni == ranges[-1][1] + 1:
132 ranges[-1][1] = uni
133 else:
134 ranges.append([uni, uni, gblinnum])
135 gblinnum += 1
136
137 filler = BufferedFiller()
138 for first, last, base in ranges[1:]:
139 filler.write('{', str(first), ',', str(last), ',', str(base), '},')
140
141 filler.write('{', '0,', '0,', str(
142 ranges[-1][2] + ranges[-1][1] - ranges[-1][0] + 1), '}', '};')
143 filler.printout(fp)
144
145 print("Done!")
146
147
148if __name__ == '__main__':
149 main()