Dong-hee Na | 113feb3 | 2020-04-30 02:34:24 +0900 | [diff] [blame^] | 1 | # |
| 2 | # genmap_schinese.py: Simplified Chinese Codecs Map Generator |
| 3 | # |
| 4 | # Original Author: Hye-Shik Chang <perky@FreeBSD.org> |
| 5 | # Modified Author: Dong-hee Na <donghee.na92@gmail.com> |
| 6 | # |
| 7 | import os |
| 8 | import re |
| 9 | |
| 10 | from genmap_support import * |
| 11 | |
| 12 | |
| 13 | GB2312_C1 = (0x21, 0x7e) |
| 14 | GB2312_C2 = (0x21, 0x7e) |
| 15 | GBKL1_C1 = (0x81, 0xa8) |
| 16 | GBKL1_C2 = (0x40, 0xfe) |
| 17 | GBKL2_C1 = (0xa9, 0xfe) |
| 18 | GBKL2_C2 = (0x40, 0xa0) |
| 19 | GB18030EXTP1_C1 = (0xa1, 0xa9) |
| 20 | GB18030EXTP1_C2 = (0x40, 0xfe) |
| 21 | GB18030EXTP2_C1 = (0xaa, 0xaf) |
| 22 | GB18030EXTP2_C2 = (0xa1, 0xfe) |
| 23 | GB18030EXTP3_C1 = (0xd7, 0xd7) |
| 24 | GB18030EXTP3_C2 = (0xfa, 0xfe) |
| 25 | GB18030EXTP4_C1 = (0xf8, 0xfd) |
| 26 | GB18030EXTP4_C2 = (0xa1, 0xfe) |
| 27 | GB18030EXTP5_C1 = (0xfe, 0xfe) |
| 28 | GB18030EXTP5_C2 = (0x50, 0xfe) |
| 29 | |
| 30 | MAPPINGS_GB2312 = 'http://people.freebsd.org/~perky/i18n/GB2312.TXT' |
| 31 | MAPPINGS_CP936 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT' |
| 32 | MAPPINGS_GB18030 = 'http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/gb-18030-2000.xml' |
| 33 | |
| 34 | re_gb18030ass = re.compile('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>') |
| 35 | |
| 36 | |
| 37 | def parse_gb18030map(fo): |
| 38 | m, gbuni = {}, {} |
| 39 | for i in range(65536): |
| 40 | if i < 0xd800 or i > 0xdfff: # exclude unicode surrogate area |
| 41 | gbuni[i] = None |
| 42 | for uni, native in re_gb18030ass.findall(fo.read()): |
| 43 | uni = eval('0x'+uni) |
| 44 | native = [eval('0x'+u) for u in native.split()] |
| 45 | if len(native) <= 2: |
| 46 | del gbuni[uni] |
| 47 | if len(native) == 2: # we can decode algorithmically for 1 or 4 bytes |
| 48 | m.setdefault(native[0], {}) |
| 49 | m[native[0]][native[1]] = uni |
| 50 | gbuni = [k for k in gbuni.keys()] |
| 51 | gbuni.sort() |
| 52 | return m, gbuni |
| 53 | |
| 54 | def main(): |
| 55 | print("Loading Mapping File...") |
| 56 | gb2312map = open_mapping_file('python-mappings/GB2312.TXT', MAPPINGS_GB2312) |
| 57 | cp936map = open_mapping_file('python-mappings/CP936.TXT', MAPPINGS_CP936) |
| 58 | gb18030map = open_mapping_file('python-mappings/gb-18030-2000.xml', MAPPINGS_GB18030) |
| 59 | |
| 60 | gb18030decmap, gb18030unilinear = parse_gb18030map(gb18030map) |
| 61 | gbkdecmap = loadmap(cp936map) |
| 62 | gb2312decmap = loadmap(gb2312map) |
| 63 | difmap = {} |
| 64 | for c1, m in gbkdecmap.items(): |
| 65 | for c2, code in m.items(): |
| 66 | del gb18030decmap[c1][c2] |
| 67 | if not gb18030decmap[c1]: |
| 68 | del gb18030decmap[c1] |
| 69 | for c1, m in gb2312decmap.items(): |
| 70 | for c2, code in m.items(): |
| 71 | gbkc1, gbkc2 = c1 | 0x80, c2 | 0x80 |
| 72 | if gbkdecmap[gbkc1][gbkc2] == code: |
| 73 | del gbkdecmap[gbkc1][gbkc2] |
| 74 | if not gbkdecmap[gbkc1]: |
| 75 | del gbkdecmap[gbkc1] |
| 76 | |
| 77 | gb2312_gbkencmap, gb18030encmap = {}, {} |
| 78 | for c1, m in gbkdecmap.items(): |
| 79 | for c2, code in m.items(): |
| 80 | gb2312_gbkencmap.setdefault(code >> 8, {}) |
| 81 | gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB set |
| 82 | for c1, m in gb2312decmap.items(): |
| 83 | for c2, code in m.items(): |
| 84 | gb2312_gbkencmap.setdefault(code >> 8, {}) |
| 85 | gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB unset |
| 86 | for c1, m in gb18030decmap.items(): |
| 87 | for c2, code in m.items(): |
| 88 | gb18030encmap.setdefault(code >> 8, {}) |
| 89 | gb18030encmap[code >> 8][code & 0xff] = c1 << 8 | c2 |
| 90 | |
| 91 | with open('mappings_cn.h', 'w') as fp: |
| 92 | print_autogen(fp, os.path.basename(__file__)) |
| 93 | |
| 94 | print("Generating GB2312 decode map...") |
| 95 | writer = DecodeMapWriter(fp, "gb2312", gb2312decmap) |
| 96 | writer.update_decode_map(GB2312_C1, GB2312_C2) |
| 97 | writer.generate() |
| 98 | |
| 99 | print("Generating GBK decode map...") |
| 100 | writer = DecodeMapWriter(fp, "gbkext", gbkdecmap) |
| 101 | writer.update_decode_map(GBKL1_C1, GBKL1_C2) |
| 102 | writer.update_decode_map(GBKL2_C1, GBKL2_C2) |
| 103 | writer.generate() |
| 104 | |
| 105 | print("Generating GB2312 && GBK encode map...") |
| 106 | writer = EncodeMapWriter(fp, "gbcommon", gb2312_gbkencmap) |
| 107 | writer.generate() |
| 108 | |
| 109 | print("Generating GB18030 extension decode map...") |
| 110 | writer = DecodeMapWriter(fp, "gb18030ext", gb18030decmap) |
| 111 | for i in range(1, 6): |
| 112 | writer.update_decode_map(eval("GB18030EXTP%d_C1" % i), eval("GB18030EXTP%d_C2" % i)) |
| 113 | |
| 114 | writer.generate() |
| 115 | |
| 116 | print("Generating GB18030 extension encode map...") |
| 117 | writer = EncodeMapWriter(fp, "gb18030ext", gb18030encmap) |
| 118 | writer.generate() |
| 119 | |
| 120 | print("Generating GB18030 Unicode BMP Mapping Ranges...") |
| 121 | ranges = [[-1, -1, -1]] |
| 122 | gblinnum = 0 |
| 123 | fp.write(""" |
| 124 | static const struct _gb18030_to_unibmp_ranges { |
| 125 | Py_UCS4 first, last; |
| 126 | DBCHAR base; |
| 127 | } gb18030_to_unibmp_ranges[] = { |
| 128 | """) |
| 129 | |
| 130 | for uni in gb18030unilinear: |
| 131 | if uni == ranges[-1][1] + 1: |
| 132 | ranges[-1][1] = uni |
| 133 | else: |
| 134 | ranges.append([uni, uni, gblinnum]) |
| 135 | gblinnum += 1 |
| 136 | |
| 137 | filler = BufferedFiller() |
| 138 | for first, last, base in ranges[1:]: |
| 139 | filler.write('{', str(first), ',', str(last), ',', str(base), '},') |
| 140 | |
| 141 | filler.write('{', '0,', '0,', str( |
| 142 | ranges[-1][2] + ranges[-1][1] - ranges[-1][0] + 1), '}', '};') |
| 143 | filler.printout(fp) |
| 144 | |
| 145 | print("Done!") |
| 146 | |
| 147 | |
| 148 | if __name__ == '__main__': |
| 149 | main() |