Dong-hee Na | 113feb3 | 2020-04-30 02:34:24 +0900 | [diff] [blame] | 1 | # |
| 2 | # genmap_ja_codecs.py: Japanese Codecs Map Generator |
| 3 | # |
| 4 | # Original Author: Hye-Shik Chang <perky@FreeBSD.org> |
| 5 | # Modified Author: Dong-hee Na <donghee.na92@gmail.com> |
| 6 | # |
| 7 | import os |
| 8 | |
| 9 | from genmap_support import * |
| 10 | |
| 11 | JISX0208_C1 = (0x21, 0x74) |
| 12 | JISX0208_C2 = (0x21, 0x7e) |
| 13 | JISX0212_C1 = (0x22, 0x6d) |
| 14 | JISX0212_C2 = (0x21, 0x7e) |
| 15 | JISX0213_C1 = (0x21, 0x7e) |
| 16 | JISX0213_C2 = (0x21, 0x7e) |
| 17 | CP932P0_C1 = (0x81, 0x81) # patches between shift-jis and cp932 |
| 18 | CP932P0_C2 = (0x5f, 0xca) |
| 19 | CP932P1_C1 = (0x87, 0x87) # CP932 P1 |
| 20 | CP932P1_C2 = (0x40, 0x9c) |
| 21 | CP932P2_C1 = (0xed, 0xfc) # CP932 P2 |
| 22 | CP932P2_C2 = (0x40, 0xfc) |
| 23 | |
| 24 | MAPPINGS_JIS0208 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT' |
| 25 | MAPPINGS_JIS0212 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT' |
| 26 | MAPPINGS_CP932 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT' |
| 27 | MAPPINGS_JISX0213_2004 = 'http://wakaba-web.hp.infoseek.co.jp/table/jisx0213-2004-std.txt' |
| 28 | |
| 29 | |
| 30 | def loadmap_jisx0213(fo): |
| 31 | decmap3, decmap4 = {}, {} # maps to BMP for level 3 and 4 |
| 32 | decmap3_2, decmap4_2 = {}, {} # maps to U+2xxxx for level 3 and 4 |
| 33 | decmap3_pair = {} # maps to BMP-pair for level 3 |
| 34 | for line in fo: |
| 35 | line = line.split('#', 1)[0].strip() |
| 36 | if not line or len(line.split()) < 2: |
| 37 | continue |
| 38 | |
| 39 | row = line.split() |
| 40 | loc = eval('0x' + row[0][2:]) |
| 41 | level = eval(row[0][0]) |
| 42 | m = None |
| 43 | if len(row[1].split('+')) == 2: # single unicode |
| 44 | uni = eval('0x' + row[1][2:]) |
| 45 | if level == 3: |
| 46 | if uni < 0x10000: |
| 47 | m = decmap3 |
| 48 | elif 0x20000 <= uni < 0x30000: |
| 49 | uni -= 0x20000 |
| 50 | m = decmap3_2 |
| 51 | elif level == 4: |
| 52 | if uni < 0x10000: |
| 53 | m = decmap4 |
| 54 | elif 0x20000 <= uni < 0x30000: |
| 55 | uni -= 0x20000 |
| 56 | m = decmap4_2 |
| 57 | m.setdefault((loc >> 8), {}) |
| 58 | m[(loc >> 8)][(loc & 0xff)] = uni |
| 59 | else: # pair |
| 60 | uniprefix = eval('0x' + row[1][2:6]) # body |
| 61 | uni = eval('0x' + row[1][7:11]) # modifier |
| 62 | if level != 3: |
| 63 | raise ValueError("invalid map") |
| 64 | decmap3_pair.setdefault(uniprefix, {}) |
| 65 | m = decmap3_pair[uniprefix] |
| 66 | |
| 67 | if m is None: |
| 68 | raise ValueError("invalid map") |
| 69 | m.setdefault((loc >> 8), {}) |
| 70 | m[(loc >> 8)][(loc & 0xff)] = uni |
| 71 | |
| 72 | return decmap3, decmap4, decmap3_2, decmap4_2, decmap3_pair |
| 73 | |
| 74 | |
| 75 | def main(): |
| 76 | jisx0208file = open_mapping_file('python-mappings/JIS0208.TXT', MAPPINGS_JIS0208) |
| 77 | jisx0212file = open_mapping_file('python-mappings/JIS0212.TXT', MAPPINGS_JIS0212) |
| 78 | cp932file = open_mapping_file('python-mappings/CP932.TXT', MAPPINGS_CP932) |
| 79 | jisx0213file = open_mapping_file('python-mappings/jisx0213-2004-std.txt', MAPPINGS_JISX0213_2004) |
| 80 | |
| 81 | print("Loading Mapping File...") |
| 82 | |
| 83 | sjisdecmap = loadmap(jisx0208file, natcol=0, unicol=2) |
| 84 | jisx0208decmap = loadmap(jisx0208file, natcol=1, unicol=2) |
| 85 | jisx0212decmap = loadmap(jisx0212file) |
| 86 | cp932decmap = loadmap(cp932file) |
| 87 | jis3decmap, jis4decmap, jis3_2_decmap, jis4_2_decmap, jis3_pairdecmap = loadmap_jisx0213(jisx0213file) |
| 88 | |
| 89 | if jis3decmap[0x21][0x24] != 0xff0c: |
| 90 | raise SystemExit('Please adjust your JIS X 0213 map using jisx0213-2000-std.txt.diff') |
| 91 | |
| 92 | sjisencmap, cp932encmap = {}, {} |
| 93 | jisx0208_0212encmap = {} |
| 94 | for c1, m in sjisdecmap.items(): |
| 95 | for c2, code in m.items(): |
| 96 | sjisencmap.setdefault(code >> 8, {}) |
| 97 | sjisencmap[code >> 8][code & 0xff] = c1 << 8 | c2 |
| 98 | for c1, m in cp932decmap.items(): |
| 99 | for c2, code in m.items(): |
| 100 | cp932encmap.setdefault(code >> 8, {}) |
| 101 | if (code & 0xff) not in cp932encmap[code >> 8]: |
| 102 | cp932encmap[code >> 8][code & 0xff] = c1 << 8 | c2 |
| 103 | for c1, m in cp932encmap.copy().items(): |
| 104 | for c2, code in m.copy().items(): |
| 105 | if c1 in sjisencmap and c2 in sjisencmap[c1] and sjisencmap[c1][c2] == code: |
| 106 | del cp932encmap[c1][c2] |
| 107 | if not cp932encmap[c1]: |
| 108 | del cp932encmap[c1] |
| 109 | |
| 110 | jisx0213pairdecmap = {} |
| 111 | jisx0213pairencmap = [] |
| 112 | for unibody, m1 in jis3_pairdecmap.items(): |
| 113 | for c1, m2 in m1.items(): |
| 114 | for c2, modifier in m2.items(): |
| 115 | jisx0213pairencmap.append((unibody, modifier, c1 << 8 | c2)) |
| 116 | jisx0213pairdecmap.setdefault(c1, {}) |
| 117 | jisx0213pairdecmap[c1][c2] = unibody << 16 | modifier |
| 118 | |
| 119 | # Twinmap for both of JIS X 0208 (MSB unset) and JIS X 0212 (MSB set) |
| 120 | for c1, m in jisx0208decmap.items(): |
| 121 | for c2, code in m.items(): |
| 122 | jisx0208_0212encmap.setdefault(code >> 8, {}) |
| 123 | jisx0208_0212encmap[code >> 8][code & 0xff] = c1 << 8 | c2 |
| 124 | |
| 125 | for c1, m in jisx0212decmap.items(): |
| 126 | for c2, code in m.items(): |
| 127 | jisx0208_0212encmap.setdefault(code >> 8, {}) |
| 128 | if (code & 0xff) in jisx0208_0212encmap[code >> 8]: |
| 129 | print("OOPS!!!", (code)) |
| 130 | jisx0208_0212encmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2 |
| 131 | |
| 132 | jisx0213bmpencmap = {} |
| 133 | for c1, m in jis3decmap.copy().items(): |
| 134 | for c2, code in m.copy().items(): |
| 135 | if c1 in jisx0208decmap and c2 in jisx0208decmap[c1]: |
| 136 | if code in jis3_pairdecmap: |
| 137 | jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair |
| 138 | jisx0213pairencmap.append((code, 0, c1 << 8 | c2)) |
| 139 | elif jisx0208decmap[c1][c2] == code: |
| 140 | del jis3decmap[c1][c2] |
| 141 | if not jis3decmap[c1]: |
| 142 | del jis3decmap[c1] |
| 143 | else: |
| 144 | raise ValueError("Difference between JIS X 0208 and JIS X 0213 Plane 1 is found.") |
| 145 | else: |
| 146 | jisx0213bmpencmap.setdefault(code >> 8, {}) |
| 147 | if code not in jis3_pairdecmap: |
| 148 | jisx0213bmpencmap[code >> 8][code & 0xff] = c1 << 8 | c2 |
| 149 | else: |
| 150 | jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair |
| 151 | jisx0213pairencmap.append((code, 0, c1 << 8 | c2)) |
| 152 | |
| 153 | for c1, m in jis4decmap.items(): |
| 154 | for c2, code in m.items(): |
| 155 | jisx0213bmpencmap.setdefault(code >> 8, {}) |
| 156 | jisx0213bmpencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2 |
| 157 | |
| 158 | jisx0213empencmap = {} |
| 159 | for c1, m in jis3_2_decmap.items(): |
| 160 | for c2, code in m.items(): |
| 161 | jisx0213empencmap.setdefault(code >> 8, {}) |
| 162 | jisx0213empencmap[code >> 8][code & 0xff] = c1 << 8 | c2 |
| 163 | for c1, m in jis4_2_decmap.items(): |
| 164 | for c2, code in m.items(): |
| 165 | jisx0213empencmap.setdefault(code >> 8, {}) |
| 166 | jisx0213empencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2 |
| 167 | |
| 168 | with open("mappings_jp.h", "w") as fp: |
| 169 | print_autogen(fp, os.path.basename(__file__)) |
| 170 | print("Generating JIS X 0208 decode map...") |
| 171 | writer = DecodeMapWriter(fp, "jisx0208", jisx0208decmap) |
| 172 | writer.update_decode_map(JISX0208_C1, JISX0208_C2) |
| 173 | writer.generate() |
| 174 | |
| 175 | print("Generating JIS X 0212 decode map...") |
| 176 | writer = DecodeMapWriter(fp, "jisx0212", jisx0212decmap) |
| 177 | writer.update_decode_map(JISX0212_C1, JISX0212_C2) |
| 178 | writer.generate() |
| 179 | |
| 180 | print("Generating JIS X 0208 && JIS X 0212 encode map...") |
| 181 | writer = EncodeMapWriter(fp, "jisxcommon", jisx0208_0212encmap) |
| 182 | writer.generate() |
| 183 | |
| 184 | print("Generating CP932 Extension decode map...") |
| 185 | writer = DecodeMapWriter(fp, "cp932ext", cp932decmap) |
| 186 | writer.update_decode_map(CP932P0_C1, CP932P0_C2) |
| 187 | writer.update_decode_map(CP932P1_C1, CP932P1_C2) |
| 188 | writer.update_decode_map(CP932P2_C1, CP932P2_C2) |
| 189 | writer.generate() |
| 190 | |
| 191 | print("Generating CP932 Extension encode map...") |
| 192 | writer = EncodeMapWriter(fp, "cp932ext", cp932encmap) |
| 193 | writer.generate() |
| 194 | |
| 195 | print("Generating JIS X 0213 Plane 1 BMP decode map...") |
| 196 | writer = DecodeMapWriter(fp, "jisx0213_1_bmp", jis3decmap) |
| 197 | writer.update_decode_map(JISX0213_C1, JISX0213_C2) |
| 198 | writer.generate() |
| 199 | |
| 200 | print("Generating JIS X 0213 Plane 2 BMP decode map...") |
| 201 | writer = DecodeMapWriter(fp, "jisx0213_2_bmp", jis4decmap) |
| 202 | writer.update_decode_map(JISX0213_C1, JISX0213_C2) |
| 203 | writer.generate() |
| 204 | |
| 205 | print("Generating JIS X 0213 BMP encode map...") |
| 206 | writer = EncodeMapWriter(fp, "jisx0213_bmp", jisx0213bmpencmap) |
| 207 | writer.generate() |
| 208 | |
| 209 | print("Generating JIS X 0213 Plane 1 EMP decode map...") |
| 210 | writer = DecodeMapWriter(fp, "jisx0213_1_emp", jis3_2_decmap) |
| 211 | writer.update_decode_map(JISX0213_C1, JISX0213_C2) |
| 212 | writer.generate() |
| 213 | |
| 214 | print("Generating JIS X 0213 Plane 2 EMP decode map...") |
| 215 | writer = DecodeMapWriter(fp, "jisx0213_2_emp", jis4_2_decmap) |
| 216 | writer.update_decode_map(JISX0213_C1, JISX0213_C2) |
| 217 | writer.generate() |
| 218 | |
| 219 | print("Generating JIS X 0213 EMP encode map...") |
| 220 | writer = EncodeMapWriter(fp, "jisx0213_emp", jisx0213empencmap) |
| 221 | writer.generate() |
| 222 | |
| 223 | with open('mappings_jisx0213_pair.h', 'w') as fp: |
| 224 | print_autogen(fp, os.path.basename(__file__)) |
| 225 | fp.write(f"#define JISX0213_ENCPAIRS {len(jisx0213pairencmap)}\n") |
| 226 | fp.write("""\ |
| 227 | #ifdef EXTERN_JISX0213_PAIR |
| 228 | static const struct widedbcs_index *jisx0213_pair_decmap; |
| 229 | static const struct pair_encodemap *jisx0213_pair_encmap; |
| 230 | #else |
| 231 | """) |
| 232 | |
| 233 | print("Generating JIS X 0213 unicode-pair decode map...") |
| 234 | writer = DecodeMapWriter(fp, "jisx0213_pair", jisx0213pairdecmap) |
| 235 | writer.update_decode_map(JISX0213_C1, JISX0213_C2) |
| 236 | writer.generate(wide=True) |
| 237 | |
| 238 | print("Generating JIS X 0213 unicode-pair encode map...") |
| 239 | jisx0213pairencmap.sort() |
| 240 | fp.write("static const struct pair_encodemap jisx0213_pair_encmap[JISX0213_ENCPAIRS] = {\n") |
| 241 | filler = BufferedFiller() |
| 242 | for body, modifier, jis in jisx0213pairencmap: |
| 243 | filler.write('{', '0x%04x%04x,' % (body, modifier), '0x%04x' % jis, '},') |
| 244 | filler.printout(fp) |
| 245 | fp.write("};\n") |
| 246 | fp.write("#endif\n") |
| 247 | |
| 248 | print("Done!") |
| 249 | |
| 250 | if __name__ == '__main__': |
| 251 | main() |