blob: 1e5aced63aa0bb04a1ab549bea35bc3b349c6688 [file] [log] [blame]
Marc-André Lemburgc5694c82005-10-21 13:45:17 +00001""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
7
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
15The tool also writes marshalled versions of the mapping tables to the
16same location (with .mapping extension).
17
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000018Written by Marc-Andre Lemburg (mal@lemburg.com).
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000019
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21(c) Copyright Guido van Rossum, 2000.
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000022
23Table generation:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000024(c) Copyright Marc-Andre Lemburg, 2005.
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000025 Licensed to PSF under a Contributor Agreement.
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000026
27"""#"
28
Christian Heimes05e8be12008-02-23 18:30:17 +000029import re, os, marshal, codecs
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000030
31# Maximum allowed size of charmap tables
32MAX_TABLE_SIZE = 8192
33
34# Standard undefined Unicode code point
Georg Brandlbf82e372008-05-16 17:02:34 +000035UNI_UNDEFINED = chr(0xFFFE)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000036
Serhiy Storchakad3faf432015-01-18 11:28:37 +020037# Placeholder for a missing code point
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +000038MISSING_CODE = -1
39
R David Murray44b548d2016-09-08 13:59:53 -040040mapRE = re.compile(r'((?:0x[0-9a-fA-F]+\+?)+)'
41 r'\s+'
42 r'((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
43 r'\s*'
44 r'(#.+)?')
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000045
Florent Xiclunaf089fd62010-03-19 14:25:03 +000046def parsecodes(codes, len=len, range=range):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000047
48 """ Converts code combinations to either a single code integer
49 or a tuple of integers.
50
51 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
52 ignored.
53
54 Empty codes or illegal ones are returned as None.
55
56 """
57 if not codes:
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +000058 return MISSING_CODE
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000059 l = codes.split('+')
60 if len(l) == 1:
61 return int(l[0],16)
62 for i in range(len(l)):
63 try:
64 l[i] = int(l[i],16)
65 except ValueError:
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +000066 l[i] = MISSING_CODE
67 l = [x for x in l if x != MISSING_CODE]
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000068 if len(l) == 1:
69 return l[0]
70 else:
71 return tuple(l)
72
73def readmap(filename):
74
Serhiy Storchaka172bb392019-03-30 08:33:02 +020075 with open(filename) as f:
76 lines = f.readlines()
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000077 enc2uni = {}
78 identity = []
Georg Brandlbf82e372008-05-16 17:02:34 +000079 unmapped = list(range(256))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000080
81 # UTC mapping tables per convention don't include the identity
82 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
83 # explicitly mapped to different characters or undefined
Georg Brandlbf82e372008-05-16 17:02:34 +000084 for i in list(range(32)) + [127]:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000085 identity.append(i)
86 unmapped.remove(i)
87 enc2uni[i] = (i, 'CONTROL CHARACTER')
88
89 for line in lines:
90 line = line.strip()
91 if not line or line[0] == '#':
92 continue
93 m = mapRE.match(line)
94 if not m:
95 #print '* not matched: %s' % repr(line)
96 continue
97 enc,uni,comment = m.groups()
98 enc = parsecodes(enc)
99 uni = parsecodes(uni)
100 if comment is None:
101 comment = ''
102 else:
103 comment = comment[1:].strip()
Antoine Pitrouaaefac72012-06-16 22:48:21 +0200104 if not isinstance(enc, tuple) and enc < 256:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000105 if enc in unmapped:
106 unmapped.remove(enc)
107 if enc == uni:
108 identity.append(enc)
109 enc2uni[enc] = (uni,comment)
110 else:
111 enc2uni[enc] = (uni,comment)
112
113 # If there are more identity-mapped entries than unmapped entries,
114 # it pays to generate an identity dictionary first, and add explicit
115 # mappings to None for the rest
116 if len(identity) >= len(unmapped):
117 for enc in unmapped:
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +0000118 enc2uni[enc] = (MISSING_CODE, "")
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000119 enc2uni['IDENTITY'] = 256
120
121 return enc2uni
122
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000123def hexrepr(t, precision=4):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000124
125 if t is None:
126 return 'None'
127 try:
128 len(t)
Serhiy Storchakaba9ac5b2015-05-20 10:33:40 +0300129 except TypeError:
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000130 return '0x%0*X' % (precision, t)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000131 try:
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000132 return '(' + ', '.join(['0x%0*X' % (precision, item)
133 for item in t]) + ')'
Guido van Rossumb940e112007-01-10 16:19:56 +0000134 except TypeError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000135 print('* failed to convert %r: %s' % (t, why))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000136 raise
137
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000138def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000139
140 l = []
141 append = l.append
Georg Brandlbf82e372008-05-16 17:02:34 +0000142 if "IDENTITY" in map:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000143 append("%s = codecs.make_identity_dict(range(%d))" %
144 (varname, map["IDENTITY"]))
145 append("%s.update({" % varname)
146 splits = 1
147 del map["IDENTITY"]
148 identity = 1
149 else:
150 append("%s = {" % varname)
151 splits = 0
152 identity = 0
153
Georg Brandlbf82e372008-05-16 17:02:34 +0000154 mappings = sorted(map.items())
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000155 i = 0
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000156 key_precision, value_precision = precisions
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000157 for mapkey, mapvalue in mappings:
158 mapcomment = ''
159 if isinstance(mapkey, tuple):
160 (mapkey, mapcomment) = mapkey
161 if isinstance(mapvalue, tuple):
162 (mapvalue, mapcomment) = mapvalue
163 if mapkey is None:
164 continue
165 if (identity and
166 mapkey == mapvalue and
167 mapkey < 256):
168 # No need to include identity mappings, since these
169 # are already set for the first 256 code points.
170 continue
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000171 key = hexrepr(mapkey, key_precision)
172 value = hexrepr(mapvalue, value_precision)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000173 if mapcomment and comments:
174 append(' %s: %s,\t# %s' % (key, value, mapcomment))
175 else:
176 append(' %s: %s,' % (key, value))
177 i += 1
178 if i == 4096:
179 # Split the definition into parts to that the Python
180 # parser doesn't dump core
181 if splits == 0:
182 append('}')
183 else:
184 append('})')
185 append('%s.update({' % varname)
186 i = 0
187 splits = splits + 1
188 if splits == 0:
189 append('}')
190 else:
191 append('})')
192
193 return l
194
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000195def python_tabledef_code(varname, map, comments=1, key_precision=2):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000196
197 l = []
198 append = l.append
199 append('%s = (' % varname)
200
201 # Analyze map and create table dict
Georg Brandlbf82e372008-05-16 17:02:34 +0000202 mappings = sorted(map.items())
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000203 table = {}
Antoine Pitrouaaefac72012-06-16 22:48:21 +0200204 maxkey = 255
Georg Brandlbf82e372008-05-16 17:02:34 +0000205 if 'IDENTITY' in map:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000206 for key in range(256):
207 table[key] = (key, '')
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000208 del map['IDENTITY']
209 for mapkey, mapvalue in mappings:
210 mapcomment = ''
211 if isinstance(mapkey, tuple):
212 (mapkey, mapcomment) = mapkey
213 if isinstance(mapvalue, tuple):
214 (mapvalue, mapcomment) = mapvalue
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +0000215 if mapkey == MISSING_CODE:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000216 continue
217 table[mapkey] = (mapvalue, mapcomment)
218 if mapkey > maxkey:
219 maxkey = mapkey
220 if maxkey > MAX_TABLE_SIZE:
221 # Table too large
222 return None
223
224 # Create table code
Antoine Pitrouaaefac72012-06-16 22:48:21 +0200225 maxchar = 0
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000226 for key in range(maxkey + 1):
227 if key not in table:
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +0000228 mapvalue = MISSING_CODE
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000229 mapcomment = 'UNDEFINED'
230 else:
231 mapvalue, mapcomment = table[key]
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +0000232 if mapvalue == MISSING_CODE:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000233 mapchar = UNI_UNDEFINED
234 else:
235 if isinstance(mapvalue, tuple):
236 # 1-n mappings not supported
237 return None
238 else:
Georg Brandlbf82e372008-05-16 17:02:34 +0000239 mapchar = chr(mapvalue)
Antoine Pitrouaaefac72012-06-16 22:48:21 +0200240 maxchar = max(maxchar, ord(mapchar))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000241 if mapcomment and comments:
Amaury Forgeot d'Arc8b84ea02009-07-13 20:38:21 +0000242 append(' %a \t# %s -> %s' % (mapchar,
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000243 hexrepr(key, key_precision),
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000244 mapcomment))
245 else:
Amaury Forgeot d'Arc8b84ea02009-07-13 20:38:21 +0000246 append(' %a' % mapchar)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000247
Antoine Pitrouaaefac72012-06-16 22:48:21 +0200248 if maxchar < 256:
249 append(' %a \t## Widen to UCS2 for optimization' % UNI_UNDEFINED)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000250 append(')')
251 return l
252
Thomas Woutersa9773292006-04-21 09:43:23 +0000253def codegen(name, map, encodingname, comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000254
255 """ Returns Python source for the given map.
256
257 Comments are included in the source, if comments is true (default).
258
259 """
260 # Generate code
261 decoding_map_code = python_mapdef_code(
262 'decoding_map',
263 map,
264 comments=comments)
265 decoding_table_code = python_tabledef_code(
266 'decoding_table',
267 map,
268 comments=comments)
269 encoding_map_code = python_mapdef_code(
270 'encoding_map',
271 codecs.make_encoding_map(map),
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000272 comments=comments,
273 precisions=(4, 2))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000274
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000275 if decoding_table_code:
276 suffix = 'table'
277 else:
278 suffix = 'map'
279
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000280 l = [
281 '''\
Thomas Woutersa9773292006-04-21 09:43:23 +0000282""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000283
284"""#"
285
286import codecs
287
288### Codec APIs
289
290class Codec(codecs.Codec):
291
Andrew Kuchling695f07b2013-11-10 21:45:24 -0500292 def encode(self, input, errors='strict'):
293 return codecs.charmap_encode(input, errors, encoding_%s)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000294
Andrew Kuchling695f07b2013-11-10 21:45:24 -0500295 def decode(self, input, errors='strict'):
296 return codecs.charmap_decode(input, errors, decoding_%s)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000297''' % (encodingname, name, suffix, suffix)]
298 l.append('''\
Thomas Woutersa9773292006-04-21 09:43:23 +0000299class IncrementalEncoder(codecs.IncrementalEncoder):
300 def encode(self, input, final=False):
Andrew Kuchling695f07b2013-11-10 21:45:24 -0500301 return codecs.charmap_encode(input, self.errors, encoding_%s)[0]
Thomas Woutersa9773292006-04-21 09:43:23 +0000302
303class IncrementalDecoder(codecs.IncrementalDecoder):
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000304 def decode(self, input, final=False):
Andrew Kuchling695f07b2013-11-10 21:45:24 -0500305 return codecs.charmap_decode(input, self.errors, decoding_%s)[0]''' %
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000306 (suffix, suffix))
Thomas Woutersa9773292006-04-21 09:43:23 +0000307
308 l.append('''
Andrew Kuchling695f07b2013-11-10 21:45:24 -0500309class StreamWriter(Codec, codecs.StreamWriter):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000310 pass
311
Andrew Kuchling695f07b2013-11-10 21:45:24 -0500312class StreamReader(Codec, codecs.StreamReader):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000313 pass
314
315### encodings module API
316
317def getregentry():
Thomas Wouters477c8d52006-05-27 19:21:47 +0000318 return codecs.CodecInfo(
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319 name=%r,
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000320 encode=Codec().encode,
321 decode=Codec().decode,
Thomas Woutersa9773292006-04-21 09:43:23 +0000322 incrementalencoder=IncrementalEncoder,
323 incrementaldecoder=IncrementalDecoder,
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000324 streamreader=StreamReader,
325 streamwriter=StreamWriter,
Thomas Wouters477c8d52006-05-27 19:21:47 +0000326 )
Thomas Woutersa9773292006-04-21 09:43:23 +0000327''' % encodingname.replace('_', '-'))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000328
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000329 # Add decoding table or map (with preference to the table)
330 if not decoding_table_code:
331 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000332### Decoding Map
333''')
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000334 l.extend(decoding_map_code)
335 else:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000336 l.append('''
337### Decoding Table
338''')
339 l.extend(decoding_table_code)
340
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000341 # Add encoding map
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000342 if decoding_table_code:
343 l.append('''
344### Encoding table
Andrew Kuchling695f07b2013-11-10 21:45:24 -0500345encoding_table = codecs.charmap_build(decoding_table)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000346''')
347 else:
348 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000349### Encoding Map
350''')
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000351 l.extend(encoding_map_code)
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000352
353 # Final new-line
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000354 l.append('')
Tim Peters536cf992005-12-25 23:18:31 +0000355
Thomas Woutersa9773292006-04-21 09:43:23 +0000356 return '\n'.join(l).expandtabs()
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000357
Thomas Woutersa9773292006-04-21 09:43:23 +0000358def pymap(name,map,pyfile,encodingname,comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000359
Thomas Woutersa9773292006-04-21 09:43:23 +0000360 code = codegen(name,map,encodingname,comments)
Serhiy Storchaka172bb392019-03-30 08:33:02 +0200361 with open(pyfile,'w') as f:
362 f.write(code)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000363
364def marshalmap(name,map,marshalfile):
365
366 d = {}
367 for e,(u,c) in map.items():
368 d[e] = (u,c)
Serhiy Storchaka172bb392019-03-30 08:33:02 +0200369 with open(marshalfile,'wb') as f:
370 marshal.dump(d,f)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000371
Thomas Woutersa9773292006-04-21 09:43:23 +0000372def convertdir(dir, dirprefix='', nameprefix='', comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000373
374 mapnames = os.listdir(dir)
375 for mapname in mapnames:
376 mappathname = os.path.join(dir, mapname)
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000377 if not os.path.isfile(mappathname):
378 continue
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000379 name = os.path.split(mapname)[1]
380 name = name.replace('-','_')
381 name = name.split('.')[0]
382 name = name.lower()
Thomas Woutersa9773292006-04-21 09:43:23 +0000383 name = nameprefix + name
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000384 codefile = name + '.py'
385 marshalfile = name + '.mapping'
Collin Winter6afaeb72007-08-03 17:06:41 +0000386 print('converting %s to %s and %s' % (mapname,
Thomas Woutersa9773292006-04-21 09:43:23 +0000387 dirprefix + codefile,
Collin Winter6afaeb72007-08-03 17:06:41 +0000388 dirprefix + marshalfile))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000389 try:
390 map = readmap(os.path.join(dir,mapname))
391 if not map:
Collin Winter6afaeb72007-08-03 17:06:41 +0000392 print('* map is empty; skipping')
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000393 else:
Thomas Woutersa9773292006-04-21 09:43:23 +0000394 pymap(mappathname, map, dirprefix + codefile,name,comments)
395 marshalmap(mappathname, map, dirprefix + marshalfile)
Guido van Rossumb940e112007-01-10 16:19:56 +0000396 except ValueError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000397 print('* conversion failed: %s' % why)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000398 raise
399
Thomas Woutersa9773292006-04-21 09:43:23 +0000400def rewritepythondir(dir, dirprefix='', comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000401
402 mapnames = os.listdir(dir)
403 for mapname in mapnames:
404 if not mapname.endswith('.mapping'):
405 continue
Thomas Woutersa9773292006-04-21 09:43:23 +0000406 name = mapname[:-len('.mapping')]
407 codefile = name + '.py'
Collin Winter6afaeb72007-08-03 17:06:41 +0000408 print('converting %s to %s' % (mapname,
409 dirprefix + codefile))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000410 try:
Serhiy Storchaka172bb392019-03-30 08:33:02 +0200411 with open(os.path.join(dir, mapname), 'rb') as f:
412 map = marshal.load(f)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000413 if not map:
Collin Winter6afaeb72007-08-03 17:06:41 +0000414 print('* map is empty; skipping')
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000415 else:
Thomas Woutersa9773292006-04-21 09:43:23 +0000416 pymap(mapname, map, dirprefix + codefile,name,comments)
Guido van Rossumb940e112007-01-10 16:19:56 +0000417 except ValueError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000418 print('* conversion failed: %s' % why)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000419
420if __name__ == '__main__':
421
422 import sys
423 if 1:
Neal Norwitzd9108552006-03-17 08:00:19 +0000424 convertdir(*sys.argv[1:])
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000425 else:
Neal Norwitzd9108552006-03-17 08:00:19 +0000426 rewritepythondir(*sys.argv[1:])