blob: f5a1af380e0e4f59e1881fe136b1abb924c7d29a [file] [log] [blame]
Marc-André Lemburgc5694c82005-10-21 13:45:17 +00001""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
7
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
15The tool also writes marshalled versions of the mapping tables to the
16same location (with .mapping extension).
17
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000018Written by Marc-Andre Lemburg (mal@lemburg.com).
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000019
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21(c) Copyright Guido van Rossum, 2000.
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000022
23Table generation:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000024(c) Copyright Marc-Andre Lemburg, 2005.
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000025 Licensed to PSF under a Contributor Agreement.
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000026
27"""#"
28
Christian Heimes05e8be12008-02-23 18:30:17 +000029import re, os, marshal, codecs
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000030
31# Maximum allowed size of charmap tables
32MAX_TABLE_SIZE = 8192
33
34# Standard undefined Unicode code point
Georg Brandlbf82e372008-05-16 17:02:34 +000035UNI_UNDEFINED = chr(0xFFFE)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000036
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +000037# Placeholder for a missing codepoint
38MISSING_CODE = -1
39
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000040mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
41 '\s+'
42 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
43 '\s*'
44 '(#.+)?')
45
Florent Xiclunaf089fd62010-03-19 14:25:03 +000046def parsecodes(codes, len=len, range=range):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000047
48 """ Converts code combinations to either a single code integer
49 or a tuple of integers.
50
51 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
52 ignored.
53
54 Empty codes or illegal ones are returned as None.
55
56 """
57 if not codes:
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +000058 return MISSING_CODE
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000059 l = codes.split('+')
60 if len(l) == 1:
61 return int(l[0],16)
62 for i in range(len(l)):
63 try:
64 l[i] = int(l[i],16)
65 except ValueError:
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +000066 l[i] = MISSING_CODE
67 l = [x for x in l if x != MISSING_CODE]
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000068 if len(l) == 1:
69 return l[0]
70 else:
71 return tuple(l)
72
73def readmap(filename):
74
75 f = open(filename,'r')
76 lines = f.readlines()
77 f.close()
78 enc2uni = {}
79 identity = []
Georg Brandlbf82e372008-05-16 17:02:34 +000080 unmapped = list(range(256))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000081
82 # UTC mapping tables per convention don't include the identity
83 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
84 # explicitly mapped to different characters or undefined
Georg Brandlbf82e372008-05-16 17:02:34 +000085 for i in list(range(32)) + [127]:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000086 identity.append(i)
87 unmapped.remove(i)
88 enc2uni[i] = (i, 'CONTROL CHARACTER')
89
90 for line in lines:
91 line = line.strip()
92 if not line or line[0] == '#':
93 continue
94 m = mapRE.match(line)
95 if not m:
96 #print '* not matched: %s' % repr(line)
97 continue
98 enc,uni,comment = m.groups()
99 enc = parsecodes(enc)
100 uni = parsecodes(uni)
101 if comment is None:
102 comment = ''
103 else:
104 comment = comment[1:].strip()
Antoine Pitrouaaefac72012-06-16 22:48:21 +0200105 if not isinstance(enc, tuple) and enc < 256:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000106 if enc in unmapped:
107 unmapped.remove(enc)
108 if enc == uni:
109 identity.append(enc)
110 enc2uni[enc] = (uni,comment)
111 else:
112 enc2uni[enc] = (uni,comment)
113
114 # If there are more identity-mapped entries than unmapped entries,
115 # it pays to generate an identity dictionary first, and add explicit
116 # mappings to None for the rest
117 if len(identity) >= len(unmapped):
118 for enc in unmapped:
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +0000119 enc2uni[enc] = (MISSING_CODE, "")
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000120 enc2uni['IDENTITY'] = 256
121
122 return enc2uni
123
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000124def hexrepr(t, precision=4):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000125
126 if t is None:
127 return 'None'
128 try:
129 len(t)
130 except:
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000131 return '0x%0*X' % (precision, t)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000132 try:
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000133 return '(' + ', '.join(['0x%0*X' % (precision, item)
134 for item in t]) + ')'
Guido van Rossumb940e112007-01-10 16:19:56 +0000135 except TypeError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000136 print('* failed to convert %r: %s' % (t, why))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000137 raise
138
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000139def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000140
141 l = []
142 append = l.append
Georg Brandlbf82e372008-05-16 17:02:34 +0000143 if "IDENTITY" in map:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000144 append("%s = codecs.make_identity_dict(range(%d))" %
145 (varname, map["IDENTITY"]))
146 append("%s.update({" % varname)
147 splits = 1
148 del map["IDENTITY"]
149 identity = 1
150 else:
151 append("%s = {" % varname)
152 splits = 0
153 identity = 0
154
Georg Brandlbf82e372008-05-16 17:02:34 +0000155 mappings = sorted(map.items())
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000156 i = 0
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000157 key_precision, value_precision = precisions
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000158 for mapkey, mapvalue in mappings:
159 mapcomment = ''
160 if isinstance(mapkey, tuple):
161 (mapkey, mapcomment) = mapkey
162 if isinstance(mapvalue, tuple):
163 (mapvalue, mapcomment) = mapvalue
164 if mapkey is None:
165 continue
166 if (identity and
167 mapkey == mapvalue and
168 mapkey < 256):
169 # No need to include identity mappings, since these
170 # are already set for the first 256 code points.
171 continue
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000172 key = hexrepr(mapkey, key_precision)
173 value = hexrepr(mapvalue, value_precision)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000174 if mapcomment and comments:
175 append(' %s: %s,\t# %s' % (key, value, mapcomment))
176 else:
177 append(' %s: %s,' % (key, value))
178 i += 1
179 if i == 4096:
180 # Split the definition into parts to that the Python
181 # parser doesn't dump core
182 if splits == 0:
183 append('}')
184 else:
185 append('})')
186 append('%s.update({' % varname)
187 i = 0
188 splits = splits + 1
189 if splits == 0:
190 append('}')
191 else:
192 append('})')
193
194 return l
195
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000196def python_tabledef_code(varname, map, comments=1, key_precision=2):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000197
198 l = []
199 append = l.append
200 append('%s = (' % varname)
201
202 # Analyze map and create table dict
Georg Brandlbf82e372008-05-16 17:02:34 +0000203 mappings = sorted(map.items())
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000204 table = {}
Antoine Pitrouaaefac72012-06-16 22:48:21 +0200205 maxkey = 255
Georg Brandlbf82e372008-05-16 17:02:34 +0000206 if 'IDENTITY' in map:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000207 for key in range(256):
208 table[key] = (key, '')
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000209 del map['IDENTITY']
210 for mapkey, mapvalue in mappings:
211 mapcomment = ''
212 if isinstance(mapkey, tuple):
213 (mapkey, mapcomment) = mapkey
214 if isinstance(mapvalue, tuple):
215 (mapvalue, mapcomment) = mapvalue
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +0000216 if mapkey == MISSING_CODE:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000217 continue
218 table[mapkey] = (mapvalue, mapcomment)
219 if mapkey > maxkey:
220 maxkey = mapkey
221 if maxkey > MAX_TABLE_SIZE:
222 # Table too large
223 return None
224
225 # Create table code
Antoine Pitrouaaefac72012-06-16 22:48:21 +0200226 maxchar = 0
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000227 for key in range(maxkey + 1):
228 if key not in table:
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +0000229 mapvalue = MISSING_CODE
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000230 mapcomment = 'UNDEFINED'
231 else:
232 mapvalue, mapcomment = table[key]
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +0000233 if mapvalue == MISSING_CODE:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000234 mapchar = UNI_UNDEFINED
235 else:
236 if isinstance(mapvalue, tuple):
237 # 1-n mappings not supported
238 return None
239 else:
Georg Brandlbf82e372008-05-16 17:02:34 +0000240 mapchar = chr(mapvalue)
Antoine Pitrouaaefac72012-06-16 22:48:21 +0200241 maxchar = max(maxchar, ord(mapchar))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000242 if mapcomment and comments:
Amaury Forgeot d'Arc8b84ea02009-07-13 20:38:21 +0000243 append(' %a \t# %s -> %s' % (mapchar,
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000244 hexrepr(key, key_precision),
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000245 mapcomment))
246 else:
Amaury Forgeot d'Arc8b84ea02009-07-13 20:38:21 +0000247 append(' %a' % mapchar)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000248
Antoine Pitrouaaefac72012-06-16 22:48:21 +0200249 if maxchar < 256:
250 append(' %a \t## Widen to UCS2 for optimization' % UNI_UNDEFINED)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000251 append(')')
252 return l
253
Thomas Woutersa9773292006-04-21 09:43:23 +0000254def codegen(name, map, encodingname, comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000255
256 """ Returns Python source for the given map.
257
258 Comments are included in the source, if comments is true (default).
259
260 """
261 # Generate code
262 decoding_map_code = python_mapdef_code(
263 'decoding_map',
264 map,
265 comments=comments)
266 decoding_table_code = python_tabledef_code(
267 'decoding_table',
268 map,
269 comments=comments)
270 encoding_map_code = python_mapdef_code(
271 'encoding_map',
272 codecs.make_encoding_map(map),
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000273 comments=comments,
274 precisions=(4, 2))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000275
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000276 if decoding_table_code:
277 suffix = 'table'
278 else:
279 suffix = 'map'
280
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000281 l = [
282 '''\
Thomas Woutersa9773292006-04-21 09:43:23 +0000283""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000284
285"""#"
286
287import codecs
288
289### Codec APIs
290
291class Codec(codecs.Codec):
292
293 def encode(self,input,errors='strict'):
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000294 return codecs.charmap_encode(input,errors,encoding_%s)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000295
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000296 def decode(self,input,errors='strict'):
297 return codecs.charmap_decode(input,errors,decoding_%s)
298''' % (encodingname, name, suffix, suffix)]
299 l.append('''\
Thomas Woutersa9773292006-04-21 09:43:23 +0000300class IncrementalEncoder(codecs.IncrementalEncoder):
301 def encode(self, input, final=False):
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000302 return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
Thomas Woutersa9773292006-04-21 09:43:23 +0000303
304class IncrementalDecoder(codecs.IncrementalDecoder):
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000305 def decode(self, input, final=False):
306 return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
307 (suffix, suffix))
Thomas Woutersa9773292006-04-21 09:43:23 +0000308
309 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000310class StreamWriter(Codec,codecs.StreamWriter):
311 pass
312
313class StreamReader(Codec,codecs.StreamReader):
314 pass
315
316### encodings module API
317
318def getregentry():
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319 return codecs.CodecInfo(
Thomas Wouters477c8d52006-05-27 19:21:47 +0000320 name=%r,
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000321 encode=Codec().encode,
322 decode=Codec().decode,
Thomas Woutersa9773292006-04-21 09:43:23 +0000323 incrementalencoder=IncrementalEncoder,
324 incrementaldecoder=IncrementalDecoder,
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000325 streamreader=StreamReader,
326 streamwriter=StreamWriter,
Thomas Wouters477c8d52006-05-27 19:21:47 +0000327 )
Thomas Woutersa9773292006-04-21 09:43:23 +0000328''' % encodingname.replace('_', '-'))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000329
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000330 # Add decoding table or map (with preference to the table)
331 if not decoding_table_code:
332 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000333### Decoding Map
334''')
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000335 l.extend(decoding_map_code)
336 else:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000337 l.append('''
338### Decoding Table
339''')
340 l.extend(decoding_table_code)
341
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000342 # Add encoding map
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000343 if decoding_table_code:
344 l.append('''
345### Encoding table
346encoding_table=codecs.charmap_build(decoding_table)
347''')
348 else:
349 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000350### Encoding Map
351''')
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000352 l.extend(encoding_map_code)
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000353
354 # Final new-line
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000355 l.append('')
Tim Peters536cf992005-12-25 23:18:31 +0000356
Thomas Woutersa9773292006-04-21 09:43:23 +0000357 return '\n'.join(l).expandtabs()
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000358
Thomas Woutersa9773292006-04-21 09:43:23 +0000359def pymap(name,map,pyfile,encodingname,comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000360
Thomas Woutersa9773292006-04-21 09:43:23 +0000361 code = codegen(name,map,encodingname,comments)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000362 f = open(pyfile,'w')
363 f.write(code)
364 f.close()
365
366def marshalmap(name,map,marshalfile):
367
368 d = {}
369 for e,(u,c) in map.items():
370 d[e] = (u,c)
371 f = open(marshalfile,'wb')
372 marshal.dump(d,f)
373 f.close()
374
Thomas Woutersa9773292006-04-21 09:43:23 +0000375def convertdir(dir, dirprefix='', nameprefix='', comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000376
377 mapnames = os.listdir(dir)
378 for mapname in mapnames:
379 mappathname = os.path.join(dir, mapname)
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000380 if not os.path.isfile(mappathname):
381 continue
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000382 name = os.path.split(mapname)[1]
383 name = name.replace('-','_')
384 name = name.split('.')[0]
385 name = name.lower()
Thomas Woutersa9773292006-04-21 09:43:23 +0000386 name = nameprefix + name
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000387 codefile = name + '.py'
388 marshalfile = name + '.mapping'
Collin Winter6afaeb72007-08-03 17:06:41 +0000389 print('converting %s to %s and %s' % (mapname,
Thomas Woutersa9773292006-04-21 09:43:23 +0000390 dirprefix + codefile,
Collin Winter6afaeb72007-08-03 17:06:41 +0000391 dirprefix + marshalfile))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000392 try:
393 map = readmap(os.path.join(dir,mapname))
394 if not map:
Collin Winter6afaeb72007-08-03 17:06:41 +0000395 print('* map is empty; skipping')
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000396 else:
Thomas Woutersa9773292006-04-21 09:43:23 +0000397 pymap(mappathname, map, dirprefix + codefile,name,comments)
398 marshalmap(mappathname, map, dirprefix + marshalfile)
Guido van Rossumb940e112007-01-10 16:19:56 +0000399 except ValueError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000400 print('* conversion failed: %s' % why)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000401 raise
402
Thomas Woutersa9773292006-04-21 09:43:23 +0000403def rewritepythondir(dir, dirprefix='', comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000404
405 mapnames = os.listdir(dir)
406 for mapname in mapnames:
407 if not mapname.endswith('.mapping'):
408 continue
Thomas Woutersa9773292006-04-21 09:43:23 +0000409 name = mapname[:-len('.mapping')]
410 codefile = name + '.py'
Collin Winter6afaeb72007-08-03 17:06:41 +0000411 print('converting %s to %s' % (mapname,
412 dirprefix + codefile))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000413 try:
414 map = marshal.load(open(os.path.join(dir,mapname),
415 'rb'))
416 if not map:
Collin Winter6afaeb72007-08-03 17:06:41 +0000417 print('* map is empty; skipping')
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000418 else:
Thomas Woutersa9773292006-04-21 09:43:23 +0000419 pymap(mapname, map, dirprefix + codefile,name,comments)
Guido van Rossumb940e112007-01-10 16:19:56 +0000420 except ValueError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000421 print('* conversion failed: %s' % why)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000422
423if __name__ == '__main__':
424
425 import sys
426 if 1:
Neal Norwitzd9108552006-03-17 08:00:19 +0000427 convertdir(*sys.argv[1:])
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000428 else:
Neal Norwitzd9108552006-03-17 08:00:19 +0000429 rewritepythondir(*sys.argv[1:])