blob: c3846e9fb2bbe6c7be9efc95aa4f72ba0bce13d0 [file] [log] [blame]
Marc-André Lemburgc5694c82005-10-21 13:45:17 +00001""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
7
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
15The tool also writes marshalled versions of the mapping tables to the
16same location (with .mapping extension).
17
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000018Written by Marc-Andre Lemburg (mal@lemburg.com).
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000019
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21(c) Copyright Guido van Rossum, 2000.
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000022
23Table generation:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000024(c) Copyright Marc-Andre Lemburg, 2005.
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000025 Licensed to PSF under a Contributor Agreement.
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000026
27"""#"
28
Christian Heimes05e8be12008-02-23 18:30:17 +000029import re, os, marshal, codecs
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000030
31# Maximum allowed size of charmap tables
32MAX_TABLE_SIZE = 8192
33
34# Standard undefined Unicode code point
Georg Brandlbf82e372008-05-16 17:02:34 +000035UNI_UNDEFINED = chr(0xFFFE)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000036
37mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
38 '\s+'
39 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
40 '\s*'
41 '(#.+)?')
42
43def parsecodes(codes,
44 len=len, filter=filter,range=range):
45
46 """ Converts code combinations to either a single code integer
47 or a tuple of integers.
48
49 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
50 ignored.
51
52 Empty codes or illegal ones are returned as None.
53
54 """
55 if not codes:
56 return None
57 l = codes.split('+')
58 if len(l) == 1:
59 return int(l[0],16)
60 for i in range(len(l)):
61 try:
62 l[i] = int(l[i],16)
63 except ValueError:
64 l[i] = None
Georg Brandlbf82e372008-05-16 17:02:34 +000065 l = [x for x in l if x is not None]
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000066 if len(l) == 1:
67 return l[0]
68 else:
69 return tuple(l)
70
71def readmap(filename):
72
73 f = open(filename,'r')
74 lines = f.readlines()
75 f.close()
76 enc2uni = {}
77 identity = []
Georg Brandlbf82e372008-05-16 17:02:34 +000078 unmapped = list(range(256))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000079
80 # UTC mapping tables per convention don't include the identity
81 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
82 # explicitly mapped to different characters or undefined
Georg Brandlbf82e372008-05-16 17:02:34 +000083 for i in list(range(32)) + [127]:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000084 identity.append(i)
85 unmapped.remove(i)
86 enc2uni[i] = (i, 'CONTROL CHARACTER')
87
88 for line in lines:
89 line = line.strip()
90 if not line or line[0] == '#':
91 continue
92 m = mapRE.match(line)
93 if not m:
94 #print '* not matched: %s' % repr(line)
95 continue
96 enc,uni,comment = m.groups()
97 enc = parsecodes(enc)
98 uni = parsecodes(uni)
99 if comment is None:
100 comment = ''
101 else:
102 comment = comment[1:].strip()
103 if enc < 256:
104 if enc in unmapped:
105 unmapped.remove(enc)
106 if enc == uni:
107 identity.append(enc)
108 enc2uni[enc] = (uni,comment)
109 else:
110 enc2uni[enc] = (uni,comment)
111
112 # If there are more identity-mapped entries than unmapped entries,
113 # it pays to generate an identity dictionary first, and add explicit
114 # mappings to None for the rest
115 if len(identity) >= len(unmapped):
116 for enc in unmapped:
117 enc2uni[enc] = (None, "")
118 enc2uni['IDENTITY'] = 256
119
120 return enc2uni
121
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000122def hexrepr(t, precision=4):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000123
124 if t is None:
125 return 'None'
126 try:
127 len(t)
128 except:
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000129 return '0x%0*X' % (precision, t)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000130 try:
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000131 return '(' + ', '.join(['0x%0*X' % (precision, item)
132 for item in t]) + ')'
Guido van Rossumb940e112007-01-10 16:19:56 +0000133 except TypeError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000134 print('* failed to convert %r: %s' % (t, why))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000135 raise
136
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000137def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000138
139 l = []
140 append = l.append
Georg Brandlbf82e372008-05-16 17:02:34 +0000141 if "IDENTITY" in map:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000142 append("%s = codecs.make_identity_dict(range(%d))" %
143 (varname, map["IDENTITY"]))
144 append("%s.update({" % varname)
145 splits = 1
146 del map["IDENTITY"]
147 identity = 1
148 else:
149 append("%s = {" % varname)
150 splits = 0
151 identity = 0
152
Georg Brandlbf82e372008-05-16 17:02:34 +0000153 mappings = sorted(map.items())
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000154 i = 0
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000155 key_precision, value_precision = precisions
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000156 for mapkey, mapvalue in mappings:
157 mapcomment = ''
158 if isinstance(mapkey, tuple):
159 (mapkey, mapcomment) = mapkey
160 if isinstance(mapvalue, tuple):
161 (mapvalue, mapcomment) = mapvalue
162 if mapkey is None:
163 continue
164 if (identity and
165 mapkey == mapvalue and
166 mapkey < 256):
167 # No need to include identity mappings, since these
168 # are already set for the first 256 code points.
169 continue
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000170 key = hexrepr(mapkey, key_precision)
171 value = hexrepr(mapvalue, value_precision)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000172 if mapcomment and comments:
173 append(' %s: %s,\t# %s' % (key, value, mapcomment))
174 else:
175 append(' %s: %s,' % (key, value))
176 i += 1
177 if i == 4096:
178 # Split the definition into parts to that the Python
179 # parser doesn't dump core
180 if splits == 0:
181 append('}')
182 else:
183 append('})')
184 append('%s.update({' % varname)
185 i = 0
186 splits = splits + 1
187 if splits == 0:
188 append('}')
189 else:
190 append('})')
191
192 return l
193
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000194def python_tabledef_code(varname, map, comments=1, key_precision=2):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000195
196 l = []
197 append = l.append
198 append('%s = (' % varname)
199
200 # Analyze map and create table dict
Georg Brandlbf82e372008-05-16 17:02:34 +0000201 mappings = sorted(map.items())
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000202 table = {}
203 maxkey = 0
Georg Brandlbf82e372008-05-16 17:02:34 +0000204 if 'IDENTITY' in map:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000205 for key in range(256):
206 table[key] = (key, '')
207 maxkey = 255
208 del map['IDENTITY']
209 for mapkey, mapvalue in mappings:
210 mapcomment = ''
211 if isinstance(mapkey, tuple):
212 (mapkey, mapcomment) = mapkey
213 if isinstance(mapvalue, tuple):
214 (mapvalue, mapcomment) = mapvalue
215 if mapkey is None:
216 continue
217 table[mapkey] = (mapvalue, mapcomment)
218 if mapkey > maxkey:
219 maxkey = mapkey
220 if maxkey > MAX_TABLE_SIZE:
221 # Table too large
222 return None
223
224 # Create table code
225 for key in range(maxkey + 1):
226 if key not in table:
227 mapvalue = None
228 mapcomment = 'UNDEFINED'
229 else:
230 mapvalue, mapcomment = table[key]
231 if mapvalue is None:
232 mapchar = UNI_UNDEFINED
233 else:
234 if isinstance(mapvalue, tuple):
235 # 1-n mappings not supported
236 return None
237 else:
Georg Brandlbf82e372008-05-16 17:02:34 +0000238 mapchar = chr(mapvalue)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000239 if mapcomment and comments:
240 append(' %r\t# %s -> %s' % (mapchar,
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000241 hexrepr(key, key_precision),
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000242 mapcomment))
243 else:
244 append(' %r' % mapchar)
245
246 append(')')
247 return l
248
Thomas Woutersa9773292006-04-21 09:43:23 +0000249def codegen(name, map, encodingname, comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000250
251 """ Returns Python source for the given map.
252
253 Comments are included in the source, if comments is true (default).
254
255 """
256 # Generate code
257 decoding_map_code = python_mapdef_code(
258 'decoding_map',
259 map,
260 comments=comments)
261 decoding_table_code = python_tabledef_code(
262 'decoding_table',
263 map,
264 comments=comments)
265 encoding_map_code = python_mapdef_code(
266 'encoding_map',
267 codecs.make_encoding_map(map),
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000268 comments=comments,
269 precisions=(4, 2))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000270
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000271 if decoding_table_code:
272 suffix = 'table'
273 else:
274 suffix = 'map'
275
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000276 l = [
277 '''\
Thomas Woutersa9773292006-04-21 09:43:23 +0000278""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000279
280"""#"
281
282import codecs
283
284### Codec APIs
285
286class Codec(codecs.Codec):
287
288 def encode(self,input,errors='strict'):
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000289 return codecs.charmap_encode(input,errors,encoding_%s)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000290
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000291 def decode(self,input,errors='strict'):
292 return codecs.charmap_decode(input,errors,decoding_%s)
293''' % (encodingname, name, suffix, suffix)]
294 l.append('''\
Thomas Woutersa9773292006-04-21 09:43:23 +0000295class IncrementalEncoder(codecs.IncrementalEncoder):
296 def encode(self, input, final=False):
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000297 return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
Thomas Woutersa9773292006-04-21 09:43:23 +0000298
299class IncrementalDecoder(codecs.IncrementalDecoder):
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000300 def decode(self, input, final=False):
301 return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
302 (suffix, suffix))
Thomas Woutersa9773292006-04-21 09:43:23 +0000303
304 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000305class StreamWriter(Codec,codecs.StreamWriter):
306 pass
307
308class StreamReader(Codec,codecs.StreamReader):
309 pass
310
311### encodings module API
312
313def getregentry():
Thomas Wouters477c8d52006-05-27 19:21:47 +0000314 return codecs.CodecInfo(
Thomas Wouters477c8d52006-05-27 19:21:47 +0000315 name=%r,
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000316 encode=Codec().encode,
317 decode=Codec().decode,
Thomas Woutersa9773292006-04-21 09:43:23 +0000318 incrementalencoder=IncrementalEncoder,
319 incrementaldecoder=IncrementalDecoder,
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000320 streamreader=StreamReader,
321 streamwriter=StreamWriter,
Thomas Wouters477c8d52006-05-27 19:21:47 +0000322 )
Thomas Woutersa9773292006-04-21 09:43:23 +0000323''' % encodingname.replace('_', '-'))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000324
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000325 # Add decoding table or map (with preference to the table)
326 if not decoding_table_code:
327 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000328### Decoding Map
329''')
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000330 l.extend(decoding_map_code)
331 else:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000332 l.append('''
333### Decoding Table
334''')
335 l.extend(decoding_table_code)
336
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000337 # Add encoding map
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000338 if decoding_table_code:
339 l.append('''
340### Encoding table
341encoding_table=codecs.charmap_build(decoding_table)
342''')
343 else:
344 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000345### Encoding Map
346''')
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000347 l.extend(encoding_map_code)
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000348
349 # Final new-line
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000350 l.append('')
Tim Peters536cf992005-12-25 23:18:31 +0000351
Thomas Woutersa9773292006-04-21 09:43:23 +0000352 return '\n'.join(l).expandtabs()
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000353
Thomas Woutersa9773292006-04-21 09:43:23 +0000354def pymap(name,map,pyfile,encodingname,comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000355
Thomas Woutersa9773292006-04-21 09:43:23 +0000356 code = codegen(name,map,encodingname,comments)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000357 f = open(pyfile,'w')
358 f.write(code)
359 f.close()
360
361def marshalmap(name,map,marshalfile):
362
363 d = {}
364 for e,(u,c) in map.items():
365 d[e] = (u,c)
366 f = open(marshalfile,'wb')
367 marshal.dump(d,f)
368 f.close()
369
Thomas Woutersa9773292006-04-21 09:43:23 +0000370def convertdir(dir, dirprefix='', nameprefix='', comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000371
372 mapnames = os.listdir(dir)
373 for mapname in mapnames:
374 mappathname = os.path.join(dir, mapname)
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000375 if not os.path.isfile(mappathname):
376 continue
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000377 name = os.path.split(mapname)[1]
378 name = name.replace('-','_')
379 name = name.split('.')[0]
380 name = name.lower()
Thomas Woutersa9773292006-04-21 09:43:23 +0000381 name = nameprefix + name
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000382 codefile = name + '.py'
383 marshalfile = name + '.mapping'
Collin Winter6afaeb72007-08-03 17:06:41 +0000384 print('converting %s to %s and %s' % (mapname,
Thomas Woutersa9773292006-04-21 09:43:23 +0000385 dirprefix + codefile,
Collin Winter6afaeb72007-08-03 17:06:41 +0000386 dirprefix + marshalfile))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000387 try:
388 map = readmap(os.path.join(dir,mapname))
389 if not map:
Collin Winter6afaeb72007-08-03 17:06:41 +0000390 print('* map is empty; skipping')
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000391 else:
Thomas Woutersa9773292006-04-21 09:43:23 +0000392 pymap(mappathname, map, dirprefix + codefile,name,comments)
393 marshalmap(mappathname, map, dirprefix + marshalfile)
Guido van Rossumb940e112007-01-10 16:19:56 +0000394 except ValueError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000395 print('* conversion failed: %s' % why)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000396 raise
397
Thomas Woutersa9773292006-04-21 09:43:23 +0000398def rewritepythondir(dir, dirprefix='', comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000399
400 mapnames = os.listdir(dir)
401 for mapname in mapnames:
402 if not mapname.endswith('.mapping'):
403 continue
Thomas Woutersa9773292006-04-21 09:43:23 +0000404 name = mapname[:-len('.mapping')]
405 codefile = name + '.py'
Collin Winter6afaeb72007-08-03 17:06:41 +0000406 print('converting %s to %s' % (mapname,
407 dirprefix + codefile))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000408 try:
409 map = marshal.load(open(os.path.join(dir,mapname),
410 'rb'))
411 if not map:
Collin Winter6afaeb72007-08-03 17:06:41 +0000412 print('* map is empty; skipping')
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000413 else:
Thomas Woutersa9773292006-04-21 09:43:23 +0000414 pymap(mapname, map, dirprefix + codefile,name,comments)
Guido van Rossumb940e112007-01-10 16:19:56 +0000415 except ValueError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000416 print('* conversion failed: %s' % why)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000417
418if __name__ == '__main__':
419
420 import sys
421 if 1:
Neal Norwitzd9108552006-03-17 08:00:19 +0000422 convertdir(*sys.argv[1:])
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000423 else:
Neal Norwitzd9108552006-03-17 08:00:19 +0000424 rewritepythondir(*sys.argv[1:])