blob: 65fe5e5d82dd579c39a0c3e8da20d05d2a1bf789 [file] [log] [blame]
Marc-André Lemburgc5694c82005-10-21 13:45:17 +00001""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
7
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
15The tool also writes marshalled versions of the mapping tables to the
16same location (with .mapping extension).
17
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000018Written by Marc-Andre Lemburg (mal@lemburg.com).
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000019
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21(c) Copyright Guido van Rossum, 2000.
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000022
23Table generation:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000024(c) Copyright Marc-Andre Lemburg, 2005.
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000025 Licensed to PSF under a Contributor Agreement.
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000026
27"""#"
28
Christian Heimes05e8be12008-02-23 18:30:17 +000029import re, os, marshal, codecs
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000030
31# Maximum allowed size of charmap tables
32MAX_TABLE_SIZE = 8192
33
34# Standard undefined Unicode code point
Georg Brandlbf82e372008-05-16 17:02:34 +000035UNI_UNDEFINED = chr(0xFFFE)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000036
37mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
38 '\s+'
39 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
40 '\s*'
41 '(#.+)?')
42
Florent Xiclunaf089fd62010-03-19 14:25:03 +000043def parsecodes(codes, len=len, range=range):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000044
45 """ Converts code combinations to either a single code integer
46 or a tuple of integers.
47
48 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
49 ignored.
50
51 Empty codes or illegal ones are returned as None.
52
53 """
54 if not codes:
55 return None
56 l = codes.split('+')
57 if len(l) == 1:
58 return int(l[0],16)
59 for i in range(len(l)):
60 try:
61 l[i] = int(l[i],16)
62 except ValueError:
63 l[i] = None
Georg Brandlbf82e372008-05-16 17:02:34 +000064 l = [x for x in l if x is not None]
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000065 if len(l) == 1:
66 return l[0]
67 else:
68 return tuple(l)
69
70def readmap(filename):
71
72 f = open(filename,'r')
73 lines = f.readlines()
74 f.close()
75 enc2uni = {}
76 identity = []
Georg Brandlbf82e372008-05-16 17:02:34 +000077 unmapped = list(range(256))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000078
79 # UTC mapping tables per convention don't include the identity
80 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
81 # explicitly mapped to different characters or undefined
Georg Brandlbf82e372008-05-16 17:02:34 +000082 for i in list(range(32)) + [127]:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000083 identity.append(i)
84 unmapped.remove(i)
85 enc2uni[i] = (i, 'CONTROL CHARACTER')
86
87 for line in lines:
88 line = line.strip()
89 if not line or line[0] == '#':
90 continue
91 m = mapRE.match(line)
92 if not m:
93 #print '* not matched: %s' % repr(line)
94 continue
95 enc,uni,comment = m.groups()
96 enc = parsecodes(enc)
97 uni = parsecodes(uni)
98 if comment is None:
99 comment = ''
100 else:
101 comment = comment[1:].strip()
102 if enc < 256:
103 if enc in unmapped:
104 unmapped.remove(enc)
105 if enc == uni:
106 identity.append(enc)
107 enc2uni[enc] = (uni,comment)
108 else:
109 enc2uni[enc] = (uni,comment)
110
111 # If there are more identity-mapped entries than unmapped entries,
112 # it pays to generate an identity dictionary first, and add explicit
113 # mappings to None for the rest
114 if len(identity) >= len(unmapped):
115 for enc in unmapped:
116 enc2uni[enc] = (None, "")
117 enc2uni['IDENTITY'] = 256
118
119 return enc2uni
120
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000121def hexrepr(t, precision=4):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000122
123 if t is None:
124 return 'None'
125 try:
126 len(t)
127 except:
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000128 return '0x%0*X' % (precision, t)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000129 try:
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000130 return '(' + ', '.join(['0x%0*X' % (precision, item)
131 for item in t]) + ')'
Guido van Rossumb940e112007-01-10 16:19:56 +0000132 except TypeError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000133 print('* failed to convert %r: %s' % (t, why))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000134 raise
135
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000136def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000137
138 l = []
139 append = l.append
Georg Brandlbf82e372008-05-16 17:02:34 +0000140 if "IDENTITY" in map:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000141 append("%s = codecs.make_identity_dict(range(%d))" %
142 (varname, map["IDENTITY"]))
143 append("%s.update({" % varname)
144 splits = 1
145 del map["IDENTITY"]
146 identity = 1
147 else:
148 append("%s = {" % varname)
149 splits = 0
150 identity = 0
151
Georg Brandlbf82e372008-05-16 17:02:34 +0000152 mappings = sorted(map.items())
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000153 i = 0
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000154 key_precision, value_precision = precisions
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000155 for mapkey, mapvalue in mappings:
156 mapcomment = ''
157 if isinstance(mapkey, tuple):
158 (mapkey, mapcomment) = mapkey
159 if isinstance(mapvalue, tuple):
160 (mapvalue, mapcomment) = mapvalue
161 if mapkey is None:
162 continue
163 if (identity and
164 mapkey == mapvalue and
165 mapkey < 256):
166 # No need to include identity mappings, since these
167 # are already set for the first 256 code points.
168 continue
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000169 key = hexrepr(mapkey, key_precision)
170 value = hexrepr(mapvalue, value_precision)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000171 if mapcomment and comments:
172 append(' %s: %s,\t# %s' % (key, value, mapcomment))
173 else:
174 append(' %s: %s,' % (key, value))
175 i += 1
176 if i == 4096:
177 # Split the definition into parts to that the Python
178 # parser doesn't dump core
179 if splits == 0:
180 append('}')
181 else:
182 append('})')
183 append('%s.update({' % varname)
184 i = 0
185 splits = splits + 1
186 if splits == 0:
187 append('}')
188 else:
189 append('})')
190
191 return l
192
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000193def python_tabledef_code(varname, map, comments=1, key_precision=2):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000194
195 l = []
196 append = l.append
197 append('%s = (' % varname)
198
199 # Analyze map and create table dict
Georg Brandlbf82e372008-05-16 17:02:34 +0000200 mappings = sorted(map.items())
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000201 table = {}
202 maxkey = 0
Georg Brandlbf82e372008-05-16 17:02:34 +0000203 if 'IDENTITY' in map:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000204 for key in range(256):
205 table[key] = (key, '')
206 maxkey = 255
207 del map['IDENTITY']
208 for mapkey, mapvalue in mappings:
209 mapcomment = ''
210 if isinstance(mapkey, tuple):
211 (mapkey, mapcomment) = mapkey
212 if isinstance(mapvalue, tuple):
213 (mapvalue, mapcomment) = mapvalue
214 if mapkey is None:
215 continue
216 table[mapkey] = (mapvalue, mapcomment)
217 if mapkey > maxkey:
218 maxkey = mapkey
219 if maxkey > MAX_TABLE_SIZE:
220 # Table too large
221 return None
222
223 # Create table code
224 for key in range(maxkey + 1):
225 if key not in table:
226 mapvalue = None
227 mapcomment = 'UNDEFINED'
228 else:
229 mapvalue, mapcomment = table[key]
230 if mapvalue is None:
231 mapchar = UNI_UNDEFINED
232 else:
233 if isinstance(mapvalue, tuple):
234 # 1-n mappings not supported
235 return None
236 else:
Georg Brandlbf82e372008-05-16 17:02:34 +0000237 mapchar = chr(mapvalue)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000238 if mapcomment and comments:
Amaury Forgeot d'Arc8b84ea02009-07-13 20:38:21 +0000239 append(' %a \t# %s -> %s' % (mapchar,
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000240 hexrepr(key, key_precision),
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000241 mapcomment))
242 else:
Amaury Forgeot d'Arc8b84ea02009-07-13 20:38:21 +0000243 append(' %a' % mapchar)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000244
245 append(')')
246 return l
247
Thomas Woutersa9773292006-04-21 09:43:23 +0000248def codegen(name, map, encodingname, comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000249
250 """ Returns Python source for the given map.
251
252 Comments are included in the source, if comments is true (default).
253
254 """
255 # Generate code
256 decoding_map_code = python_mapdef_code(
257 'decoding_map',
258 map,
259 comments=comments)
260 decoding_table_code = python_tabledef_code(
261 'decoding_table',
262 map,
263 comments=comments)
264 encoding_map_code = python_mapdef_code(
265 'encoding_map',
266 codecs.make_encoding_map(map),
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000267 comments=comments,
268 precisions=(4, 2))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000269
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000270 if decoding_table_code:
271 suffix = 'table'
272 else:
273 suffix = 'map'
274
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000275 l = [
276 '''\
Thomas Woutersa9773292006-04-21 09:43:23 +0000277""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000278
279"""#"
280
281import codecs
282
283### Codec APIs
284
285class Codec(codecs.Codec):
286
287 def encode(self,input,errors='strict'):
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000288 return codecs.charmap_encode(input,errors,encoding_%s)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000289
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000290 def decode(self,input,errors='strict'):
291 return codecs.charmap_decode(input,errors,decoding_%s)
292''' % (encodingname, name, suffix, suffix)]
293 l.append('''\
Thomas Woutersa9773292006-04-21 09:43:23 +0000294class IncrementalEncoder(codecs.IncrementalEncoder):
295 def encode(self, input, final=False):
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000296 return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
Thomas Woutersa9773292006-04-21 09:43:23 +0000297
298class IncrementalDecoder(codecs.IncrementalDecoder):
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000299 def decode(self, input, final=False):
300 return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
301 (suffix, suffix))
Thomas Woutersa9773292006-04-21 09:43:23 +0000302
303 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000304class StreamWriter(Codec,codecs.StreamWriter):
305 pass
306
307class StreamReader(Codec,codecs.StreamReader):
308 pass
309
310### encodings module API
311
312def getregentry():
Thomas Wouters477c8d52006-05-27 19:21:47 +0000313 return codecs.CodecInfo(
Thomas Wouters477c8d52006-05-27 19:21:47 +0000314 name=%r,
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000315 encode=Codec().encode,
316 decode=Codec().decode,
Thomas Woutersa9773292006-04-21 09:43:23 +0000317 incrementalencoder=IncrementalEncoder,
318 incrementaldecoder=IncrementalDecoder,
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000319 streamreader=StreamReader,
320 streamwriter=StreamWriter,
Thomas Wouters477c8d52006-05-27 19:21:47 +0000321 )
Thomas Woutersa9773292006-04-21 09:43:23 +0000322''' % encodingname.replace('_', '-'))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000323
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000324 # Add decoding table or map (with preference to the table)
325 if not decoding_table_code:
326 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000327### Decoding Map
328''')
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000329 l.extend(decoding_map_code)
330 else:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000331 l.append('''
332### Decoding Table
333''')
334 l.extend(decoding_table_code)
335
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000336 # Add encoding map
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000337 if decoding_table_code:
338 l.append('''
339### Encoding table
340encoding_table=codecs.charmap_build(decoding_table)
341''')
342 else:
343 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000344### Encoding Map
345''')
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000346 l.extend(encoding_map_code)
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000347
348 # Final new-line
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000349 l.append('')
Tim Peters536cf992005-12-25 23:18:31 +0000350
Thomas Woutersa9773292006-04-21 09:43:23 +0000351 return '\n'.join(l).expandtabs()
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000352
Thomas Woutersa9773292006-04-21 09:43:23 +0000353def pymap(name,map,pyfile,encodingname,comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000354
Thomas Woutersa9773292006-04-21 09:43:23 +0000355 code = codegen(name,map,encodingname,comments)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000356 f = open(pyfile,'w')
357 f.write(code)
358 f.close()
359
360def marshalmap(name,map,marshalfile):
361
362 d = {}
363 for e,(u,c) in map.items():
364 d[e] = (u,c)
365 f = open(marshalfile,'wb')
366 marshal.dump(d,f)
367 f.close()
368
Thomas Woutersa9773292006-04-21 09:43:23 +0000369def convertdir(dir, dirprefix='', nameprefix='', comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000370
371 mapnames = os.listdir(dir)
372 for mapname in mapnames:
373 mappathname = os.path.join(dir, mapname)
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000374 if not os.path.isfile(mappathname):
375 continue
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000376 name = os.path.split(mapname)[1]
377 name = name.replace('-','_')
378 name = name.split('.')[0]
379 name = name.lower()
Thomas Woutersa9773292006-04-21 09:43:23 +0000380 name = nameprefix + name
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000381 codefile = name + '.py'
382 marshalfile = name + '.mapping'
Collin Winter6afaeb72007-08-03 17:06:41 +0000383 print('converting %s to %s and %s' % (mapname,
Thomas Woutersa9773292006-04-21 09:43:23 +0000384 dirprefix + codefile,
Collin Winter6afaeb72007-08-03 17:06:41 +0000385 dirprefix + marshalfile))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000386 try:
387 map = readmap(os.path.join(dir,mapname))
388 if not map:
Collin Winter6afaeb72007-08-03 17:06:41 +0000389 print('* map is empty; skipping')
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000390 else:
Thomas Woutersa9773292006-04-21 09:43:23 +0000391 pymap(mappathname, map, dirprefix + codefile,name,comments)
392 marshalmap(mappathname, map, dirprefix + marshalfile)
Guido van Rossumb940e112007-01-10 16:19:56 +0000393 except ValueError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000394 print('* conversion failed: %s' % why)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000395 raise
396
Thomas Woutersa9773292006-04-21 09:43:23 +0000397def rewritepythondir(dir, dirprefix='', comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000398
399 mapnames = os.listdir(dir)
400 for mapname in mapnames:
401 if not mapname.endswith('.mapping'):
402 continue
Thomas Woutersa9773292006-04-21 09:43:23 +0000403 name = mapname[:-len('.mapping')]
404 codefile = name + '.py'
Collin Winter6afaeb72007-08-03 17:06:41 +0000405 print('converting %s to %s' % (mapname,
406 dirprefix + codefile))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000407 try:
408 map = marshal.load(open(os.path.join(dir,mapname),
409 'rb'))
410 if not map:
Collin Winter6afaeb72007-08-03 17:06:41 +0000411 print('* map is empty; skipping')
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000412 else:
Thomas Woutersa9773292006-04-21 09:43:23 +0000413 pymap(mapname, map, dirprefix + codefile,name,comments)
Guido van Rossumb940e112007-01-10 16:19:56 +0000414 except ValueError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000415 print('* conversion failed: %s' % why)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000416
417if __name__ == '__main__':
418
419 import sys
420 if 1:
Neal Norwitzd9108552006-03-17 08:00:19 +0000421 convertdir(*sys.argv[1:])
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000422 else:
Neal Norwitzd9108552006-03-17 08:00:19 +0000423 rewritepythondir(*sys.argv[1:])