blob: 431a76ddc59872902a0e56e9e119c69fb0cfecbd [file] [log] [blame]
Marc-André Lemburgc5694c82005-10-21 13:45:17 +00001""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
7
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
15The tool also writes marshalled versions of the mapping tables to the
16same location (with .mapping extension).
17
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000018Written by Marc-Andre Lemburg (mal@lemburg.com).
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000019
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21(c) Copyright Guido van Rossum, 2000.
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000022
23Table generation:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000024(c) Copyright Marc-Andre Lemburg, 2005.
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000025 Licensed to PSF under a Contributor Agreement.
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000026
27"""#"
28
Christian Heimesc5f05e42008-02-23 17:40:11 +000029import re, os, marshal, codecs
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000030
31# Maximum allowed size of charmap tables
32MAX_TABLE_SIZE = 8192
33
34# Standard undefined Unicode code point
35UNI_UNDEFINED = unichr(0xFFFE)
36
37mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
38 '\s+'
39 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
40 '\s*'
41 '(#.+)?')
42
43def parsecodes(codes,
44 len=len, filter=filter,range=range):
45
46 """ Converts code combinations to either a single code integer
47 or a tuple of integers.
48
49 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
50 ignored.
51
52 Empty codes or illegal ones are returned as None.
53
54 """
55 if not codes:
56 return None
57 l = codes.split('+')
58 if len(l) == 1:
59 return int(l[0],16)
60 for i in range(len(l)):
61 try:
62 l[i] = int(l[i],16)
63 except ValueError:
64 l[i] = None
65 l = filter(lambda x: x is not None, l)
66 if len(l) == 1:
67 return l[0]
68 else:
69 return tuple(l)
70
71def readmap(filename):
72
73 f = open(filename,'r')
74 lines = f.readlines()
75 f.close()
76 enc2uni = {}
77 identity = []
78 unmapped = range(256)
79
80 # UTC mapping tables per convention don't include the identity
81 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
82 # explicitly mapped to different characters or undefined
83 for i in range(32) + [127]:
84 identity.append(i)
85 unmapped.remove(i)
86 enc2uni[i] = (i, 'CONTROL CHARACTER')
87
88 for line in lines:
89 line = line.strip()
90 if not line or line[0] == '#':
91 continue
92 m = mapRE.match(line)
93 if not m:
94 #print '* not matched: %s' % repr(line)
95 continue
96 enc,uni,comment = m.groups()
97 enc = parsecodes(enc)
98 uni = parsecodes(uni)
99 if comment is None:
100 comment = ''
101 else:
102 comment = comment[1:].strip()
103 if enc < 256:
104 if enc in unmapped:
105 unmapped.remove(enc)
106 if enc == uni:
107 identity.append(enc)
108 enc2uni[enc] = (uni,comment)
109 else:
110 enc2uni[enc] = (uni,comment)
111
112 # If there are more identity-mapped entries than unmapped entries,
113 # it pays to generate an identity dictionary first, and add explicit
114 # mappings to None for the rest
115 if len(identity) >= len(unmapped):
116 for enc in unmapped:
117 enc2uni[enc] = (None, "")
118 enc2uni['IDENTITY'] = 256
119
120 return enc2uni
121
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000122def hexrepr(t, precision=4):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000123
124 if t is None:
125 return 'None'
126 try:
127 len(t)
128 except:
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000129 return '0x%0*X' % (precision, t)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000130 try:
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000131 return '(' + ', '.join(['0x%0*X' % (precision, item)
132 for item in t]) + ')'
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000133 except TypeError, why:
134 print '* failed to convert %r: %s' % (t, why)
135 raise
136
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000137def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000138
139 l = []
140 append = l.append
141 if map.has_key("IDENTITY"):
142 append("%s = codecs.make_identity_dict(range(%d))" %
143 (varname, map["IDENTITY"]))
144 append("%s.update({" % varname)
145 splits = 1
146 del map["IDENTITY"]
147 identity = 1
148 else:
149 append("%s = {" % varname)
150 splits = 0
151 identity = 0
152
153 mappings = map.items()
154 mappings.sort()
155 i = 0
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000156 key_precision, value_precision = precisions
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000157 for mapkey, mapvalue in mappings:
158 mapcomment = ''
159 if isinstance(mapkey, tuple):
160 (mapkey, mapcomment) = mapkey
161 if isinstance(mapvalue, tuple):
162 (mapvalue, mapcomment) = mapvalue
163 if mapkey is None:
164 continue
165 if (identity and
166 mapkey == mapvalue and
167 mapkey < 256):
168 # No need to include identity mappings, since these
169 # are already set for the first 256 code points.
170 continue
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000171 key = hexrepr(mapkey, key_precision)
172 value = hexrepr(mapvalue, value_precision)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000173 if mapcomment and comments:
174 append(' %s: %s,\t# %s' % (key, value, mapcomment))
175 else:
176 append(' %s: %s,' % (key, value))
177 i += 1
178 if i == 4096:
179 # Split the definition into parts to that the Python
180 # parser doesn't dump core
181 if splits == 0:
182 append('}')
183 else:
184 append('})')
185 append('%s.update({' % varname)
186 i = 0
187 splits = splits + 1
188 if splits == 0:
189 append('}')
190 else:
191 append('})')
192
193 return l
194
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000195def python_tabledef_code(varname, map, comments=1, key_precision=2):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000196
197 l = []
198 append = l.append
199 append('%s = (' % varname)
200
201 # Analyze map and create table dict
202 mappings = map.items()
203 mappings.sort()
204 table = {}
205 maxkey = 0
206 if map.has_key('IDENTITY'):
207 for key in range(256):
208 table[key] = (key, '')
209 maxkey = 255
210 del map['IDENTITY']
211 for mapkey, mapvalue in mappings:
212 mapcomment = ''
213 if isinstance(mapkey, tuple):
214 (mapkey, mapcomment) = mapkey
215 if isinstance(mapvalue, tuple):
216 (mapvalue, mapcomment) = mapvalue
217 if mapkey is None:
218 continue
219 table[mapkey] = (mapvalue, mapcomment)
220 if mapkey > maxkey:
221 maxkey = mapkey
222 if maxkey > MAX_TABLE_SIZE:
223 # Table too large
224 return None
225
226 # Create table code
227 for key in range(maxkey + 1):
228 if key not in table:
229 mapvalue = None
230 mapcomment = 'UNDEFINED'
231 else:
232 mapvalue, mapcomment = table[key]
233 if mapvalue is None:
234 mapchar = UNI_UNDEFINED
235 else:
236 if isinstance(mapvalue, tuple):
237 # 1-n mappings not supported
238 return None
239 else:
240 mapchar = unichr(mapvalue)
241 if mapcomment and comments:
242 append(' %r\t# %s -> %s' % (mapchar,
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000243 hexrepr(key, key_precision),
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000244 mapcomment))
245 else:
246 append(' %r' % mapchar)
247
248 append(')')
249 return l
250
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000251def codegen(name, map, encodingname, comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000252
253 """ Returns Python source for the given map.
254
255 Comments are included in the source, if comments is true (default).
256
257 """
258 # Generate code
259 decoding_map_code = python_mapdef_code(
260 'decoding_map',
261 map,
262 comments=comments)
263 decoding_table_code = python_tabledef_code(
264 'decoding_table',
265 map,
266 comments=comments)
267 encoding_map_code = python_mapdef_code(
268 'encoding_map',
269 codecs.make_encoding_map(map),
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000270 comments=comments,
271 precisions=(4, 2))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000272
Martin v. Löwis3f767792006-06-04 19:36:28 +0000273 if decoding_table_code:
274 suffix = 'table'
275 else:
276 suffix = 'map'
277
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000278 l = [
279 '''\
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000280""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000281
282"""#"
283
284import codecs
285
286### Codec APIs
287
288class Codec(codecs.Codec):
289
290 def encode(self,input,errors='strict'):
Martin v. Löwis3f767792006-06-04 19:36:28 +0000291 return codecs.charmap_encode(input,errors,encoding_%s)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000292
Martin v. Löwis3f767792006-06-04 19:36:28 +0000293 def decode(self,input,errors='strict'):
294 return codecs.charmap_decode(input,errors,decoding_%s)
295''' % (encodingname, name, suffix, suffix)]
296 l.append('''\
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000297class IncrementalEncoder(codecs.IncrementalEncoder):
298 def encode(self, input, final=False):
Martin v. Löwis3f767792006-06-04 19:36:28 +0000299 return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000300
301class IncrementalDecoder(codecs.IncrementalDecoder):
Martin v. Löwis3f767792006-06-04 19:36:28 +0000302 def decode(self, input, final=False):
303 return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
304 (suffix, suffix))
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000305
306 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000307class StreamWriter(Codec,codecs.StreamWriter):
308 pass
309
310class StreamReader(Codec,codecs.StreamReader):
311 pass
312
313### encodings module API
314
315def getregentry():
Jack Diederichdf676c52006-05-26 11:37:20 +0000316 return codecs.CodecInfo(
Jack Diederichdf676c52006-05-26 11:37:20 +0000317 name=%r,
Martin v. Löwis3f767792006-06-04 19:36:28 +0000318 encode=Codec().encode,
319 decode=Codec().decode,
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000320 incrementalencoder=IncrementalEncoder,
321 incrementaldecoder=IncrementalDecoder,
Martin v. Löwis3f767792006-06-04 19:36:28 +0000322 streamreader=StreamReader,
323 streamwriter=StreamWriter,
Jack Diederichdf676c52006-05-26 11:37:20 +0000324 )
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000325''' % encodingname.replace('_', '-'))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000326
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000327 # Add decoding table or map (with preference to the table)
328 if not decoding_table_code:
329 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000330### Decoding Map
331''')
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000332 l.extend(decoding_map_code)
333 else:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000334 l.append('''
335### Decoding Table
336''')
337 l.extend(decoding_table_code)
338
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000339 # Add encoding map
Martin v. Löwis3f767792006-06-04 19:36:28 +0000340 if decoding_table_code:
341 l.append('''
342### Encoding table
343encoding_table=codecs.charmap_build(decoding_table)
344''')
345 else:
346 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000347### Encoding Map
348''')
Martin v. Löwis3f767792006-06-04 19:36:28 +0000349 l.extend(encoding_map_code)
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000350
351 # Final new-line
Walter Dörwald5d23f9a2006-03-31 10:13:10 +0000352 l.append('')
Tim Peters536cf992005-12-25 23:18:31 +0000353
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000354 return '\n'.join(l).expandtabs()
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000355
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000356def pymap(name,map,pyfile,encodingname,comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000357
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000358 code = codegen(name,map,encodingname,comments)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000359 f = open(pyfile,'w')
360 f.write(code)
361 f.close()
362
363def marshalmap(name,map,marshalfile):
364
365 d = {}
366 for e,(u,c) in map.items():
367 d[e] = (u,c)
368 f = open(marshalfile,'wb')
369 marshal.dump(d,f)
370 f.close()
371
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000372def convertdir(dir, dirprefix='', nameprefix='', comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000373
374 mapnames = os.listdir(dir)
375 for mapname in mapnames:
376 mappathname = os.path.join(dir, mapname)
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000377 if not os.path.isfile(mappathname):
378 continue
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000379 name = os.path.split(mapname)[1]
380 name = name.replace('-','_')
381 name = name.split('.')[0]
382 name = name.lower()
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000383 name = nameprefix + name
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000384 codefile = name + '.py'
385 marshalfile = name + '.mapping'
386 print 'converting %s to %s and %s' % (mapname,
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000387 dirprefix + codefile,
388 dirprefix + marshalfile)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000389 try:
390 map = readmap(os.path.join(dir,mapname))
391 if not map:
392 print '* map is empty; skipping'
393 else:
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000394 pymap(mappathname, map, dirprefix + codefile,name,comments)
395 marshalmap(mappathname, map, dirprefix + marshalfile)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000396 except ValueError, why:
397 print '* conversion failed: %s' % why
398 raise
399
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000400def rewritepythondir(dir, dirprefix='', comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000401
402 mapnames = os.listdir(dir)
403 for mapname in mapnames:
404 if not mapname.endswith('.mapping'):
405 continue
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000406 name = mapname[:-len('.mapping')]
407 codefile = name + '.py'
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000408 print 'converting %s to %s' % (mapname,
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000409 dirprefix + codefile)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000410 try:
411 map = marshal.load(open(os.path.join(dir,mapname),
412 'rb'))
413 if not map:
414 print '* map is empty; skipping'
415 else:
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000416 pymap(mapname, map, dirprefix + codefile,name,comments)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000417 except ValueError, why:
418 print '* conversion failed: %s' % why
419
420if __name__ == '__main__':
421
422 import sys
423 if 1:
424 apply(convertdir,tuple(sys.argv[1:]))
425 else:
426 apply(rewritepythondir,tuple(sys.argv[1:]))