blob: 7bce3d5f7f09d73029159b46fa98901b24f11747 [file] [log] [blame]
Marc-André Lemburgc5694c82005-10-21 13:45:17 +00001""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
7
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
15The tool also writes marshalled versions of the mapping tables to the
16same location (with .mapping extension).
17
18Written by Marc-Andre Lemburg (mal@lemburg.com). Modified to generate
19Unicode table maps for decoding.
20
21(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
22(c) Copyright Guido van Rossum, 2000.
23(c) Copyright Marc-Andre Lemburg, 2005.
24
25"""#"
26
27import re, os, time, marshal, codecs
28
29# Maximum allowed size of charmap tables
30MAX_TABLE_SIZE = 8192
31
32# Standard undefined Unicode code point
33UNI_UNDEFINED = unichr(0xFFFE)
34
35mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
36 '\s+'
37 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
38 '\s*'
39 '(#.+)?')
40
41def parsecodes(codes,
42 len=len, filter=filter,range=range):
43
44 """ Converts code combinations to either a single code integer
45 or a tuple of integers.
46
47 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
48 ignored.
49
50 Empty codes or illegal ones are returned as None.
51
52 """
53 if not codes:
54 return None
55 l = codes.split('+')
56 if len(l) == 1:
57 return int(l[0],16)
58 for i in range(len(l)):
59 try:
60 l[i] = int(l[i],16)
61 except ValueError:
62 l[i] = None
63 l = filter(lambda x: x is not None, l)
64 if len(l) == 1:
65 return l[0]
66 else:
67 return tuple(l)
68
69def readmap(filename):
70
71 f = open(filename,'r')
72 lines = f.readlines()
73 f.close()
74 enc2uni = {}
75 identity = []
76 unmapped = range(256)
77
78 # UTC mapping tables per convention don't include the identity
79 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
80 # explicitly mapped to different characters or undefined
81 for i in range(32) + [127]:
82 identity.append(i)
83 unmapped.remove(i)
84 enc2uni[i] = (i, 'CONTROL CHARACTER')
85
86 for line in lines:
87 line = line.strip()
88 if not line or line[0] == '#':
89 continue
90 m = mapRE.match(line)
91 if not m:
92 #print '* not matched: %s' % repr(line)
93 continue
94 enc,uni,comment = m.groups()
95 enc = parsecodes(enc)
96 uni = parsecodes(uni)
97 if comment is None:
98 comment = ''
99 else:
100 comment = comment[1:].strip()
101 if enc < 256:
102 if enc in unmapped:
103 unmapped.remove(enc)
104 if enc == uni:
105 identity.append(enc)
106 enc2uni[enc] = (uni,comment)
107 else:
108 enc2uni[enc] = (uni,comment)
109
110 # If there are more identity-mapped entries than unmapped entries,
111 # it pays to generate an identity dictionary first, and add explicit
112 # mappings to None for the rest
113 if len(identity) >= len(unmapped):
114 for enc in unmapped:
115 enc2uni[enc] = (None, "")
116 enc2uni['IDENTITY'] = 256
117
118 return enc2uni
119
120def hexrepr(t):
121
122 if t is None:
123 return 'None'
124 try:
125 len(t)
126 except:
127 return '0x%04x' % t
128 try:
129 return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
130 except TypeError, why:
131 print '* failed to convert %r: %s' % (t, why)
132 raise
133
134def python_mapdef_code(varname, map, comments=1):
135
136 l = []
137 append = l.append
138 if map.has_key("IDENTITY"):
139 append("%s = codecs.make_identity_dict(range(%d))" %
140 (varname, map["IDENTITY"]))
141 append("%s.update({" % varname)
142 splits = 1
143 del map["IDENTITY"]
144 identity = 1
145 else:
146 append("%s = {" % varname)
147 splits = 0
148 identity = 0
149
150 mappings = map.items()
151 mappings.sort()
152 i = 0
153 for mapkey, mapvalue in mappings:
154 mapcomment = ''
155 if isinstance(mapkey, tuple):
156 (mapkey, mapcomment) = mapkey
157 if isinstance(mapvalue, tuple):
158 (mapvalue, mapcomment) = mapvalue
159 if mapkey is None:
160 continue
161 if (identity and
162 mapkey == mapvalue and
163 mapkey < 256):
164 # No need to include identity mappings, since these
165 # are already set for the first 256 code points.
166 continue
167 key = hexrepr(mapkey)
168 value = hexrepr(mapvalue)
169 if mapcomment and comments:
170 append(' %s: %s,\t# %s' % (key, value, mapcomment))
171 else:
172 append(' %s: %s,' % (key, value))
173 i += 1
174 if i == 4096:
175 # Split the definition into parts to that the Python
176 # parser doesn't dump core
177 if splits == 0:
178 append('}')
179 else:
180 append('})')
181 append('%s.update({' % varname)
182 i = 0
183 splits = splits + 1
184 if splits == 0:
185 append('}')
186 else:
187 append('})')
188
189 return l
190
191def python_tabledef_code(varname, map, comments=1):
192
193 l = []
194 append = l.append
195 append('%s = (' % varname)
196
197 # Analyze map and create table dict
198 mappings = map.items()
199 mappings.sort()
200 table = {}
201 maxkey = 0
202 if map.has_key('IDENTITY'):
203 for key in range(256):
204 table[key] = (key, '')
205 maxkey = 255
206 del map['IDENTITY']
207 for mapkey, mapvalue in mappings:
208 mapcomment = ''
209 if isinstance(mapkey, tuple):
210 (mapkey, mapcomment) = mapkey
211 if isinstance(mapvalue, tuple):
212 (mapvalue, mapcomment) = mapvalue
213 if mapkey is None:
214 continue
215 table[mapkey] = (mapvalue, mapcomment)
216 if mapkey > maxkey:
217 maxkey = mapkey
218 if maxkey > MAX_TABLE_SIZE:
219 # Table too large
220 return None
221
222 # Create table code
223 for key in range(maxkey + 1):
224 if key not in table:
225 mapvalue = None
226 mapcomment = 'UNDEFINED'
227 else:
228 mapvalue, mapcomment = table[key]
229 if mapvalue is None:
230 mapchar = UNI_UNDEFINED
231 else:
232 if isinstance(mapvalue, tuple):
233 # 1-n mappings not supported
234 return None
235 else:
236 mapchar = unichr(mapvalue)
237 if mapcomment and comments:
238 append(' %r\t# %s -> %s' % (mapchar,
239 hexrepr(key),
240 mapcomment))
241 else:
242 append(' %r' % mapchar)
243
244 append(')')
245 return l
246
247def codegen(name, map, comments=1):
248
249 """ Returns Python source for the given map.
250
251 Comments are included in the source, if comments is true (default).
252
253 """
254 # Generate code
255 decoding_map_code = python_mapdef_code(
256 'decoding_map',
257 map,
258 comments=comments)
259 decoding_table_code = python_tabledef_code(
260 'decoding_table',
261 map,
262 comments=comments)
263 encoding_map_code = python_mapdef_code(
264 'encoding_map',
265 codecs.make_encoding_map(map),
266 comments=comments)
267
268 l = [
269 '''\
270""" Python Character Mapping Codec generated from '%s' with gencodec.py.
271
272"""#"
273
274import codecs
275
276### Codec APIs
277
278class Codec(codecs.Codec):
279
280 def encode(self,input,errors='strict'):
281
282 return codecs.charmap_encode(input,errors,encoding_map)
283
284 def decode(self,input,errors='strict'):
285''' % name
286 ]
287 if decoding_table_code:
288 l.append('''\
289 return codecs.charmap_decode(input,errors,decoding_table)''')
290 else:
291 l.append('''\
292 return codecs.charmap_decode(input,errors,decoding_map)''')
293
294 l.append('''
295class StreamWriter(Codec,codecs.StreamWriter):
296 pass
297
298class StreamReader(Codec,codecs.StreamReader):
299 pass
300
301### encodings module API
302
303def getregentry():
304
305 return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
306
307### Decoding Map
308''')
309 l.extend(decoding_map_code)
310
311 # Add optional decoding table
312 if decoding_table_code:
313 l.append('''
314### Decoding Table
315''')
316 l.extend(decoding_table_code)
317
318 l.append('''
319### Encoding Map
320''')
321 l.extend(encoding_map_code)
322
323 return '\n'.join(l)
324
325def pymap(name,map,pyfile,comments=1):
326
327 code = codegen(name,map,comments)
328 f = open(pyfile,'w')
329 f.write(code)
330 f.close()
331
332def marshalmap(name,map,marshalfile):
333
334 d = {}
335 for e,(u,c) in map.items():
336 d[e] = (u,c)
337 f = open(marshalfile,'wb')
338 marshal.dump(d,f)
339 f.close()
340
341def convertdir(dir,prefix='',comments=1):
342
343 mapnames = os.listdir(dir)
344 for mapname in mapnames:
345 mappathname = os.path.join(dir, mapname)
346 name = os.path.split(mapname)[1]
347 name = name.replace('-','_')
348 name = name.split('.')[0]
349 name = name.lower()
350 codefile = name + '.py'
351 marshalfile = name + '.mapping'
352 print 'converting %s to %s and %s' % (mapname,
353 prefix + codefile,
354 prefix + marshalfile)
355 try:
356 map = readmap(os.path.join(dir,mapname))
357 if not map:
358 print '* map is empty; skipping'
359 else:
360 pymap(mappathname, map, prefix + codefile,comments)
361 marshalmap(mappathname, map, prefix + marshalfile)
362 except ValueError, why:
363 print '* conversion failed: %s' % why
364 raise
365
366def rewritepythondir(dir,prefix='',comments=1):
367
368 mapnames = os.listdir(dir)
369 for mapname in mapnames:
370 if not mapname.endswith('.mapping'):
371 continue
372 codefile = mapname[:-len('.mapping')] + '.py'
373 print 'converting %s to %s' % (mapname,
374 prefix + codefile)
375 try:
376 map = marshal.load(open(os.path.join(dir,mapname),
377 'rb'))
378 if not map:
379 print '* map is empty; skipping'
380 else:
381 pymap(mapname, map, prefix + codefile,comments)
382 except ValueError, why:
383 print '* conversion failed: %s' % why
384
385if __name__ == '__main__':
386
387 import sys
388 if 1:
389 apply(convertdir,tuple(sys.argv[1:]))
390 else:
391 apply(rewritepythondir,tuple(sys.argv[1:]))