blob: 7e7d6d0661473cb632e2b328a1c0ee601d39e674 [file] [log] [blame]
Marc-André Lemburgc5694c82005-10-21 13:45:17 +00001""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
7
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
15The tool also writes marshalled versions of the mapping tables to the
16same location (with .mapping extension).
17
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000018Written by Marc-Andre Lemburg (mal@lemburg.com).
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000019
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21(c) Copyright Guido van Rossum, 2000.
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000022
23Table generation:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000024(c) Copyright Marc-Andre Lemburg, 2005.
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +000025 Licensed to PSF under a Contributor Agreement.
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000026
27"""#"
28
Christian Heimes05e8be12008-02-23 18:30:17 +000029import re, os, marshal, codecs
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000030
31# Maximum allowed size of charmap tables
32MAX_TABLE_SIZE = 8192
33
34# Standard undefined Unicode code point
Georg Brandlbf82e372008-05-16 17:02:34 +000035UNI_UNDEFINED = chr(0xFFFE)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000036
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +000037# Placeholder for a missing codepoint
38MISSING_CODE = -1
39
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000040mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
41 '\s+'
42 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
43 '\s*'
44 '(#.+)?')
45
Florent Xiclunaf089fd62010-03-19 14:25:03 +000046def parsecodes(codes, len=len, range=range):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000047
48 """ Converts code combinations to either a single code integer
49 or a tuple of integers.
50
51 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
52 ignored.
53
54 Empty codes or illegal ones are returned as None.
55
56 """
57 if not codes:
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +000058 return MISSING_CODE
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000059 l = codes.split('+')
60 if len(l) == 1:
61 return int(l[0],16)
62 for i in range(len(l)):
63 try:
64 l[i] = int(l[i],16)
65 except ValueError:
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +000066 l[i] = MISSING_CODE
67 l = [x for x in l if x != MISSING_CODE]
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000068 if len(l) == 1:
69 return l[0]
70 else:
71 return tuple(l)
72
73def readmap(filename):
74
75 f = open(filename,'r')
76 lines = f.readlines()
77 f.close()
78 enc2uni = {}
79 identity = []
Georg Brandlbf82e372008-05-16 17:02:34 +000080 unmapped = list(range(256))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000081
82 # UTC mapping tables per convention don't include the identity
83 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
84 # explicitly mapped to different characters or undefined
Georg Brandlbf82e372008-05-16 17:02:34 +000085 for i in list(range(32)) + [127]:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +000086 identity.append(i)
87 unmapped.remove(i)
88 enc2uni[i] = (i, 'CONTROL CHARACTER')
89
90 for line in lines:
91 line = line.strip()
92 if not line or line[0] == '#':
93 continue
94 m = mapRE.match(line)
95 if not m:
96 #print '* not matched: %s' % repr(line)
97 continue
98 enc,uni,comment = m.groups()
99 enc = parsecodes(enc)
100 uni = parsecodes(uni)
101 if comment is None:
102 comment = ''
103 else:
104 comment = comment[1:].strip()
105 if enc < 256:
106 if enc in unmapped:
107 unmapped.remove(enc)
108 if enc == uni:
109 identity.append(enc)
110 enc2uni[enc] = (uni,comment)
111 else:
112 enc2uni[enc] = (uni,comment)
113
114 # If there are more identity-mapped entries than unmapped entries,
115 # it pays to generate an identity dictionary first, and add explicit
116 # mappings to None for the rest
117 if len(identity) >= len(unmapped):
118 for enc in unmapped:
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +0000119 enc2uni[enc] = (MISSING_CODE, "")
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000120 enc2uni['IDENTITY'] = 256
121
122 return enc2uni
123
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000124def hexrepr(t, precision=4):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000125
126 if t is None:
127 return 'None'
128 try:
129 len(t)
130 except:
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000131 return '0x%0*X' % (precision, t)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000132 try:
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000133 return '(' + ', '.join(['0x%0*X' % (precision, item)
134 for item in t]) + ')'
Guido van Rossumb940e112007-01-10 16:19:56 +0000135 except TypeError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000136 print('* failed to convert %r: %s' % (t, why))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000137 raise
138
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000139def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000140
141 l = []
142 append = l.append
Georg Brandlbf82e372008-05-16 17:02:34 +0000143 if "IDENTITY" in map:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000144 append("%s = codecs.make_identity_dict(range(%d))" %
145 (varname, map["IDENTITY"]))
146 append("%s.update({" % varname)
147 splits = 1
148 del map["IDENTITY"]
149 identity = 1
150 else:
151 append("%s = {" % varname)
152 splits = 0
153 identity = 0
154
Georg Brandlbf82e372008-05-16 17:02:34 +0000155 mappings = sorted(map.items())
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000156 i = 0
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000157 key_precision, value_precision = precisions
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000158 for mapkey, mapvalue in mappings:
159 mapcomment = ''
160 if isinstance(mapkey, tuple):
161 (mapkey, mapcomment) = mapkey
162 if isinstance(mapvalue, tuple):
163 (mapvalue, mapcomment) = mapvalue
164 if mapkey is None:
165 continue
166 if (identity and
167 mapkey == mapvalue and
168 mapkey < 256):
169 # No need to include identity mappings, since these
170 # are already set for the first 256 code points.
171 continue
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000172 key = hexrepr(mapkey, key_precision)
173 value = hexrepr(mapvalue, value_precision)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000174 if mapcomment and comments:
175 append(' %s: %s,\t# %s' % (key, value, mapcomment))
176 else:
177 append(' %s: %s,' % (key, value))
178 i += 1
179 if i == 4096:
180 # Split the definition into parts to that the Python
181 # parser doesn't dump core
182 if splits == 0:
183 append('}')
184 else:
185 append('})')
186 append('%s.update({' % varname)
187 i = 0
188 splits = splits + 1
189 if splits == 0:
190 append('}')
191 else:
192 append('})')
193
194 return l
195
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000196def python_tabledef_code(varname, map, comments=1, key_precision=2):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000197
198 l = []
199 append = l.append
200 append('%s = (' % varname)
201
202 # Analyze map and create table dict
Georg Brandlbf82e372008-05-16 17:02:34 +0000203 mappings = sorted(map.items())
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000204 table = {}
205 maxkey = 0
Georg Brandlbf82e372008-05-16 17:02:34 +0000206 if 'IDENTITY' in map:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000207 for key in range(256):
208 table[key] = (key, '')
209 maxkey = 255
210 del map['IDENTITY']
211 for mapkey, mapvalue in mappings:
212 mapcomment = ''
213 if isinstance(mapkey, tuple):
214 (mapkey, mapcomment) = mapkey
215 if isinstance(mapvalue, tuple):
216 (mapvalue, mapcomment) = mapvalue
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +0000217 if mapkey == MISSING_CODE:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000218 continue
219 table[mapkey] = (mapvalue, mapcomment)
220 if mapkey > maxkey:
221 maxkey = mapkey
222 if maxkey > MAX_TABLE_SIZE:
223 # Table too large
224 return None
225
226 # Create table code
227 for key in range(maxkey + 1):
228 if key not in table:
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +0000229 mapvalue = MISSING_CODE
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000230 mapcomment = 'UNDEFINED'
231 else:
232 mapvalue, mapcomment = table[key]
Alexander Belopolsky827fdaa2010-11-30 16:56:15 +0000233 if mapvalue == MISSING_CODE:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000234 mapchar = UNI_UNDEFINED
235 else:
236 if isinstance(mapvalue, tuple):
237 # 1-n mappings not supported
238 return None
239 else:
Georg Brandlbf82e372008-05-16 17:02:34 +0000240 mapchar = chr(mapvalue)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000241 if mapcomment and comments:
Amaury Forgeot d'Arc8b84ea02009-07-13 20:38:21 +0000242 append(' %a \t# %s -> %s' % (mapchar,
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000243 hexrepr(key, key_precision),
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000244 mapcomment))
245 else:
Amaury Forgeot d'Arc8b84ea02009-07-13 20:38:21 +0000246 append(' %a' % mapchar)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000247
248 append(')')
249 return l
250
Thomas Woutersa9773292006-04-21 09:43:23 +0000251def codegen(name, map, encodingname, comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000252
253 """ Returns Python source for the given map.
254
255 Comments are included in the source, if comments is true (default).
256
257 """
258 # Generate code
259 decoding_map_code = python_mapdef_code(
260 'decoding_map',
261 map,
262 comments=comments)
263 decoding_table_code = python_tabledef_code(
264 'decoding_table',
265 map,
266 comments=comments)
267 encoding_map_code = python_mapdef_code(
268 'encoding_map',
269 codecs.make_encoding_map(map),
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000270 comments=comments,
271 precisions=(4, 2))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000272
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000273 if decoding_table_code:
274 suffix = 'table'
275 else:
276 suffix = 'map'
277
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000278 l = [
279 '''\
Thomas Woutersa9773292006-04-21 09:43:23 +0000280""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000281
282"""#"
283
284import codecs
285
286### Codec APIs
287
288class Codec(codecs.Codec):
289
290 def encode(self,input,errors='strict'):
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000291 return codecs.charmap_encode(input,errors,encoding_%s)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000292
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000293 def decode(self,input,errors='strict'):
294 return codecs.charmap_decode(input,errors,decoding_%s)
295''' % (encodingname, name, suffix, suffix)]
296 l.append('''\
Thomas Woutersa9773292006-04-21 09:43:23 +0000297class IncrementalEncoder(codecs.IncrementalEncoder):
298 def encode(self, input, final=False):
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000299 return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
Thomas Woutersa9773292006-04-21 09:43:23 +0000300
301class IncrementalDecoder(codecs.IncrementalDecoder):
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000302 def decode(self, input, final=False):
303 return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
304 (suffix, suffix))
Thomas Woutersa9773292006-04-21 09:43:23 +0000305
306 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000307class StreamWriter(Codec,codecs.StreamWriter):
308 pass
309
310class StreamReader(Codec,codecs.StreamReader):
311 pass
312
313### encodings module API
314
315def getregentry():
Thomas Wouters477c8d52006-05-27 19:21:47 +0000316 return codecs.CodecInfo(
Thomas Wouters477c8d52006-05-27 19:21:47 +0000317 name=%r,
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000318 encode=Codec().encode,
319 decode=Codec().decode,
Thomas Woutersa9773292006-04-21 09:43:23 +0000320 incrementalencoder=IncrementalEncoder,
321 incrementaldecoder=IncrementalDecoder,
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000322 streamreader=StreamReader,
323 streamwriter=StreamWriter,
Thomas Wouters477c8d52006-05-27 19:21:47 +0000324 )
Thomas Woutersa9773292006-04-21 09:43:23 +0000325''' % encodingname.replace('_', '-'))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000326
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000327 # Add decoding table or map (with preference to the table)
328 if not decoding_table_code:
329 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000330### Decoding Map
331''')
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000332 l.extend(decoding_map_code)
333 else:
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000334 l.append('''
335### Decoding Table
336''')
337 l.extend(decoding_table_code)
338
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000339 # Add encoding map
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000340 if decoding_table_code:
341 l.append('''
342### Encoding table
343encoding_table=codecs.charmap_build(decoding_table)
344''')
345 else:
346 l.append('''
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000347### Encoding Map
348''')
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000349 l.extend(encoding_map_code)
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000350
351 # Final new-line
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000352 l.append('')
Tim Peters536cf992005-12-25 23:18:31 +0000353
Thomas Woutersa9773292006-04-21 09:43:23 +0000354 return '\n'.join(l).expandtabs()
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000355
Thomas Woutersa9773292006-04-21 09:43:23 +0000356def pymap(name,map,pyfile,encodingname,comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000357
Thomas Woutersa9773292006-04-21 09:43:23 +0000358 code = codegen(name,map,encodingname,comments)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000359 f = open(pyfile,'w')
360 f.write(code)
361 f.close()
362
363def marshalmap(name,map,marshalfile):
364
365 d = {}
366 for e,(u,c) in map.items():
367 d[e] = (u,c)
368 f = open(marshalfile,'wb')
369 marshal.dump(d,f)
370 f.close()
371
Thomas Woutersa9773292006-04-21 09:43:23 +0000372def convertdir(dir, dirprefix='', nameprefix='', comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000373
374 mapnames = os.listdir(dir)
375 for mapname in mapnames:
376 mappathname = os.path.join(dir, mapname)
Marc-André Lemburgbd20ea52005-10-25 11:53:33 +0000377 if not os.path.isfile(mappathname):
378 continue
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000379 name = os.path.split(mapname)[1]
380 name = name.replace('-','_')
381 name = name.split('.')[0]
382 name = name.lower()
Thomas Woutersa9773292006-04-21 09:43:23 +0000383 name = nameprefix + name
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000384 codefile = name + '.py'
385 marshalfile = name + '.mapping'
Collin Winter6afaeb72007-08-03 17:06:41 +0000386 print('converting %s to %s and %s' % (mapname,
Thomas Woutersa9773292006-04-21 09:43:23 +0000387 dirprefix + codefile,
Collin Winter6afaeb72007-08-03 17:06:41 +0000388 dirprefix + marshalfile))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000389 try:
390 map = readmap(os.path.join(dir,mapname))
391 if not map:
Collin Winter6afaeb72007-08-03 17:06:41 +0000392 print('* map is empty; skipping')
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000393 else:
Thomas Woutersa9773292006-04-21 09:43:23 +0000394 pymap(mappathname, map, dirprefix + codefile,name,comments)
395 marshalmap(mappathname, map, dirprefix + marshalfile)
Guido van Rossumb940e112007-01-10 16:19:56 +0000396 except ValueError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000397 print('* conversion failed: %s' % why)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000398 raise
399
Thomas Woutersa9773292006-04-21 09:43:23 +0000400def rewritepythondir(dir, dirprefix='', comments=1):
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000401
402 mapnames = os.listdir(dir)
403 for mapname in mapnames:
404 if not mapname.endswith('.mapping'):
405 continue
Thomas Woutersa9773292006-04-21 09:43:23 +0000406 name = mapname[:-len('.mapping')]
407 codefile = name + '.py'
Collin Winter6afaeb72007-08-03 17:06:41 +0000408 print('converting %s to %s' % (mapname,
409 dirprefix + codefile))
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000410 try:
411 map = marshal.load(open(os.path.join(dir,mapname),
412 'rb'))
413 if not map:
Collin Winter6afaeb72007-08-03 17:06:41 +0000414 print('* map is empty; skipping')
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000415 else:
Thomas Woutersa9773292006-04-21 09:43:23 +0000416 pymap(mapname, map, dirprefix + codefile,name,comments)
Guido van Rossumb940e112007-01-10 16:19:56 +0000417 except ValueError as why:
Collin Winter6afaeb72007-08-03 17:06:41 +0000418 print('* conversion failed: %s' % why)
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000419
420if __name__ == '__main__':
421
422 import sys
423 if 1:
Neal Norwitzd9108552006-03-17 08:00:19 +0000424 convertdir(*sys.argv[1:])
Marc-André Lemburgc5694c82005-10-21 13:45:17 +0000425 else:
Neal Norwitzd9108552006-03-17 08:00:19 +0000426 rewritepythondir(*sys.argv[1:])