blob: 75337d6dbb2c96629f4b89f8ad13119bde900675 [file] [log] [blame]
Guido van Rossum34a79112000-03-10 22:36:57 +00001""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
Marc-André Lemburga866df82001-01-03 21:29:14 +00004site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
Guido van Rossum34a79112000-03-10 22:36:57 +00007
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
Fred Drakebae57a82000-03-17 16:56:23 +000015The tool also writes marshalled versions of the mapping tables to the
Guido van Rossum34a79112000-03-10 22:36:57 +000016same location (with .mapping extension).
17
18Written by Marc-Andre Lemburg (mal@lemburg.com).
19
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
Marc-André Lemburga866df82001-01-03 21:29:14 +000021(c) Copyright Guido van Rossum, 2000.
Guido van Rossum34a79112000-03-10 22:36:57 +000022
23"""#"
24
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000025import re,os,time,marshal
Guido van Rossum34a79112000-03-10 22:36:57 +000026
27# Create numeric tables or character based ones ?
28numeric = 1
29
30mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
31 '\s+'
32 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
33 '\s*'
34 '(#.+)?')
35
36def parsecodes(codes,
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000037 len=len, filter=filter,range=range):
Guido van Rossum34a79112000-03-10 22:36:57 +000038
39 """ Converts code combinations to either a single code integer
40 or a tuple of integers.
41
42 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
43 ignored.
44
45 Empty codes or illegal ones are returned as None.
46
47 """
48 if not codes:
49 return None
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000050 l = codes.split('+')
Guido van Rossum34a79112000-03-10 22:36:57 +000051 if len(l) == 1:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000052 return int(l[0],16)
Guido van Rossum34a79112000-03-10 22:36:57 +000053 for i in range(len(l)):
54 try:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000055 l[i] = int(l[i],16)
Guido van Rossum34a79112000-03-10 22:36:57 +000056 except ValueError:
57 l[i] = None
58 l = filter(lambda x: x is not None, l)
59 if len(l) == 1:
60 return l[0]
61 else:
62 return tuple(l)
63
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000064def readmap(filename):
Guido van Rossum34a79112000-03-10 22:36:57 +000065
66 f = open(filename,'r')
67 lines = f.readlines()
68 f.close()
69 enc2uni = {}
Marc-André Lemburga866df82001-01-03 21:29:14 +000070 identity = []
71 unmapped = range(256)
72 for i in range(256):
73 unmapped[i] = i
Guido van Rossum34a79112000-03-10 22:36:57 +000074 for line in lines:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000075 line = line.strip()
Guido van Rossum34a79112000-03-10 22:36:57 +000076 if not line or line[0] == '#':
77 continue
78 m = mapRE.match(line)
79 if not m:
80 #print '* not matched: %s' % repr(line)
81 continue
82 enc,uni,comment = m.groups()
83 enc = parsecodes(enc)
84 uni = parsecodes(uni)
85 if not comment:
86 comment = ''
87 else:
88 comment = comment[1:]
Marc-André Lemburga866df82001-01-03 21:29:14 +000089 if enc < 256:
90 unmapped.remove(enc)
91 if enc == uni:
92 identity.append(enc)
93 else:
94 enc2uni[enc] = (uni,comment)
95 else:
Guido van Rossum34a79112000-03-10 22:36:57 +000096 enc2uni[enc] = (uni,comment)
Marc-André Lemburga866df82001-01-03 21:29:14 +000097 # If there are more identity-mapped entries than unmapped entries,
Walter Dörwald771bc372003-02-02 23:39:45 +000098 # it pays to generate an identity dictionary first, and add explicit
Marc-André Lemburga866df82001-01-03 21:29:14 +000099 # mappings to None for the rest
100 if len(identity)>=len(unmapped):
101 for enc in unmapped:
102 enc2uni[enc] = (None, "")
103 enc2uni['IDENTITY'] = 256
104
Guido van Rossum34a79112000-03-10 22:36:57 +0000105 return enc2uni
106
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000107def hexrepr(t):
Guido van Rossum34a79112000-03-10 22:36:57 +0000108
109 if t is None:
110 return 'None'
111 try:
112 len(t)
113 except:
114 return '0x%04x' % t
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000115 return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
Guido van Rossum34a79112000-03-10 22:36:57 +0000116
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000117def unicoderepr(t):
Guido van Rossum34a79112000-03-10 22:36:57 +0000118
119 if t is None:
120 return 'None'
121 if numeric:
122 return hexrepr(t)
123 else:
124 try:
125 len(t)
126 except:
127 return repr(unichr(t))
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000128 return repr(''.join(map(unichr, t)))
Guido van Rossum34a79112000-03-10 22:36:57 +0000129
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000130def keyrepr(t):
Guido van Rossum34a79112000-03-10 22:36:57 +0000131
132 if t is None:
133 return 'None'
134 if numeric:
135 return hexrepr(t)
136 else:
137 try:
138 len(t)
139 except:
140 if t < 256:
141 return repr(chr(t))
142 else:
143 return repr(unichr(t))
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000144 return repr(''.join(map(chr, t)))
Guido van Rossum34a79112000-03-10 22:36:57 +0000145
146def codegen(name,map,comments=1):
147
148 """ Returns Python source for the given map.
149
150 Comments are included in the source, if comments is true (default).
151
152 """
153 l = [
154 '''\
Marc-André Lemburga866df82001-01-03 21:29:14 +0000155""" Python Character Mapping Codec generated from '%s' with gencodec.py.
Guido van Rossum34a79112000-03-10 22:36:57 +0000156
Guido van Rossum34a79112000-03-10 22:36:57 +0000157"""#"
158
159import codecs
160
161### Codec APIs
162
163class Codec(codecs.Codec):
164
165 def encode(self,input,errors='strict'):
166
167 return codecs.charmap_encode(input,errors,encoding_map)
Tim Peters70c43782001-01-17 08:48:39 +0000168
Guido van Rossum34a79112000-03-10 22:36:57 +0000169 def decode(self,input,errors='strict'):
170
171 return codecs.charmap_decode(input,errors,decoding_map)
172
173class StreamWriter(Codec,codecs.StreamWriter):
174 pass
Tim Peters70c43782001-01-17 08:48:39 +0000175
Guido van Rossum34a79112000-03-10 22:36:57 +0000176class StreamReader(Codec,codecs.StreamReader):
177 pass
178
179### encodings module API
180
181def getregentry():
182
183 return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
184
185### Decoding Map
Guido van Rossum34a79112000-03-10 22:36:57 +0000186''' % name,
187 ]
Marc-André Lemburga866df82001-01-03 21:29:14 +0000188
189 if map.has_key("IDENTITY"):
190 l.append("decoding_map = codecs.make_identity_dict(range(%d))"
191 % map["IDENTITY"])
192 l.append("decoding_map.update({")
193 splits = 1
194 del map["IDENTITY"]
195 else:
196 l.append("decoding_map = {")
197 splits = 0
Tim Peters70c43782001-01-17 08:48:39 +0000198
Guido van Rossum34a79112000-03-10 22:36:57 +0000199 mappings = map.items()
200 mappings.sort()
201 append = l.append
202 i = 0
Guido van Rossum34a79112000-03-10 22:36:57 +0000203 for e,value in mappings:
204 try:
205 (u,c) = value
206 except TypeError:
207 u = value
208 c = ''
209 key = keyrepr(e)
210 if c and comments:
211 append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
212 else:
213 append('\t%s: %s,' % (key,unicoderepr(u)))
Marc-André Lemburga866df82001-01-03 21:29:14 +0000214 i += 1
Guido van Rossum34a79112000-03-10 22:36:57 +0000215 if i == 4096:
216 # Split the definition into parts to that the Python
217 # parser doesn't dump core
218 if splits == 0:
219 append('}')
220 else:
221 append('})')
Marc-André Lemburga866df82001-01-03 21:29:14 +0000222 append('decoding_map.update({')
Guido van Rossum34a79112000-03-10 22:36:57 +0000223 i = 0
224 splits = splits + 1
225 if splits == 0:
226 append('}')
227 else:
228 append('})')
229 append('''
230### Encoding Map
231
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000232encoding_map = codecs.make_encoding_map(decoding_map)
Guido van Rossum34a79112000-03-10 22:36:57 +0000233''')
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000234 return '\n'.join(l)
Guido van Rossum34a79112000-03-10 22:36:57 +0000235
236def pymap(name,map,pyfile,comments=1):
237
238 code = codegen(name,map,comments)
239 f = open(pyfile,'w')
240 f.write(code)
241 f.close()
242
243def marshalmap(name,map,marshalfile):
244
245 d = {}
246 for e,(u,c) in map.items():
247 d[e] = (u,c)
248 f = open(marshalfile,'wb')
249 marshal.dump(d,f)
250 f.close()
251
252def convertdir(dir,prefix='',comments=1):
253
254 mapnames = os.listdir(dir)
255 for mapname in mapnames:
256 name = os.path.split(mapname)[1]
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000257 name = name.replace('-','_')
258 name = name.split('.')[0]
259 name = name.lower()
Guido van Rossum34a79112000-03-10 22:36:57 +0000260 codefile = name + '.py'
261 marshalfile = name + '.mapping'
262 print 'converting %s to %s and %s' % (mapname,
263 prefix + codefile,
264 prefix + marshalfile)
265 try:
266 map = readmap(os.path.join(dir,mapname))
267 if not map:
268 print '* map is empty; skipping'
269 else:
270 pymap(mapname, map, prefix + codefile,comments)
271 marshalmap(mapname, map, prefix + marshalfile)
272 except ValueError:
273 print '* conversion failed'
274
275def rewritepythondir(dir,prefix='',comments=1):
Tim Peters70c43782001-01-17 08:48:39 +0000276
Guido van Rossum34a79112000-03-10 22:36:57 +0000277 mapnames = os.listdir(dir)
278 for mapname in mapnames:
Marc-André Lemburga866df82001-01-03 21:29:14 +0000279 if not mapname.endswith('.mapping'):
Guido van Rossum34a79112000-03-10 22:36:57 +0000280 continue
281 codefile = mapname[:-len('.mapping')] + '.py'
282 print 'converting %s to %s' % (mapname,
283 prefix + codefile)
284 try:
285 map = marshal.load(open(os.path.join(dir,mapname),
286 'rb'))
287 if not map:
288 print '* map is empty; skipping'
289 else:
290 pymap(mapname, map, prefix + codefile,comments)
291 except ValueError, why:
292 print '* conversion failed: %s' % why
293
294if __name__ == '__main__':
295
296 import sys
297 if 1:
298 apply(convertdir,tuple(sys.argv[1:]))
299 else:
300 apply(rewritepythondir,tuple(sys.argv[1:]))