blob: fdaa8fefbb54cdaf0a516801629f8c389e34e6bb [file] [log] [blame]
Guido van Rossum34a79112000-03-10 22:36:57 +00001""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
Marc-André Lemburga866df82001-01-03 21:29:14 +00004site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
Guido van Rossum34a79112000-03-10 22:36:57 +00007
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
Fred Drakebae57a82000-03-17 16:56:23 +000015The tool also writes marshalled versions of the mapping tables to the
Guido van Rossum34a79112000-03-10 22:36:57 +000016same location (with .mapping extension).
17
18Written by Marc-Andre Lemburg (mal@lemburg.com).
19
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
Marc-André Lemburga866df82001-01-03 21:29:14 +000021(c) Copyright Guido van Rossum, 2000.
Guido van Rossum34a79112000-03-10 22:36:57 +000022
23"""#"
24
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000025import re,os,time,marshal
Guido van Rossum34a79112000-03-10 22:36:57 +000026
27# Create numeric tables or character based ones ?
28numeric = 1
29
30mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
31 '\s+'
32 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
33 '\s*'
34 '(#.+)?')
35
36def parsecodes(codes,
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000037 len=len, filter=filter,range=range):
Guido van Rossum34a79112000-03-10 22:36:57 +000038
39 """ Converts code combinations to either a single code integer
40 or a tuple of integers.
41
42 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
43 ignored.
44
45 Empty codes or illegal ones are returned as None.
46
47 """
48 if not codes:
49 return None
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000050 l = codes.split('+')
Guido van Rossum34a79112000-03-10 22:36:57 +000051 if len(l) == 1:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000052 return int(l[0],16)
Guido van Rossum34a79112000-03-10 22:36:57 +000053 for i in range(len(l)):
54 try:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000055 l[i] = int(l[i],16)
Guido van Rossum34a79112000-03-10 22:36:57 +000056 except ValueError:
57 l[i] = None
58 l = filter(lambda x: x is not None, l)
59 if len(l) == 1:
60 return l[0]
61 else:
62 return tuple(l)
63
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000064def readmap(filename):
Guido van Rossum34a79112000-03-10 22:36:57 +000065
66 f = open(filename,'r')
67 lines = f.readlines()
68 f.close()
69 enc2uni = {}
Marc-André Lemburga866df82001-01-03 21:29:14 +000070 identity = []
71 unmapped = range(256)
72 for i in range(256):
73 unmapped[i] = i
Guido van Rossum34a79112000-03-10 22:36:57 +000074 for line in lines:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000075 line = line.strip()
Guido van Rossum34a79112000-03-10 22:36:57 +000076 if not line or line[0] == '#':
77 continue
78 m = mapRE.match(line)
79 if not m:
80 #print '* not matched: %s' % repr(line)
81 continue
82 enc,uni,comment = m.groups()
83 enc = parsecodes(enc)
84 uni = parsecodes(uni)
85 if not comment:
86 comment = ''
87 else:
88 comment = comment[1:]
Marc-André Lemburga866df82001-01-03 21:29:14 +000089 if enc < 256:
90 unmapped.remove(enc)
91 if enc == uni:
92 identity.append(enc)
93 else:
94 enc2uni[enc] = (uni,comment)
95 else:
Guido van Rossum34a79112000-03-10 22:36:57 +000096 enc2uni[enc] = (uni,comment)
Marc-André Lemburga866df82001-01-03 21:29:14 +000097 # If there are more identity-mapped entries than unmapped entries,
Walter Dörwald771bc372003-02-02 23:39:45 +000098 # it pays to generate an identity dictionary first, and add explicit
Marc-André Lemburga866df82001-01-03 21:29:14 +000099 # mappings to None for the rest
100 if len(identity)>=len(unmapped):
101 for enc in unmapped:
102 enc2uni[enc] = (None, "")
103 enc2uni['IDENTITY'] = 256
104
Guido van Rossum34a79112000-03-10 22:36:57 +0000105 return enc2uni
106
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000107def hexrepr(t):
Guido van Rossum34a79112000-03-10 22:36:57 +0000108
109 if t is None:
110 return 'None'
111 try:
112 len(t)
113 except:
114 return '0x%04x' % t
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000115 return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
Guido van Rossum34a79112000-03-10 22:36:57 +0000116
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000117def unicoderepr(t):
Guido van Rossum34a79112000-03-10 22:36:57 +0000118
119 if t is None:
120 return 'None'
121 if numeric:
122 return hexrepr(t)
123 else:
124 try:
125 len(t)
126 except:
127 return repr(unichr(t))
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000128 return repr(''.join(map(unichr, t)))
Guido van Rossum34a79112000-03-10 22:36:57 +0000129
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000130def keyrepr(t):
Guido van Rossum34a79112000-03-10 22:36:57 +0000131
132 if t is None:
133 return 'None'
134 if numeric:
135 return hexrepr(t)
136 else:
137 try:
138 len(t)
139 except:
140 if t < 256:
141 return repr(chr(t))
142 else:
143 return repr(unichr(t))
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000144 return repr(''.join(map(chr, t)))
Guido van Rossum34a79112000-03-10 22:36:57 +0000145
146def codegen(name,map,comments=1):
147
148 """ Returns Python source for the given map.
149
150 Comments are included in the source, if comments is true (default).
151
152 """
153 l = [
154 '''\
Marc-André Lemburga866df82001-01-03 21:29:14 +0000155""" Python Character Mapping Codec generated from '%s' with gencodec.py.
Guido van Rossum34a79112000-03-10 22:36:57 +0000156
157Written by Marc-Andre Lemburg (mal@lemburg.com).
158
159(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
Marc-André Lemburga866df82001-01-03 21:29:14 +0000160(c) Copyright 2000 Guido van Rossum.
Guido van Rossum34a79112000-03-10 22:36:57 +0000161
162"""#"
163
164import codecs
165
166### Codec APIs
167
168class Codec(codecs.Codec):
169
170 def encode(self,input,errors='strict'):
171
172 return codecs.charmap_encode(input,errors,encoding_map)
Tim Peters70c43782001-01-17 08:48:39 +0000173
Guido van Rossum34a79112000-03-10 22:36:57 +0000174 def decode(self,input,errors='strict'):
175
176 return codecs.charmap_decode(input,errors,decoding_map)
177
178class StreamWriter(Codec,codecs.StreamWriter):
179 pass
Tim Peters70c43782001-01-17 08:48:39 +0000180
Guido van Rossum34a79112000-03-10 22:36:57 +0000181class StreamReader(Codec,codecs.StreamReader):
182 pass
183
184### encodings module API
185
186def getregentry():
187
188 return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
189
190### Decoding Map
Guido van Rossum34a79112000-03-10 22:36:57 +0000191''' % name,
192 ]
Marc-André Lemburga866df82001-01-03 21:29:14 +0000193
194 if map.has_key("IDENTITY"):
195 l.append("decoding_map = codecs.make_identity_dict(range(%d))"
196 % map["IDENTITY"])
197 l.append("decoding_map.update({")
198 splits = 1
199 del map["IDENTITY"]
200 else:
201 l.append("decoding_map = {")
202 splits = 0
Tim Peters70c43782001-01-17 08:48:39 +0000203
Guido van Rossum34a79112000-03-10 22:36:57 +0000204 mappings = map.items()
205 mappings.sort()
206 append = l.append
207 i = 0
Guido van Rossum34a79112000-03-10 22:36:57 +0000208 for e,value in mappings:
209 try:
210 (u,c) = value
211 except TypeError:
212 u = value
213 c = ''
214 key = keyrepr(e)
215 if c and comments:
216 append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
217 else:
218 append('\t%s: %s,' % (key,unicoderepr(u)))
Marc-André Lemburga866df82001-01-03 21:29:14 +0000219 i += 1
Guido van Rossum34a79112000-03-10 22:36:57 +0000220 if i == 4096:
221 # Split the definition into parts to that the Python
222 # parser doesn't dump core
223 if splits == 0:
224 append('}')
225 else:
226 append('})')
Marc-André Lemburga866df82001-01-03 21:29:14 +0000227 append('decoding_map.update({')
Guido van Rossum34a79112000-03-10 22:36:57 +0000228 i = 0
229 splits = splits + 1
230 if splits == 0:
231 append('}')
232 else:
233 append('})')
234 append('''
235### Encoding Map
236
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000237encoding_map = codecs.make_encoding_map(decoding_map)
Guido van Rossum34a79112000-03-10 22:36:57 +0000238''')
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000239 return '\n'.join(l)
Guido van Rossum34a79112000-03-10 22:36:57 +0000240
241def pymap(name,map,pyfile,comments=1):
242
243 code = codegen(name,map,comments)
244 f = open(pyfile,'w')
245 f.write(code)
246 f.close()
247
248def marshalmap(name,map,marshalfile):
249
250 d = {}
251 for e,(u,c) in map.items():
252 d[e] = (u,c)
253 f = open(marshalfile,'wb')
254 marshal.dump(d,f)
255 f.close()
256
257def convertdir(dir,prefix='',comments=1):
258
259 mapnames = os.listdir(dir)
260 for mapname in mapnames:
261 name = os.path.split(mapname)[1]
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000262 name = name.replace('-','_')
263 name = name.split('.')[0]
264 name = name.lower()
Guido van Rossum34a79112000-03-10 22:36:57 +0000265 codefile = name + '.py'
266 marshalfile = name + '.mapping'
267 print 'converting %s to %s and %s' % (mapname,
268 prefix + codefile,
269 prefix + marshalfile)
270 try:
271 map = readmap(os.path.join(dir,mapname))
272 if not map:
273 print '* map is empty; skipping'
274 else:
275 pymap(mapname, map, prefix + codefile,comments)
276 marshalmap(mapname, map, prefix + marshalfile)
277 except ValueError:
278 print '* conversion failed'
279
280def rewritepythondir(dir,prefix='',comments=1):
Tim Peters70c43782001-01-17 08:48:39 +0000281
Guido van Rossum34a79112000-03-10 22:36:57 +0000282 mapnames = os.listdir(dir)
283 for mapname in mapnames:
Marc-André Lemburga866df82001-01-03 21:29:14 +0000284 if not mapname.endswith('.mapping'):
Guido van Rossum34a79112000-03-10 22:36:57 +0000285 continue
286 codefile = mapname[:-len('.mapping')] + '.py'
287 print 'converting %s to %s' % (mapname,
288 prefix + codefile)
289 try:
290 map = marshal.load(open(os.path.join(dir,mapname),
291 'rb'))
292 if not map:
293 print '* map is empty; skipping'
294 else:
295 pymap(mapname, map, prefix + codefile,comments)
296 except ValueError, why:
297 print '* conversion failed: %s' % why
298
299if __name__ == '__main__':
300
301 import sys
302 if 1:
303 apply(convertdir,tuple(sys.argv[1:]))
304 else:
305 apply(rewritepythondir,tuple(sys.argv[1:]))