blob: 69b6ede6c287824f7012737700e08d3ab4bc2c59 [file] [log] [blame]
Guido van Rossum34a79112000-03-10 22:36:57 +00001""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
Marc-André Lemburga866df82001-01-03 21:29:14 +00004site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
Guido van Rossum34a79112000-03-10 22:36:57 +00007
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
Fred Drakebae57a82000-03-17 16:56:23 +000015The tool also writes marshalled versions of the mapping tables to the
Guido van Rossum34a79112000-03-10 22:36:57 +000016same location (with .mapping extension).
17
18Written by Marc-Andre Lemburg (mal@lemburg.com).
19
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
Marc-André Lemburga866df82001-01-03 21:29:14 +000021(c) Copyright Guido van Rossum, 2000.
Guido van Rossum34a79112000-03-10 22:36:57 +000022
23"""#"
24
25import string,re,os,time,marshal
26
27# Create numeric tables or character based ones ?
28numeric = 1
29
30mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
31 '\s+'
32 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
33 '\s*'
34 '(#.+)?')
35
36def parsecodes(codes,
37
38 split=string.split,atoi=string.atoi,len=len,
39 filter=filter,range=range):
40
41 """ Converts code combinations to either a single code integer
42 or a tuple of integers.
43
44 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
45 ignored.
46
47 Empty codes or illegal ones are returned as None.
48
49 """
50 if not codes:
51 return None
52 l = split(codes,'+')
53 if len(l) == 1:
54 return atoi(l[0],16)
55 for i in range(len(l)):
56 try:
57 l[i] = atoi(l[i],16)
58 except ValueError:
59 l[i] = None
60 l = filter(lambda x: x is not None, l)
61 if len(l) == 1:
62 return l[0]
63 else:
64 return tuple(l)
65
66def readmap(filename,
67
68 strip=string.strip):
69
70 f = open(filename,'r')
71 lines = f.readlines()
72 f.close()
73 enc2uni = {}
Marc-André Lemburga866df82001-01-03 21:29:14 +000074 identity = []
75 unmapped = range(256)
76 for i in range(256):
77 unmapped[i] = i
Guido van Rossum34a79112000-03-10 22:36:57 +000078 for line in lines:
79 line = strip(line)
80 if not line or line[0] == '#':
81 continue
82 m = mapRE.match(line)
83 if not m:
84 #print '* not matched: %s' % repr(line)
85 continue
86 enc,uni,comment = m.groups()
87 enc = parsecodes(enc)
88 uni = parsecodes(uni)
89 if not comment:
90 comment = ''
91 else:
92 comment = comment[1:]
Marc-André Lemburga866df82001-01-03 21:29:14 +000093 if enc < 256:
94 unmapped.remove(enc)
95 if enc == uni:
96 identity.append(enc)
97 else:
98 enc2uni[enc] = (uni,comment)
99 else:
Guido van Rossum34a79112000-03-10 22:36:57 +0000100 enc2uni[enc] = (uni,comment)
Marc-André Lemburga866df82001-01-03 21:29:14 +0000101 # If there are more identity-mapped entries than unmapped entries,
102 # it pays to generate an identity dictionary first, add add explicit
103 # mappings to None for the rest
104 if len(identity)>=len(unmapped):
105 for enc in unmapped:
106 enc2uni[enc] = (None, "")
107 enc2uni['IDENTITY'] = 256
108
Guido van Rossum34a79112000-03-10 22:36:57 +0000109 return enc2uni
110
111def hexrepr(t,
112
113 join=string.join):
114
115 if t is None:
116 return 'None'
117 try:
118 len(t)
119 except:
120 return '0x%04x' % t
121 return '(' + join(map(lambda t: '0x%04x' % t, t),', ') + ')'
122
123def unicoderepr(t,
124
125 join=string.join):
126
127 if t is None:
128 return 'None'
129 if numeric:
130 return hexrepr(t)
131 else:
132 try:
133 len(t)
134 except:
135 return repr(unichr(t))
136 return repr(join(map(unichr, t),''))
137
138def keyrepr(t,
139
140 join=string.join):
141
142 if t is None:
143 return 'None'
144 if numeric:
145 return hexrepr(t)
146 else:
147 try:
148 len(t)
149 except:
150 if t < 256:
151 return repr(chr(t))
152 else:
153 return repr(unichr(t))
154 return repr(join(map(chr, t),''))
155
156def codegen(name,map,comments=1):
157
158 """ Returns Python source for the given map.
159
160 Comments are included in the source, if comments is true (default).
161
162 """
163 l = [
164 '''\
Marc-André Lemburga866df82001-01-03 21:29:14 +0000165""" Python Character Mapping Codec generated from '%s' with gencodec.py.
Guido van Rossum34a79112000-03-10 22:36:57 +0000166
167Written by Marc-Andre Lemburg (mal@lemburg.com).
168
169(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
Marc-André Lemburga866df82001-01-03 21:29:14 +0000170(c) Copyright 2000 Guido van Rossum.
Guido van Rossum34a79112000-03-10 22:36:57 +0000171
172"""#"
173
174import codecs
175
176### Codec APIs
177
178class Codec(codecs.Codec):
179
180 def encode(self,input,errors='strict'):
181
182 return codecs.charmap_encode(input,errors,encoding_map)
Tim Peters70c43782001-01-17 08:48:39 +0000183
Guido van Rossum34a79112000-03-10 22:36:57 +0000184 def decode(self,input,errors='strict'):
185
186 return codecs.charmap_decode(input,errors,decoding_map)
187
188class StreamWriter(Codec,codecs.StreamWriter):
189 pass
Tim Peters70c43782001-01-17 08:48:39 +0000190
Guido van Rossum34a79112000-03-10 22:36:57 +0000191class StreamReader(Codec,codecs.StreamReader):
192 pass
193
194### encodings module API
195
196def getregentry():
197
198 return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
199
200### Decoding Map
Guido van Rossum34a79112000-03-10 22:36:57 +0000201''' % name,
202 ]
Marc-André Lemburga866df82001-01-03 21:29:14 +0000203
204 if map.has_key("IDENTITY"):
205 l.append("decoding_map = codecs.make_identity_dict(range(%d))"
206 % map["IDENTITY"])
207 l.append("decoding_map.update({")
208 splits = 1
209 del map["IDENTITY"]
210 else:
211 l.append("decoding_map = {")
212 splits = 0
Tim Peters70c43782001-01-17 08:48:39 +0000213
Guido van Rossum34a79112000-03-10 22:36:57 +0000214 mappings = map.items()
215 mappings.sort()
216 append = l.append
217 i = 0
Guido van Rossum34a79112000-03-10 22:36:57 +0000218 for e,value in mappings:
219 try:
220 (u,c) = value
221 except TypeError:
222 u = value
223 c = ''
224 key = keyrepr(e)
225 if c and comments:
226 append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
227 else:
228 append('\t%s: %s,' % (key,unicoderepr(u)))
Marc-André Lemburga866df82001-01-03 21:29:14 +0000229 i += 1
Guido van Rossum34a79112000-03-10 22:36:57 +0000230 if i == 4096:
231 # Split the definition into parts to that the Python
232 # parser doesn't dump core
233 if splits == 0:
234 append('}')
235 else:
236 append('})')
Marc-André Lemburga866df82001-01-03 21:29:14 +0000237 append('decoding_map.update({')
Guido van Rossum34a79112000-03-10 22:36:57 +0000238 i = 0
239 splits = splits + 1
240 if splits == 0:
241 append('}')
242 else:
243 append('})')
244 append('''
245### Encoding Map
246
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000247encoding_map = codecs.make_encoding_map(decoding_map)
Guido van Rossum34a79112000-03-10 22:36:57 +0000248''')
249 return string.join(l,'\n')
250
251def pymap(name,map,pyfile,comments=1):
252
253 code = codegen(name,map,comments)
254 f = open(pyfile,'w')
255 f.write(code)
256 f.close()
257
258def marshalmap(name,map,marshalfile):
259
260 d = {}
261 for e,(u,c) in map.items():
262 d[e] = (u,c)
263 f = open(marshalfile,'wb')
264 marshal.dump(d,f)
265 f.close()
266
267def convertdir(dir,prefix='',comments=1):
268
269 mapnames = os.listdir(dir)
270 for mapname in mapnames:
271 name = os.path.split(mapname)[1]
272 name = string.replace(name,'-','_')
273 name = string.split(name, '.')[0]
274 name = string.lower(name)
275 codefile = name + '.py'
276 marshalfile = name + '.mapping'
277 print 'converting %s to %s and %s' % (mapname,
278 prefix + codefile,
279 prefix + marshalfile)
280 try:
281 map = readmap(os.path.join(dir,mapname))
282 if not map:
283 print '* map is empty; skipping'
284 else:
285 pymap(mapname, map, prefix + codefile,comments)
286 marshalmap(mapname, map, prefix + marshalfile)
287 except ValueError:
288 print '* conversion failed'
289
290def rewritepythondir(dir,prefix='',comments=1):
Tim Peters70c43782001-01-17 08:48:39 +0000291
Guido van Rossum34a79112000-03-10 22:36:57 +0000292 mapnames = os.listdir(dir)
293 for mapname in mapnames:
Marc-André Lemburga866df82001-01-03 21:29:14 +0000294 if not mapname.endswith('.mapping'):
Guido van Rossum34a79112000-03-10 22:36:57 +0000295 continue
296 codefile = mapname[:-len('.mapping')] + '.py'
297 print 'converting %s to %s' % (mapname,
298 prefix + codefile)
299 try:
300 map = marshal.load(open(os.path.join(dir,mapname),
301 'rb'))
302 if not map:
303 print '* map is empty; skipping'
304 else:
305 pymap(mapname, map, prefix + codefile,comments)
306 except ValueError, why:
307 print '* conversion failed: %s' % why
308
309if __name__ == '__main__':
310
311 import sys
312 if 1:
313 apply(convertdir,tuple(sys.argv[1:]))
314 else:
315 apply(rewritepythondir,tuple(sys.argv[1:]))