blob: 0c17dec924649513cac0abbdfe5f464ab3386704 [file] [log] [blame]
Guido van Rossum34a79112000-03-10 22:36:57 +00001""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
4site (ftp.unicode.org) and creates Python codec modules from them. The
5codecs use the standard character mapping codec to actually apply the
6mapping.
7
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
Fred Drake19660042000-03-17 15:43:37 +000015The tool also writes marhsalled versions of the mapping tables to the
Guido van Rossum34a79112000-03-10 22:36:57 +000016same location (with .mapping extension).
17
18Written by Marc-Andre Lemburg (mal@lemburg.com).
19
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21
22"""#"
23
24import string,re,os,time,marshal
25
26# Create numeric tables or character based ones ?
27numeric = 1
28
29mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
30 '\s+'
31 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
32 '\s*'
33 '(#.+)?')
34
35def parsecodes(codes,
36
37 split=string.split,atoi=string.atoi,len=len,
38 filter=filter,range=range):
39
40 """ Converts code combinations to either a single code integer
41 or a tuple of integers.
42
43 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
44 ignored.
45
46 Empty codes or illegal ones are returned as None.
47
48 """
49 if not codes:
50 return None
51 l = split(codes,'+')
52 if len(l) == 1:
53 return atoi(l[0],16)
54 for i in range(len(l)):
55 try:
56 l[i] = atoi(l[i],16)
57 except ValueError:
58 l[i] = None
59 l = filter(lambda x: x is not None, l)
60 if len(l) == 1:
61 return l[0]
62 else:
63 return tuple(l)
64
65def readmap(filename,
66
67 strip=string.strip):
68
69 f = open(filename,'r')
70 lines = f.readlines()
71 f.close()
72 enc2uni = {}
73 for line in lines:
74 line = strip(line)
75 if not line or line[0] == '#':
76 continue
77 m = mapRE.match(line)
78 if not m:
79 #print '* not matched: %s' % repr(line)
80 continue
81 enc,uni,comment = m.groups()
82 enc = parsecodes(enc)
83 uni = parsecodes(uni)
84 if not comment:
85 comment = ''
86 else:
87 comment = comment[1:]
88 if enc != uni:
89 enc2uni[enc] = (uni,comment)
90 return enc2uni
91
92def hexrepr(t,
93
94 join=string.join):
95
96 if t is None:
97 return 'None'
98 try:
99 len(t)
100 except:
101 return '0x%04x' % t
102 return '(' + join(map(lambda t: '0x%04x' % t, t),', ') + ')'
103
104def unicoderepr(t,
105
106 join=string.join):
107
108 if t is None:
109 return 'None'
110 if numeric:
111 return hexrepr(t)
112 else:
113 try:
114 len(t)
115 except:
116 return repr(unichr(t))
117 return repr(join(map(unichr, t),''))
118
119def keyrepr(t,
120
121 join=string.join):
122
123 if t is None:
124 return 'None'
125 if numeric:
126 return hexrepr(t)
127 else:
128 try:
129 len(t)
130 except:
131 if t < 256:
132 return repr(chr(t))
133 else:
134 return repr(unichr(t))
135 return repr(join(map(chr, t),''))
136
137def codegen(name,map,comments=1):
138
139 """ Returns Python source for the given map.
140
141 Comments are included in the source, if comments is true (default).
142
143 """
144 l = [
145 '''\
146""" Python Character Mapping Codec generated from '%s'.
147
148Written by Marc-Andre Lemburg (mal@lemburg.com).
149
150(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
151
152"""#"
153
154import codecs
155
156### Codec APIs
157
158class Codec(codecs.Codec):
159
160 def encode(self,input,errors='strict'):
161
162 return codecs.charmap_encode(input,errors,encoding_map)
163
164 def decode(self,input,errors='strict'):
165
166 return codecs.charmap_decode(input,errors,decoding_map)
167
168class StreamWriter(Codec,codecs.StreamWriter):
169 pass
170
171class StreamReader(Codec,codecs.StreamReader):
172 pass
173
174### encodings module API
175
176def getregentry():
177
178 return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
179
180### Decoding Map
181
182decoding_map = {
183''' % name,
184 ]
185 mappings = map.items()
186 mappings.sort()
187 append = l.append
188 i = 0
189 splits = 0
190 for e,value in mappings:
191 try:
192 (u,c) = value
193 except TypeError:
194 u = value
195 c = ''
196 key = keyrepr(e)
197 if c and comments:
198 append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
199 else:
200 append('\t%s: %s,' % (key,unicoderepr(u)))
201 i = i + 1
202 if i == 4096:
203 # Split the definition into parts to that the Python
204 # parser doesn't dump core
205 if splits == 0:
206 append('}')
207 else:
208 append('})')
209 append('map.update({')
210 i = 0
211 splits = splits + 1
212 if splits == 0:
213 append('}')
214 else:
215 append('})')
216 append('''
217### Encoding Map
218
219encoding_map = {}
220for k,v in decoding_map.items():
221 encoding_map[v] = k
222''')
223 return string.join(l,'\n')
224
225def pymap(name,map,pyfile,comments=1):
226
227 code = codegen(name,map,comments)
228 f = open(pyfile,'w')
229 f.write(code)
230 f.close()
231
232def marshalmap(name,map,marshalfile):
233
234 d = {}
235 for e,(u,c) in map.items():
236 d[e] = (u,c)
237 f = open(marshalfile,'wb')
238 marshal.dump(d,f)
239 f.close()
240
241def convertdir(dir,prefix='',comments=1):
242
243 mapnames = os.listdir(dir)
244 for mapname in mapnames:
245 name = os.path.split(mapname)[1]
246 name = string.replace(name,'-','_')
247 name = string.split(name, '.')[0]
248 name = string.lower(name)
249 codefile = name + '.py'
250 marshalfile = name + '.mapping'
251 print 'converting %s to %s and %s' % (mapname,
252 prefix + codefile,
253 prefix + marshalfile)
254 try:
255 map = readmap(os.path.join(dir,mapname))
256 if not map:
257 print '* map is empty; skipping'
258 else:
259 pymap(mapname, map, prefix + codefile,comments)
260 marshalmap(mapname, map, prefix + marshalfile)
261 except ValueError:
262 print '* conversion failed'
263
264def rewritepythondir(dir,prefix='',comments=1):
265
266 mapnames = os.listdir(dir)
267 for mapname in mapnames:
268 if mapname[-len('.mapping'):] != '.mapping':
269 continue
270 codefile = mapname[:-len('.mapping')] + '.py'
271 print 'converting %s to %s' % (mapname,
272 prefix + codefile)
273 try:
274 map = marshal.load(open(os.path.join(dir,mapname),
275 'rb'))
276 if not map:
277 print '* map is empty; skipping'
278 else:
279 pymap(mapname, map, prefix + codefile,comments)
280 except ValueError, why:
281 print '* conversion failed: %s' % why
282
283if __name__ == '__main__':
284
285 import sys
286 if 1:
287 apply(convertdir,tuple(sys.argv[1:]))
288 else:
289 apply(rewritepythondir,tuple(sys.argv[1:]))