blob: b8411ad48bed6ec10e65fca6a6ed0645a7ac1197 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
4# this script converts a unicode 3.0 database file to
5# Modules/unicodedata_db.h and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00006#
7# history:
8# 2000-09-24 fl created (based on bits and pieces from unidb)
9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000010# 2000-09-25 fl added character type table
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011#
12# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
Fredrik Lundhf367cac2000-09-24 23:18:31 +000013#
14
15import sys
16
17SCRIPT = sys.argv[0]
Fredrik Lundhcfcea492000-09-25 08:07:06 +000018VERSION = "1.1"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000019
Fredrik Lundhe9133f72000-09-25 17:59:57 +000020UNICODE_DATA = "UnicodeData-Latest.txt"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000021
22CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
23 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
24 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
25 "So" ]
26
27BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
28 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
29 "ON" ]
30
Fredrik Lundhe9133f72000-09-25 17:59:57 +000031ALPHA_MASK = 0x01
32DECIMAL_MASK = 0x02
33DIGIT_MASK = 0x04
34LOWER_MASK = 0x08
35NUMERIC_MASK = 0x10
36SPACE_MASK = 0x20
37TITLE_MASK = 0x40
38UPPER_MASK = 0x80
39
40def maketables():
Fredrik Lundhf367cac2000-09-24 23:18:31 +000041
42 unicode = UnicodeData(UNICODE_DATA)
43
44 # extract unicode properties
Fredrik Lundhcfcea492000-09-25 08:07:06 +000045 dummy = (0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000046 table = [dummy]
47 cache = {0: dummy}
48 index = [0] * len(unicode.chars)
49
Fredrik Lundhcfcea492000-09-25 08:07:06 +000050 # 1) database properties
Fredrik Lundhf367cac2000-09-24 23:18:31 +000051 for char in unicode.chars:
52 record = unicode.table[char]
53 if record:
54 # extract database properties
55 category = CATEGORY_NAMES.index(record[2])
56 combining = int(record[3])
57 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
58 mirrored = record[9] == "Y"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000059 item = (
Fredrik Lundhcfcea492000-09-25 08:07:06 +000060 category, combining, bidirectional, mirrored
Fredrik Lundhf367cac2000-09-24 23:18:31 +000061 )
62 # add entry to index and item tables
63 i = cache.get(item)
64 if i is None:
65 cache[item] = i = len(table)
66 table.append(item)
67 index[char] = i
68
Fredrik Lundhcfcea492000-09-25 08:07:06 +000069 # 2) decomposition data
70
71 # FIXME: <fl> using the encoding stuff from unidb would save
72 # another 50k or so, but I'll leave that for 2.1...
73
74 decomp_data = [""]
75 decomp_index = [0] * len(unicode.chars)
76
77 for char in unicode.chars:
78 record = unicode.table[char]
79 if record:
80 if record[5]:
81 try:
82 i = decomp_data.index(record[5])
83 except ValueError:
84 i = len(decomp_data)
85 decomp_data.append(record[5])
86 else:
87 i = 0
88 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +000089
Fredrik Lundhe9133f72000-09-25 17:59:57 +000090 FILE = "Modules/unicodedata_db.h"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000091
92 sys.stdout = open(FILE, "w")
93
94 print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
95 print
96 print "/* a list of unique database records */"
97 print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
98 for item in table:
Fredrik Lundhcfcea492000-09-25 08:07:06 +000099 print " {%d, %d, %d, %d}," % item
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000100 print "};"
101 print
102
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000103 # FIXME: the following tables should be made static, and
104 # the support code moved into unicodedatabase.c
105
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000106 print "/* string literals */"
107 print "const char *_PyUnicode_CategoryNames[] = {"
108 for name in CATEGORY_NAMES:
109 print " \"%s\"," % name
110 print " NULL"
111 print "};"
112
113 print "const char *_PyUnicode_BidirectionalNames[] = {"
114 for name in BIDIRECTIONAL_NAMES:
115 print " \"%s\"," % name
116 print " NULL"
117 print "};"
118
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000119 print "static const char *decomp_data[] = {"
120 for name in decomp_data:
121 print " \"%s\"," % name
122 print " NULL"
123 print "};"
124
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000125 # split record index table
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000126 index1, index2, shift = splitbins(index)
127
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000128 print "/* index tables for the database records */"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000129 print "#define SHIFT", shift
130 Array("index1", index1).dump(sys.stdout)
131 Array("index2", index2).dump(sys.stdout)
132
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000133 # split decomposition index table
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000134 index1, index2, shift = splitbins(decomp_index)
135
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000136 print "/* index tables for the decomposition data */"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000137 print "#define DECOMP_SHIFT", shift
138 Array("decomp_index1", index1).dump(sys.stdout)
139 Array("decomp_index2", index2).dump(sys.stdout)
140
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000141 sys.stdout = sys.__stdout__
142
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000143 #
144 # 3) unicode type data
145
146 # extract unicode types
147 dummy = (0, 0, 0, 0)
148 table = [dummy]
149 cache = {0: dummy}
150 index = [0] * len(unicode.chars)
151
152 for char in unicode.chars:
153 record = unicode.table[char]
154 if record:
155 # extract database properties
156 category = record[2]
157 bidirectional = record[4]
158 flags = 0
159 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
160 flags |= ALPHA_MASK
161 if category == "Ll":
162 flags |= LOWER_MASK
163 if category == "Zs" or bidirectional in ("WS", "B", "S"):
164 flags |= SPACE_MASK
165 if category in ["Lt", "Lu"]:
166 flags |= TITLE_MASK
167 if category == "Lu":
168 flags |= UPPER_MASK
169 # use delta predictor for upper/lower/title
170 if record[12]:
171 upper = (int(record[12], 16) - char) & 0xffff
172 else:
173 upper = 0
174 if record[13]:
175 lower = (int(record[13], 16) - char) & 0xffff
176 else:
177 lower = 0
178 if record[14]:
179 title = (int(record[14], 16) - char) & 0xffff
180 else:
181 title = 0
182 item = (
183 flags, upper, lower, title
184 )
185 # add entry to index and item tables
186 i = cache.get(item)
187 if i is None:
188 cache[item] = i = len(table)
189 table.append(item)
190 index[char] = i
191
192 FILE = "Objects/unicodetype_db.h"
193
194 sys.stdout = open(FILE, "w")
195
196 print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
197 print
198 print "/* a list of unique character type descriptors */"
199 print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
200 for item in table:
201 print " {%d, %d, %d, %d}," % item
202 print "};"
203 print
204
205 # split decomposition index table
206 index1, index2, shift = splitbins(index)
207
208 print "/* type indexes */"
209 print "#define SHIFT", shift
210 Array("index1", index1).dump(sys.stdout)
211 Array("index2", index2).dump(sys.stdout)
212
213 sys.stdout = sys.__stdout__
214
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000215# --------------------------------------------------------------------
216# the following support code is taken from the unidb utilities
217# Copyright (c) 1999-2000 by Secret Labs AB
218
219# load a unicode-data file from disk
220
221import string, sys
222
223class UnicodeData:
224
225 def __init__(self, filename):
226 file = open(filename)
227 table = [None] * 65536
228 while 1:
229 s = file.readline()
230 if not s:
231 break
232 s = string.split(string.strip(s), ";")
233 char = string.atoi(s[0], 16)
234 table[char] = s
235
236 # public attributes
237 self.filename = filename
238 self.table = table
239 self.chars = range(65536) # unicode
240
241 def uselatin1(self):
242 # restrict character range to ISO Latin 1
243 self.chars = range(256)
244
245# stuff to deal with arrays of unsigned integers
246
247class Array:
248
249 def __init__(self, name, data):
250 self.name = name
251 self.data = data
252
253 def dump(self, file):
254 # write data to file, as a C array
255 size = getsize(self.data)
256 # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
257 file.write("static ")
258 if size == 1:
259 file.write("unsigned char")
260 elif size == 2:
261 file.write("unsigned short")
262 else:
263 file.write("unsigned int")
264 file.write(" " + self.name + "[] = {\n")
265 if self.data:
266 s = " "
267 for item in self.data:
268 i = str(item) + ", "
269 if len(s) + len(i) > 78:
270 file.write(s + "\n")
271 s = " " + i
272 else:
273 s = s + i
274 if string.strip(s):
275 file.write(s + "\n")
276 file.write("};\n\n")
277
278def getsize(data):
279 # return smallest possible integer size for the given array
280 maxdata = max(data)
281 if maxdata < 256:
282 return 1
283 elif maxdata < 65536:
284 return 2
285 else:
286 return 4
287
Tim Peters21013482000-09-25 07:13:41 +0000288def splitbins(t, trace=0):
289 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
290
291 t is a sequence of ints. This function can be useful to save space if
292 many of the ints are the same. t1 and t2 are lists of ints, and shift
293 is an int, chosen to minimize the combined size of t1 and t2 (in C
294 code), and where for each i in range(len(t)),
295 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
296 where mask is a bitmask isolating the last "shift" bits.
297
298 If optional arg trace is true (default false), progress info is
299 printed to sys.stderr.
300 """
301
302 import sys
303 if trace:
304 def dump(t1, t2, shift, bytes):
305 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
306 len(t1), len(t2), shift, bytes)
307 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
308 "bytes"
309 n = len(t)-1 # last valid index
310 maxshift = 0 # the most we can shift n and still have something left
311 if n > 0:
312 while n >> 1:
313 n >>= 1
314 maxshift += 1
315 del n
316 bytes = sys.maxint # smallest total size so far
317 t = tuple(t) # so slices can be dict keys
318 for shift in range(maxshift + 1):
319 t1 = []
320 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000321 size = 2**shift
322 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000323 for i in range(0, len(t), size):
324 bin = t[i:i+size]
325 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000326 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000327 index = len(t2)
328 bincache[bin] = index
329 t2.extend(bin)
330 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000331 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000332 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
333 if trace:
334 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000335 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000336 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000337 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000338 t1, t2, shift = best
339 if trace:
340 print >>sys.stderr, "Best:",
341 dump(t1, t2, shift, bytes)
342 if __debug__:
343 # exhaustively verify that the decomposition is correct
344 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
345 for i in xrange(len(t)):
346 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
347 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000348
349if __name__ == "__main__":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000350 maketables()