blob: faca17f369309dc144868a867918cfe16a0f2f8b [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
4# this script converts a unicode 3.0 database file to
5# Modules/unicodedata_db.h and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00006#
7# history:
8# 2000-09-24 fl created (based on bits and pieces from unidb)
9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000010# 2000-09-25 fl added character type table
Fredrik Lundh375732c2000-09-25 23:03:34 +000011# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012#
13# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
Fredrik Lundhf367cac2000-09-24 23:18:31 +000014#
15
16import sys
17
18SCRIPT = sys.argv[0]
Fredrik Lundhcfcea492000-09-25 08:07:06 +000019VERSION = "1.1"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000020
Fredrik Lundhe9133f72000-09-25 17:59:57 +000021UNICODE_DATA = "UnicodeData-Latest.txt"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000022
23CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
24 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
25 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
26 "So" ]
27
28BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
29 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
30 "ON" ]
31
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000032# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000033ALPHA_MASK = 0x01
34DECIMAL_MASK = 0x02
35DIGIT_MASK = 0x04
36LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000037LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000038SPACE_MASK = 0x20
39TITLE_MASK = 0x40
40UPPER_MASK = 0x80
41
42def maketables():
Fredrik Lundhf367cac2000-09-24 23:18:31 +000043
44 unicode = UnicodeData(UNICODE_DATA)
45
46 # extract unicode properties
Fredrik Lundhcfcea492000-09-25 08:07:06 +000047 dummy = (0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000048 table = [dummy]
49 cache = {0: dummy}
50 index = [0] * len(unicode.chars)
51
Fredrik Lundhcfcea492000-09-25 08:07:06 +000052 # 1) database properties
Fredrik Lundhf367cac2000-09-24 23:18:31 +000053 for char in unicode.chars:
54 record = unicode.table[char]
55 if record:
56 # extract database properties
57 category = CATEGORY_NAMES.index(record[2])
58 combining = int(record[3])
59 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
60 mirrored = record[9] == "Y"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000061 item = (
Fredrik Lundhcfcea492000-09-25 08:07:06 +000062 category, combining, bidirectional, mirrored
Fredrik Lundhf367cac2000-09-24 23:18:31 +000063 )
64 # add entry to index and item tables
65 i = cache.get(item)
66 if i is None:
67 cache[item] = i = len(table)
68 table.append(item)
69 index[char] = i
70
Fredrik Lundhcfcea492000-09-25 08:07:06 +000071 # 2) decomposition data
72
73 # FIXME: <fl> using the encoding stuff from unidb would save
74 # another 50k or so, but I'll leave that for 2.1...
75
76 decomp_data = [""]
77 decomp_index = [0] * len(unicode.chars)
78
79 for char in unicode.chars:
80 record = unicode.table[char]
81 if record:
82 if record[5]:
83 try:
84 i = decomp_data.index(record[5])
85 except ValueError:
86 i = len(decomp_data)
87 decomp_data.append(record[5])
88 else:
89 i = 0
90 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +000091
Fredrik Lundhe9133f72000-09-25 17:59:57 +000092 FILE = "Modules/unicodedata_db.h"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000093
94 sys.stdout = open(FILE, "w")
95
96 print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
97 print
98 print "/* a list of unique database records */"
99 print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
100 for item in table:
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000101 print " {%d, %d, %d, %d}," % item
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000102 print "};"
103 print
104
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000105 # FIXME: the following tables should be made static, and
106 # the support code moved into unicodedatabase.c
107
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000108 print "/* string literals */"
109 print "const char *_PyUnicode_CategoryNames[] = {"
110 for name in CATEGORY_NAMES:
111 print " \"%s\"," % name
112 print " NULL"
113 print "};"
114
115 print "const char *_PyUnicode_BidirectionalNames[] = {"
116 for name in BIDIRECTIONAL_NAMES:
117 print " \"%s\"," % name
118 print " NULL"
119 print "};"
120
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000121 print "static const char *decomp_data[] = {"
122 for name in decomp_data:
123 print " \"%s\"," % name
124 print " NULL"
125 print "};"
126
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000127 # split record index table
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000128 index1, index2, shift = splitbins(index)
129
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000130 print "/* index tables for the database records */"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000131 print "#define SHIFT", shift
132 Array("index1", index1).dump(sys.stdout)
133 Array("index2", index2).dump(sys.stdout)
134
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000135 # split decomposition index table
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000136 index1, index2, shift = splitbins(decomp_index)
137
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000138 print "/* index tables for the decomposition data */"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000139 print "#define DECOMP_SHIFT", shift
140 Array("decomp_index1", index1).dump(sys.stdout)
141 Array("decomp_index2", index2).dump(sys.stdout)
142
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000143 sys.stdout = sys.__stdout__
144
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000145 #
146 # 3) unicode type data
147
148 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000149 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000150 table = [dummy]
151 cache = {0: dummy}
152 index = [0] * len(unicode.chars)
153
154 for char in unicode.chars:
155 record = unicode.table[char]
156 if record:
157 # extract database properties
158 category = record[2]
159 bidirectional = record[4]
160 flags = 0
161 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
162 flags |= ALPHA_MASK
163 if category == "Ll":
164 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000165 if category == "Zl" or bidirectional == "B":
166 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000167 if category == "Zs" or bidirectional in ("WS", "B", "S"):
168 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000169 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000170 flags |= TITLE_MASK
171 if category == "Lu":
172 flags |= UPPER_MASK
173 # use delta predictor for upper/lower/title
174 if record[12]:
175 upper = (int(record[12], 16) - char) & 0xffff
176 else:
177 upper = 0
178 if record[13]:
179 lower = (int(record[13], 16) - char) & 0xffff
180 else:
181 lower = 0
182 if record[14]:
183 title = (int(record[14], 16) - char) & 0xffff
184 else:
185 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000186 # decimal digit, integer digit
187 decimal = 0
188 if record[6]:
189 flags |= DECIMAL_MASK
190 decimal = int(record[6])
191 digit = 0
192 if record[7]:
193 flags |= DIGIT_MASK
194 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000195 item = (
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000196 flags, upper, lower, title, decimal, digit
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000197 )
198 # add entry to index and item tables
199 i = cache.get(item)
200 if i is None:
201 cache[item] = i = len(table)
202 table.append(item)
203 index[char] = i
204
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000205 print len(table), "ctype entries"
206
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000207 FILE = "Objects/unicodetype_db.h"
208
209 sys.stdout = open(FILE, "w")
210
211 print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
212 print
213 print "/* a list of unique character type descriptors */"
214 print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
215 for item in table:
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000216 print " {%d, %d, %d, %d, %d, %d}," % item
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000217 print "};"
218 print
219
220 # split decomposition index table
221 index1, index2, shift = splitbins(index)
222
223 print "/* type indexes */"
224 print "#define SHIFT", shift
225 Array("index1", index1).dump(sys.stdout)
226 Array("index2", index2).dump(sys.stdout)
227
228 sys.stdout = sys.__stdout__
229
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000230# --------------------------------------------------------------------
231# the following support code is taken from the unidb utilities
232# Copyright (c) 1999-2000 by Secret Labs AB
233
234# load a unicode-data file from disk
235
236import string, sys
237
238class UnicodeData:
239
240 def __init__(self, filename):
241 file = open(filename)
242 table = [None] * 65536
243 while 1:
244 s = file.readline()
245 if not s:
246 break
247 s = string.split(string.strip(s), ";")
248 char = string.atoi(s[0], 16)
249 table[char] = s
250
251 # public attributes
252 self.filename = filename
253 self.table = table
254 self.chars = range(65536) # unicode
255
256 def uselatin1(self):
257 # restrict character range to ISO Latin 1
258 self.chars = range(256)
259
260# stuff to deal with arrays of unsigned integers
261
262class Array:
263
264 def __init__(self, name, data):
265 self.name = name
266 self.data = data
267
268 def dump(self, file):
269 # write data to file, as a C array
270 size = getsize(self.data)
271 # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
272 file.write("static ")
273 if size == 1:
274 file.write("unsigned char")
275 elif size == 2:
276 file.write("unsigned short")
277 else:
278 file.write("unsigned int")
279 file.write(" " + self.name + "[] = {\n")
280 if self.data:
281 s = " "
282 for item in self.data:
283 i = str(item) + ", "
284 if len(s) + len(i) > 78:
285 file.write(s + "\n")
286 s = " " + i
287 else:
288 s = s + i
289 if string.strip(s):
290 file.write(s + "\n")
291 file.write("};\n\n")
292
293def getsize(data):
294 # return smallest possible integer size for the given array
295 maxdata = max(data)
296 if maxdata < 256:
297 return 1
298 elif maxdata < 65536:
299 return 2
300 else:
301 return 4
302
Tim Peters21013482000-09-25 07:13:41 +0000303def splitbins(t, trace=0):
304 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
305
306 t is a sequence of ints. This function can be useful to save space if
307 many of the ints are the same. t1 and t2 are lists of ints, and shift
308 is an int, chosen to minimize the combined size of t1 and t2 (in C
309 code), and where for each i in range(len(t)),
310 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
311 where mask is a bitmask isolating the last "shift" bits.
312
313 If optional arg trace is true (default false), progress info is
314 printed to sys.stderr.
315 """
316
317 import sys
318 if trace:
319 def dump(t1, t2, shift, bytes):
320 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
321 len(t1), len(t2), shift, bytes)
322 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
323 "bytes"
324 n = len(t)-1 # last valid index
325 maxshift = 0 # the most we can shift n and still have something left
326 if n > 0:
327 while n >> 1:
328 n >>= 1
329 maxshift += 1
330 del n
331 bytes = sys.maxint # smallest total size so far
332 t = tuple(t) # so slices can be dict keys
333 for shift in range(maxshift + 1):
334 t1 = []
335 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000336 size = 2**shift
337 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000338 for i in range(0, len(t), size):
339 bin = t[i:i+size]
340 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000341 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000342 index = len(t2)
343 bincache[bin] = index
344 t2.extend(bin)
345 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000346 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000347 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
348 if trace:
349 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000350 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000351 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000352 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000353 t1, t2, shift = best
354 if trace:
355 print >>sys.stderr, "Best:",
356 dump(t1, t2, shift, bytes)
357 if __debug__:
358 # exhaustively verify that the decomposition is correct
359 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
360 for i in xrange(len(t)):
361 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
362 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000363
364if __name__ == "__main__":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000365 maketables()